diff --git a/clang-tools-extra/clang-apply-replacements/lib/Tooling/ApplyReplacements.cpp b/clang-tools-extra/clang-apply-replacements/lib/Tooling/ApplyReplacements.cpp index 33ece7f1b4e07..1e5012b9891bf 100644 --- a/clang-tools-extra/clang-apply-replacements/lib/Tooling/ApplyReplacements.cpp +++ b/clang-tools-extra/clang-apply-replacements/lib/Tooling/ApplyReplacements.cpp @@ -143,18 +143,26 @@ groupReplacements(const TUReplacements &TUs, const TUDiagnostics &TUDs, llvm::DenseMap> GroupedReplacements; - // Deduplicate identical replacements in diagnostics. + // Deduplicate identical replacements in diagnostics unless they are from the + // same TU. // FIXME: Find an efficient way to deduplicate on diagnostics level. - llvm::DenseMap> + llvm::DenseMap> DiagReplacements; - auto AddToGroup = [&](const tooling::Replacement &R, bool FromDiag) { + auto AddToGroup = [&](const tooling::Replacement &R, + const tooling::TranslationUnitDiagnostics *SourceTU) { // Use the file manager to deduplicate paths. FileEntries are // automatically canonicalized. if (auto Entry = SM.getFileManager().getFile(R.getFilePath())) { - if (FromDiag) { + if (SourceTU) { auto &Replaces = DiagReplacements[*Entry]; - if (!Replaces.insert(R).second) + auto It = Replaces.find(R); + if (It == Replaces.end()) + Replaces.emplace(R, SourceTU); + else if (It->second != SourceTU) + // This replacement is a duplicate of one suggested by another TU. return; } GroupedReplacements[*Entry].push_back(R); @@ -166,14 +174,14 @@ groupReplacements(const TUReplacements &TUs, const TUDiagnostics &TUDs, for (const auto &TU : TUs) for (const tooling::Replacement &R : TU.Replacements) - AddToGroup(R, false); + AddToGroup(R, nullptr); for (const auto &TU : TUDs) for (const auto &D : TU.Diagnostics) if (const auto *ChoosenFix = tooling::selectFirstFix(D)) { for (const auto &Fix : *ChoosenFix) for (const tooling::Replacement &R : Fix.second) - AddToGroup(R, true); + AddToGroup(R, &TU); } // Sort replacements per file to keep consistent behavior when diff --git a/clang-tools-extra/clang-tidy/ClangTidyForceLinker.h b/clang-tools-extra/clang-tidy/ClangTidyForceLinker.h index 8d29e62c44087..1d6bd2a4fd621 100644 --- a/clang-tools-extra/clang-tidy/ClangTidyForceLinker.h +++ b/clang-tools-extra/clang-tidy/ClangTidyForceLinker.h @@ -15,16 +15,16 @@ namespace clang { namespace tidy { -// This anchor is used to force the linker to link the CERTModule. -extern volatile int CERTModuleAnchorSource; -static int LLVM_ATTRIBUTE_UNUSED CERTModuleAnchorDestination = - CERTModuleAnchorSource; - // This anchor is used to force the linker to link the AbseilModule. extern volatile int AbseilModuleAnchorSource; static int LLVM_ATTRIBUTE_UNUSED AbseilModuleAnchorDestination = AbseilModuleAnchorSource; +// This anchor is used to force the linker to link the AndroidModule. +extern volatile int AndroidModuleAnchorSource; +static int LLVM_ATTRIBUTE_UNUSED AndroidModuleAnchorDestination = + AndroidModuleAnchorSource; + // This anchor is used to force the linker to link the BoostModule. extern volatile int BoostModuleAnchorSource; static int LLVM_ATTRIBUTE_UNUSED BoostModuleAnchorDestination = @@ -35,20 +35,10 @@ extern volatile int BugproneModuleAnchorSource; static int LLVM_ATTRIBUTE_UNUSED BugproneModuleAnchorDestination = BugproneModuleAnchorSource; -// This anchor is used to force the linker to link the LinuxKernelModule. -extern volatile int LinuxKernelModuleAnchorSource; -static int LLVM_ATTRIBUTE_UNUSED LinuxKernelModuleAnchorDestination = - LinuxKernelModuleAnchorSource; - -// This anchor is used to force the linker to link the LLVMModule. -extern volatile int LLVMModuleAnchorSource; -static int LLVM_ATTRIBUTE_UNUSED LLVMModuleAnchorDestination = - LLVMModuleAnchorSource; - -// This anchor is used to force the linker to link the LLVMLibcModule. -extern volatile int LLVMLibcModuleAnchorSource; -static int LLVM_ATTRIBUTE_UNUSED LLVMLibcModuleAnchorDestination = - LLVMLibcModuleAnchorSource; +// This anchor is used to force the linker to link the CERTModule. +extern volatile int CERTModuleAnchorSource; +static int LLVM_ATTRIBUTE_UNUSED CERTModuleAnchorDestination = + CERTModuleAnchorSource; // This anchor is used to force the linker to link the CppCoreGuidelinesModule. extern volatile int CppCoreGuidelinesModuleAnchorSource; @@ -70,10 +60,25 @@ extern volatile int GoogleModuleAnchorSource; static int LLVM_ATTRIBUTE_UNUSED GoogleModuleAnchorDestination = GoogleModuleAnchorSource; -// This anchor is used to force the linker to link the AndroidModule. -extern volatile int AndroidModuleAnchorSource; -static int LLVM_ATTRIBUTE_UNUSED AndroidModuleAnchorDestination = - AndroidModuleAnchorSource; +// This anchor is used to force the linker to link the HICPPModule. +extern volatile int HICPPModuleAnchorSource; +static int LLVM_ATTRIBUTE_UNUSED HICPPModuleAnchorDestination = + HICPPModuleAnchorSource; + +// This anchor is used to force the linker to link the LinuxKernelModule. +extern volatile int LinuxKernelModuleAnchorSource; +static int LLVM_ATTRIBUTE_UNUSED LinuxKernelModuleAnchorDestination = + LinuxKernelModuleAnchorSource; + +// This anchor is used to force the linker to link the LLVMModule. +extern volatile int LLVMModuleAnchorSource; +static int LLVM_ATTRIBUTE_UNUSED LLVMModuleAnchorDestination = + LLVMModuleAnchorSource; + +// This anchor is used to force the linker to link the LLVMLibcModule. +extern volatile int LLVMLibcModuleAnchorSource; +static int LLVM_ATTRIBUTE_UNUSED LLVMLibcModuleAnchorDestination = + LLVMLibcModuleAnchorSource; // This anchor is used to force the linker to link the MiscModule. extern volatile int MiscModuleAnchorSource; @@ -93,6 +98,11 @@ static int LLVM_ATTRIBUTE_UNUSED MPIModuleAnchorDestination = MPIModuleAnchorSource; #endif +// This anchor is used to force the linker to link the ObjCModule. +extern volatile int ObjCModuleAnchorSource; +static int LLVM_ATTRIBUTE_UNUSED ObjCModuleAnchorDestination = + ObjCModuleAnchorSource; + // This anchor is used to force the linker to link the OpenMPModule. extern volatile int OpenMPModuleAnchorSource; static int LLVM_ATTRIBUTE_UNUSED OpenMPModuleAnchorDestination = @@ -113,16 +123,6 @@ extern volatile int ReadabilityModuleAnchorSource; static int LLVM_ATTRIBUTE_UNUSED ReadabilityModuleAnchorDestination = ReadabilityModuleAnchorSource; -// This anchor is used to force the linker to link the ObjCModule. -extern volatile int ObjCModuleAnchorSource; -static int LLVM_ATTRIBUTE_UNUSED ObjCModuleAnchorDestination = - ObjCModuleAnchorSource; - -// This anchor is used to force the linker to link the HICPPModule. -extern volatile int HICPPModuleAnchorSource; -static int LLVM_ATTRIBUTE_UNUSED HICPPModuleAnchorDestination = - HICPPModuleAnchorSource; - // This anchor is used to force the linker to link the ZirconModule. extern volatile int ZirconModuleAnchorSource; static int LLVM_ATTRIBUTE_UNUSED ZirconModuleAnchorDestination = diff --git a/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp b/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp index 9dcb315a257a4..d010c3ce7e522 100644 --- a/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp +++ b/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp @@ -41,6 +41,7 @@ #include "SignedCharMisuseCheck.h" #include "SizeofContainerCheck.h" #include "SizeofExpressionCheck.h" +#include "SpuriouslyWakeUpFunctionsCheck.h" #include "StringConstructorCheck.h" #include "StringIntegerAssignmentCheck.h" #include "StringLiteralWithEmbeddedNulCheck.h" @@ -133,6 +134,8 @@ class BugproneModule : public ClangTidyModule { "bugprone-sizeof-container"); CheckFactories.registerCheck( "bugprone-sizeof-expression"); + CheckFactories.registerCheck( + "bugprone-spuriously-wake-up-functions"); CheckFactories.registerCheck( "bugprone-string-constructor"); CheckFactories.registerCheck( diff --git a/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt b/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt index a24f3bc7eb0d0..4aa3b325ce247 100644 --- a/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt +++ b/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt @@ -33,6 +33,7 @@ add_clang_library(clangTidyBugproneModule SignedCharMisuseCheck.cpp SizeofContainerCheck.cpp SizeofExpressionCheck.cpp + SpuriouslyWakeUpFunctionsCheck.cpp StringConstructorCheck.cpp StringIntegerAssignmentCheck.cpp StringLiteralWithEmbeddedNulCheck.cpp diff --git a/clang-tools-extra/clang-tidy/bugprone/SpuriouslyWakeUpFunctionsCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/SpuriouslyWakeUpFunctionsCheck.cpp new file mode 100644 index 0000000000000..844d672f121fb --- /dev/null +++ b/clang-tools-extra/clang-tidy/bugprone/SpuriouslyWakeUpFunctionsCheck.cpp @@ -0,0 +1,108 @@ +//===--- SpuriouslyWakeUpFunctionsCheck.cpp - clang-tidy ------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "SpuriouslyWakeUpFunctionsCheck.h" +#include "clang/AST/ASTContext.h" +#include "clang/ASTMatchers/ASTMatchFinder.h" + +using namespace clang::ast_matchers; + +namespace clang { +namespace tidy { +namespace bugprone { + +void SpuriouslyWakeUpFunctionsCheck::registerMatchers(MatchFinder *Finder) { + + auto hasUniqueLock = hasDescendant(declRefExpr( + hasDeclaration(varDecl(hasType(recordDecl(classTemplateSpecializationDecl( + hasName("::std::unique_lock"), + hasTemplateArgument( + 0, templateArgument(refersToType(qualType(hasDeclaration( + cxxRecordDecl(hasName("::std::mutex")))))))))))))); + + auto hasWaitDescendantCPP = hasDescendant( + cxxMemberCallExpr( + anyOf( + allOf(hasDescendant(memberExpr(hasDeclaration(functionDecl( + allOf(hasName("::std::condition_variable::wait"), + parameterCountIs(1)))))), + onImplicitObjectArgument( + declRefExpr(to(varDecl(hasType(references(recordDecl( + hasName("::std::condition_variable")))))))), + hasUniqueLock), + allOf(hasDescendant(memberExpr(hasDeclaration(functionDecl( + allOf(hasName("::std::condition_variable::wait_for"), + parameterCountIs(2)))))), + onImplicitObjectArgument( + declRefExpr(to(varDecl(hasType(references(recordDecl( + hasName("::std::condition_variable")))))))), + hasUniqueLock), + allOf(hasDescendant(memberExpr(hasDeclaration(functionDecl( + allOf(hasName("::std::condition_variable::wait_until"), + parameterCountIs(2)))))), + onImplicitObjectArgument( + declRefExpr(to(varDecl(hasType(references(recordDecl( + hasName("::std::condition_variable")))))))), + hasUniqueLock) + + )) + .bind("wait")); + + auto hasWaitDescendantC = hasDescendant( + callExpr(callee(functionDecl( + anyOf(hasName("cnd_wait"), hasName("cnd_timedwait"))))) + .bind("wait")); + if (getLangOpts().CPlusPlus) { + // Check for `CON54-CPP` + Finder->addMatcher( + ifStmt( + + allOf(hasWaitDescendantCPP, + unless(anyOf(hasDescendant(ifStmt(hasWaitDescendantCPP)), + hasDescendant(whileStmt(hasWaitDescendantCPP)), + hasDescendant(forStmt(hasWaitDescendantCPP)), + hasDescendant(doStmt(hasWaitDescendantCPP))))) + + ), + this); + } else { + // Check for `CON36-C` + Finder->addMatcher( + + ifStmt( + allOf(hasWaitDescendantC, + unless(anyOf(hasDescendant(ifStmt(hasWaitDescendantC)), + hasDescendant(whileStmt(hasWaitDescendantC)), + hasDescendant(forStmt(hasWaitDescendantC)), + hasDescendant(doStmt(hasWaitDescendantC)), + hasParent(whileStmt()), + hasParent(compoundStmt(hasParent(whileStmt()))), + hasParent(forStmt()), + hasParent(compoundStmt(hasParent(forStmt()))), + hasParent(doStmt()), + hasParent(compoundStmt(hasParent(doStmt()))))) + + )) + + , + this); + } +} + +void SpuriouslyWakeUpFunctionsCheck::check( + const MatchFinder::MatchResult &Result) { + const auto *MatchedWait = Result.Nodes.getNodeAs("wait"); + StringRef WaitName = MatchedWait->getDirectCallee()->getName(); + diag(MatchedWait->getExprLoc(), + "'%0' should be placed inside a while statement %select{|or used with a " + "conditional parameter}1") + << WaitName << (WaitName != "cnd_wait" && WaitName != "cnd_timedwait"); +} +} // namespace bugprone +} // namespace tidy +} // namespace clang diff --git a/clang-tools-extra/clang-tidy/bugprone/SpuriouslyWakeUpFunctionsCheck.h b/clang-tools-extra/clang-tidy/bugprone/SpuriouslyWakeUpFunctionsCheck.h new file mode 100644 index 0000000000000..d2d3745769f75 --- /dev/null +++ b/clang-tools-extra/clang-tidy/bugprone/SpuriouslyWakeUpFunctionsCheck.h @@ -0,0 +1,37 @@ +//===--- SpuriouslyWakeUpFunctionsCheck.h - clang-tidy ----------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_SPURIOUSLYWAKEUPFUNCTIONSCHECK_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_SPURIOUSLYWAKEUPFUNCTIONSCHECK_H + +#include "../ClangTidyCheck.h" + +namespace clang { +namespace tidy { +namespace bugprone { + +/// Finds ``cnd_wait``, ``cnd_timedwait``, ``wait``, ``wait_for``, or +/// ``wait_until`` function calls when the function is not invoked from a loop +/// that checks whether a condition predicate holds or the function has a +/// condition parameter. +/// +/// For the user-facing documentation see: +/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone-spuriously-wake-up-functions.html +class SpuriouslyWakeUpFunctionsCheck : public ClangTidyCheck { +public: + SpuriouslyWakeUpFunctionsCheck(StringRef Name, ClangTidyContext *Context) + : ClangTidyCheck(Name, Context) {} + void registerMatchers(ast_matchers::MatchFinder *Finder) override; + void check(const ast_matchers::MatchFinder::MatchResult &Result) override; +}; + +} // namespace bugprone +} // namespace tidy +} // namespace clang + +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_SPURIOUSLYWAKEUPFUNCTIONSCHECK_H diff --git a/clang-tools-extra/clang-tidy/cert/CERTTidyModule.cpp b/clang-tools-extra/clang-tidy/cert/CERTTidyModule.cpp index 226526d319701..6459dcf5627d9 100644 --- a/clang-tools-extra/clang-tidy/cert/CERTTidyModule.cpp +++ b/clang-tools-extra/clang-tidy/cert/CERTTidyModule.cpp @@ -11,6 +11,7 @@ #include "../ClangTidyModuleRegistry.h" #include "../bugprone/BadSignalToKillThreadCheck.h" #include "../bugprone/ReservedIdentifierCheck.h" +#include "../bugprone/SpuriouslyWakeUpFunctionsCheck.h" #include "../bugprone/UnhandledSelfAssignmentCheck.h" #include "../google/UnnamedNamespaceInHeaderCheck.h" #include "../misc/NewDeleteOverloadsCheck.h" @@ -42,6 +43,9 @@ class CERTModule : public ClangTidyModule { public: void addCheckFactories(ClangTidyCheckFactories &CheckFactories) override { // C++ checkers + // CON + CheckFactories.registerCheck( + "cert-con54-cpp"); // DCL CheckFactories.registerCheck( "cert-dcl21-cpp"); @@ -80,6 +84,9 @@ class CERTModule : public ClangTidyModule { "cert-oop58-cpp"); // C checkers + // CON + CheckFactories.registerCheck( + "cert-con36-c"); // DCL CheckFactories.registerCheck("cert-dcl03-c"); CheckFactories.registerCheck( diff --git a/clang-tools-extra/clang-tidy/llvmlibc/CMakeLists.txt b/clang-tools-extra/clang-tidy/llvmlibc/CMakeLists.txt index c03d8b677f263..cc213d35a5728 100644 --- a/clang-tools-extra/clang-tidy/llvmlibc/CMakeLists.txt +++ b/clang-tools-extra/clang-tidy/llvmlibc/CMakeLists.txt @@ -10,6 +10,7 @@ add_clang_library(clangTidyLLVMLibcModule clangBasic clangLex clangTidy + clangTidyPortabilityModule clangTidyUtils clangTooling ) diff --git a/clang-tools-extra/clang-tidy/llvmlibc/RestrictSystemLibcHeadersCheck.cpp b/clang-tools-extra/clang-tidy/llvmlibc/RestrictSystemLibcHeadersCheck.cpp index 8a597c0b2a246..4a19b5359d4f9 100644 --- a/clang-tools-extra/clang-tidy/llvmlibc/RestrictSystemLibcHeadersCheck.cpp +++ b/clang-tools-extra/clang-tidy/llvmlibc/RestrictSystemLibcHeadersCheck.cpp @@ -18,12 +18,14 @@ namespace llvm_libc { namespace { -class RestrictedIncludesPPCallbacks : public PPCallbacks { +class RestrictedIncludesPPCallbacks + : public portability::RestrictedIncludesPPCallbacks { public: explicit RestrictedIncludesPPCallbacks( RestrictSystemLibcHeadersCheck &Check, const SourceManager &SM, const SmallString<128> CompilerIncudeDir) - : Check(Check), SM(SM), CompilerIncudeDir(CompilerIncudeDir) {} + : portability::RestrictedIncludesPPCallbacks(Check, SM), + CompilerIncudeDir(CompilerIncudeDir) {} void InclusionDirective(SourceLocation HashLoc, const Token &IncludeTok, StringRef FileName, bool IsAngled, @@ -33,8 +35,6 @@ class RestrictedIncludesPPCallbacks : public PPCallbacks { SrcMgr::CharacteristicKind FileType) override; private: - RestrictSystemLibcHeadersCheck &Check; - const SourceManager &SM; const SmallString<128> CompilerIncudeDir; }; @@ -45,18 +45,12 @@ void RestrictedIncludesPPCallbacks::InclusionDirective( bool IsAngled, CharSourceRange FilenameRange, const FileEntry *File, StringRef SearchPath, StringRef RelativePath, const Module *Imported, SrcMgr::CharacteristicKind FileType) { - if (SrcMgr::isSystem(FileType)) { - // Compiler provided headers are allowed (e.g stddef.h). - if (SearchPath == CompilerIncudeDir) return; - if (!SM.isInMainFile(HashLoc)) { - Check.diag( - HashLoc, - "system libc header %0 not allowed, transitively included from %1") - << FileName << SM.getFilename(HashLoc); - } else { - Check.diag(HashLoc, "system libc header %0 not allowed") << FileName; - } - } + // Compiler provided headers are allowed (e.g stddef.h). + if (SrcMgr::isSystem(FileType) && SearchPath == CompilerIncudeDir) + return; + portability::RestrictedIncludesPPCallbacks::InclusionDirective( + HashLoc, IncludeTok, FileName, IsAngled, FilenameRange, File, SearchPath, + RelativePath, Imported, FileType); } void RestrictSystemLibcHeadersCheck::registerPPCallbacks( diff --git a/clang-tools-extra/clang-tidy/llvmlibc/RestrictSystemLibcHeadersCheck.h b/clang-tools-extra/clang-tidy/llvmlibc/RestrictSystemLibcHeadersCheck.h index 3910a29a28e4f..9eead7a228828 100644 --- a/clang-tools-extra/clang-tidy/llvmlibc/RestrictSystemLibcHeadersCheck.h +++ b/clang-tools-extra/clang-tidy/llvmlibc/RestrictSystemLibcHeadersCheck.h @@ -10,6 +10,7 @@ #define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_LLVMLIBC_RESTRICTSYSTEMLIBCHEADERSCHECK_H #include "../ClangTidyCheck.h" +#include "../portability/RestrictSystemIncludesCheck.h" namespace clang { namespace tidy { @@ -20,10 +21,11 @@ namespace llvm_libc { /// /// For the user-facing documentation see: /// http://clang.llvm.org/extra/clang-tidy/checks/llvmlibc-restrict-system-libc-headers.html -class RestrictSystemLibcHeadersCheck : public ClangTidyCheck { +class RestrictSystemLibcHeadersCheck + : public portability::RestrictSystemIncludesCheck { public: RestrictSystemLibcHeadersCheck(StringRef Name, ClangTidyContext *Context) - : ClangTidyCheck(Name, Context) {} + : portability::RestrictSystemIncludesCheck(Name, Context, "-*") {} void registerPPCallbacks(const SourceManager &SM, Preprocessor *PP, Preprocessor *ModuleExpanderPP) override; }; diff --git a/clang-tools-extra/clang-tidy/misc/UnconventionalAssignOperatorCheck.cpp b/clang-tools-extra/clang-tidy/misc/UnconventionalAssignOperatorCheck.cpp index 93ccd5492af76..5fc973223ea3b 100644 --- a/clang-tools-extra/clang-tidy/misc/UnconventionalAssignOperatorCheck.cpp +++ b/clang-tools-extra/clang-tidy/misc/UnconventionalAssignOperatorCheck.cpp @@ -75,7 +75,10 @@ void UnconventionalAssignOperatorCheck::check( } else { static const char *const Messages[][2] = { {"ReturnType", "operator=() should return '%0&'"}, - {"ArgumentType", "operator=() should take '%0 const&', '%0&&' or '%0'"}, + {"ArgumentType", + getLangOpts().CPlusPlus11 + ? "operator=() should take '%0 const&', '%0&&' or '%0'" + : "operator=() should take '%0 const&' or '%0'"}, {"cv", "operator=() should not be marked '%1'"}}; const auto *Method = Result.Nodes.getNodeAs("method"); diff --git a/clang-tools-extra/clang-tidy/portability/RestrictSystemIncludesCheck.cpp b/clang-tools-extra/clang-tidy/portability/RestrictSystemIncludesCheck.cpp index 15076d01a7713..f6163989a461a 100644 --- a/clang-tools-extra/clang-tidy/portability/RestrictSystemIncludesCheck.cpp +++ b/clang-tools-extra/clang-tidy/portability/RestrictSystemIncludesCheck.cpp @@ -20,42 +20,6 @@ namespace clang { namespace tidy { namespace portability { -class RestrictedIncludesPPCallbacks : public PPCallbacks { -public: - explicit RestrictedIncludesPPCallbacks(RestrictSystemIncludesCheck &Check, - const SourceManager &SM) - : Check(Check), SM(SM) {} - - void InclusionDirective(SourceLocation HashLoc, const Token &IncludeTok, - StringRef FileName, bool IsAngled, - CharSourceRange FilenameRange, const FileEntry *File, - StringRef SearchPath, StringRef RelativePath, - const Module *Imported, - SrcMgr::CharacteristicKind FileType) override; - void EndOfMainFile() override; - -private: - struct IncludeDirective { - IncludeDirective() = default; - IncludeDirective(SourceLocation Loc, CharSourceRange Range, - StringRef Filename, StringRef FullPath, bool IsInMainFile) - : Loc(Loc), Range(Range), IncludeFile(Filename), IncludePath(FullPath), - IsInMainFile(IsInMainFile) {} - - SourceLocation Loc; // '#' location in the include directive - CharSourceRange Range; // SourceRange for the file name - std::string IncludeFile; // Filename as a string - std::string IncludePath; // Full file path as a string - bool IsInMainFile; // Whether or not the include is in the main file - }; - - using FileIncludes = llvm::SmallVector; - llvm::SmallDenseMap IncludeDirectives; - - RestrictSystemIncludesCheck &Check; - const SourceManager &SM; -}; - void RestrictedIncludesPPCallbacks::InclusionDirective( SourceLocation HashLoc, const Token &IncludeTok, StringRef FileName, bool IsAngled, CharSourceRange FilenameRange, const FileEntry *File, diff --git a/clang-tools-extra/clang-tidy/portability/RestrictSystemIncludesCheck.h b/clang-tools-extra/clang-tidy/portability/RestrictSystemIncludesCheck.h index db2f9935534b4..c34f054fba2e0 100644 --- a/clang-tools-extra/clang-tidy/portability/RestrictSystemIncludesCheck.h +++ b/clang-tools-extra/clang-tidy/portability/RestrictSystemIncludesCheck.h @@ -23,9 +23,10 @@ namespace portability { /// http://clang.llvm.org/extra/clang-tidy/checks/portability-restrict-system-includes.html class RestrictSystemIncludesCheck : public ClangTidyCheck { public: - RestrictSystemIncludesCheck(StringRef Name, ClangTidyContext *Context) + RestrictSystemIncludesCheck(StringRef Name, ClangTidyContext *Context, + std::string DefaultAllowedIncludes = "*") : ClangTidyCheck(Name, Context), - AllowedIncludes(Options.get("Includes", "*")), + AllowedIncludes(Options.get("Includes", DefaultAllowedIncludes)), AllowedIncludesGlobList(AllowedIncludes) {} void registerPPCallbacks(const SourceManager &SM, Preprocessor *PP, @@ -40,8 +41,44 @@ class RestrictSystemIncludesCheck : public ClangTidyCheck { GlobList AllowedIncludesGlobList; }; +class RestrictedIncludesPPCallbacks : public PPCallbacks { +public: + explicit RestrictedIncludesPPCallbacks(RestrictSystemIncludesCheck &Check, + const SourceManager &SM) + : Check(Check), SM(SM) {} + + void InclusionDirective(SourceLocation HashLoc, const Token &IncludeTok, + StringRef FileName, bool IsAngled, + CharSourceRange FilenameRange, const FileEntry *File, + StringRef SearchPath, StringRef RelativePath, + const Module *Imported, + SrcMgr::CharacteristicKind FileType) override; + void EndOfMainFile() override; + +private: + struct IncludeDirective { + IncludeDirective() = default; + IncludeDirective(SourceLocation Loc, CharSourceRange Range, + StringRef Filename, StringRef FullPath, bool IsInMainFile) + : Loc(Loc), Range(Range), IncludeFile(Filename), IncludePath(FullPath), + IsInMainFile(IsInMainFile) {} + + SourceLocation Loc; // '#' location in the include directive + CharSourceRange Range; // SourceRange for the file name + std::string IncludeFile; // Filename as a string + std::string IncludePath; // Full file path as a string + bool IsInMainFile; // Whether or not the include is in the main file + }; + + using FileIncludes = llvm::SmallVector; + llvm::SmallDenseMap IncludeDirectives; + + RestrictSystemIncludesCheck &Check; + const SourceManager &SM; +}; + } // namespace portability } // namespace tidy } // namespace clang -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PORTABILITY_RESTRICTINCLUDESSCHECK_H \ No newline at end of file +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_PORTABILITY_RESTRICTINCLUDESSCHECK_H diff --git a/clang-tools-extra/clang-tidy/utils/RenamerClangTidyCheck.cpp b/clang-tools-extra/clang-tidy/utils/RenamerClangTidyCheck.cpp index c1145802aaa41..3523cf5dcf164 100644 --- a/clang-tools-extra/clang-tidy/utils/RenamerClangTidyCheck.cpp +++ b/clang-tools-extra/clang-tidy/utils/RenamerClangTidyCheck.cpp @@ -203,14 +203,15 @@ void RenamerClangTidyCheck::check(const MatchFinder::MatchResult &Result) { } if (const auto *Loc = Result.Nodes.getNodeAs("typeLoc")) { + UnqualTypeLoc Unqual = Loc->getUnqualifiedLoc(); NamedDecl *Decl = nullptr; - if (const auto &Ref = Loc->getAs()) + if (const auto &Ref = Unqual.getAs()) Decl = Ref.getDecl(); - else if (const auto &Ref = Loc->getAs()) + else if (const auto &Ref = Unqual.getAs()) Decl = Ref.getDecl(); - else if (const auto &Ref = Loc->getAs()) + else if (const auto &Ref = Unqual.getAs()) Decl = Ref.getDecl(); - else if (const auto &Ref = Loc->getAs()) + else if (const auto &Ref = Unqual.getAs()) Decl = Ref.getDecl(); // further TypeLocs handled below @@ -272,6 +273,11 @@ void RenamerClangTidyCheck::check(const MatchFinder::MatchResult &Result) { } if (const auto *Decl = Result.Nodes.getNodeAs("decl")) { + // Fix using namespace declarations. + if (const auto *UsingNS = dyn_cast(Decl)) + addUsage(NamingCheckFailures, UsingNS->getNominatedNamespaceAsWritten(), + UsingNS->getIdentLocation()); + if (!Decl->getIdentifier() || Decl->getName().empty() || Decl->isImplicit()) return; diff --git a/clang-tools-extra/clangd/ClangdLSPServer.cpp b/clang-tools-extra/clangd/ClangdLSPServer.cpp index 748269d5aef4b..d0e8d139a40e8 100644 --- a/clang-tools-extra/clangd/ClangdLSPServer.cpp +++ b/clang-tools-extra/clangd/ClangdLSPServer.cpp @@ -485,8 +485,8 @@ void ClangdLSPServer::onInitialize(const InitializeParams &Params, } } - ClangdServerOpts.SemanticHighlighting = - Params.capabilities.SemanticHighlighting; + ClangdServerOpts.TheiaSemanticHighlighting = + Params.capabilities.TheiaSemanticHighlighting; if (Params.rootUri && *Params.rootUri) ClangdServerOpts.WorkspaceRoot = std::string(Params.rootUri->file()); else if (Params.rootPath && !Params.rootPath->empty()) @@ -611,7 +611,7 @@ void ClangdLSPServer::onInitialize(const InitializeParams &Params, }}}}; if (NegotiatedOffsetEncoding) Result["offsetEncoding"] = *NegotiatedOffsetEncoding; - if (Params.capabilities.SemanticHighlighting) + if (Params.capabilities.TheiaSemanticHighlighting) Result.getObject("capabilities") ->insert( {"semanticHighlighting", @@ -1169,8 +1169,8 @@ void ClangdLSPServer::applyConfiguration( reparseOpenedFiles(ModifiedFiles); } -void ClangdLSPServer::publishSemanticHighlighting( - const SemanticHighlightingParams &Params) { +void ClangdLSPServer::publishTheiaSemanticHighlighting( + const TheiaSemanticHighlightingParams &Params) { notify("textDocument/semanticHighlighting", Params); } @@ -1376,12 +1376,12 @@ void ClangdLSPServer::onHighlightingsReady( // LSP allows us to send incremental edits of highlightings. Also need to diff // to remove highlightings from tokens that should no longer have them. std::vector Diffed = diffHighlightings(Highlightings, Old); - SemanticHighlightingParams Notification; + TheiaSemanticHighlightingParams Notification; Notification.TextDocument.uri = URIForFile::canonicalize(File, /*TUPath=*/File); Notification.TextDocument.version = decodeVersion(Version); - Notification.Lines = toSemanticHighlightingInformation(Diffed); - publishSemanticHighlighting(Notification); + Notification.Lines = toTheiaSemanticHighlightingInformation(Diffed); + publishTheiaSemanticHighlighting(Notification); } void ClangdLSPServer::onDiagnosticsReady(PathRef File, llvm::StringRef Version, diff --git a/clang-tools-extra/clangd/ClangdLSPServer.h b/clang-tools-extra/clangd/ClangdLSPServer.h index e70b7b56a4f2f..c4e9e5fb679c5 100644 --- a/clang-tools-extra/clangd/ClangdLSPServer.h +++ b/clang-tools-extra/clangd/ClangdLSPServer.h @@ -135,7 +135,8 @@ class ClangdLSPServer : private ClangdServer::Callbacks { void applyConfiguration(const ConfigurationSettings &Settings); /// Sends a "publishSemanticHighlighting" notification to the LSP client. - void publishSemanticHighlighting(const SemanticHighlightingParams &); + void + publishTheiaSemanticHighlighting(const TheiaSemanticHighlightingParams &); /// Sends a "publishDiagnostics" notification to the LSP client. void publishDiagnostics(const PublishDiagnosticsParams &); diff --git a/clang-tools-extra/clangd/ClangdServer.cpp b/clang-tools-extra/clangd/ClangdServer.cpp index 5d2bfa7c8c575..3d68f85b6487d 100644 --- a/clang-tools-extra/clangd/ClangdServer.cpp +++ b/clang-tools-extra/clangd/ClangdServer.cpp @@ -58,9 +58,9 @@ namespace { struct UpdateIndexCallbacks : public ParsingCallbacks { UpdateIndexCallbacks(FileIndex *FIndex, ClangdServer::Callbacks *ServerCallbacks, - bool SemanticHighlighting) + bool TheiaSemanticHighlighting) : FIndex(FIndex), ServerCallbacks(ServerCallbacks), - SemanticHighlighting(SemanticHighlighting) {} + TheiaSemanticHighlighting(TheiaSemanticHighlighting) {} void onPreambleAST(PathRef Path, llvm::StringRef Version, ASTContext &Ctx, std::shared_ptr PP, @@ -75,14 +75,14 @@ struct UpdateIndexCallbacks : public ParsingCallbacks { std::vector Diagnostics = AST.getDiagnostics(); std::vector Highlightings; - if (SemanticHighlighting) + if (TheiaSemanticHighlighting) Highlightings = getSemanticHighlightings(AST); if (ServerCallbacks) Publish([&]() { ServerCallbacks->onDiagnosticsReady(Path, AST.version(), std::move(Diagnostics)); - if (SemanticHighlighting) + if (TheiaSemanticHighlighting) ServerCallbacks->onHighlightingsReady(Path, AST.version(), std::move(Highlightings)); }); @@ -103,7 +103,7 @@ struct UpdateIndexCallbacks : public ParsingCallbacks { private: FileIndex *FIndex; ClangdServer::Callbacks *ServerCallbacks; - bool SemanticHighlighting; + bool TheiaSemanticHighlighting; }; } // namespace @@ -112,7 +112,7 @@ ClangdServer::Options ClangdServer::optsForTest() { Opts.UpdateDebounce = DebouncePolicy::fixed(/*zero*/ {}); Opts.StorePreamblesInMemory = true; Opts.AsyncThreadsCount = 4; // Consistent! - Opts.SemanticHighlighting = true; + Opts.TheiaSemanticHighlighting = true; return Opts; } @@ -142,8 +142,8 @@ ClangdServer::ClangdServer(const GlobalCompilationDatabase &CDB, // critical paths. WorkScheduler( CDB, TUScheduler::Options(Opts), - std::make_unique(DynamicIdx.get(), Callbacks, - Opts.SemanticHighlighting)) { + std::make_unique( + DynamicIdx.get(), Callbacks, Opts.TheiaSemanticHighlighting)) { // Adds an index to the stack, at higher priority than existing indexes. auto AddIndex = [&](SymbolIndex *Idx) { if (this->Index != nullptr) { diff --git a/clang-tools-extra/clangd/ClangdServer.h b/clang-tools-extra/clangd/ClangdServer.h index 4c3fe56dd7e2f..ae3dd8a065d8a 100644 --- a/clang-tools-extra/clangd/ClangdServer.h +++ b/clang-tools-extra/clangd/ClangdServer.h @@ -143,7 +143,7 @@ class ClangdServer { std::vector QueryDriverGlobs; /// Enable semantic highlighting features. - bool SemanticHighlighting = false; + bool TheiaSemanticHighlighting = false; /// Returns true if the tweak should be enabled. std::function TweakFilter = [](const Tweak &T) { diff --git a/clang-tools-extra/clangd/Hover.cpp b/clang-tools-extra/clangd/Hover.cpp index 5c1288c14b586..1d41f0a3e0461 100644 --- a/clang-tools-extra/clangd/Hover.cpp +++ b/clang-tools-extra/clangd/Hover.cpp @@ -520,6 +520,49 @@ llvm::Optional getHoverContents(const Expr *E, ParsedAST &AST) { } return llvm::None; } + +bool isParagraphLineBreak(llvm::StringRef Str, size_t LineBreakIndex) { + return Str.substr(LineBreakIndex + 1) + .drop_while([](auto C) { return C == ' ' || C == '\t'; }) + .startswith("\n"); +}; + +bool isPunctuationLineBreak(llvm::StringRef Str, size_t LineBreakIndex) { + constexpr llvm::StringLiteral Punctuation = R"txt(.:,;!?)txt"; + + return LineBreakIndex > 0 && Punctuation.contains(Str[LineBreakIndex - 1]); +}; + +bool isFollowedByHardLineBreakIndicator(llvm::StringRef Str, + size_t LineBreakIndex) { + // '-'/'*' md list, '@'/'\' documentation command, '>' md blockquote, + // '#' headings, '`' code blocks + constexpr llvm::StringLiteral LinbreakIdenticators = R"txt(-*@\>#`)txt"; + + auto NextNonSpaceCharIndex = Str.find_first_not_of(' ', LineBreakIndex + 1); + + if (NextNonSpaceCharIndex == llvm::StringRef::npos) { + return false; + } + + auto FollowedBySingleCharIndicator = + LinbreakIdenticators.find(Str[NextNonSpaceCharIndex]) != + llvm::StringRef::npos; + + auto FollowedByNumberedListIndicator = + llvm::isDigit(Str[NextNonSpaceCharIndex]) && + NextNonSpaceCharIndex + 1 < Str.size() && + (Str[NextNonSpaceCharIndex + 1] == '.' || + Str[NextNonSpaceCharIndex + 1] == ')'); + + return FollowedBySingleCharIndicator || FollowedByNumberedListIndicator; +}; + +bool isHardLineBreak(llvm::StringRef Str, size_t LineBreakIndex) { + return isPunctuationLineBreak(Str, LineBreakIndex) || + isFollowedByHardLineBreakIndicator(Str, LineBreakIndex); +} + } // namespace llvm::Optional getHover(ParsedAST &AST, Position Pos, @@ -652,7 +695,7 @@ markup::Document HoverInfo::present() const { } if (!Documentation.empty()) - Output.addParagraph().appendText(Documentation); + parseDocumentation(Documentation, Output); if (!Definition.empty()) { Output.addRuler(); @@ -675,6 +718,45 @@ markup::Document HoverInfo::present() const { return Output; } +void parseDocumentation(llvm::StringRef Input, markup::Document &Output) { + + constexpr auto WhiteSpaceChars = "\t\n\v\f\r "; + + auto TrimmedInput = Input.trim(); + + std::string CurrentLine; + + for (size_t CharIndex = 0; CharIndex < TrimmedInput.size();) { + if (TrimmedInput[CharIndex] == '\n') { + // Trim whitespace infront of linebreak + const auto LastNonSpaceCharIndex = + CurrentLine.find_last_not_of(WhiteSpaceChars) + 1; + CurrentLine.erase(LastNonSpaceCharIndex); + + if (isParagraphLineBreak(TrimmedInput, CharIndex) || + isHardLineBreak(TrimmedInput, CharIndex)) { + // FIXME: maybe distinguish between line breaks and paragraphs + Output.addParagraph().appendText(CurrentLine); + CurrentLine = ""; + } else { + // Ommit linebreak + CurrentLine += ' '; + } + + CharIndex++; + // After a linebreak always remove spaces to avoid 4 space markdown code + // blocks, also skip all additional linebreaks since they have no effect + CharIndex = TrimmedInput.find_first_not_of(WhiteSpaceChars, CharIndex); + } else { + CurrentLine += TrimmedInput[CharIndex]; + CharIndex++; + } + } + if (!CurrentLine.empty()) { + Output.addParagraph().appendText(CurrentLine); + } +} + llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const HoverInfo::Param &P) { std::vector Output; diff --git a/clang-tools-extra/clangd/Hover.h b/clang-tools-extra/clangd/Hover.h index 40a10ff6a63fe..ef3bd9f22d95f 100644 --- a/clang-tools-extra/clangd/Hover.h +++ b/clang-tools-extra/clangd/Hover.h @@ -74,6 +74,10 @@ struct HoverInfo { /// Produce a user-readable information. markup::Document present() const; }; + +// Try to infer structure of a documentation comment (e.g. line breaks). +void parseDocumentation(llvm::StringRef Input, markup::Document &Output); + llvm::raw_ostream &operator<<(llvm::raw_ostream &, const HoverInfo::Param &); inline bool operator==(const HoverInfo::Param &LHS, const HoverInfo::Param &RHS) { diff --git a/clang-tools-extra/clangd/Preamble.cpp b/clang-tools-extra/clangd/Preamble.cpp index bd92b8c1bdb43..fdee71fd22449 100644 --- a/clang-tools-extra/clangd/Preamble.cpp +++ b/clang-tools-extra/clangd/Preamble.cpp @@ -162,7 +162,7 @@ buildPreamble(PathRef FileName, CompilerInvocation &CI, SerializedDeclsCollector.takeMacros(), std::move(StatCache), SerializedDeclsCollector.takeCanonicalIncludes()); } else { - elog("Could not build a preamble for file {0} version {2}", FileName, + elog("Could not build a preamble for file {0} version {1}", FileName, Inputs.Version); return nullptr; } diff --git a/clang-tools-extra/clangd/Protocol.cpp b/clang-tools-extra/clangd/Protocol.cpp index 56ddbfb446f77..462aebc4b0465 100644 --- a/clang-tools-extra/clangd/Protocol.cpp +++ b/clang-tools-extra/clangd/Protocol.cpp @@ -295,7 +295,7 @@ bool fromJSON(const llvm::json::Value &Params, ClientCapabilities &R) { TextDocument->getObject("semanticHighlightingCapabilities")) { if (auto SemanticHighlightingSupport = SemanticHighlighting->getBoolean("semanticHighlighting")) - R.SemanticHighlighting = *SemanticHighlightingSupport; + R.TheiaSemanticHighlighting = *SemanticHighlightingSupport; } if (auto *Diagnostics = TextDocument->getObject("publishDiagnostics")) { if (auto CategorySupport = Diagnostics->getBoolean("categorySupport")) @@ -1131,18 +1131,19 @@ llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, OffsetEncoding Enc) { return OS << toString(Enc); } -bool operator==(const SemanticHighlightingInformation &Lhs, - const SemanticHighlightingInformation &Rhs) { +bool operator==(const TheiaSemanticHighlightingInformation &Lhs, + const TheiaSemanticHighlightingInformation &Rhs) { return Lhs.Line == Rhs.Line && Lhs.Tokens == Rhs.Tokens; } -llvm::json::Value toJSON(const SemanticHighlightingInformation &Highlighting) { +llvm::json::Value +toJSON(const TheiaSemanticHighlightingInformation &Highlighting) { return llvm::json::Object{{"line", Highlighting.Line}, {"tokens", Highlighting.Tokens}, {"isInactive", Highlighting.IsInactive}}; } -llvm::json::Value toJSON(const SemanticHighlightingParams &Highlighting) { +llvm::json::Value toJSON(const TheiaSemanticHighlightingParams &Highlighting) { return llvm::json::Object{ {"textDocument", Highlighting.TextDocument}, {"lines", std::move(Highlighting.Lines)}, diff --git a/clang-tools-extra/clangd/Protocol.h b/clang-tools-extra/clangd/Protocol.h index 6ab6bcc920b22..d08e546e3ffb2 100644 --- a/clang-tools-extra/clangd/Protocol.h +++ b/clang-tools-extra/clangd/Protocol.h @@ -433,9 +433,11 @@ struct ClientCapabilities { /// textDocument.codeAction.codeActionLiteralSupport. bool CodeActionStructure = false; - /// Client supports semantic highlighting. + /// Client supports Theia semantic highlighting extension. + /// https://github.com/microsoft/vscode-languageserver-node/pull/367 /// textDocument.semanticHighlightingCapabilities.semanticHighlighting - bool SemanticHighlighting = false; + /// FIXME: drop this support once clients support LSP 3.16 Semantic Tokens. + bool TheiaSemanticHighlighting = false; /// Supported encodings for LSP character offsets. (clangd extension). llvm::Optional> offsetEncoding; @@ -1342,7 +1344,7 @@ llvm::json::Value toJSON(const FileStatus &FStatus); /// Represents a semantic highlighting information that has to be applied on a /// specific line of the text document. -struct SemanticHighlightingInformation { +struct TheiaSemanticHighlightingInformation { /// The line these highlightings belong to. int Line = 0; /// The base64 encoded string of highlighting tokens. @@ -1353,18 +1355,19 @@ struct SemanticHighlightingInformation { /// clients should combine line style and token style if possible. bool IsInactive = false; }; -bool operator==(const SemanticHighlightingInformation &Lhs, - const SemanticHighlightingInformation &Rhs); -llvm::json::Value toJSON(const SemanticHighlightingInformation &Highlighting); +bool operator==(const TheiaSemanticHighlightingInformation &Lhs, + const TheiaSemanticHighlightingInformation &Rhs); +llvm::json::Value +toJSON(const TheiaSemanticHighlightingInformation &Highlighting); /// Parameters for the semantic highlighting (server-side) push notification. -struct SemanticHighlightingParams { +struct TheiaSemanticHighlightingParams { /// The textdocument these highlightings belong to. VersionedTextDocumentIdentifier TextDocument; /// The lines of highlightings that should be sent. - std::vector Lines; + std::vector Lines; }; -llvm::json::Value toJSON(const SemanticHighlightingParams &Highlighting); +llvm::json::Value toJSON(const TheiaSemanticHighlightingParams &Highlighting); struct SelectionRangeParams { /// The text document. diff --git a/clang-tools-extra/clangd/SemanticHighlighting.cpp b/clang-tools-extra/clangd/SemanticHighlighting.cpp index d5c51ebff5e1e..b69f9e8f2710e 100644 --- a/clang-tools-extra/clangd/SemanticHighlighting.cpp +++ b/clang-tools-extra/clangd/SemanticHighlighting.cpp @@ -445,14 +445,15 @@ bool operator==(const LineHighlightings &L, const LineHighlightings &R) { return std::tie(L.Line, L.Tokens) == std::tie(R.Line, R.Tokens); } -std::vector -toSemanticHighlightingInformation(llvm::ArrayRef Tokens) { +std::vector +toTheiaSemanticHighlightingInformation( + llvm::ArrayRef Tokens) { if (Tokens.size() == 0) return {}; // FIXME: Tokens might be multiple lines long (block comments) in this case // this needs to add multiple lines for those tokens. - std::vector Lines; + std::vector Lines; Lines.reserve(Tokens.size()); for (const auto &Line : Tokens) { llvm::SmallVector LineByteTokens; diff --git a/clang-tools-extra/clangd/SemanticHighlighting.h b/clang-tools-extra/clangd/SemanticHighlighting.h index cf4a51a341e71..31a97b81d6c24 100644 --- a/clang-tools-extra/clangd/SemanticHighlighting.h +++ b/clang-tools-extra/clangd/SemanticHighlighting.h @@ -80,8 +80,9 @@ std::vector getSemanticHighlightings(ParsedAST &AST); llvm::StringRef toTextMateScope(HighlightingKind Kind); /// Convert to LSP's semantic highlighting information. -std::vector -toSemanticHighlightingInformation(llvm::ArrayRef Tokens); +std::vector +toTheiaSemanticHighlightingInformation( + llvm::ArrayRef Tokens); /// Return a line-by-line diff between two highlightings. /// - if the tokens on a line are the same in both highlightings, this line is diff --git a/clang-tools-extra/clangd/XRefs.cpp b/clang-tools-extra/clangd/XRefs.cpp index acf9f6df8281a..7d55a372905c0 100644 --- a/clang-tools-extra/clangd/XRefs.cpp +++ b/clang-tools-extra/clangd/XRefs.cpp @@ -374,6 +374,18 @@ locateSymbolNamedTextuallyAt(ParsedAST &AST, const SymbolIndex *Index, unsigned WordOffset = Word.data() - Code.data(); SourceLocation WordStart = SM.getComposedLoc(File, WordOffset); + // Attempt to determine the kind of token that contains the word, + // and bail if it's a string literal. Note that we cannot always + // determine the token kind (e.g. comments, for which we do want + // to activate, are not retained by TokenBuffer). + for (syntax::Token T : + syntax::spelledTokensTouching(WordStart, AST.getTokens())) { + if (T.range(AST.getSourceManager()).touches(WordOffset + Word.size())) { + if (isStringLiteral(T.kind())) + return {}; + } + } + // Do not consider tokens that survived preprocessing. // We are erring on the safe side here, as a user may expect to get // accurate (as opposed to textual-heuristic) results for such tokens. diff --git a/clang-tools-extra/clangd/unittests/ClangdTests.cpp b/clang-tools-extra/clangd/unittests/ClangdTests.cpp index 7ac75b8b4e29f..1e5fcf3d97e1a 100644 --- a/clang-tools-extra/clangd/unittests/ClangdTests.cpp +++ b/clang-tools-extra/clangd/unittests/ClangdTests.cpp @@ -1083,6 +1083,9 @@ TEST_F(ClangdVFSTest, FallbackWhenWaitingForCompileCommand) { Field(&CodeCompletion::Scope, "ns::")))); } +// Tests fails when built with asan due to stack overflow. So skip running the +// test as a workaround. +#if !defined(__has_feature) || !__has_feature(address_sanitizer) TEST_F(ClangdVFSTest, TestStackOverflow) { MockFSProvider FS; ErrorCheckingCallbacks DiagConsumer; @@ -1103,6 +1106,7 @@ TEST_F(ClangdVFSTest, TestStackOverflow) { // overflow EXPECT_TRUE(DiagConsumer.hadErrorInLastDiags()); } +#endif } // namespace } // namespace clangd diff --git a/clang-tools-extra/clangd/unittests/CodeCompleteTests.cpp b/clang-tools-extra/clangd/unittests/CodeCompleteTests.cpp index 0d6e37f001184..f5c90a4677cb8 100644 --- a/clang-tools-extra/clangd/unittests/CodeCompleteTests.cpp +++ b/clang-tools-extra/clangd/unittests/CodeCompleteTests.cpp @@ -1195,7 +1195,9 @@ TEST(SignatureHelpTest, OpeningParen) { int foo(int a, int b, int c); int main() { #define ID(X) X - ID(foo $p^( foo(10), ^ )) + // FIXME: figure out why ID(foo (foo(10), )) doesn't work when preserving + // the recovery expression. + ID(foo $p^( 10, ^ )) })cpp"}; for (auto Test : Tests) { diff --git a/clang-tools-extra/clangd/unittests/HoverTests.cpp b/clang-tools-extra/clangd/unittests/HoverTests.cpp index c243346a73f6b..593fb16e21948 100644 --- a/clang-tools-extra/clangd/unittests/HoverTests.cpp +++ b/clang-tools-extra/clangd/unittests/HoverTests.cpp @@ -1883,6 +1883,71 @@ def)", } } +TEST(Hover, DocCommentLineBreakConversion) { + struct Case { + llvm::StringRef Documentation; + llvm::StringRef ExpectedRenderMarkdown; + llvm::StringRef ExpectedRenderPlainText; + } Cases[] = {{ + " \n foo\nbar", + "foo bar", + "foo bar", + }, + { + "foo\nbar \n ", + "foo bar", + "foo bar", + }, + { + "foo \nbar", + "foo bar", + "foo bar", + }, + { + "foo \nbar", + "foo bar", + "foo bar", + }, + { + "foo\n\n\nbar", + "foo \nbar", + "foo\nbar", + }, + { + "foo\n\n\n\tbar", + "foo \nbar", + "foo\nbar", + }, + { + "foo\n\n\n bar", + "foo \nbar", + "foo\nbar", + }, + { + "foo.\nbar", + "foo. \nbar", + "foo.\nbar", + }, + { + "foo\n*bar", + "foo \n\\*bar", + "foo\n*bar", + }, + { + "foo\nbar", + "foo bar", + "foo bar", + }}; + + for (const auto &C : Cases) { + markup::Document Output; + parseDocumentation(C.Documentation, Output); + + EXPECT_EQ(Output.asMarkdown(), C.ExpectedRenderMarkdown); + EXPECT_EQ(Output.asPlainText(), C.ExpectedRenderPlainText); + } +} + // This is a separate test as headings don't create any differences in plaintext // mode. TEST(Hover, PresentHeadings) { diff --git a/clang-tools-extra/clangd/unittests/SemanticHighlightingTests.cpp b/clang-tools-extra/clangd/unittests/SemanticHighlightingTests.cpp index 8672a043f1862..6a7f700e1f491 100644 --- a/clang-tools-extra/clangd/unittests/SemanticHighlightingTests.cpp +++ b/clang-tools-extra/clangd/unittests/SemanticHighlightingTests.cpp @@ -720,7 +720,7 @@ TEST(SemanticHighlighting, GeneratesHighlightsWhenFileChange) { ASSERT_EQ(Counter.Count, 1); } -TEST(SemanticHighlighting, toSemanticHighlightingInformation) { +TEST(SemanticHighlighting, toTheiaSemanticHighlightingInformation) { auto CreatePosition = [](int Line, int Character) -> Position { Position Pos; Pos.line = Line; @@ -739,9 +739,9 @@ TEST(SemanticHighlighting, toSemanticHighlightingInformation) { {{HighlightingKind::Variable, Range{CreatePosition(1, 1), CreatePosition(1, 5)}}}, /* IsInactive = */ true}}; - std::vector ActualResults = - toSemanticHighlightingInformation(Tokens); - std::vector ExpectedResults = { + std::vector ActualResults = + toTheiaSemanticHighlightingInformation(Tokens); + std::vector ExpectedResults = { {3, "AAAACAAEAAAAAAAEAAMAAw=="}, {1, "AAAAAQAEAAA="}}; EXPECT_EQ(ActualResults, ExpectedResults); } diff --git a/clang-tools-extra/clangd/unittests/XRefsTests.cpp b/clang-tools-extra/clangd/unittests/XRefsTests.cpp index 32a89df424e41..fc36dfa42d7f2 100644 --- a/clang-tools-extra/clangd/unittests/XRefsTests.cpp +++ b/clang-tools-extra/clangd/unittests/XRefsTests.cpp @@ -644,7 +644,8 @@ TEST(LocateSymbol, Textual) { // Comment mentioning M^yClass )cpp", R"cpp(// String - struct [[MyClass]] {}; + struct MyClass {}; + // Not triggered for string literal tokens. const char* s = "String literal mentioning M^yClass"; )cpp", R"cpp(// Ifdef'ed out code @@ -696,7 +697,7 @@ TEST(LocateSymbol, Textual) { EXPECT_EQ(Results[0].PreferredDeclaration.range, *WantDecl) << Test; } } -} +} // namespace TEST(LocateSymbol, Ambiguous) { auto T = Annotations(R"cpp( diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst index 3b9212f6723c6..745a1d1035ed8 100644 --- a/clang-tools-extra/docs/ReleaseNotes.rst +++ b/clang-tools-extra/docs/ReleaseNotes.rst @@ -87,6 +87,14 @@ New checks result of a memory allocation function (``malloc()``, ``calloc()``, ``realloc()``, ``alloca()``) instead of its argument. +- New :doc:`bugprone-spuriously-wake-up-functions + ` check. + + Finds ``cnd_wait``, ``cnd_timedwait``, ``wait``, ``wait_for``, or + ``wait_until`` function calls when the function is not invoked from a loop + that checks whether a condition predicate holds or the function has a + condition parameter. + - New :doc:`bugprone-reserved-identifier ` check. @@ -124,6 +132,16 @@ New checks New check aliases ^^^^^^^^^^^^^^^^^ +- New alias :doc:`cert-con36-c + ` to + :doc:`bugprone-spuriously-wake-up-functions + ` was added. + +- New alias :doc:`cert-con54-cpp + ` to + :doc:`bugprone-spuriously-wake-up-functions + ` was added. + - New alias :doc:`cert-dcl37-c ` to :doc:`bugprone-reserved-identifier diff --git a/clang-tools-extra/docs/clang-rename.rst b/clang-tools-extra/docs/clang-rename.rst index ef6ed9cc08143..b45ba01c06a2e 100644 --- a/clang-tools-extra/docs/clang-rename.rst +++ b/clang-tools-extra/docs/clang-rename.rst @@ -16,7 +16,7 @@ functions, variables, arguments, namespaces etc. The tool is in a very early development stage, so you might encounter bugs and crashes. Submitting reports with information about how to reproduce the issue -to `the LLVM bugtracker `_ will definitely help the +to `the LLVM bugtracker `_ will definitely help the project. If you have any ideas or suggestions, you might want to put a feature request there. diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone-spuriously-wake-up-functions.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone-spuriously-wake-up-functions.rst new file mode 100644 index 0000000000000..17b81e13f579c --- /dev/null +++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone-spuriously-wake-up-functions.rst @@ -0,0 +1,29 @@ +.. title:: clang-tidy - bugprone-spuriously-wake-up-functions + +bugprone-spuriously-wake-up-functions +===================================== + +Finds ``cnd_wait``, ``cnd_timedwait``, ``wait``, ``wait_for``, or +``wait_until`` function calls when the function is not invoked from a loop +that checks whether a condition predicate holds or the function has a +condition parameter. + +.. code-block: c++ + + if (condition_predicate) { + condition.wait(lk); + } + +.. code-block: c + + if (condition_predicate) { + if (thrd_success != cnd_wait(&condition, &lock)) { + } + } + +This check corresponds to the CERT C++ Coding Standard rule +`CON54-CPP. Wrap functions that can spuriously wake up in a loop +`_. +and CERT C Coding Standard rule +`CON36-C. Wrap functions that can spuriously wake up in a loop +`_. diff --git a/clang-tools-extra/docs/clang-tidy/checks/cert-con36-c.rst b/clang-tools-extra/docs/clang-tidy/checks/cert-con36-c.rst new file mode 100644 index 0000000000000..7d74e05cf64d3 --- /dev/null +++ b/clang-tools-extra/docs/clang-tidy/checks/cert-con36-c.rst @@ -0,0 +1,10 @@ +.. title:: clang-tidy - cert-con36-c +.. meta:: + :http-equiv=refresh: 5;URL=bugprone-spuriously-wake-up-functions.html + +cert-con36-c +============ + +The cert-con36-c check is an alias, please see +`bugprone-spuriously-wake-up-functions `_ +for more information. diff --git a/clang-tools-extra/docs/clang-tidy/checks/cert-con54-cpp.rst b/clang-tools-extra/docs/clang-tidy/checks/cert-con54-cpp.rst new file mode 100644 index 0000000000000..f74bc44962199 --- /dev/null +++ b/clang-tools-extra/docs/clang-tidy/checks/cert-con54-cpp.rst @@ -0,0 +1,10 @@ +.. title:: clang-tidy - cert-con54-cpp +.. meta:: + :http-equiv=refresh: 5;URL=bugprone-spuriously-wake-up-functions.html + +cert-con54-cpp +============== + +The cert-con54-cpp check is an alias, please see +`bugprone-spuriously-wake-up-functions `_ +for more information. diff --git a/clang-tools-extra/docs/clang-tidy/checks/list.rst b/clang-tools-extra/docs/clang-tidy/checks/list.rst index 5e943c5003f0e..333a7ea4d5b47 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/list.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/list.rst @@ -76,6 +76,7 @@ Clang-Tidy Checks `bugprone-signed-char-misuse `_, `bugprone-sizeof-container `_, `bugprone-sizeof-expression `_, + `bugprone-spuriously-wake-up-functions `_, `bugprone-string-constructor `_, "Yes" `bugprone-string-integer-assignment `_, "Yes" `bugprone-string-literal-with-embedded-nul `_, @@ -187,7 +188,7 @@ Clang-Tidy Checks `llvm-prefer-isa-or-dyn-cast-in-conditionals `_, "Yes" `llvm-prefer-register-over-unsigned `_, "Yes" `llvm-twine-local `_, "Yes" - `llvmlibc-restrict-system-libc-headers `_, + `llvmlibc-restrict-system-libc-headers `_, "Yes" `misc-definitions-in-headers `_, "Yes" `misc-misplaced-const `_, `misc-new-delete-overloads `_, @@ -300,6 +301,8 @@ Clang-Tidy Checks .. csv-table:: Aliases.. :header: "Name", "Redirect", "Offers fixes" + `cert-con36-c `_, `bugprone-spuriously-wake-up-functions `_, + `cert-con54-cpp `_, `bugprone-spuriously-wake-up-functions `_, `cert-dcl03-c `_, `misc-static-assert `_, "Yes" `cert-dcl16-c `_, `readability-uppercase-literal-suffix `_, "Yes" `cert-dcl37-c `_, `bugprone-reserved-identifier `_, "Yes" diff --git a/clang-tools-extra/docs/clang-tidy/checks/llvmlibc-restrict-system-libc-headers.rst b/clang-tools-extra/docs/clang-tidy/checks/llvmlibc-restrict-system-libc-headers.rst index 0ec092584895d..bf39dd62ba95b 100644 --- a/clang-tools-extra/docs/clang-tidy/checks/llvmlibc-restrict-system-libc-headers.rst +++ b/clang-tools-extra/docs/clang-tidy/checks/llvmlibc-restrict-system-libc-headers.rst @@ -18,3 +18,18 @@ lead to subtle and hard to detect bugs. For example consider a system libc whose ``dirent`` struct has slightly different field ordering than llvm-libc. While this will compile successfully, this can cause issues during runtime because they are ABI incompatible. + +Options +------- + +.. option:: Includes + + A string containing a comma separated glob list of allowed include + filenames. Similar to the -checks glob list for running clang-tidy itself, + the two wildcard characters are `*` and `-`, to include and exclude globs, + respectively. The default is `-*`, which disallows all includes. + + This can be used to allow known safe includes such as Linux development + headers. See :doc:`portability-restrict-system-includes + ` for more + details. diff --git a/clang-tools-extra/docs/clang-tidy/index.rst b/clang-tools-extra/docs/clang-tidy/index.rst index 2ca7f99a0bff4..1dfaa2e06b8d2 100644 --- a/clang-tools-extra/docs/clang-tidy/index.rst +++ b/clang-tools-extra/docs/clang-tidy/index.rst @@ -62,11 +62,13 @@ Name prefix Description ``boost-`` Checks related to Boost library. ``bugprone-`` Checks that target bugprone code constructs. ``cert-`` Checks related to CERT Secure Coding Guidelines. -``cppcoreguidelines-`` Checks related to C++ Core Guidelines. ``clang-analyzer-`` Clang Static Analyzer checks. +``cppcoreguidelines-`` Checks related to C++ Core Guidelines. +``darwin-`` Checks related to Darwin coding conventions. ``fuchsia-`` Checks related to Fuchsia coding conventions. ``google-`` Checks related to Google coding conventions. ``hicpp-`` Checks related to High Integrity C++ Coding Standard. +``linuxkernel-`` Checks related to the Linux Kernel coding conventions. ``llvm-`` Checks related to the LLVM coding conventions. ``llvmlibc-`` Checks related to the LLVM-libc coding standards. ``misc-`` Checks that we didn't have a better category for. diff --git a/clang-tools-extra/test/clang-apply-replacements/Inputs/identical-in-TU/file1.yaml b/clang-tools-extra/test/clang-apply-replacements/Inputs/identical-in-TU/file1.yaml new file mode 100644 index 0000000000000..65a1b47a175c0 --- /dev/null +++ b/clang-tools-extra/test/clang-apply-replacements/Inputs/identical-in-TU/file1.yaml @@ -0,0 +1,19 @@ +--- +MainSourceFile: identical_in_TU.cpp +Diagnostics: + - DiagnosticName: test-identical-insertion + DiagnosticMessage: + Message: Fix + FilePath: $(path)/identical_in_TU.cpp + FileOffset: 12 + Replacements: + - FilePath: $(path)/identical_in_TU.cpp + Offset: 12 + Length: 0 + ReplacementText: '0' + - FilePath: $(path)/identical_in_TU.cpp + Offset: 12 + Length: 0 + ReplacementText: '0' +... + diff --git a/clang-tools-extra/test/clang-apply-replacements/Inputs/identical-in-TU/file2.yaml b/clang-tools-extra/test/clang-apply-replacements/Inputs/identical-in-TU/file2.yaml new file mode 100644 index 0000000000000..5297e974dec47 --- /dev/null +++ b/clang-tools-extra/test/clang-apply-replacements/Inputs/identical-in-TU/file2.yaml @@ -0,0 +1,19 @@ +--- +MainSourceFile: identical-in-TU.cpp +Diagnostics: + - DiagnosticName: test-identical-insertion + DiagnosticMessage: + Message: Fix + FilePath: $(path)/identical-in-TU.cpp + FileOffset: 12 + Replacements: + - FilePath: $(path)/identical-in-TU.cpp + Offset: 12 + Length: 0 + ReplacementText: '0' + - FilePath: $(path)/identical-in-TU.cpp + Offset: 12 + Length: 0 + ReplacementText: '0' +... + diff --git a/clang-tools-extra/test/clang-apply-replacements/Inputs/identical-in-TU/identical-in-TU.cpp b/clang-tools-extra/test/clang-apply-replacements/Inputs/identical-in-TU/identical-in-TU.cpp new file mode 100644 index 0000000000000..bdaab4fc823a8 --- /dev/null +++ b/clang-tools-extra/test/clang-apply-replacements/Inputs/identical-in-TU/identical-in-TU.cpp @@ -0,0 +1,2 @@ +class MyType {}; +// CHECK: class MyType00 {}; diff --git a/clang-tools-extra/test/clang-apply-replacements/identical-in-TU.cpp b/clang-tools-extra/test/clang-apply-replacements/identical-in-TU.cpp new file mode 100644 index 0000000000000..024db114b5890 --- /dev/null +++ b/clang-tools-extra/test/clang-apply-replacements/identical-in-TU.cpp @@ -0,0 +1,11 @@ +// RUN: mkdir -p %T/Inputs/identical-in-TU + +// RUN: grep -Ev "// *[A-Z-]+:" %S/Inputs/identical-in-TU/identical-in-TU.cpp > %T/Inputs/identical-in-TU/identical-in-TU.cpp +// RUN: sed "s#\$(path)#%/T/Inputs/identical-in-TU#" %S/Inputs/identical-in-TU/file1.yaml > %T/Inputs/identical-in-TU/file1.yaml +// RUN: sed "s#\$(path)#%/T/Inputs/identical-in-TU#" %S/Inputs/identical-in-TU/file2.yaml > %T/Inputs/identical-in-TU/file2.yaml +// RUN: clang-apply-replacements %T/Inputs/identical-in-TU +// RUN: FileCheck -input-file=%T/Inputs/identical-in-TU/identical-in-TU.cpp %S/Inputs/identical-in-TU/identical-in-TU.cpp + +// Similar to identical test but each yaml file contains the same fix twice. +// This check ensures that only the duplicated replacements in a single yaml +// file are applied twice. Addresses PR45150. diff --git a/clang-tools-extra/test/clang-tidy/checkers/Inputs/llvmlibc/system/math.h b/clang-tools-extra/test/clang-tidy/checkers/Inputs/llvmlibc/system/math.h deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/clang-tools-extra/test/clang-tidy/checkers/Inputs/llvmlibc/transitive.h b/clang-tools-extra/test/clang-tidy/checkers/Inputs/llvmlibc/transitive.h deleted file mode 100644 index a84546e5bc836..0000000000000 --- a/clang-tools-extra/test/clang-tidy/checkers/Inputs/llvmlibc/transitive.h +++ /dev/null @@ -1 +0,0 @@ -#include diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone-reserved-identifier-invert.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone-reserved-identifier-invert.cpp index f8a2662bebf17..501f6dd13dc53 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/bugprone-reserved-identifier-invert.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone-reserved-identifier-invert.cpp @@ -57,7 +57,7 @@ template inline reference_wrapper cref(const Up &u) noexcept { // CHECK-MESSAGES: :[[@LINE-1]]:16: warning: declaration uses identifier 'u', which is not a reserved identifier [bugprone-reserved-identifier] - // CHECK-FIXES: {{^}}cref(const Up &__u) noexcept {{{$}} + // CHECK-FIXES: {{^}}cref(const _Up &__u) noexcept {{{$}} return reference_wrapper(u); } diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone-spuriously-wake-up-functions.c b/clang-tools-extra/test/clang-tidy/checkers/bugprone-spuriously-wake-up-functions.c new file mode 100644 index 0000000000000..fd3b94081c208 --- /dev/null +++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone-spuriously-wake-up-functions.c @@ -0,0 +1,164 @@ +// RUN: %check_clang_tidy %s bugprone-spuriously-wake-up-functions %t -- -- +#define NULL 0 + +struct Node1 { + void *Node1; + struct Node1 *next; +}; + +typedef struct mtx_t { +} mtx_t; +typedef struct cnd_t { +} cnd_t; +struct timespec {}; + +int cnd_wait(cnd_t *cond, mtx_t *mutex){}; +int cnd_timedwait(cnd_t *cond, mtx_t *mutex, + const struct timespec *time_point){}; + +struct Node1 list_c; +static mtx_t lock; +static cnd_t condition_c; +struct timespec ts; + +void consume_list_element(void) { + + if (list_c.next == NULL) { + if (0 != cnd_wait(&condition_c, &lock)) { + // CHECK-MESSAGES: :[[@LINE-1]]:14: warning: 'cnd_wait' should be placed inside a while statement [bugprone-spuriously-wake-up-functions] + } + } + if (list_c.next == NULL) + if (0 != cnd_wait(&condition_c, &lock)) + // CHECK-MESSAGES: :[[@LINE-1]]:14: warning: 'cnd_wait' should be placed inside a while statement [bugprone-spuriously-wake-up-functions] + ; + if (list_c.next == NULL && 0 != cnd_wait(&condition_c, &lock)) + // CHECK-MESSAGES: :[[@LINE-1]]:35: warning: 'cnd_wait' should be placed inside a while statement [bugprone-spuriously-wake-up-functions] + ; + while (list_c.next == NULL) { + if (0 != cnd_wait(&condition_c, &lock)) { + } + } + while (list_c.next == NULL) + if (0 != cnd_wait(&condition_c, &lock)) { + } + while (list_c.next == NULL) + if (0 != cnd_wait(&condition_c, &lock)) + ; + if (list_c.next == NULL) { + cnd_wait(&condition_c, &lock); + // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: 'cnd_wait' should be placed inside a while statement [bugprone-spuriously-wake-up-functions] + } + if (list_c.next == NULL) + cnd_wait(&condition_c, &lock); + // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: 'cnd_wait' should be placed inside a while statement [bugprone-spuriously-wake-up-functions] + while (list_c.next == NULL) { + cnd_wait(&condition_c, &lock); + } + while (list_c.next == NULL) + cnd_wait(&condition_c, &lock); + + do { + if (0 != cnd_wait(&condition_c, &lock)) { + } + } while (list_c.next == NULL); + do + if (0 != cnd_wait(&condition_c, &lock)) { + } + while (list_c.next == NULL); + do + if (0 != cnd_wait(&condition_c, &lock)) + ; + while (list_c.next == NULL); + do { + cnd_wait(&condition_c, &lock); + } while (list_c.next == NULL); + do + cnd_wait(&condition_c, &lock); + while (list_c.next == NULL); + for (;; list_c.next == NULL) { + if (0 != cnd_wait(&condition_c, &lock)) { + } + } + for (;; list_c.next == NULL) + if (0 != cnd_wait(&condition_c, &lock)) { + } + for (;; list_c.next == NULL) + if (0 != cnd_wait(&condition_c, &lock)) + ; + for (;; list_c.next == NULL) { + cnd_wait(&condition_c, &lock); + } + for (;; list_c.next == NULL) + cnd_wait(&condition_c, &lock); + + if (list_c.next == NULL) { + if (0 != cnd_timedwait(&condition_c, &lock, &ts)) { + // CHECK-MESSAGES: :[[@LINE-1]]:14: warning: 'cnd_timedwait' should be placed inside a while statement [bugprone-spuriously-wake-up-functions] + } + } + if (list_c.next == NULL) + if (0 != cnd_timedwait(&condition_c, &lock, &ts)) + // CHECK-MESSAGES: :[[@LINE-1]]:14: warning: 'cnd_timedwait' should be placed inside a while statement [bugprone-spuriously-wake-up-functions] + ; + if (list_c.next == NULL && 0 != cnd_timedwait(&condition_c, &lock, &ts)) + // CHECK-MESSAGES: :[[@LINE-1]]:35: warning: 'cnd_timedwait' should be placed inside a while statement [bugprone-spuriously-wake-up-functions] + ; + while (list_c.next == NULL) { + if (0 != cnd_timedwait(&condition_c, &lock, &ts)) { + } + } + while (list_c.next == NULL) + if (0 != cnd_timedwait(&condition_c, &lock, &ts)) { + } + while (list_c.next == NULL) + if (0 != cnd_timedwait(&condition_c, &lock, &ts)) + ; + if (list_c.next == NULL) { + cnd_timedwait(&condition_c, &lock, &ts); + // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: 'cnd_timedwait' should be placed inside a while statement [bugprone-spuriously-wake-up-functions] + } + if (list_c.next == NULL) + cnd_timedwait(&condition_c, &lock, &ts); + // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: 'cnd_timedwait' should be placed inside a while statement [bugprone-spuriously-wake-up-functions] + while (list_c.next == NULL) { + cnd_timedwait(&condition_c, &lock, &ts); + } + while (list_c.next == NULL) + cnd_timedwait(&condition_c, &lock, &ts); + + do { + if (0 != cnd_timedwait(&condition_c, &lock, &ts)) { + } + } while (list_c.next == NULL); + do + if (0 != cnd_timedwait(&condition_c, &lock, &ts)) { + } + while (list_c.next == NULL); + do + if (0 != cnd_timedwait(&condition_c, &lock, &ts)) + ; + while (list_c.next == NULL); + do { + cnd_timedwait(&condition_c, &lock, &ts); + } while (list_c.next == NULL); + do + cnd_timedwait(&condition_c, &lock, &ts); + while (list_c.next == NULL); + for (;; list_c.next == NULL) { + if (0 != cnd_timedwait(&condition_c, &lock, &ts)) { + } + } + for (;; list_c.next == NULL) + if (0 != cnd_timedwait(&condition_c, &lock, &ts)) { + } + for (;; list_c.next == NULL) + if (0 != cnd_timedwait(&condition_c, &lock, &ts)) + ; + for (;; list_c.next == NULL) { + cnd_timedwait(&condition_c, &lock, &ts); + } + for (;; list_c.next == NULL) + cnd_timedwait(&condition_c, &lock, &ts); +} +int main() { return 0; } diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone-spuriously-wake-up-functions.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone-spuriously-wake-up-functions.cpp new file mode 100644 index 0000000000000..6db92ef939fa3 --- /dev/null +++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone-spuriously-wake-up-functions.cpp @@ -0,0 +1,191 @@ +// RUN: %check_clang_tidy %s bugprone-spuriously-wake-up-functions %t -- -- +#define NULL 0 + +namespace std { +using intmax_t = int; + +template +class ratio { +public: + static constexpr intmax_t num = 0; + static constexpr intmax_t den = 0; + typedef ratio type; +}; +typedef ratio<1, 1000> milli; +namespace chrono { + +template > +class duration { +public: + using rep = Rep; + using period = Period; + +public: + constexpr duration() = default; + template + constexpr explicit duration(const Rep2 &r); + template + constexpr duration(const duration &d); + ~duration() = default; + duration(const duration &) = default; +}; + +template +class time_point { +public: + using clock = Clock; + using duration = Duration; + +public: + constexpr time_point(); + constexpr explicit time_point(const duration &d); + template + constexpr time_point(const time_point &t); +}; + +using milliseconds = duration; + +class system_clock { +public: + typedef milliseconds duration; + typedef duration::rep rep; + typedef duration::period period; + typedef chrono::time_point time_point; + + static time_point now() noexcept; +}; +} // namespace chrono + +class mutex; +template +class unique_lock { +public: + typedef Mutex mutex_type; + + unique_lock() noexcept; + explicit unique_lock(mutex_type &m); +}; + +class mutex { +public: + constexpr mutex() noexcept; + ~mutex(); + mutex(const mutex &) = delete; + mutex &operator=(const mutex &) = delete; +}; + +enum class cv_status { + no_timeout, + timeout +}; + +class condition_variable { +public: + condition_variable(); + ~condition_variable(); + condition_variable(const condition_variable &) = delete; + + void wait(unique_lock &lock); + template + void wait(unique_lock &lock, Predicate pred); + template + cv_status wait_until(unique_lock &lock, + const chrono::time_point &abs_time){}; + template + bool wait_until(unique_lock &lock, + const chrono::time_point &abs_time, + Predicate pred){}; + template + cv_status wait_for(unique_lock &lock, + const chrono::duration &rel_time){}; + template + bool wait_for(unique_lock &lock, + const chrono::duration &rel_time, + Predicate pred){}; +}; + +} // namespace std + +struct Node1 { + void *Node1; + struct Node1 *next; +}; + +static Node1 list; +static std::mutex m; +static std::condition_variable condition; + +void consume_list_element(std::condition_variable &condition) { + std::unique_lock lk(m); + + if (list.next == nullptr) { + condition.wait(lk); + // CHECK-MESSAGES: :[[@LINE-1]]:15: warning: 'wait' should be placed inside a while statement or used with a conditional parameter [bugprone-spuriously-wake-up-functions] + } + + while (list.next == nullptr) { + condition.wait(lk); + } + + do { + condition.wait(lk); + } while (list.next == nullptr); + + for (;; list.next == nullptr) { + condition.wait(lk); + } + + if (list.next == nullptr) { + while (list.next == nullptr) { + condition.wait(lk); + } + } + + if (list.next == nullptr) { + do { + condition.wait(lk); + } while (list.next == nullptr); + } + + if (list.next == nullptr) { + for (;; list.next == nullptr) { + condition.wait(lk); + } + } + using durtype = std::chrono::duration; + durtype dur = std::chrono::duration(); + if (list.next == nullptr) { + condition.wait_for(lk, dur); + // CHECK-MESSAGES: :[[@LINE-1]]:15: warning: 'wait_for' should be placed inside a while statement or used with a conditional parameter [bugprone-spuriously-wake-up-functions] + } + if (list.next == nullptr) { + condition.wait_for(lk, dur, [] { return 1; }); + } + while (list.next == nullptr) { + condition.wait_for(lk, dur); + } + do { + condition.wait_for(lk, dur); + } while (list.next == nullptr); + for (;; list.next == nullptr) { + condition.wait_for(lk, dur); + } + + auto now = std::chrono::system_clock::now(); + if (list.next == nullptr) { + condition.wait_until(lk, now); + // CHECK-MESSAGES: :[[@LINE-1]]:15: warning: 'wait_until' should be placed inside a while statement or used with a conditional parameter [bugprone-spuriously-wake-up-functions] + } + if (list.next == nullptr) { + condition.wait_until(lk, now, [] { return 1; }); + } + while (list.next == nullptr) { + condition.wait_until(lk, now); + } + do { + condition.wait_until(lk, now); + } while (list.next == nullptr); + for (;; list.next == nullptr) { + condition.wait_until(lk, now); + } +} diff --git a/clang-tools-extra/test/clang-tidy/checkers/llvmlibc-restrict-system-libc-headers-transitive.cpp b/clang-tools-extra/test/clang-tidy/checkers/llvmlibc-restrict-system-libc-headers-transitive.cpp deleted file mode 100644 index 745aa0bb34016..0000000000000 --- a/clang-tools-extra/test/clang-tidy/checkers/llvmlibc-restrict-system-libc-headers-transitive.cpp +++ /dev/null @@ -1,8 +0,0 @@ -// RUN: %check_clang_tidy %s llvmlibc-restrict-system-libc-headers %t \ -// RUN: -- -header-filter=.* \ -// RUN: -- -I %S/Inputs/llvmlibc \ -// RUN: -isystem %S/Inputs/llvmlibc/system \ -// RUN: -resource-dir %S/Inputs/llvmlibc/resource - -#include "transitive.h" -// CHECK-MESSAGES: :1:1: warning: system libc header math.h not allowed, transitively included from {{.*}} diff --git a/clang-tools-extra/test/clang-tidy/checkers/llvmlibc-restrict-system-libc-headers.cpp b/clang-tools-extra/test/clang-tidy/checkers/llvmlibc-restrict-system-libc-headers.cpp index 43f5b1e94279a..52e25faf190fd 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/llvmlibc-restrict-system-libc-headers.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/llvmlibc-restrict-system-libc-headers.cpp @@ -3,11 +3,11 @@ // RUN: -resource-dir %S/Inputs/llvmlibc/resource #include -// CHECK-MESSAGES: :[[@LINE-1]]:1: warning: system libc header stdio.h not allowed +// CHECK-MESSAGES: :[[@LINE-1]]:1: warning: system include stdio.h not allowed #include -// CHECK-MESSAGES: :[[@LINE-1]]:1: warning: system libc header stdlib.h not allowed +// CHECK-MESSAGES: :[[@LINE-1]]:1: warning: system include stdlib.h not allowed #include "string.h" -// CHECK-MESSAGES: :[[@LINE-1]]:1: warning: system libc header string.h not allowed +// CHECK-MESSAGES: :[[@LINE-1]]:1: warning: system include string.h not allowed #include "stdatomic.h" #include // Compiler provided headers should not throw warnings. diff --git a/clang-tools-extra/test/clang-tidy/checkers/misc-unconventional-assign-operator-precxx11.cpp b/clang-tools-extra/test/clang-tidy/checkers/misc-unconventional-assign-operator-precxx11.cpp new file mode 100644 index 0000000000000..7dc939955f37a --- /dev/null +++ b/clang-tools-extra/test/clang-tidy/checkers/misc-unconventional-assign-operator-precxx11.cpp @@ -0,0 +1,6 @@ +// RUN: %check_clang_tidy -std=c++98,c++03 %s misc-unconventional-assign-operator %t + +struct BadArgument { + BadArgument &operator=(BadArgument &); + // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: operator=() should take 'BadArgument const&' or 'BadArgument' +}; diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability-identifier-naming.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability-identifier-naming.cpp index c9509434813cd..7983bb30ca649 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/readability-identifier-naming.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/readability-identifier-naming.cpp @@ -527,3 +527,21 @@ void MyPoiterFunction(int * p_normal_pointer, int * const constant_ptr){ // CHECK-FIXES: {{^}} int * const lc_PointerB = nullptr;{{$}} } +using namespace FOO_NS; +// CHECK-FIXES: {{^}}using namespace foo_ns; + +using namespace FOO_NS::InlineNamespace; +// CHECK-FIXES: {{^}}using namespace foo_ns::inline_namespace; + +void QualifiedTypeLocTest(THIS___Structure); +// CHECK-FIXES: {{^}}void QualifiedTypeLocTest(this_structure);{{$}} +void QualifiedTypeLocTest(THIS___Structure &); +// CHECK-FIXES: {{^}}void QualifiedTypeLocTest(this_structure &);{{$}} +void QualifiedTypeLocTest(THIS___Structure &&); +// CHECK-FIXES: {{^}}void QualifiedTypeLocTest(this_structure &&);{{$}} +void QualifiedTypeLocTest(const THIS___Structure); +// CHECK-FIXES: {{^}}void QualifiedTypeLocTest(const this_structure);{{$}} +void QualifiedTypeLocTest(const THIS___Structure &); +// CHECK-FIXES: {{^}}void QualifiedTypeLocTest(const this_structure &);{{$}} +void QualifiedTypeLocTest(volatile THIS___Structure &); +// CHECK-FIXES: {{^}}void QualifiedTypeLocTest(volatile this_structure &);{{$}} diff --git a/clang/CMakeLists.txt b/clang/CMakeLists.txt index 75dafbf75300f..9ddb633a2899d 100644 --- a/clang/CMakeLists.txt +++ b/clang/CMakeLists.txt @@ -591,6 +591,7 @@ if(CLANG_LIBS) add_dependencies(clang-libraries ${lib}) if(NOT LLVM_ENABLE_IDE) add_dependencies(install-clang-libraries install-${lib}) + add_dependencies(install-clang-libraries-stripped install-${lib}-stripped) endif() endforeach() endif() diff --git a/clang/cmake/caches/Fuchsia-stage2.cmake b/clang/cmake/caches/Fuchsia-stage2.cmake index 7733e55ab229c..e2be68be22848 100644 --- a/clang/cmake/caches/Fuchsia-stage2.cmake +++ b/clang/cmake/caches/Fuchsia-stage2.cmake @@ -16,7 +16,7 @@ set(LLVM_ENABLE_ZLIB ON CACHE BOOL "") set(LLVM_INCLUDE_DOCS OFF CACHE BOOL "") set(LLVM_INCLUDE_EXAMPLES OFF CACHE BOOL "") set(LLVM_INCLUDE_GO_TESTS OFF CACHE BOOL "") -set(LLVM_USE_RELATIVE_PATHS_IN_DEBUG_INFO ON CACHE BOOL "") +set(LLVM_USE_RELATIVE_PATHS_IN_FILES ON CACHE BOOL "") set(CLANG_DEFAULT_CXX_STDLIB libc++ CACHE STRING "") if(NOT APPLE) diff --git a/clang/docs/ClangCommandLineReference.rst b/clang/docs/ClangCommandLineReference.rst index 74e831c36eb90..3f0e471e852a9 100644 --- a/clang/docs/ClangCommandLineReference.rst +++ b/clang/docs/ClangCommandLineReference.rst @@ -2403,7 +2403,7 @@ Target-dependent compilation options ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. option:: -G, -G=, -msmall-data-threshold= -Put objects of at most bytes into small data section (MIPS / Hexagon) +Put objects of at most bytes into small data section (MIPS / Hexagon / RISCV) .. option:: -ffixed-x1 @@ -3201,6 +3201,10 @@ Enable linker relaxation Enable using library calls for save and restore +.. option:: -msmall-data-limit= + +Put global and static data smaller than the limit into a special section + Long double flags ----------------- Selects the long double implementation diff --git a/clang/docs/ClangPlugins.rst b/clang/docs/ClangPlugins.rst index 23e037e197c9e..7e33ea33c0df5 100644 --- a/clang/docs/ClangPlugins.rst +++ b/clang/docs/ClangPlugins.rst @@ -63,6 +63,53 @@ registering it using ``PragmaHandlerRegistry::Add<>``: static PragmaHandlerRegistry::Add Y("example_pragma","example pragma description"); +Defining attributes +=================== + +Plugins can define attributes by declaring a ``ParsedAttrInfo`` and registering +it using ``ParsedAttrInfoRegister::Add<>``: + +.. code-block:: c++ + + class ExampleAttrInfo : public ParsedAttrInfo { + public: + ExampleAttrInfo() { + Spellings.push_back({ParsedAttr::AS_GNU,"example"}); + } + AttrHandling handleDeclAttribute(Sema &S, Decl *D, + const ParsedAttr &Attr) const override { + // Handle the attribute + return AttributeApplied; + } + }; + + static ParsedAttrInfoRegistry::Add Z("example_attr","example attribute description"); + +The members of ``ParsedAttrInfo`` that a plugin attribute must define are: + + * ``Spellings``, which must be populated with every `Spelling + `_ of the + attribute, each of which consists of an attribute syntax and how the + attribute name is spelled for that syntax. If the syntax allows a scope then + the spelling must be "scope::attr" if a scope is present or "::attr" if not. + * ``handleDeclAttribute``, which is the function that applies the attribute to + a declaration. It is responsible for checking that the attribute's arguments + are valid, and typically applies the attribute by adding an ``Attr`` to the + ``Decl``. It returns either ``AttributeApplied``, to indicate that the + attribute was successfully applied, or ``AttributeNotApplied`` if it wasn't. + +The members of ``ParsedAttrInfo`` that may need to be defined, depending on the +attribute, are: + + * ``NumArgs`` and ``OptArgs``, which set the number of required and optional + arguments to the attribute. + * ``diagAppertainsToDecl``, which checks if the attribute has been used on the + right kind of declaration and issues a diagnostic if not. + * ``diagLangOpts``, which checks if the attribute is permitted for the current + language mode and issues a diagnostic if not. + * ``existsInTarget``, which checks if the attribute is permitted for the given + target. + Putting it all together ======================= diff --git a/clang/docs/InternalsManual.rst b/clang/docs/InternalsManual.rst index 4f1c8a584c670..09aec6df69f2f 100644 --- a/clang/docs/InternalsManual.rst +++ b/clang/docs/InternalsManual.rst @@ -2296,7 +2296,7 @@ are created implicitly. The following spellings are accepted: placement. ``CXX11`` Spelled with a C++-style ``[[attr]]`` syntax with an optional vendor-specific namespace. - ``C2x`` Spelled with a C-style ``[[attr]]` syntax with an optional + ``C2x`` Spelled with a C-style ``[[attr]]`` syntax with an optional vendor-specific namespace. ``Declspec`` Spelled with a Microsoft-style ``__declspec(attr)`` syntax. ``Keyword`` The attribute is spelled as a keyword, and required custom @@ -2455,6 +2455,9 @@ Attributes that do not require custom semantic handling should set the attributes are assumed to use a semantic handler by default. Attributes without a semantic handler are not given a parsed attribute ``Kind`` enumerator. +"Simple" attributes, that require no custom semantic processing aside from what +is automatically provided, should set the ``SimpleHandler`` field to ``1``. + Target-specific attributes may share a spelling with other attributes in different targets. For instance, the ARM and MSP430 targets both have an attribute spelled ``GNU<"interrupt">``, but with different parsing and semantic @@ -2481,12 +2484,11 @@ Boilerplate All semantic processing of declaration attributes happens in `lib/Sema/SemaDeclAttr.cpp `_, and generally starts in the ``ProcessDeclAttribute()`` function. If the -attribute is a "simple" attribute -- meaning that it requires no custom semantic -processing aside from what is automatically provided, add a call to -``handleSimpleAttribute(S, D, Attr);`` to the switch statement. -Otherwise, write a new ``handleYourAttr()`` function, and add that to the switch -statement. Please do not implement handling logic directly in the ``case`` for -the attribute. +attribute has the ``SimpleHandler`` field set to ``1`` then the function to +process the attribute will be automatically generated, and nothing needs to be +done here. Otherwise, write a new ``handleYourAttr()`` function, and add that to +the switch statement. Please do not implement handling logic directly in the +``case`` for the attribute. Unless otherwise specified by the attribute definition, common semantic checking of the parsed attribute is handled automatically. This includes diagnosing diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst index e69dd810f21d0..558ce7dee6536 100644 --- a/clang/docs/LanguageExtensions.rst +++ b/clang/docs/LanguageExtensions.rst @@ -1259,7 +1259,7 @@ ASM Goto with Output Constraints ================================ In addition to the functionality provided by `GCC's extended -assembly``_, clang +assembly `_, clang supports output constraints with the `goto` form. The goto form of GCC's extended assembly allows the programmer to branch to a C @@ -2129,44 +2129,31 @@ object that overloads ``operator&``. ``__builtin_operator_new`` and ``__builtin_operator_delete`` ------------------------------------------------------------ -``__builtin_operator_new`` allocates memory just like a non-placement non-class -*new-expression*. This is exactly like directly calling the normal -non-placement ``::operator new``, except that it allows certain optimizations +A call to ``__builtin_operator_new(args)`` is exactly the same as a call to +``::operator new(args)``, except that it allows certain optimizations that the C++ standard does not permit for a direct function call to ``::operator new`` (in particular, removing ``new`` / ``delete`` pairs and -merging allocations). +merging allocations), and that the call is required to resolve to a +`replaceable global allocation function +`_. -Likewise, ``__builtin_operator_delete`` deallocates memory just like a -non-class *delete-expression*, and is exactly like directly calling the normal -``::operator delete``, except that it permits optimizations. Only the unsized -form of ``__builtin_operator_delete`` is currently available. +Likewise, ``__builtin_operator_delete`` is exactly the same as a call to +``::operator delete(args)``, except that it permits optimizations +and that the call is required to resolve to a +`replaceable global deallocation function +`_. These builtins are intended for use in the implementation of ``std::allocator`` and other similar allocation libraries, and are only available in C++. -``__unique_stable_name`` ------------------------- - -``__unique_stable_name()`` is a builtin that takes a type or expression and -produces a string literal containing a unique name for the type (or type of the -expression) that is stable across split compilations. - -In cases where the split compilation needs to share a unique token for a type -across the boundary (such as in an offloading situation), this name can be used -for lookup purposes. +Query for this feature with ``__has_builtin(__builtin_operator_new)`` or +``__has_builtin(__builtin_operator_delete)``: -This builtin is superior to RTTI for this purpose for two reasons. First, this -value is computed entirely at compile time, so it can be used in constant -expressions. Second, this value encodes lambda functions based on line-number -rather than the order in which it appears in a function. This is valuable -because it is stable in cases where an unrelated lambda is introduced -conditionally in the same function. + * If the value is at least ``201802L``, the builtins behave as described above. -The current implementation of this builtin uses a slightly modified Itanium -Mangler to produce the unique name. The lambda ordinal is replaced with one or -more line/column pairs in the format ``LINE->COL``, separated with a ``~`` -character. Typically, only one pair will be included, however in the case of -macro expansions, the entire macro expansion stack is expressed. + * If the value is non-zero, the builtins may not support calling arbitrary + replaceable global (de)allocation functions, but do support calling at least + ``::operator new(size_t)`` and ``::operator delete(void*)``. ``__builtin_preserve_access_index`` ----------------------------------- @@ -2200,6 +2187,30 @@ argument. int *pb =__builtin_preserve_access_index(&v->c[3].b); __builtin_preserve_access_index(v->j); +``__builtin_unique_stable_name`` +------------------------ + +``__builtin_unique_stable_name()`` is a builtin that takes a type or expression and +produces a string literal containing a unique name for the type (or type of the +expression) that is stable across split compilations. + +In cases where the split compilation needs to share a unique token for a type +across the boundary (such as in an offloading situation), this name can be used +for lookup purposes. + +This builtin is superior to RTTI for this purpose for two reasons. First, this +value is computed entirely at compile time, so it can be used in constant +expressions. Second, this value encodes lambda functions based on line-number +rather than the order in which it appears in a function. This is valuable +because it is stable in cases where an unrelated lambda is introduced +conditionally in the same function. + +The current implementation of this builtin uses a slightly modified Itanium +Mangler to produce the unique name. The lambda ordinal is replaced with one or +more line/column pairs in the format ``LINE->COL``, separated with a ``~`` +character. Typically, only one pair will be included, however in the case of +macro expansions the entire macro expansion stack is expressed. + Multiprecision Arithmetic Builtins ---------------------------------- diff --git a/clang/docs/LibASTImporter.rst b/clang/docs/LibASTImporter.rst index 9c02b6ae76e99..bedaf527f5e9e 100644 --- a/clang/docs/LibASTImporter.rst +++ b/clang/docs/LibASTImporter.rst @@ -119,7 +119,7 @@ Now we create the Importer and do the import: llvm::Expected ImportedOrErr = Importer.Import(From); The ``Import`` call returns with ``llvm::Expected``, so, we must check for any error. -Please refer to the `error handling `_ documentation for details. +Please refer to the `error handling `_ documentation for details. .. code-block:: cpp diff --git a/clang/docs/OpenMPSupport.rst b/clang/docs/OpenMPSupport.rst index 66ce6eda9c8c8..209a774405372 100644 --- a/clang/docs/OpenMPSupport.rst +++ b/clang/docs/OpenMPSupport.rst @@ -131,7 +131,7 @@ implementation. +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ | loop extension | clause: if for SIMD directives | :good:`done` | | +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ -| loop extension | inclusive scan extension (matching C++17 PSTL) | :none:`unclaimed` | | +| loop extension | inclusive scan extension (matching C++17 PSTL) | :none:`claimed` | | +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ | memory mangagement | memory allocators | :good:`done` | r341687,r357929 | +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ @@ -179,6 +179,10 @@ implementation. +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ | device extension | clause: device_type | :good:`done` | | +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| device extension | clause: extended device | :good:`done` | | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ +| device extension | clause: uses_allocators clause | :none:`claimed` | | ++------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ | device extension | clause: in_reduction | :part:`worked on` | r308768 | +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+ | device extension | omp_get_device_num() | :part:`worked on` | D54342 | diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 710f005985da1..ad13fb1b3e95f 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -57,6 +57,10 @@ Improvements to Clang's diagnostics Non-comprehensive list of changes in this release ------------------------------------------------- +- For the ARM target, C-language intrinsics are now provided for the full Arm + v8.1-M MVE instruction set. ```` supports the complete API defined + in the Arm C Language Extensions. + New Compiler Flags ------------------ @@ -304,7 +308,7 @@ Additional Information A wide variety of additional information is available on the `Clang web page `_. The web page contains versions of the -API documentation which are up-to-date with the Subversion version of +API documentation which are up-to-date with the Git version of the source code. You can access versions of these documents specific to this release by going into the "``clang/docs/``" directory in the Clang tree. diff --git a/clang/docs/analyzer/checkers.rst b/clang/docs/analyzer/checkers.rst index b130cc34f6851..0bfb6456dc820 100644 --- a/clang/docs/analyzer/checkers.rst +++ b/clang/docs/analyzer/checkers.rst @@ -1934,14 +1934,14 @@ alpha.security alpha.security.cert ^^^^^^^^^^^^^^^^^^^ -SEI CERT checkers which tries to find errors based on their `C coding rules`_. +SEI CERT checkers which tries to find errors based on their `C coding rules `_. .. _alpha-security-cert-pos-checkers: alpha.security.cert.pos ^^^^^^^^^^^^^^^^^^^^^^^ -SEI CERT checkers of POSIX `C coding rules`_. +SEI CERT checkers of `POSIX C coding rules `_. .. _alpha-security-cert-pos-34c: @@ -2199,9 +2199,9 @@ lck_rw_try_lock_exclusive, lck_rw_try_lock_shared, pthread_mutex_unlock, pthread alpha.unix.SimpleStream (C) """"""""""""""""""""""""""" Check for misuses of stream APIs. Check for misuses of stream APIs: ``fopen, fclose`` -(demo checker, the subject of the demo (`Slides `_ , +(demo checker, the subject of the demo (`Slides `_ , `Video `_) by Anna Zaks and Jordan Rose presented at the -`2012 LLVM Developers' Meeting `_). +`2012 LLVM Developers' Meeting `_). .. code-block:: c diff --git a/clang/docs/analyzer/developer-docs/DebugChecks.rst b/clang/docs/analyzer/developer-docs/DebugChecks.rst index 05b3e2480d3b7..48b584a463072 100644 --- a/clang/docs/analyzer/developer-docs/DebugChecks.rst +++ b/clang/docs/analyzer/developer-docs/DebugChecks.rst @@ -281,7 +281,7 @@ ExprInspection checks This is useful in tests, where we don't want to issue warning for all tainted expressions but only check for certain expressions. This would help to reduce the *noise* that the `TaintTest` debug checker would - introduce and let you focus on the `expected-warning`s that you really care + introduce and let you focus on the `expected-warning`'s that you really care about. Example usage:: diff --git a/clang/include/clang-c/Index.h b/clang/include/clang-c/Index.h index 9d4930a3887a7..ad2bb6a76c438 100644 --- a/clang/include/clang-c/Index.h +++ b/clang/include/clang-c/Index.h @@ -2578,7 +2578,11 @@ enum CXCursorKind { */ CXCursor_OMPDepobjDirective = 286, - CXCursor_LastStmt = CXCursor_OMPDepobjDirective, + /** OpenMP scan directive. + */ + CXCursor_OMPScanDirective = 287, + + CXCursor_LastStmt = CXCursor_OMPScanDirective, /** * Cursor that represents the translation unit itself. diff --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h index d74edb8a8adb8..ca0f991c24e31 100644 --- a/clang/include/clang/AST/ASTContext.h +++ b/clang/include/clang/AST/ASTContext.h @@ -116,6 +116,7 @@ class ObjCPropertyDecl; class ObjCPropertyImplDecl; class ObjCProtocolDecl; class ObjCTypeParamDecl; +class OMPTraitInfo; struct ParsedTargetAttr; class Preprocessor; class Stmt; @@ -2962,6 +2963,14 @@ OPT_LIST(V) }; llvm::StringMap SectionInfos; + + /// Return a new OMPTraitInfo object owned by this context. + OMPTraitInfo &getNewOMPTraitInfo(); + +private: + /// All OMPTraitInfo objects live in this collection, one per + /// `pragma omp [begin] declare variant` directive. + SmallVector OMPTraitInfoVector; }; /// Utility function for constructing a nullary selector. diff --git a/clang/include/clang/AST/ASTDumperUtils.h b/clang/include/clang/AST/ASTDumperUtils.h index 55a085449a9b2..1dce913049ad6 100644 --- a/clang/include/clang/AST/ASTDumperUtils.h +++ b/clang/include/clang/AST/ASTDumperUtils.h @@ -62,6 +62,8 @@ static const TerminalColor LocationColor = {llvm::raw_ostream::YELLOW, false}; static const TerminalColor ValueKindColor = {llvm::raw_ostream::CYAN, false}; // bitfield/objcproperty/objcsubscript/vectorcomponent static const TerminalColor ObjectKindColor = {llvm::raw_ostream::CYAN, false}; +// contains-errors +static const TerminalColor ErrorsColor = {llvm::raw_ostream::RED, true}; // Null statements static const TerminalColor NullColor = {llvm::raw_ostream::BLUE, false}; diff --git a/clang/include/clang/AST/CXXInheritance.h b/clang/include/clang/AST/CXXInheritance.h index f223c1f2f4f0a..8b1bcb367b3b4 100644 --- a/clang/include/clang/AST/CXXInheritance.h +++ b/clang/include/clang/AST/CXXInheritance.h @@ -119,7 +119,7 @@ class CXXBasePaths { friend class CXXRecordDecl; /// The type from which this search originated. - CXXRecordDecl *Origin = nullptr; + const CXXRecordDecl *Origin = nullptr; /// Paths - The actual set of paths that can be taken from the /// derived class to the same base class. @@ -225,8 +225,8 @@ class CXXBasePaths { /// Retrieve the type from which this base-paths search /// began - CXXRecordDecl *getOrigin() const { return Origin; } - void setOrigin(CXXRecordDecl *Rec) { Origin = Rec; } + const CXXRecordDecl *getOrigin() const { return Origin; } + void setOrigin(const CXXRecordDecl *Rec) { Origin = Rec; } /// Clear the base-paths results. void clear(); diff --git a/clang/include/clang/AST/ComputeDependence.h b/clang/include/clang/AST/ComputeDependence.h index 593ff3a6eb163..02f826438d4df 100644 --- a/clang/include/clang/AST/ComputeDependence.h +++ b/clang/include/clang/AST/ComputeDependence.h @@ -45,6 +45,7 @@ class ExtVectorElementExpr; class BlockExpr; class AsTypeExpr; class DeclRefExpr; +class RecoveryExpr; class CXXRewrittenBinaryOperator; class CXXStdInitializerListExpr; class CXXTypeidExpr; @@ -59,6 +60,7 @@ class CXXDeleteExpr; class ArrayTypeTraitExpr; class ExpressionTraitExpr; class CXXNoexceptExpr; +class PackExpansionExpr; class SubstNonTypeTemplateParmExpr; class CoroutineSuspendExpr; class DependentCoawaitExpr; @@ -71,6 +73,7 @@ class LambdaExpr; class CXXUnresolvedConstructExpr; class CXXDependentScopeMemberExpr; class MaterializeTemporaryExpr; +class CXXFoldExpr; class TypeTraitExpr; class ConceptSpecializationExpr; class PredefinedExpr; @@ -120,6 +123,7 @@ ExprDependence computeDependence(ExtVectorElementExpr *E); ExprDependence computeDependence(BlockExpr *E); ExprDependence computeDependence(AsTypeExpr *E); ExprDependence computeDependence(DeclRefExpr *E, const ASTContext &Ctx); +ExprDependence computeDependence(RecoveryExpr *E); ExprDependence computeDependence(CXXRewrittenBinaryOperator *E); ExprDependence computeDependence(CXXStdInitializerListExpr *E); ExprDependence computeDependence(CXXTypeidExpr *E); @@ -134,6 +138,7 @@ ExprDependence computeDependence(CXXDeleteExpr *E); ExprDependence computeDependence(ArrayTypeTraitExpr *E); ExprDependence computeDependence(ExpressionTraitExpr *E); ExprDependence computeDependence(CXXNoexceptExpr *E, CanThrowResult CT); +ExprDependence computeDependence(PackExpansionExpr *E); ExprDependence computeDependence(SubstNonTypeTemplateParmExpr *E); ExprDependence computeDependence(CoroutineSuspendExpr *E); ExprDependence computeDependence(DependentCoawaitExpr *E); @@ -149,6 +154,7 @@ ExprDependence computeDependence(LambdaExpr *E, ExprDependence computeDependence(CXXUnresolvedConstructExpr *E); ExprDependence computeDependence(CXXDependentScopeMemberExpr *E); ExprDependence computeDependence(MaterializeTemporaryExpr *E); +ExprDependence computeDependence(CXXFoldExpr *E); ExprDependence computeDependence(TypeTraitExpr *E); ExprDependence computeDependence(ConceptSpecializationExpr *E, bool ValueDependent); diff --git a/clang/include/clang/AST/DependenceFlags.h b/clang/include/clang/AST/DependenceFlags.h index 21daf0a203ac8..788227156c4d9 100644 --- a/clang/include/clang/AST/DependenceFlags.h +++ b/clang/include/clang/AST/DependenceFlags.h @@ -20,19 +20,23 @@ struct ExprDependenceScope { Type = 4, Value = 8, + // clang extension: this expr contains or references an error, and is + // considered dependent on how that error is resolved. + Error = 16, + None = 0, - All = 15, + All = 31, TypeValue = Type | Value, TypeInstantiation = Type | Instantiation, ValueInstantiation = Value | Instantiation, TypeValueInstantiation = Type | Value | Instantiation, - LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/Value) + LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/Error) }; }; using ExprDependence = ExprDependenceScope::ExprDependence; -static constexpr unsigned ExprDependenceBits = 4; +static constexpr unsigned ExprDependenceBits = 5; struct TypeDependenceScope { enum TypeDependence : uint8_t { @@ -47,6 +51,8 @@ struct TypeDependenceScope { /// Whether this type is a variably-modified type (C99 6.7.5). VariablyModified = 8, + // FIXME: add Error bit. + None = 0, All = 15, @@ -83,11 +89,14 @@ LLVM_COMMON_DEPENDENCE(TemplateArgumentDependence) /// Computes dependencies of a reference with the name having template arguments /// with \p TA dependencies. inline ExprDependence toExprDependence(TemplateArgumentDependence TA) { - auto E = - static_cast(TA & ~TemplateArgumentDependence::Dependent); + auto D = ExprDependence::None; + if (TA & TemplateArgumentDependence::UnexpandedPack) + D |= ExprDependence::UnexpandedPack; + if (TA & TemplateArgumentDependence::Instantiation) + D |= ExprDependence::Instantiation; if (TA & TemplateArgumentDependence::Dependent) - return E | ExprDependence::Type | ExprDependence::Value; - return E; + D |= ExprDependence::Type | ExprDependence::Value; + return D; } inline ExprDependence toExprDependence(TypeDependence TD) { // This hack works because TypeDependence and TemplateArgumentDependence @@ -106,6 +115,32 @@ inline ExprDependence turnTypeToValueDependence(ExprDependence D) { // type dependency. return D & ~ExprDependence::Type; } +inline ExprDependence turnValueToTypeDependence(ExprDependence D) { + // Type-dependent expressions are always be value-dependent. + if (D & ExprDependence::Value) + D |= ExprDependence::Type; + return D; +} + +// Returned type-dependence will never have VariablyModified set. +inline TypeDependence toTypeDependence(ExprDependence D) { + // Supported bits all have the same representation. + return static_cast(D & (ExprDependence::UnexpandedPack | + ExprDependence::Instantiation | + ExprDependence::Type)); +} +inline TypeDependence toTypeDependence(NestedNameSpecifierDependence D) { + // Supported bits all have the same representation. + return static_cast(D); +} +inline TypeDependence toTypeDependence(TemplateNameDependence D) { + // Supported bits all have the same representation. + return static_cast(D); +} +inline TypeDependence toTypeDependence(TemplateArgumentDependence D) { + // Supported bits all have the same representation. + return static_cast(D); +} inline NestedNameSpecifierDependence toNestedNameSpecifierDependendence(TypeDependence D) { @@ -127,10 +162,13 @@ toTemplateArgumentDependence(TemplateNameDependence D) { } inline TemplateArgumentDependence toTemplateArgumentDependence(ExprDependence ED) { - TemplateArgumentDependence TAD = static_cast( - ED & ~(ExprDependence::Type | ExprDependence::Value)); + TemplateArgumentDependence TAD = TemplateArgumentDependence::None; if (ED & (ExprDependence::Type | ExprDependence::Value)) TAD |= TemplateArgumentDependence::Dependent; + if (ED & ExprDependence::Instantiation) + TAD |= TemplateArgumentDependence::Instantiation; + if (ED & ExprDependence::UnexpandedPack) + TAD |= TemplateArgumentDependence::UnexpandedPack; return TAD; } diff --git a/clang/include/clang/AST/Expr.h b/clang/include/clang/AST/Expr.h index 4a1d2f91b8673..fa49182728194 100644 --- a/clang/include/clang/AST/Expr.h +++ b/clang/include/clang/AST/Expr.h @@ -130,6 +130,14 @@ class Expr : public ValueStmt { /// Construct an empty expression. explicit Expr(StmtClass SC, EmptyShell) : ValueStmt(SC) { } + /// Each concrete expr subclass is expected to compute its dependence and call + /// this in the constructor. + void setDependence(ExprDependence Deps) { + ExprBits.Dependent = static_cast(Deps); + } + friend class ASTImporter; // Sets dependence dircetly. + friend class ASTStmtReader; // Sets dependence dircetly. + public: QualType getType() const { return TR; } void setType(QualType t) { @@ -149,18 +157,6 @@ class Expr : public ValueStmt { return static_cast(ExprBits.Dependent); } - /// Each concrete expr subclass is expected to compute its dependence and call - /// this in the constructor. - void setDependence(ExprDependence Deps) { - ExprBits.Dependent = static_cast(Deps); - } - void addDependence(ExprDependence Deps) { - ExprBits.Dependent |= static_cast(Deps); - } - void removeDependence(ExprDependence Deps) { - ExprBits.Dependent &= ~static_cast(Deps); - } - /// isValueDependent - Determines whether this expression is /// value-dependent (C++ [temp.dep.constexpr]). For example, the /// array bound of "Chars" in the following example is @@ -226,6 +222,12 @@ class Expr : public ValueStmt { return static_cast(getDependence() & ExprDependence::UnexpandedPack); } + /// Whether this expression contains subexpressions which had errors, e.g. a + /// TypoExpr. + bool containsErrors() const { + return static_cast(getDependence() & ExprDependence::Error); + } + /// getExprLoc - Return the preferred location for the arrow when diagnosing /// a problem with a generic expression. SourceLocation getExprLoc() const LLVM_READONLY; @@ -1054,6 +1056,9 @@ class ConstantExpr final bool isImmediateInvocation() const { return ConstantExprBits.IsImmediateInvocation; } + bool hasAPValueResult() const { + return ConstantExprBits.APValueKind != APValue::None; + } APValue getAPValueResult() const; APValue &getResultAsAPValue() const { return APValueResult(); } llvm::APSInt getResultAsAPSInt() const; @@ -1888,22 +1893,20 @@ class StringLiteral final } }; -union PredefExprStorage { - Stmt *S; - Expr *E; - TypeSourceInfo *T; -}; - /// [C99 6.4.2.2] - A predefined identifier such as __func__. class PredefinedExpr final : public Expr, - private llvm::TrailingObjects { + private llvm::TrailingObjects { friend class ASTStmtReader; friend TrailingObjects; // PredefinedExpr is optionally followed by a single trailing // "Stmt *" for the predefined identifier. It is present if and only if // hasFunctionName() is true and is always a "StringLiteral *". + // It can also be followed by a Expr* in the case of a + // __builtin_unique_stable_name with an expression, or TypeSourceInfo * if + // __builtin_unique_stable_name with a type. public: enum IdentKind { @@ -1937,32 +1940,41 @@ class PredefinedExpr final void setFunctionName(StringLiteral *SL) { assert(hasFunctionName() && "This PredefinedExpr has no storage for a function name!"); - getTrailingObjects()->S = SL; + *getTrailingObjects() = SL; } void setTypeSourceInfo(TypeSourceInfo *Info) { assert(!hasFunctionName() && getIdentKind() == UniqueStableNameType && "TypeSourceInfo only valid for UniqueStableName of a Type"); - getTrailingObjects()->T = Info; + *getTrailingObjects() = Info; } void setExpr(Expr *E) { assert(!hasFunctionName() && getIdentKind() == UniqueStableNameExpr && - "Expr only valid for UniqueStableName of an Expression."); - getTrailingObjects()->E = E; + "TypeSourceInfo only valid for UniqueStableName of n Expression."); + *getTrailingObjects() = E; + } + + size_t numTrailingObjects(OverloadToken) const { + return hasFunctionName(); + } + + size_t numTrailingObjects(OverloadToken) const { + return getIdentKind() == UniqueStableNameType && !hasFunctionName(); + } + size_t numTrailingObjects(OverloadToken) const { + return getIdentKind() == UniqueStableNameExpr && !hasFunctionName(); } public: /// Create a PredefinedExpr. static PredefinedExpr *Create(const ASTContext &Ctx, SourceLocation L, QualType FNTy, IdentKind IK, StringLiteral *SL); - static PredefinedExpr *Create(const ASTContext &Ctx, SourceLocation L, - QualType FnTy, IdentKind IK, StringLiteral *SL, + QualType FNTy, IdentKind IK, StringLiteral *SL, TypeSourceInfo *Info); - static PredefinedExpr *Create(const ASTContext &Ctx, SourceLocation L, - QualType FnTy, IdentKind IK, StringLiteral *SL, + QualType FNTy, IdentKind IK, StringLiteral *SL, Expr *E); /// Create an empty PredefinedExpr. @@ -1976,47 +1988,45 @@ class PredefinedExpr final SourceLocation getLocation() const { return PredefinedExprBits.Loc; } void setLocation(SourceLocation L) { PredefinedExprBits.Loc = L; } + StringLiteral *getFunctionName() { + return hasFunctionName() + ? static_cast(*getTrailingObjects()) + : nullptr; + } + + const StringLiteral *getFunctionName() const { + return hasFunctionName() + ? static_cast(*getTrailingObjects()) + : nullptr; + } + TypeSourceInfo *getTypeSourceInfo() { assert(!hasFunctionName() && getIdentKind() == UniqueStableNameType && "TypeSourceInfo only valid for UniqueStableName of a Type"); - return getTrailingObjects()->T; + return *getTrailingObjects(); } const TypeSourceInfo *getTypeSourceInfo() const { assert(!hasFunctionName() && getIdentKind() == UniqueStableNameType && "TypeSourceInfo only valid for UniqueStableName of a Type"); - return getTrailingObjects()->T; + return *getTrailingObjects(); } Expr *getExpr() { assert(!hasFunctionName() && getIdentKind() == UniqueStableNameExpr && - "Expr only valid for UniqueStableName of an Expression."); - return getTrailingObjects()->E; + "TypeSourceInfo only valid for UniqueStableName of n Expression."); + return *getTrailingObjects(); } const Expr *getExpr() const { assert(!hasFunctionName() && getIdentKind() == UniqueStableNameExpr && - "Expr only valid for UniqueStableName of an Expression."); - return getTrailingObjects()->E; - } - - - StringLiteral *getFunctionName() { - return hasFunctionName() ? static_cast( - getTrailingObjects()->S) - : nullptr; - } - - const StringLiteral *getFunctionName() const { - return hasFunctionName() ? static_cast( - getTrailingObjects()->S) - : nullptr; + "TypeSourceInfo only valid for UniqueStableName of n Expression."); + return *getTrailingObjects(); } static StringRef getIdentKindName(IdentKind IK); static std::string ComputeName(IdentKind IK, const Decl *CurrentDecl); - static std::string ComputeName(ASTContext &Ctx, IdentKind IK, - const QualType Ty); + static std::string ComputeName(ASTContext &Ctx, IdentKind IK, const QualType Ty); SourceLocation getBeginLoc() const { return getLocation(); } SourceLocation getEndLoc() const { return getLocation(); } @@ -2027,15 +2037,13 @@ class PredefinedExpr final // Iterators child_range children() { - return child_range(&getTrailingObjects()->S, - &getTrailingObjects()->S + - hasFunctionName()); + return child_range(getTrailingObjects(), + getTrailingObjects() + hasFunctionName()); } const_child_range children() const { - return const_child_range(&getTrailingObjects()->S, - &getTrailingObjects()->S + - hasFunctionName()); + return const_child_range(getTrailingObjects(), + getTrailingObjects() + hasFunctionName()); } }; @@ -2828,6 +2836,12 @@ class CallExpr : public Expr { /// a non-value-dependent constant parameter evaluating as false. bool isBuiltinAssumeFalse(const ASTContext &Ctx) const; + /// Used by Sema to implement MSVC-compatible delayed name lookup. + /// (Usually Exprs themselves should set dependence). + void markDependentForPostponedNameLookup() { + setDependence(getDependence() | ExprDependence::TypeValueInstantiation); + } + bool isCallToStdMove() const { const FunctionDecl *FD = getDirectCallee(); return getNumArgs() == 1 && FD && FD->isInStdNamespace() && @@ -4439,7 +4453,7 @@ class InitListExpr : public Expr { InitExprs[Init] = expr; if (expr) - addDependence(expr->getDependence()); + setDependence(getDependence() | expr->getDependence()); } /// Reserve space for some number of initializers. @@ -5942,7 +5956,8 @@ class TypoExpr : public Expr { public: TypoExpr(QualType T) : Expr(TypoExprClass, T, VK_LValue, OK_Ordinary) { assert(T->isDependentType() && "TypoExpr given a non-dependent type"); - setDependence(ExprDependence::TypeValueInstantiation); + setDependence(ExprDependence::TypeValueInstantiation | + ExprDependence::Error); } child_range children() { @@ -5960,6 +5975,69 @@ class TypoExpr : public Expr { } }; + +/// Frontend produces RecoveryExprs on semantic errors that prevent creating +/// other well-formed expressions. E.g. when type-checking of a binary operator +/// fails, we cannot produce a BinaryOperator expression. Instead, we can choose +/// to produce a recovery expression storing left and right operands. +/// +/// RecoveryExpr does not have any semantic meaning in C++, it is only useful to +/// preserve expressions in AST that would otherwise be dropped. It captures +/// subexpressions of some expression that we could not construct and source +/// range covered by the expression. +/// +/// For now, RecoveryExpr is type-, value- and instantiation-dependent to take +/// advantage of existing machinery to deal with dependent code in C++, e.g. +/// RecoveryExpr is preserved in `decltype()` as part of the +/// `DependentDecltypeType`. In addition to that, clang does not report most +/// errors on dependent expressions, so we get rid of bogus errors for free. +/// However, note that unlike other dependent expressions, RecoveryExpr can be +/// produced in non-template contexts. +/// +/// One can also reliably suppress all bogus errors on expressions containing +/// recovery expressions by examining results of Expr::containsErrors(). +class RecoveryExpr final : public Expr, + private llvm::TrailingObjects { +public: + static RecoveryExpr *Create(ASTContext &Ctx, SourceLocation BeginLoc, + SourceLocation EndLoc, ArrayRef SubExprs); + static RecoveryExpr *CreateEmpty(ASTContext &Ctx, unsigned NumSubExprs); + + ArrayRef subExpressions() { + auto *B = getTrailingObjects(); + return llvm::makeArrayRef(B, B + NumExprs); + } + + ArrayRef subExpressions() const { + return const_cast(this)->subExpressions(); + } + + child_range children() { + Stmt **B = reinterpret_cast(getTrailingObjects()); + return child_range(B, B + NumExprs); + } + + SourceLocation getBeginLoc() const { return BeginLoc; } + SourceLocation getEndLoc() const { return EndLoc; } + + static bool classof(const Stmt *T) { + return T->getStmtClass() == RecoveryExprClass; + } + +private: + RecoveryExpr(ASTContext &Ctx, SourceLocation BeginLoc, SourceLocation EndLoc, + ArrayRef SubExprs); + RecoveryExpr(EmptyShell Empty) : Expr(RecoveryExprClass, Empty) {} + + size_t numTrailingObjects(OverloadToken) const { return NumExprs; } + + SourceLocation BeginLoc, EndLoc; + unsigned NumExprs; + friend TrailingObjects; + friend class ASTStmtReader; + friend class ASTStmtWriter; +}; + } // end namespace clang #endif // LLVM_CLANG_AST_EXPR_H diff --git a/clang/include/clang/AST/ExprCXX.h b/clang/include/clang/AST/ExprCXX.h index 33ea3f6346b22..e3404fec02dd5 100644 --- a/clang/include/clang/AST/ExprCXX.h +++ b/clang/include/clang/AST/ExprCXX.h @@ -4020,7 +4020,7 @@ class PackExpansionExpr : public Expr { EllipsisLoc(EllipsisLoc), NumExpansions(NumExpansions ? *NumExpansions + 1 : 0), Pattern(Pattern) { - setDependence(ExprDependence::TypeValueInstantiation); + setDependence(computeDependence(this)); } PackExpansionExpr(EmptyShell Empty) : Expr(PackExpansionExprClass, Empty) {} @@ -4531,7 +4531,7 @@ class CXXFoldExpr : public Expr { NumExpansions(NumExpansions ? *NumExpansions + 1 : 0), Opcode(Opcode) { SubExprs[0] = LHS; SubExprs[1] = RHS; - setDependence(ExprDependence::TypeValueInstantiation); + setDependence(computeDependence(this)); } CXXFoldExpr(EmptyShell Empty) : Expr(CXXFoldExprClass, Empty) {} diff --git a/clang/include/clang/AST/LocInfoType.h b/clang/include/clang/AST/LocInfoType.h index 1073174bcf913..7e845ad03587c 100644 --- a/clang/include/clang/AST/LocInfoType.h +++ b/clang/include/clang/AST/LocInfoType.h @@ -35,10 +35,7 @@ class LocInfoType : public Type { TypeSourceInfo *DeclInfo; LocInfoType(QualType ty, TypeSourceInfo *TInfo) - : Type((TypeClass)LocInfo, ty, ty->isDependentType(), - ty->isInstantiationDependentType(), ty->isVariablyModifiedType(), - ty->containsUnexpandedParameterPack()), - DeclInfo(TInfo) { + : Type((TypeClass)LocInfo, ty, ty->getDependence()), DeclInfo(TInfo) { assert(getTypeClass() == (TypeClass)LocInfo && "LocInfo didn't fit in TC?"); } friend class Sema; diff --git a/clang/include/clang/AST/Mangle.h b/clang/include/clang/AST/Mangle.h index d6e2516a709b7..2cbe6e3895bd7 100644 --- a/clang/include/clang/AST/Mangle.h +++ b/clang/include/clang/AST/Mangle.h @@ -172,19 +172,17 @@ class ItaniumMangleContext : public MangleContext { virtual void mangleCXXDtorComdat(const CXXDestructorDecl *D, raw_ostream &) = 0; - bool isUniqueNameMangler() { return IsUniqueNameMangler; } - virtual void mangleLambdaSig(const CXXRecordDecl *Lambda, raw_ostream &) = 0; + bool isUniqueNameMangler() { return IsUniqueNameMangler; } + static bool classof(const MangleContext *C) { return C->getKind() == MK_Itanium; } - static ItaniumMangleContext *create(ASTContext &Context, - DiagnosticsEngine &Diags); static ItaniumMangleContext *create(ASTContext &Context, DiagnosticsEngine &Diags, - bool IsUniqueNameMangler); + bool IsUniqueNameMangler = false); }; class MicrosoftMangleContext : public MangleContext { diff --git a/clang/include/clang/AST/OpenMPClause.h b/clang/include/clang/AST/OpenMPClause.h index e82a5f09a32d1..29c251ef7ee69 100644 --- a/clang/include/clang/AST/OpenMPClause.h +++ b/clang/include/clang/AST/OpenMPClause.h @@ -2703,6 +2703,12 @@ class OMPReductionClause final friend OMPVarListClause; friend TrailingObjects; + /// Reduction modifier. + OpenMPReductionClauseModifier Modifier = OMPC_REDUCTION_unknown; + + /// Reduction modifier location. + SourceLocation ModifierLoc; + /// Location of ':'. SourceLocation ColonLoc; @@ -2716,18 +2722,22 @@ class OMPReductionClause final /// /// \param StartLoc Starting location of the clause. /// \param LParenLoc Location of '('. - /// \param EndLoc Ending location of the clause. + /// \param ModifierLoc Modifier location. /// \param ColonLoc Location of ':'. + /// \param EndLoc Ending location of the clause. /// \param N Number of the variables in the clause. /// \param QualifierLoc The nested-name qualifier with location information /// \param NameInfo The full name info for reduction identifier. OMPReductionClause(SourceLocation StartLoc, SourceLocation LParenLoc, - SourceLocation ColonLoc, SourceLocation EndLoc, unsigned N, + SourceLocation ModifierLoc, SourceLocation ColonLoc, + SourceLocation EndLoc, + OpenMPReductionClauseModifier Modifier, unsigned N, NestedNameSpecifierLoc QualifierLoc, const DeclarationNameInfo &NameInfo) : OMPVarListClause(OMPC_reduction, StartLoc, LParenLoc, EndLoc, N), - OMPClauseWithPostUpdate(this), ColonLoc(ColonLoc), + OMPClauseWithPostUpdate(this), Modifier(Modifier), + ModifierLoc(ModifierLoc), ColonLoc(ColonLoc), QualifierLoc(QualifierLoc), NameInfo(NameInfo) {} /// Build an empty clause. @@ -2739,6 +2749,12 @@ class OMPReductionClause final N), OMPClauseWithPostUpdate(this) {} + /// Sets reduction modifier. + void setModifier(OpenMPReductionClauseModifier M) { Modifier = M; } + + /// Sets location of the modifier. + void setModifierLoc(SourceLocation Loc) { ModifierLoc = Loc; } + /// Sets location of ':' symbol in clause. void setColonLoc(SourceLocation CL) { ColonLoc = CL; } @@ -2808,6 +2824,7 @@ class OMPReductionClause final /// /// \param StartLoc Starting location of the clause. /// \param LParenLoc Location of '('. + /// \param ModifierLoc Modifier location. /// \param ColonLoc Location of ':'. /// \param EndLoc Ending location of the clause. /// \param VL The variables in the clause. @@ -2838,8 +2855,9 @@ class OMPReductionClause final /// OpenMP region with this clause. static OMPReductionClause * Create(const ASTContext &C, SourceLocation StartLoc, SourceLocation LParenLoc, - SourceLocation ColonLoc, SourceLocation EndLoc, ArrayRef VL, - NestedNameSpecifierLoc QualifierLoc, + SourceLocation ModifierLoc, SourceLocation ColonLoc, + SourceLocation EndLoc, OpenMPReductionClauseModifier Modifier, + ArrayRef VL, NestedNameSpecifierLoc QualifierLoc, const DeclarationNameInfo &NameInfo, ArrayRef Privates, ArrayRef LHSExprs, ArrayRef RHSExprs, ArrayRef ReductionOps, Stmt *PreInit, Expr *PostUpdate); @@ -2850,6 +2868,12 @@ class OMPReductionClause final /// \param N The number of variables. static OMPReductionClause *CreateEmpty(const ASTContext &C, unsigned N); + /// Returns modifier. + OpenMPReductionClauseModifier getModifier() const { return Modifier; } + + /// Returns modifier location. + SourceLocation getModifierLoc() const { return ModifierLoc; } + /// Gets location of ':' symbol in clause. SourceLocation getColonLoc() const { return ColonLoc; } @@ -6911,6 +6935,154 @@ class OMPDetachClause final : public OMPClause { } }; +/// This represents clause 'inclusive' in the '#pragma omp scan' directive. +/// +/// \code +/// #pragma omp scan inclusive(a,b) +/// \endcode +/// In this example directive '#pragma omp scan' has clause 'inclusive' +/// with the variables 'a' and 'b'. +class OMPInclusiveClause final + : public OMPVarListClause, + private llvm::TrailingObjects { + friend class OMPClauseReader; + friend OMPVarListClause; + friend TrailingObjects; + + /// Build clause with number of variables \a N. + /// + /// \param StartLoc Starting location of the clause. + /// \param LParenLoc Location of '('. + /// \param EndLoc Ending location of the clause. + /// \param N Number of the variables in the clause. + OMPInclusiveClause(SourceLocation StartLoc, SourceLocation LParenLoc, + SourceLocation EndLoc, unsigned N) + : OMPVarListClause(OMPC_inclusive, StartLoc, + LParenLoc, EndLoc, N) {} + + /// Build an empty clause. + /// + /// \param N Number of variables. + explicit OMPInclusiveClause(unsigned N) + : OMPVarListClause(OMPC_inclusive, SourceLocation(), + SourceLocation(), SourceLocation(), + N) {} + +public: + /// Creates clause with a list of variables \a VL. + /// + /// \param C AST context. + /// \param StartLoc Starting location of the clause. + /// \param LParenLoc Location of '('. + /// \param EndLoc Ending location of the clause. + /// \param VL List of references to the original variables. + static OMPInclusiveClause *Create(const ASTContext &C, + SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc, ArrayRef VL); + + /// Creates an empty clause with the place for \a N variables. + /// + /// \param C AST context. + /// \param N The number of variables. + static OMPInclusiveClause *CreateEmpty(const ASTContext &C, unsigned N); + + child_range children() { + return child_range(reinterpret_cast(varlist_begin()), + reinterpret_cast(varlist_end())); + } + + const_child_range children() const { + auto Children = const_cast(this)->children(); + return const_child_range(Children.begin(), Children.end()); + } + + child_range used_children() { + return child_range(child_iterator(), child_iterator()); + } + const_child_range used_children() const { + return const_child_range(const_child_iterator(), const_child_iterator()); + } + + static bool classof(const OMPClause *T) { + return T->getClauseKind() == OMPC_inclusive; + } +}; + +/// This represents clause 'exclusive' in the '#pragma omp scan' directive. +/// +/// \code +/// #pragma omp scan exclusive(a,b) +/// \endcode +/// In this example directive '#pragma omp scan' has clause 'exclusive' +/// with the variables 'a' and 'b'. +class OMPExclusiveClause final + : public OMPVarListClause, + private llvm::TrailingObjects { + friend class OMPClauseReader; + friend OMPVarListClause; + friend TrailingObjects; + + /// Build clause with number of variables \a N. + /// + /// \param StartLoc Starting location of the clause. + /// \param LParenLoc Location of '('. + /// \param EndLoc Ending location of the clause. + /// \param N Number of the variables in the clause. + OMPExclusiveClause(SourceLocation StartLoc, SourceLocation LParenLoc, + SourceLocation EndLoc, unsigned N) + : OMPVarListClause(OMPC_exclusive, StartLoc, + LParenLoc, EndLoc, N) {} + + /// Build an empty clause. + /// + /// \param N Number of variables. + explicit OMPExclusiveClause(unsigned N) + : OMPVarListClause(OMPC_exclusive, SourceLocation(), + SourceLocation(), SourceLocation(), + N) {} + +public: + /// Creates clause with a list of variables \a VL. + /// + /// \param C AST context. + /// \param StartLoc Starting location of the clause. + /// \param LParenLoc Location of '('. + /// \param EndLoc Ending location of the clause. + /// \param VL List of references to the original variables. + static OMPExclusiveClause *Create(const ASTContext &C, + SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc, ArrayRef VL); + + /// Creates an empty clause with the place for \a N variables. + /// + /// \param C AST context. + /// \param N The number of variables. + static OMPExclusiveClause *CreateEmpty(const ASTContext &C, unsigned N); + + child_range children() { + return child_range(reinterpret_cast(varlist_begin()), + reinterpret_cast(varlist_end())); + } + + const_child_range children() const { + auto Children = const_cast(this)->children(); + return const_child_range(Children.begin(), Children.end()); + } + + child_range used_children() { + return child_range(child_iterator(), child_iterator()); + } + const_child_range used_children() const { + return const_child_range(const_child_iterator(), const_child_iterator()); + } + + static bool classof(const OMPClause *T) { + return T->getClauseKind() == OMPC_exclusive; + } +}; + /// This class implements a simple visitor for OMPClause /// subclasses. template class Ptr, typename RetTy> @@ -6968,28 +7140,33 @@ class OMPClausePrinter final : public OMPClauseVisitor { /// collection of selector sets, each with an associated kind and an ordered /// collection of selectors. A selector has a kind, an optional score/condition, /// and an ordered collection of properties. -struct OMPTraitInfo { +class OMPTraitInfo { + /// Private constructor accesible only by ASTContext. + OMPTraitInfo() {} + friend class ASTContext; + +public: struct OMPTraitProperty { llvm::omp::TraitProperty Kind = llvm::omp::TraitProperty::invalid; }; struct OMPTraitSelector { Expr *ScoreOrCondition = nullptr; llvm::omp::TraitSelector Kind = llvm::omp::TraitSelector::invalid; - llvm::SmallVector Properties; + llvm::SmallVector Properties; }; struct OMPTraitSet { llvm::omp::TraitSet Kind = llvm::omp::TraitSet::invalid; - llvm::SmallVector Selectors; + llvm::SmallVector Selectors; }; /// The outermost level of selector sets. - llvm::SmallVector Sets; + llvm::SmallVector Sets; bool anyScoreOrCondition( - const llvm::function_ref &Cond) { - return llvm::any_of(Sets, [&Cond](OMPTraitInfo::OMPTraitSet &Set) { + llvm::function_ref Cond) { + return llvm::any_of(Sets, [&](OMPTraitInfo::OMPTraitSet &Set) { return llvm::any_of( - Set.Selectors, [&Cond](OMPTraitInfo::OMPTraitSelector &Selector) { + Set.Selectors, [&](OMPTraitInfo::OMPTraitSelector &Selector) { return Cond(Selector.ScoreOrCondition, /* IsScore */ Selector.Kind != llvm::omp::TraitSelector::user_condition); @@ -7009,6 +7186,7 @@ struct OMPTraitInfo { void print(llvm::raw_ostream &OS, const PrintingPolicy &Policy) const; }; llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const OMPTraitInfo &TI); +llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const OMPTraitInfo *TI); } // namespace clang diff --git a/clang/include/clang/AST/RecursiveASTVisitor.h b/clang/include/clang/AST/RecursiveASTVisitor.h index 27a0bc774184a..2f598564f3077 100644 --- a/clang/include/clang/AST/RecursiveASTVisitor.h +++ b/clang/include/clang/AST/RecursiveASTVisitor.h @@ -2668,6 +2668,7 @@ DEF_TRAVERSE_STMT(CXXRewrittenBinaryOperator, { }) DEF_TRAVERSE_STMT(OpaqueValueExpr, {}) DEF_TRAVERSE_STMT(TypoExpr, {}) +DEF_TRAVERSE_STMT(RecoveryExpr, {}) DEF_TRAVERSE_STMT(CUDAKernelCallExpr, {}) // These operators (all of them) do not need any action except @@ -2852,6 +2853,9 @@ DEF_TRAVERSE_STMT(OMPFlushDirective, DEF_TRAVERSE_STMT(OMPDepobjDirective, { TRY_TO(TraverseOMPExecutableDirective(S)); }) +DEF_TRAVERSE_STMT(OMPScanDirective, + { TRY_TO(TraverseOMPExecutableDirective(S)); }) + DEF_TRAVERSE_STMT(OMPOrderedDirective, { TRY_TO(TraverseOMPExecutableDirective(S)); }) @@ -3180,6 +3184,20 @@ bool RecursiveASTVisitor::VisitOMPClauseList(T *Node) { return true; } +template +bool RecursiveASTVisitor::VisitOMPInclusiveClause( + OMPInclusiveClause *C) { + TRY_TO(VisitOMPClauseList(C)); + return true; +} + +template +bool RecursiveASTVisitor::VisitOMPExclusiveClause( + OMPExclusiveClause *C) { + TRY_TO(VisitOMPClauseList(C)); + return true; +} + template bool RecursiveASTVisitor::VisitOMPPrivateClause(OMPPrivateClause *C) { TRY_TO(VisitOMPClauseList(C)); diff --git a/clang/include/clang/AST/StmtOpenMP.h b/clang/include/clang/AST/StmtOpenMP.h index 5f7589acdb9e3..b390bf0042f97 100644 --- a/clang/include/clang/AST/StmtOpenMP.h +++ b/clang/include/clang/AST/StmtOpenMP.h @@ -4688,6 +4688,63 @@ class OMPTargetTeamsDistributeSimdDirective final : public OMPLoopDirective { } }; +/// This represents '#pragma omp scan' directive. +/// +/// \code +/// #pragma omp scan inclusive(a) +/// \endcode +/// In this example directive '#pragma omp scan' has clause 'inclusive' with +/// list item 'a'. +class OMPScanDirective final : public OMPExecutableDirective { + friend class ASTStmtReader; + /// Build directive with the given start and end location. + /// + /// \param StartLoc Starting location of the directive kind. + /// \param EndLoc Ending location of the directive. + /// \param NumClauses Number of clauses. + /// + OMPScanDirective(SourceLocation StartLoc, SourceLocation EndLoc, + unsigned NumClauses) + : OMPExecutableDirective(this, OMPScanDirectiveClass, + llvm::omp::OMPD_scan, StartLoc, EndLoc, + NumClauses, 0) {} + + /// Build an empty directive. + /// + /// \param NumClauses Number of clauses. + /// + explicit OMPScanDirective(unsigned NumClauses) + : OMPExecutableDirective(this, OMPScanDirectiveClass, + llvm::omp::OMPD_scan, SourceLocation(), + SourceLocation(), NumClauses, 0) {} + +public: + /// Creates directive with a list of \a Clauses. + /// + /// \param C AST context. + /// \param StartLoc Starting location of the directive kind. + /// \param EndLoc Ending Location of the directive. + /// \param Clauses List of clauses (only single OMPFlushClause clause is + /// allowed). + /// + static OMPScanDirective *Create(const ASTContext &C, SourceLocation StartLoc, + SourceLocation EndLoc, + ArrayRef Clauses); + + /// Creates an empty directive with the place for \a NumClauses + /// clauses. + /// + /// \param C AST context. + /// \param NumClauses Number of clauses. + /// + static OMPScanDirective *CreateEmpty(const ASTContext &C, unsigned NumClauses, + EmptyShell); + + static bool classof(const Stmt *T) { + return T->getStmtClass() == OMPScanDirectiveClass; + } +}; + } // end namespace clang #endif diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h index 66722873477b7..85a7d682f5c8c 100644 --- a/clang/include/clang/AST/Type.h +++ b/clang/include/clang/AST/Type.h @@ -1504,7 +1504,7 @@ class alignas(8) Type : public ExtQualsTypeCommonBase { return CachedLocalOrUnnamed; } }; - enum { NumTypeBits = 18 }; + enum { NumTypeBits = 8 + TypeDependenceBits + 6 }; protected: // These classes allow subclasses to somewhat cleanly pack bitfields @@ -1554,7 +1554,7 @@ class alignas(8) Type : public ExtQualsTypeCommonBase { /// Extra information which affects how the function is called, like /// regparm and the calling convention. - unsigned ExtInfo : 12; + unsigned ExtInfo : 13; /// The ref-qualifier associated with a \c FunctionProtoType. /// @@ -1826,23 +1826,11 @@ class alignas(8) Type : public ExtQualsTypeCommonBase { protected: friend class ASTContext; - Type(TypeClass tc, QualType canon, bool Dependent, - bool InstantiationDependent, bool VariablyModified, - bool ContainsUnexpandedParameterPack) + Type(TypeClass tc, QualType canon, TypeDependence Dependence) : ExtQualsTypeCommonBase(this, canon.isNull() ? QualType(this_(), 0) : canon) { - auto Deps = TypeDependence::None; - if (Dependent) - Deps |= TypeDependence::Dependent | TypeDependence::Instantiation; - if (InstantiationDependent) - Deps |= TypeDependence::Instantiation; - if (ContainsUnexpandedParameterPack) - Deps |= TypeDependence::UnexpandedPack; - if (VariablyModified) - Deps |= TypeDependence::VariablyModified; - TypeBits.TC = tc; - TypeBits.Dependence = static_cast(Deps); + TypeBits.Dependence = static_cast(Dependence); TypeBits.CacheValid = false; TypeBits.CachedLocalOrUnnamed = false; TypeBits.CachedLinkage = NoLinkage; @@ -1852,41 +1840,11 @@ class alignas(8) Type : public ExtQualsTypeCommonBase { // silence VC++ warning C4355: 'this' : used in base member initializer list Type *this_() { return this; } - void setDependent(bool D = true) { - if (!D) { - TypeBits.Dependence &= ~static_cast(TypeDependence::Dependent); - return; - } - TypeBits.Dependence |= static_cast(TypeDependence::Dependent | - TypeDependence::Instantiation); - } - - void setInstantiationDependent(bool D = true) { - if (D) - TypeBits.Dependence |= - static_cast(TypeDependence::Instantiation); - else - TypeBits.Dependence &= - ~static_cast(TypeDependence::Instantiation); - } - - void setVariablyModified(bool VM = true) { - if (VM) - TypeBits.Dependence |= - static_cast(TypeDependence::VariablyModified); - else - TypeBits.Dependence &= - ~static_cast(TypeDependence::VariablyModified); + void setDependence(TypeDependence D) { + TypeBits.Dependence = static_cast(D); } - void setContainsUnexpandedParameterPack(bool PP = true) { - if (PP) - TypeBits.Dependence |= - static_cast(TypeDependence::UnexpandedPack); - else - TypeBits.Dependence &= - ~static_cast(TypeDependence::UnexpandedPack); - } + void addDependence(TypeDependence D) { setDependence(getDependence() | D); } public: friend class ASTReader; @@ -2527,10 +2485,9 @@ class BuiltinType : public Type { friend class ASTContext; // ASTContext creates these. BuiltinType(Kind K) - : Type(Builtin, QualType(), /*Dependent=*/(K == Dependent), - /*InstantiationDependent=*/(K == Dependent), - /*VariablyModified=*/false, - /*Unexpanded parameter pack=*/false) { + : Type(Builtin, QualType(), + K == Dependent ? TypeDependence::DependentInstantiation + : TypeDependence::None) { BuiltinTypeBits.Kind = K; } @@ -2600,10 +2557,7 @@ class ComplexType : public Type, public llvm::FoldingSetNode { QualType ElementType; ComplexType(QualType Element, QualType CanonicalPtr) - : Type(Complex, CanonicalPtr, Element->isDependentType(), - Element->isInstantiationDependentType(), - Element->isVariablyModifiedType(), - Element->containsUnexpandedParameterPack()), + : Type(Complex, CanonicalPtr, Element->getDependence()), ElementType(Element) {} public: @@ -2630,11 +2584,7 @@ class ParenType : public Type, public llvm::FoldingSetNode { QualType Inner; ParenType(QualType InnerType, QualType CanonType) - : Type(Paren, CanonType, InnerType->isDependentType(), - InnerType->isInstantiationDependentType(), - InnerType->isVariablyModifiedType(), - InnerType->containsUnexpandedParameterPack()), - Inner(InnerType) {} + : Type(Paren, CanonType, InnerType->getDependence()), Inner(InnerType) {} public: QualType getInnerType() const { return Inner; } @@ -2660,10 +2610,7 @@ class PointerType : public Type, public llvm::FoldingSetNode { QualType PointeeType; PointerType(QualType Pointee, QualType CanonicalPtr) - : Type(Pointer, CanonicalPtr, Pointee->isDependentType(), - Pointee->isInstantiationDependentType(), - Pointee->isVariablyModifiedType(), - Pointee->containsUnexpandedParameterPack()), + : Type(Pointer, CanonicalPtr, Pointee->getDependence()), PointeeType(Pointee) {} public: @@ -2711,10 +2658,7 @@ class AdjustedType : public Type, public llvm::FoldingSetNode { AdjustedType(TypeClass TC, QualType OriginalTy, QualType AdjustedTy, QualType CanonicalPtr) - : Type(TC, CanonicalPtr, OriginalTy->isDependentType(), - OriginalTy->isInstantiationDependentType(), - OriginalTy->isVariablyModifiedType(), - OriginalTy->containsUnexpandedParameterPack()), + : Type(TC, CanonicalPtr, OriginalTy->getDependence()), OriginalTy(OriginalTy), AdjustedTy(AdjustedTy) {} public: @@ -2763,10 +2707,7 @@ class BlockPointerType : public Type, public llvm::FoldingSetNode { QualType PointeeType; BlockPointerType(QualType Pointee, QualType CanonicalCls) - : Type(BlockPointer, CanonicalCls, Pointee->isDependentType(), - Pointee->isInstantiationDependentType(), - Pointee->isVariablyModifiedType(), - Pointee->containsUnexpandedParameterPack()), + : Type(BlockPointer, CanonicalCls, Pointee->getDependence()), PointeeType(Pointee) {} public: @@ -2796,10 +2737,7 @@ class ReferenceType : public Type, public llvm::FoldingSetNode { protected: ReferenceType(TypeClass tc, QualType Referencee, QualType CanonicalRef, bool SpelledAsLValue) - : Type(tc, CanonicalRef, Referencee->isDependentType(), - Referencee->isInstantiationDependentType(), - Referencee->isVariablyModifiedType(), - Referencee->containsUnexpandedParameterPack()), + : Type(tc, CanonicalRef, Referencee->getDependence()), PointeeType(Referencee) { ReferenceTypeBits.SpelledAsLValue = SpelledAsLValue; ReferenceTypeBits.InnerRef = Referencee->isReferenceType(); @@ -2884,13 +2822,9 @@ class MemberPointerType : public Type, public llvm::FoldingSetNode { MemberPointerType(QualType Pointee, const Type *Cls, QualType CanonicalPtr) : Type(MemberPointer, CanonicalPtr, - Cls->isDependentType() || Pointee->isDependentType(), - (Cls->isInstantiationDependentType() || - Pointee->isInstantiationDependentType()), - Pointee->isVariablyModifiedType(), - (Cls->containsUnexpandedParameterPack() || - Pointee->containsUnexpandedParameterPack())), - PointeeType(Pointee), Class(Cls) {} + (Cls->getDependence() & ~TypeDependence::VariablyModified) | + Pointee->getDependence()), + PointeeType(Pointee), Class(Cls) {} public: QualType getPointeeType() const { return PointeeType; } @@ -3576,12 +3510,12 @@ class FunctionType : public Type { class ExtInfo { friend class FunctionType; - // Feel free to rearrange or add bits, but if you go over 12, - // you'll need to adjust both the Bits field below and - // Type::FunctionTypeBitfields. + // Feel free to rearrange or add bits, but if you go over 16, you'll need to + // adjust the Bits field below, and if you add bits, you'll need to adjust + // Type::FunctionTypeBitfields::ExtInfo as well. - // | CC |noreturn|produces|nocallersavedregs|regparm|nocfcheck| - // |0 .. 4| 5 | 6 | 7 |8 .. 10| 11 | + // | CC |noreturn|produces|nocallersavedregs|regparm|nocfcheck|cmsenscall| + // |0 .. 4| 5 | 6 | 7 |8 .. 10| 11 | 12 | // // regparm is either 0 (no regparm attribute) or the regparm value+1. enum { CallConvMask = 0x1F }; @@ -3589,26 +3523,29 @@ class FunctionType : public Type { enum { ProducesResultMask = 0x40 }; enum { NoCallerSavedRegsMask = 0x80 }; enum { NoCfCheckMask = 0x800 }; + enum { CmseNSCallMask = 0x1000 }; enum { RegParmMask = ~(CallConvMask | NoReturnMask | ProducesResultMask | - NoCallerSavedRegsMask | NoCfCheckMask), + NoCallerSavedRegsMask | NoCfCheckMask | CmseNSCallMask), RegParmOffset = 8 }; // Assumed to be the last field uint16_t Bits = CC_C; ExtInfo(unsigned Bits) : Bits(static_cast(Bits)) {} - public: - // Constructor with no defaults. Use this when you know that you - // have all the elements (when reading an AST file for example). - ExtInfo(bool noReturn, bool hasRegParm, unsigned regParm, CallingConv cc, - bool producesResult, bool noCallerSavedRegs, bool NoCfCheck) { - assert((!hasRegParm || regParm < 7) && "Invalid regparm value"); - Bits = ((unsigned)cc) | (noReturn ? NoReturnMask : 0) | - (producesResult ? ProducesResultMask : 0) | - (noCallerSavedRegs ? NoCallerSavedRegsMask : 0) | - (hasRegParm ? ((regParm + 1) << RegParmOffset) : 0) | - (NoCfCheck ? NoCfCheckMask : 0); + public: + // Constructor with no defaults. Use this when you know that you + // have all the elements (when reading an AST file for example). + ExtInfo(bool noReturn, bool hasRegParm, unsigned regParm, CallingConv cc, + bool producesResult, bool noCallerSavedRegs, bool NoCfCheck, + bool cmseNSCall) { + assert((!hasRegParm || regParm < 7) && "Invalid regparm value"); + Bits = ((unsigned)cc) | (noReturn ? NoReturnMask : 0) | + (producesResult ? ProducesResultMask : 0) | + (noCallerSavedRegs ? NoCallerSavedRegsMask : 0) | + (hasRegParm ? ((regParm + 1) << RegParmOffset) : 0) | + (NoCfCheck ? NoCfCheckMask : 0) | + (cmseNSCall ? CmseNSCallMask : 0); } // Constructor with all defaults. Use when for example creating a @@ -3621,6 +3558,7 @@ class FunctionType : public Type { bool getNoReturn() const { return Bits & NoReturnMask; } bool getProducesResult() const { return Bits & ProducesResultMask; } + bool getCmseNSCall() const { return Bits & CmseNSCallMask; } bool getNoCallerSavedRegs() const { return Bits & NoCallerSavedRegsMask; } bool getNoCfCheck() const { return Bits & NoCfCheckMask; } bool getHasRegParm() const { return (Bits >> RegParmOffset) != 0; } @@ -3658,6 +3596,13 @@ class FunctionType : public Type { return ExtInfo(Bits & ~ProducesResultMask); } + ExtInfo withCmseNSCall(bool cmseNSCall) const { + if (cmseNSCall) + return ExtInfo(Bits | CmseNSCallMask); + else + return ExtInfo(Bits & ~CmseNSCallMask); + } + ExtInfo withNoCallerSavedRegs(bool noCallerSavedRegs) const { if (noCallerSavedRegs) return ExtInfo(Bits | NoCallerSavedRegsMask); @@ -3704,14 +3649,9 @@ class FunctionType : public Type { }; protected: - FunctionType(TypeClass tc, QualType res, - QualType Canonical, bool Dependent, - bool InstantiationDependent, - bool VariablyModified, bool ContainsUnexpandedParameterPack, - ExtInfo Info) - : Type(tc, Canonical, Dependent, InstantiationDependent, VariablyModified, - ContainsUnexpandedParameterPack), - ResultType(res) { + FunctionType(TypeClass tc, QualType res, QualType Canonical, + TypeDependence Dependence, ExtInfo Info) + : Type(tc, Canonical, Dependence), ResultType(res) { FunctionTypeBits.ExtInfo = Info.Bits; } @@ -3730,6 +3670,7 @@ class FunctionType : public Type { /// type. bool getNoReturnAttr() const { return getExtInfo().getNoReturn(); } + bool getCmseNSCallAttr() const { return getExtInfo().getCmseNSCall(); } CallingConv getCallConv() const { return getExtInfo().getCC(); } ExtInfo getExtInfo() const { return ExtInfo(FunctionTypeBits.ExtInfo); } @@ -3762,9 +3703,10 @@ class FunctionNoProtoType : public FunctionType, public llvm::FoldingSetNode { FunctionNoProtoType(QualType Result, QualType Canonical, ExtInfo Info) : FunctionType(FunctionNoProto, Result, Canonical, - /*Dependent=*/false, /*InstantiationDependent=*/false, - Result->isVariablyModifiedType(), - /*ContainsUnexpandedParameterPack=*/false, Info) {} + Result->getDependence() & + ~(TypeDependence::DependentInstantiation | + TypeDependence::UnexpandedPack), + Info) {} public: // No additional state past what FunctionType provides. @@ -4256,9 +4198,9 @@ class UnresolvedUsingType : public Type { UnresolvedUsingTypenameDecl *Decl; UnresolvedUsingType(const UnresolvedUsingTypenameDecl *D) - : Type(UnresolvedUsing, QualType(), true, true, false, - /*ContainsUnexpandedParameterPack=*/false), - Decl(const_cast(D)) {} + : Type(UnresolvedUsing, QualType(), + TypeDependence::DependentInstantiation), + Decl(const_cast(D)) {} public: UnresolvedUsingTypenameDecl *getDecl() const { return Decl; } @@ -4287,11 +4229,8 @@ class TypedefType : public Type { friend class ASTContext; // ASTContext creates these. TypedefType(TypeClass tc, const TypedefNameDecl *D, QualType can) - : Type(tc, can, can->isDependentType(), - can->isInstantiationDependentType(), - can->isVariablyModifiedType(), - /*ContainsUnexpandedParameterPack=*/false), - Decl(const_cast(D)) { + : Type(tc, can, can->getDependence() & ~TypeDependence::UnexpandedPack), + Decl(const_cast(D)) { assert(!isa(can) && "Invalid canonical type"); } @@ -4314,10 +4253,7 @@ class MacroQualifiedType : public Type { MacroQualifiedType(QualType UnderlyingTy, QualType CanonTy, const IdentifierInfo *MacroII) - : Type(MacroQualified, CanonTy, UnderlyingTy->isDependentType(), - UnderlyingTy->isInstantiationDependentType(), - UnderlyingTy->isVariablyModifiedType(), - UnderlyingTy->containsUnexpandedParameterPack()), + : Type(MacroQualified, CanonTy, UnderlyingTy->getDependence()), UnderlyingTy(UnderlyingTy), MacroII(MacroII) { assert(isa(UnderlyingTy) && "Expected a macro qualified type to only wrap attributed types."); @@ -4389,11 +4325,7 @@ class TypeOfType : public Type { QualType TOType; TypeOfType(QualType T, QualType can) - : Type(TypeOf, can, T->isDependentType(), - T->isInstantiationDependentType(), - T->isVariablyModifiedType(), - T->containsUnexpandedParameterPack()), - TOType(T) { + : Type(TypeOf, can, T->getDependence()), TOType(T) { assert(!isa(can) && "Invalid canonical type"); } @@ -4602,10 +4534,7 @@ class AttributedType : public Type, public llvm::FoldingSetNode { AttributedType(QualType canon, attr::Kind attrKind, QualType modified, QualType equivalent) - : Type(Attributed, canon, equivalent->isDependentType(), - equivalent->isInstantiationDependentType(), - equivalent->isVariablyModifiedType(), - equivalent->containsUnexpandedParameterPack()), + : Type(Attributed, canon, equivalent->getDependence()), ModifiedType(modified), EquivalentType(equivalent) { AttributedTypeBits.AttrKind = attrKind; } @@ -4707,18 +4636,16 @@ class TemplateTypeParmType : public Type, public llvm::FoldingSetNode { /// Build a non-canonical type. TemplateTypeParmType(TemplateTypeParmDecl *TTPDecl, QualType Canon) - : Type(TemplateTypeParm, Canon, /*Dependent=*/true, - /*InstantiationDependent=*/true, - /*VariablyModified=*/false, - Canon->containsUnexpandedParameterPack()), + : Type(TemplateTypeParm, Canon, + TypeDependence::DependentInstantiation | + (Canon->getDependence() & TypeDependence::UnexpandedPack)), TTPDecl(TTPDecl) {} /// Build the canonical type. TemplateTypeParmType(unsigned D, unsigned I, bool PP) : Type(TemplateTypeParm, QualType(this, 0), - /*Dependent=*/true, - /*InstantiationDependent=*/true, - /*VariablyModified=*/false, PP) { + TypeDependence::DependentInstantiation | + (PP ? TypeDependence::UnexpandedPack : TypeDependence::None)) { CanTTPTInfo.Depth = D; CanTTPTInfo.Index = I; CanTTPTInfo.ParameterPack = PP; @@ -4775,10 +4702,7 @@ class SubstTemplateTypeParmType : public Type, public llvm::FoldingSetNode { const TemplateTypeParmType *Replaced; SubstTemplateTypeParmType(const TemplateTypeParmType *Param, QualType Canon) - : Type(SubstTemplateTypeParm, Canon, Canon->isDependentType(), - Canon->isInstantiationDependentType(), - Canon->isVariablyModifiedType(), - Canon->containsUnexpandedParameterPack()), + : Type(SubstTemplateTypeParm, Canon, Canon->getDependence()), Replaced(Param) {} public: @@ -4875,23 +4799,16 @@ class SubstTemplateTypeParmPackType : public Type, public llvm::FoldingSetNode { /// the latter case, it is also a dependent type. class DeducedType : public Type { protected: - DeducedType(TypeClass TC, QualType DeducedAsType, bool IsDependent, - bool IsInstantiationDependent, bool ContainsParameterPack) + DeducedType(TypeClass TC, QualType DeducedAsType, + TypeDependence ExtraDependence) : Type(TC, // FIXME: Retain the sugared deduced type? DeducedAsType.isNull() ? QualType(this, 0) : DeducedAsType.getCanonicalType(), - IsDependent, IsInstantiationDependent, - /*VariablyModified=*/false, ContainsParameterPack) { - if (!DeducedAsType.isNull()) { - if (DeducedAsType->isDependentType()) - setDependent(); - if (DeducedAsType->isInstantiationDependentType()) - setInstantiationDependent(); - if (DeducedAsType->containsUnexpandedParameterPack()) - setContainsUnexpandedParameterPack(); - } - } + ExtraDependence | (DeducedAsType.isNull() + ? TypeDependence::None + : DeducedAsType->getDependence() & + ~TypeDependence::VariablyModified)) {} public: bool isSugared() const { return !isCanonicalUnqualified(); } @@ -4920,7 +4837,7 @@ class alignas(8) AutoType : public DeducedType, public llvm::FoldingSetNode { ConceptDecl *TypeConstraintConcept; AutoType(QualType DeducedAsType, AutoTypeKeyword Keyword, - bool IsDeducedAsDependent, bool IsDeducedAsPack, ConceptDecl *CD, + TypeDependence ExtraDependence, ConceptDecl *CD, ArrayRef TypeConstraintArgs); const TemplateArgument *getArgBuffer() const { @@ -4991,9 +4908,10 @@ class DeducedTemplateSpecializationType : public DeducedType, QualType DeducedAsType, bool IsDeducedAsDependent) : DeducedType(DeducedTemplateSpecialization, DeducedAsType, - IsDeducedAsDependent || Template.isDependent(), - IsDeducedAsDependent || Template.isInstantiationDependent(), - Template.containsUnexpandedParameterPack()), + toTypeDependence(Template.getDependence()) | + (IsDeducedAsDependent + ? TypeDependence::DependentInstantiation + : TypeDependence::None)), Template(Template) {} public: @@ -5195,10 +5113,8 @@ class InjectedClassNameType : public Type { QualType InjectedType; InjectedClassNameType(CXXRecordDecl *D, QualType TST) - : Type(InjectedClassName, QualType(), /*Dependent=*/true, - /*InstantiationDependent=*/true, - /*VariablyModified=*/false, - /*ContainsUnexpandedParameterPack=*/false), + : Type(InjectedClassName, QualType(), + TypeDependence::DependentInstantiation), Decl(D), InjectedType(TST) { assert(isa(TST)); assert(!TST.hasQualifiers()); @@ -5277,11 +5193,8 @@ enum ElaboratedTypeKeyword { class TypeWithKeyword : public Type { protected: TypeWithKeyword(ElaboratedTypeKeyword Keyword, TypeClass tc, - QualType Canonical, bool Dependent, - bool InstantiationDependent, bool VariablyModified, - bool ContainsUnexpandedParameterPack) - : Type(tc, Canonical, Dependent, InstantiationDependent, VariablyModified, - ContainsUnexpandedParameterPack) { + QualType Canonical, TypeDependence Dependence) + : Type(tc, Canonical, Dependence) { TypeWithKeywordBits.Keyword = Keyword; } @@ -5345,10 +5258,7 @@ class ElaboratedType final ElaboratedType(ElaboratedTypeKeyword Keyword, NestedNameSpecifier *NNS, QualType NamedType, QualType CanonType, TagDecl *OwnedTagDecl) : TypeWithKeyword(Keyword, Elaborated, CanonType, - NamedType->isDependentType(), - NamedType->isInstantiationDependentType(), - NamedType->isVariablyModifiedType(), - NamedType->containsUnexpandedParameterPack()), + NamedType->getDependence()), NNS(NNS), NamedType(NamedType) { ElaboratedTypeBits.HasOwnedTagDecl = false; if (OwnedTagDecl) { @@ -5419,10 +5329,9 @@ class DependentNameType : public TypeWithKeyword, public llvm::FoldingSetNode { DependentNameType(ElaboratedTypeKeyword Keyword, NestedNameSpecifier *NNS, const IdentifierInfo *Name, QualType CanonType) - : TypeWithKeyword(Keyword, DependentName, CanonType, /*Dependent=*/true, - /*InstantiationDependent=*/true, - /*VariablyModified=*/false, - NNS->containsUnexpandedParameterPack()), + : TypeWithKeyword(Keyword, DependentName, CanonType, + TypeDependence::DependentInstantiation | + toTypeDependence(NNS->getDependence())), NNS(NNS), Name(Name) {} public: @@ -5559,10 +5468,9 @@ class PackExpansionType : public Type, public llvm::FoldingSetNode { PackExpansionType(QualType Pattern, QualType Canon, Optional NumExpansions) - : Type(PackExpansion, Canon, /*Dependent=*/Pattern->isDependentType(), - /*InstantiationDependent=*/true, - /*VariablyModified=*/Pattern->isVariablyModifiedType(), - /*ContainsUnexpandedParameterPack=*/false), + : Type(PackExpansion, Canon, + (Pattern->getDependence() | TypeDependence::Instantiation) & + ~TypeDependence::UnexpandedPack), Pattern(Pattern) { PackExpansionTypeBits.NumExpansions = NumExpansions ? *NumExpansions + 1 : 0; @@ -5781,8 +5689,8 @@ class ObjCObjectType : public Type, bool isKindOf); ObjCObjectType(enum Nonce_ObjCInterface) - : Type(ObjCInterface, QualType(), false, false, false, false), - BaseType(QualType(this_(), 0)) { + : Type(ObjCInterface, QualType(), TypeDependence::None), + BaseType(QualType(this_(), 0)) { ObjCObjectTypeBits.NumProtocols = 0; ObjCObjectTypeBits.NumTypeArgs = 0; ObjCObjectTypeBits.IsKindOf = 0; @@ -5997,11 +5905,7 @@ class ObjCObjectPointerType : public Type, public llvm::FoldingSetNode { QualType PointeeType; ObjCObjectPointerType(QualType Canonical, QualType Pointee) - : Type(ObjCObjectPointer, Canonical, - Pointee->isDependentType(), - Pointee->isInstantiationDependentType(), - Pointee->isVariablyModifiedType(), - Pointee->containsUnexpandedParameterPack()), + : Type(ObjCObjectPointer, Canonical, Pointee->getDependence()), PointeeType(Pointee) {} public: @@ -6171,11 +6075,7 @@ class AtomicType : public Type, public llvm::FoldingSetNode { QualType ValueType; AtomicType(QualType ValTy, QualType Canonical) - : Type(Atomic, Canonical, ValTy->isDependentType(), - ValTy->isInstantiationDependentType(), - ValTy->isVariablyModifiedType(), - ValTy->containsUnexpandedParameterPack()), - ValueType(ValTy) {} + : Type(Atomic, Canonical, ValTy->getDependence()), ValueType(ValTy) {} public: /// Gets the type contained by this atomic type, i.e. @@ -6206,10 +6106,7 @@ class PipeType : public Type, public llvm::FoldingSetNode { bool isRead; PipeType(QualType elemType, QualType CanonicalPtr, bool isRead) - : Type(Pipe, CanonicalPtr, elemType->isDependentType(), - elemType->isInstantiationDependentType(), - elemType->isVariablyModifiedType(), - elemType->containsUnexpandedParameterPack()), + : Type(Pipe, CanonicalPtr, elemType->getDependence()), ElementType(elemType), isRead(isRead) {} public: diff --git a/clang/include/clang/AST/TypeProperties.td b/clang/include/clang/AST/TypeProperties.td index 3cf56e5a5629a..994f932170ae1 100644 --- a/clang/include/clang/AST/TypeProperties.td +++ b/clang/include/clang/AST/TypeProperties.td @@ -249,13 +249,17 @@ let Class = FunctionType in { def : Property<"noCfCheck", Bool> { let Read = [{ node->getExtInfo().getNoCfCheck() }]; } + def : Property<"cmseNSCall", Bool> { + let Read = [{ node->getExtInfo().getCmseNSCall() }]; + } } let Class = FunctionNoProtoType in { def : Creator<[{ auto extInfo = FunctionType::ExtInfo(noReturn, hasRegParm, regParm, callingConvention, producesResult, - noCallerSavedRegs, noCfCheck); + noCallerSavedRegs, noCfCheck, + cmseNSCall); return ctx.getFunctionNoProtoType(returnType, extInfo); }]>; } @@ -288,7 +292,8 @@ let Class = FunctionProtoType in { def : Creator<[{ auto extInfo = FunctionType::ExtInfo(noReturn, hasRegParm, regParm, callingConvention, producesResult, - noCallerSavedRegs, noCfCheck); + noCallerSavedRegs, noCfCheck, + cmseNSCall); FunctionProtoType::ExtProtoInfo epi; epi.ExtInfo = extInfo; epi.Variadic = variadic; @@ -453,7 +458,9 @@ let Class = TagType in { let Class = EnumType in { def : Creator<[{ QualType result = ctx.getEnumType(cast(declaration)); - const_cast(result.getTypePtr())->setDependent(dependent); + if (dependent) + const_cast(result.getTypePtr()) + ->addDependence(TypeDependence::DependentInstantiation); return result; }]>; } @@ -462,7 +469,9 @@ let Class = RecordType in { def : Creator<[{ auto record = cast(declaration); QualType result = ctx.getRecordType(record); - const_cast(result.getTypePtr())->setDependent(dependent); + if (dependent) + const_cast(result.getTypePtr()) + ->addDependence(TypeDependence::DependentInstantiation); return result; }]>; } @@ -605,7 +614,9 @@ let Class = TemplateSpecializationType in { templateArguments, *underlyingType); } - const_cast(result.getTypePtr())->setDependent(dependent); + if (dependent) + const_cast(result.getTypePtr()) + ->addDependence(TypeDependence::DependentInstantiation); return result; }]>; } diff --git a/clang/include/clang/Analysis/CallGraph.h b/clang/include/clang/Analysis/CallGraph.h index 0410503192392..6f7159330f5da 100644 --- a/clang/include/clang/Analysis/CallGraph.h +++ b/clang/include/clang/Analysis/CallGraph.h @@ -66,6 +66,11 @@ class CallGraph : public RecursiveASTVisitor { /// Determine if a declaration should be included in the graph. static bool includeInGraph(const Decl *D); + /// Determine if a declaration should be included in the graph for the + /// purposes of being a callee. This is similar to includeInGraph except + /// it permits declarations, not just definitions. + static bool includeCalleeInGraph(const Decl *D); + /// Lookup the node for the given declaration. CallGraphNode *getNode(const Decl *) const; diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td index 43f1432b1d26c..dd873ab2595c9 100644 --- a/clang/include/clang/Basic/Attr.td +++ b/clang/include/clang/Basic/Attr.td @@ -365,6 +365,8 @@ class TargetArch arches> : TargetSpec { let Arches = arches; } def TargetARM : TargetArch<["arm", "thumb", "armeb", "thumbeb"]>; +def TargetAArch64 : TargetArch<["aarch64"]>; +def TargetAnyArm : TargetArch; def TargetAVR : TargetArch<["avr"]>; def TargetBPF : TargetArch<["bpfel", "bpfeb"]>; def TargetMips32 : TargetArch<["mips", "mipsel"]>; @@ -517,6 +519,8 @@ class Attr { bit ASTNode = 1; // Set to true for attributes which have handler in Sema. bit SemaHandler = 1; + // Set to true if this attribute doesn't need custom handling in Sema. + bit SimpleHandler = 0; // Set to true for attributes that are completely ignored. bit Ignored = 0; // Set to true if the attribute's parsing does not match its semantic @@ -632,7 +636,7 @@ def Alias : Attr { let Documentation = [Undocumented]; } -def ArmBuiltinAlias : InheritableAttr, TargetSpecificAttr { +def ArmBuiltinAlias : InheritableAttr, TargetSpecificAttr { let Spellings = [Clang<"__clang_arm_builtin_alias">]; let Args = [IdentifierArgument<"BuiltinName">]; let Subjects = SubjectList<[Function], ErrorDiag>; @@ -688,6 +692,7 @@ def Artificial : InheritableAttr { let Spellings = [GCC<"artificial">]; let Subjects = SubjectList<[InlineFunction]>; let Documentation = [ArtificialDocs]; + let SimpleHandler = 1; } def XRayInstrument : InheritableAttr { @@ -699,6 +704,7 @@ def XRayInstrument : InheritableAttr { Accessor<"neverXRayInstrument", [Clang<"xray_never_instrument">]>]; let Documentation = [XRayDocs]; + let SimpleHandler = 1; } def XRayLogArgs : InheritableAttr { @@ -955,6 +961,7 @@ def OSConsumesThis : InheritableAttr { let Spellings = [Clang<"os_consumes_this">]; let Subjects = SubjectList<[NonStaticCXXMethod]>; let Documentation = [RetainBehaviorDocs]; + let SimpleHandler = 1; } def Cleanup : InheritableAttr { @@ -964,6 +971,19 @@ def Cleanup : InheritableAttr { let Documentation = [Undocumented]; } +def CmseNSEntry : InheritableAttr, TargetSpecificAttr { + let Spellings = [GNU<"cmse_nonsecure_entry">]; + let Subjects = SubjectList<[Function]>; + let LangOpts = [Cmse]; + let Documentation = [ArmCmseNSEntryDocs]; +} + +def CmseNSCall : TypeAttr, TargetSpecificAttr { + let Spellings = [GNU<"cmse_nonsecure_call">]; + let LangOpts = [Cmse]; + let Documentation = [ArmCmseNSCallDocs]; +} + def Cold : InheritableAttr { let Spellings = [GCC<"cold">]; let Subjects = SubjectList<[Function]>; @@ -979,6 +999,7 @@ def Common : InheritableAttr { def Const : InheritableAttr { let Spellings = [GCC<"const">, GCC<"__const">]; let Documentation = [Undocumented]; + let SimpleHandler = 1; } def ConstInit : InheritableAttr { @@ -990,6 +1011,7 @@ def ConstInit : InheritableAttr { let Accessors = [Accessor<"isConstinit", [Keyword<"constinit">]>]; let Documentation = [ConstInitDocs]; let LangOpts = [CPlusPlus]; + let SimpleHandler = 1; } def Constructor : InheritableAttr { @@ -1149,6 +1171,7 @@ def SYCLIntelKernelArgsRestrict : InheritableAttr { let Subjects = SubjectList<[Function], ErrorDiag>; let LangOpts = [ SYCLIsDevice, SYCLIsHost ]; let Documentation = [ SYCLIntelKernelArgsRestrictDocs ]; + let SimpleHandler = 1; } def SYCLIntelNumSimdWorkItems : InheritableAttr { @@ -1200,6 +1223,7 @@ def CXX11NoReturn : InheritableAttr { let Spellings = [CXX11<"", "noreturn", 200809>]; let Subjects = SubjectList<[Function], ErrorDiag>; let Documentation = [CXX11NoReturnDocs]; + let SimpleHandler = 1; } // Similar to CUDA, OpenCL attributes do not receive a [[]] spelling because @@ -1208,6 +1232,7 @@ def OpenCLKernel : InheritableAttr { let Spellings = [Keyword<"__kernel">, Keyword<"kernel">]; let Subjects = SubjectList<[Function], ErrorDiag>; let Documentation = [Undocumented]; + let SimpleHandler = 1; } def OpenCLUnrollHint : InheritableAttr { @@ -1301,6 +1326,7 @@ def RenderScriptKernel : Attr { let Subjects = SubjectList<[Function]>; let Documentation = [RenderScriptKernelAttributeDocs]; let LangOpts = [RenderScript]; + let SimpleHandler = 1; } def Deprecated : InheritableAttr { @@ -1325,6 +1351,7 @@ def EmptyBases : InheritableAttr, TargetSpecificAttr { let Spellings = [Declspec<"empty_bases">]; let Subjects = SubjectList<[CXXRecord]>; let Documentation = [EmptyBasesDocs]; + let SimpleHandler = 1; } def AllocSize : InheritableAttr { @@ -1396,6 +1423,7 @@ def FlagEnum : InheritableAttr { let Spellings = [Clang<"flag_enum">]; let Subjects = SubjectList<[Enum]>; let Documentation = [FlagEnumDocs]; + let SimpleHandler = 1; } def EnumExtensibility : InheritableAttr { @@ -1410,6 +1438,7 @@ def Flatten : InheritableAttr { let Spellings = [GCC<"flatten">]; let Subjects = SubjectList<[Function], ErrorDiag>; let Documentation = [FlattenDocs]; + let SimpleHandler = 1; } def Format : InheritableAttr { @@ -1455,6 +1484,7 @@ def IBAction : InheritableAttr { // of the compiler. However, this node needs to exist in the AST because // external tools rely on it. let Documentation = [Undocumented]; + let SimpleHandler = 1; } def IBOutlet : InheritableAttr { @@ -1495,6 +1525,7 @@ def LifetimeBound : DeclOrTypeAttr { let Subjects = SubjectList<[ParmVar, ImplicitObjectParameter], ErrorDiag>; let Documentation = [LifetimeBoundDocs]; let LangOpts = [CPlusPlus]; + let SimpleHandler = 1; } def TrivialABI : InheritableAttr { @@ -1504,6 +1535,7 @@ def TrivialABI : InheritableAttr { let Subjects = SubjectList<[CXXRecord]>; let Documentation = [TrivialABIDocs]; let LangOpts = [CPlusPlus]; + let SimpleHandler = 1; } def MaxFieldAlignment : InheritableAttr { @@ -1518,6 +1550,7 @@ def MayAlias : InheritableAttr { // FIXME: this is a type attribute in GCC, but a declaration attribute here. let Spellings = [GCC<"may_alias">]; let Documentation = [Undocumented]; + let SimpleHandler = 1; } def MIGServerRoutine : InheritableAttr { @@ -1955,12 +1988,14 @@ def NoUniqueAddress : InheritableAttr, TargetSpecificAttr { let Spellings = [CXX11<"", "no_unique_address", 201803>]; let Subjects = SubjectList<[NonBitField], ErrorDiag>; let Documentation = [NoUniqueAddressDocs]; + let SimpleHandler = 1; } def ReturnsTwice : InheritableAttr { let Spellings = [GCC<"returns_twice">]; let Subjects = SubjectList<[Function]>; let Documentation = [Undocumented]; + let SimpleHandler = 1; } def DisableTailCalls : InheritableAttr { @@ -1973,12 +2008,14 @@ def NoAlias : InheritableAttr { let Spellings = [Declspec<"noalias">]; let Subjects = SubjectList<[Function]>; let Documentation = [NoAliasDocs]; + let SimpleHandler = 1; } def NoCommon : InheritableAttr { let Spellings = [GCC<"nocommon">]; let Subjects = SubjectList<[Var]>; let Documentation = [Undocumented]; + let SimpleHandler = 1; } def NoDebug : InheritableAttr { @@ -1991,30 +2028,35 @@ def NoDuplicate : InheritableAttr { let Spellings = [Clang<"noduplicate">]; let Subjects = SubjectList<[Function]>; let Documentation = [NoDuplicateDocs]; + let SimpleHandler = 1; } def Convergent : InheritableAttr { let Spellings = [Clang<"convergent">]; let Subjects = SubjectList<[Function]>; let Documentation = [ConvergentDocs]; + let SimpleHandler = 1; } def NoInline : InheritableAttr { let Spellings = [GCC<"noinline">, Declspec<"noinline">]; let Subjects = SubjectList<[Function]>; let Documentation = [Undocumented]; + let SimpleHandler = 1; } def NoMips16 : InheritableAttr, TargetSpecificAttr { let Spellings = [GCC<"nomips16">]; let Subjects = SubjectList<[Function], ErrorDiag>; let Documentation = [Undocumented]; + let SimpleHandler = 1; } def NoMicroMips : InheritableAttr, TargetSpecificAttr { let Spellings = [GCC<"nomicromips">]; let Subjects = SubjectList<[Function], ErrorDiag>; let Documentation = [MicroMipsDocs]; + let SimpleHandler = 1; } def RISCVInterrupt : InheritableAttr, TargetSpecificAttr { @@ -2109,6 +2151,7 @@ def NoSplitStack : InheritableAttr { let Spellings = [GCC<"no_split_stack">]; let Subjects = SubjectList<[Function], ErrorDiag>; let Documentation = [NoSplitStackDocs]; + let SimpleHandler = 1; } def NonNull : InheritableParamAttr { @@ -2206,6 +2249,7 @@ def NoInstrumentFunction : InheritableAttr { let Spellings = [GCC<"no_instrument_function">]; let Subjects = SubjectList<[Function]>; let Documentation = [Undocumented]; + let SimpleHandler = 1; } def NotTailCalled : InheritableAttr { @@ -2218,6 +2262,7 @@ def NoStackProtector : InheritableAttr { let Spellings = [Clang<"no_stack_protector">]; let Subjects = SubjectList<[Function]>; let Documentation = [NoStackProtectorDocs]; + let SimpleHandler = 1; } def NoThrow : InheritableAttr { @@ -2280,6 +2325,7 @@ def NSConsumesSelf : InheritableAttr { let Spellings = [Clang<"ns_consumes_self">]; let Subjects = SubjectList<[ObjCMethod]>; let Documentation = [RetainBehaviorDocs]; + let SimpleHandler = 1; } def NSConsumed : InheritableParamAttr { @@ -2292,6 +2338,7 @@ def ObjCException : InheritableAttr { let Spellings = [Clang<"objc_exception">]; let Subjects = SubjectList<[ObjCInterface], ErrorDiag>; let Documentation = [Undocumented]; + let SimpleHandler = 1; } def ObjCMethodFamily : InheritableAttr { @@ -2336,6 +2383,7 @@ def ObjCRootClass : InheritableAttr { let Spellings = [Clang<"objc_root_class">]; let Subjects = SubjectList<[ObjCInterface], ErrorDiag>; let Documentation = [Undocumented]; + let SimpleHandler = 1; } def ObjCNonLazyClass : Attr { @@ -2343,12 +2391,14 @@ def ObjCNonLazyClass : Attr { let Subjects = SubjectList<[ObjCInterface, ObjCImpl], ErrorDiag>; let LangOpts = [ObjC]; let Documentation = [ObjCNonLazyClassDocs]; + let SimpleHandler = 1; } def ObjCSubclassingRestricted : InheritableAttr { let Spellings = [Clang<"objc_subclassing_restricted">]; let Subjects = SubjectList<[ObjCInterface], ErrorDiag>; let Documentation = [ObjCSubclassingRestrictedDocs]; + let SimpleHandler = 1; } def ObjCExplicitProtocolImpl : InheritableAttr { @@ -2388,6 +2438,7 @@ def ObjCRuntimeVisible : Attr { let Spellings = [Clang<"objc_runtime_visible">]; let Subjects = SubjectList<[ObjCInterface], ErrorDiag>; let Documentation = [ObjCRuntimeVisibleDocs]; + let SimpleHandler = 1; } def ObjCClassStub : Attr { @@ -2395,6 +2446,7 @@ def ObjCClassStub : Attr { let Subjects = SubjectList<[ObjCInterface], ErrorDiag>; let Documentation = [ObjCClassStubDocs]; let LangOpts = [ObjCNonFragileRuntime]; + let SimpleHandler = 1; } def ObjCBoxable : Attr { @@ -2413,6 +2465,7 @@ def Overloadable : Attr { let Spellings = [Clang<"overloadable">]; let Subjects = SubjectList<[Function], ErrorDiag>; let Documentation = [OverloadableDocs]; + let SimpleHandler = 1; } def Override : InheritableAttr { @@ -2470,6 +2523,7 @@ def AArch64VectorPcs: DeclOrTypeAttr { def Pure : InheritableAttr { let Spellings = [GCC<"pure">]; let Documentation = [Undocumented]; + let SimpleHandler = 1; } def Regparm : TypeAttr { @@ -2796,6 +2850,7 @@ def ArcWeakrefUnavailable : InheritableAttr { let Spellings = [Clang<"objc_arc_weak_reference_unavailable">]; let Subjects = SubjectList<[ObjCInterface], ErrorDiag>; let Documentation = [Undocumented]; + let SimpleHandler = 1; } def ObjCGC : TypeAttr { @@ -2814,6 +2869,7 @@ def ObjCRequiresPropertyDefs : InheritableAttr { let Spellings = [Clang<"objc_requires_property_definitions">]; let Subjects = SubjectList<[ObjCInterface], ErrorDiag>; let Documentation = [Undocumented]; + let SimpleHandler = 1; } def Unused : InheritableAttr { @@ -2828,6 +2884,7 @@ def Used : InheritableAttr { let Spellings = [GCC<"used">]; let Subjects = SubjectList<[NonLocalVar, Function, ObjCMethod]>; let Documentation = [Undocumented]; + let SimpleHandler = 1; } def Uuid : InheritableAttr { @@ -2889,6 +2946,7 @@ def WarnUnused : InheritableAttr { let Spellings = [GCC<"warn_unused">]; let Subjects = SubjectList<[Record]>; let Documentation = [Undocumented]; + let SimpleHandler = 1; } def WarnUnusedResult : InheritableAttr { @@ -2911,6 +2969,7 @@ def Weak : InheritableAttr { let Spellings = [GCC<"weak">]; let Subjects = SubjectList<[Var, Function, CXXRecord]>; let Documentation = [Undocumented]; + let SimpleHandler = 1; } def WeakImport : InheritableAttr { @@ -2930,6 +2989,7 @@ def LTOVisibilityPublic : InheritableAttr { let Spellings = [Clang<"lto_visibility_public">]; let Subjects = SubjectList<[Record]>; let Documentation = [LTOVisibilityDocs]; + let SimpleHandler = 1; } def AnyX86Interrupt : InheritableAttr, TargetSpecificAttr { @@ -2946,6 +3006,7 @@ def AnyX86NoCallerSavedRegisters : InheritableAttr, TargetSpecificAttr { let Spellings = [GCC<"no_caller_saved_registers">]; let Documentation = [AnyX86NoCallerSavedRegistersDocs]; + let SimpleHandler = 1; } def AnyX86NoCfCheck : DeclOrTypeAttr, TargetSpecificAttr{ @@ -2997,6 +3058,7 @@ def CFICanonicalJumpTable : InheritableAttr { let Spellings = [Clang<"cfi_canonical_jump_table">]; let Subjects = SubjectList<[Function], ErrorDiag>; let Documentation = [CFICanonicalJumpTableDocs]; + let SimpleHandler = 1; } // C/C++ Thread safety attributes (e.g. for deadlock, data race checking) @@ -3009,6 +3071,7 @@ def GuardedVar : InheritableAttr { let Spellings = [Clang<"guarded_var", 0>]; let Subjects = SubjectList<[Field, SharedVar]>; let Documentation = [Undocumented]; + let SimpleHandler = 1; } def PtGuardedVar : InheritableAttr { @@ -3028,6 +3091,7 @@ def ScopedLockable : InheritableAttr { let Spellings = [Clang<"scoped_lockable", 0>]; let Subjects = SubjectList<[Record]>; let Documentation = [Undocumented]; + let SimpleHandler = 1; } def Capability : InheritableAttr { @@ -3124,6 +3188,7 @@ def NoThreadSafetyAnalysis : InheritableAttr { let Spellings = [Clang<"no_thread_safety_analysis">]; let Subjects = SubjectList<[Function]>; let Documentation = [Undocumented]; + let SimpleHandler = 1; } def GuardedBy : InheritableAttr { @@ -3260,6 +3325,7 @@ def ConsumableAutoCast : InheritableAttr { let Spellings = [Clang<"consumable_auto_cast_state", 0>]; let Subjects = SubjectList<[CXXRecord]>; let Documentation = [Undocumented]; + let SimpleHandler = 1; } def ConsumableSetOnRead : InheritableAttr { @@ -3269,6 +3335,7 @@ def ConsumableSetOnRead : InheritableAttr { let Spellings = [Clang<"consumable_set_state_on_read", 0>]; let Subjects = SubjectList<[CXXRecord]>; let Documentation = [Undocumented]; + let SimpleHandler = 1; } def CallableWhen : InheritableAttr { @@ -3375,6 +3442,7 @@ def MSNoVTable : InheritableAttr, TargetSpecificAttr { let Spellings = [Declspec<"novtable">]; let Subjects = SubjectList<[CXXRecord]>; let Documentation = [MSNoVTableDocs]; + let SimpleHandler = 1; } def : IgnoredAttr { @@ -3400,6 +3468,7 @@ def MSStruct : InheritableAttr { let Spellings = [GCC<"ms_struct">]; let Subjects = SubjectList<[Record]>; let Documentation = [Undocumented]; + let SimpleHandler = 1; } def DLLExport : InheritableAttr, TargetSpecificAttr { @@ -3448,6 +3517,7 @@ def DLLImportStaticLocal : InheritableAttr, TargetSpecificAttr { def SelectAny : InheritableAttr { let Spellings = [Declspec<"selectany">, GCC<"selectany">]; let Documentation = [SelectAnyDocs]; + let SimpleHandler = 1; } def Thread : Attr { @@ -3706,7 +3776,7 @@ def OMPDeclareVariant : InheritableAttr { OMPTraitInfoArgument<"TraitInfos">, ]; let AdditionalMembers = [{ - OMPTraitInfo &getTraitInfo() { return traitInfos; } + OMPTraitInfo &getTraitInfo() { return *traitInfos; } void printPrettyPragma(raw_ostream & OS, const PrintingPolicy &Policy) const; }]; @@ -3723,12 +3793,14 @@ def ExcludeFromExplicitInstantiation : InheritableAttr { let Subjects = SubjectList<[Var, Function, CXXRecord]>; let Documentation = [ExcludeFromExplicitInstantiationDocs]; let MeaningfulToClassTemplateDefinition = 1; + let SimpleHandler = 1; } def Reinitializes : InheritableAttr { let Spellings = [Clang<"reinitializes", 0>]; let Subjects = SubjectList<[NonStaticNonConstCXXMethod], ErrorDiag>; let Documentation = [ReinitializesDocs]; + let SimpleHandler = 1; } def NoDestroy : InheritableAttr { diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td index b70d7d0fb80c3..e8cc79967f059 100644 --- a/clang/include/clang/Basic/AttrDocs.td +++ b/clang/include/clang/Basic/AttrDocs.td @@ -3812,8 +3812,8 @@ def OMPDeclareVariantDocs : Documentation { let Heading = "#pragma omp declare variant"; let Content = [{ The `declare variant` directive declares a specialized variant of a base - function and specifies the context in which that specialized variant is used. - The declare variant directive is a declarative directive. +function and specifies the context in which that specialized variant is used. +The declare variant directive is a declarative directive. The syntax of the `declare variant` construct is as follows: .. code-block:: none @@ -3830,7 +3830,7 @@ where clause is one of the following: match(context-selector-specification) and where `variant-func-id` is the name of a function variant that is either a - base language identifier or, for C++, a template-id. +base language identifier or, for C++, a template-id. }]; } @@ -4515,8 +4515,8 @@ ways: including calling the ``+initialize`` method if present. - The implicit ``_cmd`` parameter containing the method's selector is still defined. - In order to minimize code-size costs, the implementation will not emit a reference - to the selector if the parameter is unused within the method. + In order to minimize code-size costs, the implementation will not emit a reference + to the selector if the parameter is unused within the method. Symbols for direct method implementations are implicitly given hidden visibility, meaning that they can only be called within the same linkage unit. @@ -5072,7 +5072,7 @@ def LifetimeOwnerDocs : Documentation { The attribute ``[[gsl::Owner(T)]]`` applies to structs and classes that own an object of type ``T``: -.. code-block:: c++ +.. code:: class [[gsl::Owner(int)]] IntOwner { private: @@ -5098,7 +5098,7 @@ def LifetimePointerDocs : Documentation { The attribute ``[[gsl::Pointer(T)]]`` applies to structs and classes that behave like pointers to an object of type ``T``: -.. code-block:: c++ +.. code:: class [[gsl::Pointer(int)]] IntPointer { private: @@ -5157,7 +5157,7 @@ def NoBuiltinDocs : Documentation { let Category = DocCatFunction; let Content = [{ .. Note:: This attribute is not yet fully implemented, it is validated but has -no effect on the generated code. + no effect on the generated code. The ``__attribute__((no_builtin))`` is similar to the ``-fno-builtin`` flag except it is specific to the body of a function. The attribute may also be @@ -5292,3 +5292,28 @@ other than overloading. }]; } + +def ArmCmseNSCallDocs : Documentation { + let Category = DocCatType; + let Content = [{ +This attribute declares a non-secure function type. When compiling for secure +state, a call to such a function would switch from secure to non-secure state. +All non-secure function calls must happen only through a function pointer, and +a non-secure function type should only be used as a base type of a pointer. +See `ARMv8-M Security Extensions: Requirements on Development +Tools - Engineering Specification Documentation +`_ for more information. + }]; +} + +def ArmCmseNSEntryDocs : Documentation { + let Category = DocCatFunction; + let Content = [{ +This attribute declares a function that can be called from non-secure state, or +from secure state. Entering from and returning to non-secure state would switch +to and from secure state, respectively, and prevent flow of information +to non-secure state, except via return values. See `ARMv8-M Security Extensions: +Requirements on Development Tools - Engineering Specification Documentation +`_ for more information. + }]; +} diff --git a/clang/include/clang/Basic/BuiltinsWebAssembly.def b/clang/include/clang/Basic/BuiltinsWebAssembly.def index b544e3b42137b..5fb7a603fe17b 100644 --- a/clang/include/clang/Basic/BuiltinsWebAssembly.def +++ b/clang/include/clang/Basic/BuiltinsWebAssembly.def @@ -98,6 +98,10 @@ TARGET_BUILTIN(__builtin_wasm_sub_saturate_u_i8x16, "V16cV16cV16c", "nc", "simd1 TARGET_BUILTIN(__builtin_wasm_sub_saturate_s_i16x8, "V8sV8sV8s", "nc", "simd128") TARGET_BUILTIN(__builtin_wasm_sub_saturate_u_i16x8, "V8sV8sV8s", "nc", "simd128") +TARGET_BUILTIN(__builtin_wasm_abs_i8x16, "V16cV16c", "nc", "simd128") +TARGET_BUILTIN(__builtin_wasm_abs_i16x8, "V8sV8s", "nc", "simd128") +TARGET_BUILTIN(__builtin_wasm_abs_i32x4, "V4iV4i", "nc", "simd128") + TARGET_BUILTIN(__builtin_wasm_min_s_i8x16, "V16cV16cV16c", "nc", "simd128") TARGET_BUILTIN(__builtin_wasm_min_u_i8x16, "V16cV16cV16c", "nc", "simd128") TARGET_BUILTIN(__builtin_wasm_max_s_i8x16, "V16cV16cV16c", "nc", "simd128") @@ -125,6 +129,10 @@ TARGET_BUILTIN(__builtin_wasm_all_true_i16x8, "iV8s", "nc", "simd128") TARGET_BUILTIN(__builtin_wasm_all_true_i32x4, "iV4i", "nc", "simd128") TARGET_BUILTIN(__builtin_wasm_all_true_i64x2, "iV2LLi", "nc", "unimplemented-simd128") +TARGET_BUILTIN(__builtin_wasm_bitmask_i8x16, "iV16c", "nc", "simd128") +TARGET_BUILTIN(__builtin_wasm_bitmask_i16x8, "iV8s", "nc", "simd128") +TARGET_BUILTIN(__builtin_wasm_bitmask_i32x4, "iV4i", "nc", "simd128") + TARGET_BUILTIN(__builtin_wasm_abs_f32x4, "V4fV4f", "nc", "simd128") TARGET_BUILTIN(__builtin_wasm_abs_f64x2, "V2dV2d", "nc", "simd128") diff --git a/clang/include/clang/Basic/CodeGenOptions.def b/clang/include/clang/Basic/CodeGenOptions.def index 3c8b0eeb47a5c..5b59954fae7bb 100644 --- a/clang/include/clang/Basic/CodeGenOptions.def +++ b/clang/include/clang/Basic/CodeGenOptions.def @@ -63,7 +63,6 @@ CODEGENOPT(ExperimentalNewPassManager, 1, 0) ///< Enables the new, experimental CODEGENOPT(DebugPassManager, 1, 0) ///< Prints debug information for the new ///< pass manager. CODEGENOPT(DisableRedZone , 1, 0) ///< Set when -mno-red-zone is enabled. -CODEGENOPT(EnableDebugEntryValues, 1, 0) ///< Emit call site parameter dbg info CODEGENOPT(EmitCallSiteInfo, 1, 0) ///< Emit call site info only in the case of ///< '-g' + 'O>0' level. CODEGENOPT(IndirectTlsSegRefs, 1, 0) ///< Set when -mno-tls-direct-seg-refs @@ -307,6 +306,9 @@ CODEGENOPT(LTOVisibilityPublicStd, 1, 0) /// or 0 if unspecified. VALUE_CODEGENOPT(NumRegisterParameters, 32, 0) +/// The threshold to put data into small data section. +VALUE_CODEGENOPT(SmallDataLimit, 32, 0) + /// The lower bound for a buffer to be considered for stack protection. VALUE_CODEGENOPT(SSPBufferSize, 32, 0) diff --git a/clang/include/clang/Basic/DiagnosticDriverKinds.td b/clang/include/clang/Basic/DiagnosticDriverKinds.td index 58f7c07fb18ad..47c05e4a4242d 100644 --- a/clang/include/clang/Basic/DiagnosticDriverKinds.td +++ b/clang/include/clang/Basic/DiagnosticDriverKinds.td @@ -397,6 +397,9 @@ def err_drv_ropi_incompatible_with_cxx : Error< def err_stack_tagging_requires_hardware_feature : Error< "'-fsanitize=memtag' requires hardware support (+memtag)">; +def err_cmse_pi_are_incompatible : Error< + "cmse is not compatible with %select{RWPI|ROPI}0">; + def warn_target_unsupported_nan2008 : Warning< "ignoring '-mnan=2008' option because the '%0' architecture does not support it">, InGroup; @@ -419,6 +422,9 @@ def warn_drv_unsupported_gpopt : Warning< "ignoring '-mgpopt' option as it cannot be used with %select{|the implicit" " usage of }0-mabicalls">, InGroup; +def warn_drv_unsupported_sdata : Warning< + "ignoring '-msmall-data-limit=' with -mcmodel=large for -fpic or RV64">, + InGroup; def warn_drv_unsupported_longcalls : Warning< "ignoring '-mlong-calls' option as it is not currently supported with " "%select{|the implicit usage of }0-mabicalls">, diff --git a/clang/include/clang/Basic/DiagnosticGroups.td b/clang/include/clang/Basic/DiagnosticGroups.td index 2ca557aac66d0..7c7bd7eae3740 100644 --- a/clang/include/clang/Basic/DiagnosticGroups.td +++ b/clang/include/clang/Basic/DiagnosticGroups.td @@ -1184,7 +1184,7 @@ The warning is issued if the number of pre-processor tokens exceeds the token limit, which can be set in three ways: 1. As a limit at a specific point in a file, using the ``clang max_tokens_here`` - pragma: + pragma: .. code-block: c++ #pragma clang max_tokens_here 1234 diff --git a/clang/include/clang/Basic/DiagnosticIDs.h b/clang/include/clang/Basic/DiagnosticIDs.h index 3c5ea03010987..46f0fa423a391 100644 --- a/clang/include/clang/Basic/DiagnosticIDs.h +++ b/clang/include/clang/Basic/DiagnosticIDs.h @@ -28,7 +28,7 @@ namespace clang { // Size of each of the diagnostic categories. enum { DIAG_SIZE_COMMON = 300, - DIAG_SIZE_DRIVER = 210, + DIAG_SIZE_DRIVER = 250, DIAG_SIZE_FRONTEND = 150, DIAG_SIZE_SERIALIZATION = 120, DIAG_SIZE_LEX = 400, diff --git a/clang/include/clang/Basic/DiagnosticParseKinds.td b/clang/include/clang/Basic/DiagnosticParseKinds.td index 0a1d6668ec293..469af4167f8aa 100644 --- a/clang/include/clang/Basic/DiagnosticParseKinds.td +++ b/clang/include/clang/Basic/DiagnosticParseKinds.td @@ -706,6 +706,8 @@ def err_id_after_template_in_nested_name_spec : Error< "expected template name after 'template' keyword in nested name specifier">; def err_unexpected_template_in_unqualified_id : Error< "'template' keyword not permitted here">; +def err_unexpected_template_in_destructor_name : Error< + "'template' keyword not permitted in destructor name">; def err_unexpected_template_after_using : Error< "'template' keyword not permitted after 'using' keyword">; def err_two_right_angle_brackets_need_space : Error< diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index 23bfd9eed4e30..570ea8ada9577 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -2497,7 +2497,7 @@ def err_invalid_consteval_take_address : Error< "cannot take address of consteval function %0 outside" " of an immediate invocation">; def err_invalid_consteval_call : Error< - "call to consteval function '%q0' is not a constant expression">; + "call to consteval function %q0 is not a constant expression">; def err_invalid_consteval_decl_kind : Error< "%0 cannot be declared consteval">; def err_invalid_constexpr : Error< @@ -2935,6 +2935,10 @@ def err_attribute_address_multiple_qualifiers : Error< def warn_attribute_address_multiple_identical_qualifiers : Warning< "multiple identical address spaces specified for type">, InGroup; +def err_attribute_not_clinkage : Error< + "function type with %0 attribute must have C linkage">; +def err_function_decl_cmse_ns_call : Error< + "functions may not be declared with 'cmse_nonsecure_call' attribute">; def err_attribute_address_function_type : Error< "function type may not be qualified with an address space">; def err_as_qualified_auto_decl : Error< @@ -3155,6 +3159,9 @@ def warn_attribute_weak_on_local : Warning< InGroup; def warn_weak_identifier_undeclared : Warning< "weak identifier %0 never declared">; +def warn_attribute_cmse_entry_static : Warning< + "'cmse_nonsecure_entry' cannot be applied to functions with internal linkage">, + InGroup; def err_attribute_weak_static : Error< "weak declaration cannot have internal linkage">; def err_attribute_selectany_non_extern_data : Error< @@ -4255,11 +4262,16 @@ def err_ovl_ambiguous_oper_binary : Error< "use of overloaded operator '%0' is ambiguous (with operand types %1 and %2)">; def ext_ovl_ambiguous_oper_binary_reversed : ExtWarn< "ISO C++20 considers use of overloaded operator '%0' (with operand types %1 " - "and %2) to be ambiguous despite there being a unique best viable function">, + "and %2) to be ambiguous despite there being a unique best viable function" + "%select{ with non-reversed arguments|}3">, InGroup>, SFINAEFailure; -def note_ovl_ambiguous_oper_binary_reversed_candidate : Note< +def note_ovl_ambiguous_oper_binary_reversed_self : Note< "ambiguity is between a regular call to this operator and a call with the " "argument order reversed">; +def note_ovl_ambiguous_oper_binary_selected_candidate : Note< + "candidate function with non-reversed arguments">; +def note_ovl_ambiguous_oper_binary_reversed_candidate : Note< + "ambiguous candidate function with reversed arguments">; def err_ovl_no_viable_oper : Error<"no viable overloaded '%0'">; def note_assign_lhs_incomplete : Note<"type %0 is incomplete">; def err_ovl_deleted_oper : Error< @@ -4273,6 +4285,10 @@ def err_ovl_deleted_comparison : Error< def err_ovl_rewrite_equalequal_not_bool : Error< "return type %0 of selected 'operator==' function for rewritten " "'%1' comparison is not 'bool'">; +def ext_ovl_rewrite_equalequal_not_bool : ExtWarn< + "ISO C++20 requires return type of selected 'operator==' function for " + "rewritten '%1' comparison to be 'bool', not %0">, + InGroup>, SFINAEFailure; def err_ovl_no_viable_subscript : Error<"no viable overloaded operator[] for type %0">; def err_ovl_no_oper : @@ -9864,15 +9880,20 @@ def err_omp_prohibited_region : Error< "%select{|; perhaps you forget to enclose 'omp %3' directive into a parallel region?|" "; perhaps you forget to enclose 'omp %3' directive into a for or a parallel for region with 'ordered' clause?|" "; perhaps you forget to enclose 'omp %3' directive into a target region?|" - "; perhaps you forget to enclose 'omp %3' directive into a teams region?}2">; + "; perhaps you forget to enclose 'omp %3' directive into a teams region?|" + "; perhaps you forget to enclose 'omp %3' directive into a for, simd, or for simd region?}2">; def err_omp_prohibited_region_simd : Error< - "OpenMP constructs may not be nested inside a simd region%select{| except for ordered simd, simd or atomic directive}0">; + "OpenMP constructs may not be nested inside a simd region%select{| except for ordered simd, simd, scan, or atomic directive}0">; def err_omp_prohibited_region_atomic : Error< "OpenMP constructs may not be nested inside an atomic region">; def err_omp_prohibited_region_critical_same_name : Error< "cannot nest 'critical' regions having the same name %0">; def note_omp_previous_critical_region : Note< "previous 'critical' region starts here">; +def err_omp_several_scan_directives_in_region : Error< + "exactly one 'scan' directive must appear in the loop body of an enclosing directive">; +def note_omp_previous_scan_directive : Note< + "previous 'scan' directive used here">; def err_omp_sections_not_compound_stmt : Error< "the statement for '#pragma omp sections' must be a compound statement">; def err_omp_parallel_sections_not_compound_stmt : Error< @@ -10057,7 +10078,7 @@ def warn_omp_nesting_simd : Warning< InGroup; def err_omp_orphaned_device_directive : Error< "orphaned 'omp %0' directives are prohibited" - "; perhaps you forget to enclose the directive into a %select{|||target |teams }1region?">; + "; perhaps you forget to enclose the directive into a %select{|||target |teams|for, simd, or for simd }1region?">; def err_omp_reduction_non_addressable_expression : Error< "expected addressable reduction item for the task-based directives">; def err_omp_reduction_with_nogroup : Error< @@ -10095,6 +10116,8 @@ def err_omp_depobj_expected : Error< "expected depobj expression">; def err_omp_depobj_single_clause_expected : Error< "exactly one of 'depend', 'destroy', or 'update' clauses is expected">; +def err_omp_scan_single_clause_expected : Error< + "exactly one of 'inclusive' or 'exclusive' clauses is expected">; def err_omp_expected_predefined_allocator : Error< "expected one of the predefined allocators for the variables with the static " "storage: 'omp_default_mem_alloc', 'omp_large_cap_mem_alloc', " diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def index c3d6d0822580f..42a9230998f6c 100644 --- a/clang/include/clang/Basic/LangOptions.def +++ b/clang/include/clang/Basic/LangOptions.def @@ -148,6 +148,8 @@ LANGOPT(RelaxedTemplateTemplateArgs, 1, 0, "C++17 relaxed matching of template t LANGOPT(DoubleSquareBracketAttributes, 1, 0, "'[[]]' attributes extension for all language standard modes") +COMPATIBLE_LANGOPT(RecoveryAST, 1, CPlusPlus, "Preserve expressions in AST when encountering errors") + BENIGN_LANGOPT(ThreadsafeStatics , 1, 1, "thread-safe static initializers") LANGOPT(POSIXThreads , 1, 0, "POSIX thread support") LANGOPT(Blocks , 1, 0, "blocks extension to C") diff --git a/clang/include/clang/Basic/OpenMPKinds.def b/clang/include/clang/Basic/OpenMPKinds.def index 0488dad6706b5..bfb41ab105ea5 100644 --- a/clang/include/clang/Basic/OpenMPKinds.def +++ b/clang/include/clang/Basic/OpenMPKinds.def @@ -215,6 +215,12 @@ #ifndef OPENMP_DEVICE_MODIFIER #define OPENMP_DEVICE_MODIFIER(Name) #endif +#ifndef OPENMP_SCAN_CLAUSE +#define OPENMP_SCAN_CLAUSE(Name) +#endif +#ifndef OPENMP_REDUCTION_MODIFIER +#define OPENMP_REDUCTION_MODIFIER(Name) +#endif // OpenMP clauses. OPENMP_CLAUSE(allocator, OMPAllocatorClause) @@ -281,6 +287,12 @@ OPENMP_CLAUSE(order, OMPOrderClause) OPENMP_CLAUSE(depobj, OMPDepobjClause) OPENMP_CLAUSE(destroy, OMPDestroyClause) OPENMP_CLAUSE(detach, OMPDetachClause) +OPENMP_CLAUSE(inclusive, OMPInclusiveClause) +OPENMP_CLAUSE(exclusive, OMPExclusiveClause) + +// Clauses allowed for OpenMP directive 'scan'. +OPENMP_SCAN_CLAUSE(inclusive) +OPENMP_SCAN_CLAUSE(exclusive) // Clauses allowed for OpenMP directive 'parallel'. OPENMP_PARALLEL_CLAUSE(if) @@ -1098,6 +1110,11 @@ OPENMP_DEPOBJ_CLAUSE(depend) OPENMP_DEPOBJ_CLAUSE(destroy) OPENMP_DEPOBJ_CLAUSE(update) +// Modifiers for 'reduction' clause. +OPENMP_REDUCTION_MODIFIER(default) + +#undef OPENMP_REDUCTION_MODIFIER +#undef OPENMP_SCAN_CLAUSE #undef OPENMP_DEVICE_MODIFIER #undef OPENMP_DEPOBJ_CLAUSE #undef OPENMP_FLUSH_CLAUSE diff --git a/clang/include/clang/Basic/OpenMPKinds.h b/clang/include/clang/Basic/OpenMPKinds.h index 46eeffe999d91..b567f89b986e0 100644 --- a/clang/include/clang/Basic/OpenMPKinds.h +++ b/clang/include/clang/Basic/OpenMPKinds.h @@ -168,6 +168,13 @@ struct OpenMPScheduleTy final { OpenMPScheduleClauseModifier M2 = OMPC_SCHEDULE_MODIFIER_unknown; }; +/// OpenMP modifiers for 'reduction' clause. +enum OpenMPReductionClauseModifier { +#define OPENMP_REDUCTION_MODIFIER(Name) OMPC_REDUCTION_##Name, +#include "clang/Basic/OpenMPKinds.def" + OMPC_REDUCTION_unknown, +}; + OpenMPClauseKind getOpenMPClauseKind(llvm::StringRef Str); const char *getOpenMPClauseName(OpenMPClauseKind Kind); diff --git a/clang/include/clang/Basic/StmtNodes.td b/clang/include/clang/Basic/StmtNodes.td index 41c6dbdb42e95..478179d4131f4 100644 --- a/clang/include/clang/Basic/StmtNodes.td +++ b/clang/include/clang/Basic/StmtNodes.td @@ -195,6 +195,7 @@ def ConvertVectorExpr : StmtNode; def BlockExpr : StmtNode; def OpaqueValueExpr : StmtNode; def TypoExpr : StmtNode; +def RecoveryExpr : StmtNode; def BuiltinBitCastExpr : StmtNode; // Microsoft Extensions. @@ -233,6 +234,7 @@ def OMPTaskwaitDirective : StmtNode; def OMPTaskgroupDirective : StmtNode; def OMPFlushDirective : StmtNode; def OMPDepobjDirective : StmtNode; +def OMPScanDirective : StmtNode; def OMPOrderedDirective : StmtNode; def OMPAtomicDirective : StmtNode; def OMPTargetDirective : StmtNode; diff --git a/clang/include/clang/Basic/TokenKinds.def b/clang/include/clang/Basic/TokenKinds.def index cdc1545c6cac5..ee6dd9ddd12f2 100644 --- a/clang/include/clang/Basic/TokenKinds.def +++ b/clang/include/clang/Basic/TokenKinds.def @@ -675,12 +675,12 @@ ALIAS("_declspec" , __declspec , KEYMS) ALIAS("_pascal" , __pascal , KEYBORLAND) // Clang Extensions. -KEYWORD(__builtin_convertvector , KEYALL) -ALIAS("__char16_t" , char16_t , KEYCXX) -ALIAS("__char32_t" , char32_t , KEYCXX) -KEYWORD(__builtin_bit_cast , KEYALL) -KEYWORD(__builtin_available , KEYALL) -KEYWORD(__unique_stable_name , KEYALL) +KEYWORD(__builtin_convertvector , KEYALL) +ALIAS("__char16_t" , char16_t , KEYCXX) +ALIAS("__char32_t" , char32_t , KEYCXX) +KEYWORD(__builtin_bit_cast , KEYALL) +KEYWORD(__builtin_available , KEYALL) +KEYWORD(__builtin_unique_stable_name, KEYALL) // Clang-specific keywords enabled only in testing. TESTING_KEYWORD(__unknown_anytype , KEYALL) diff --git a/clang/include/clang/Basic/arm_cde.td b/clang/include/clang/Basic/arm_cde.td index 222b63966a383..e258bf5ee83e5 100644 --- a/clang/include/clang/Basic/arm_cde.td +++ b/clang/include/clang/Basic/arm_cde.td @@ -13,6 +13,16 @@ include "arm_mve_defs.td" +// f64 is not defined in arm_mve_defs.td because MVE instructions only work with +// f16 and f32 +def f64: PrimitiveType<"f", 64>; + +// Float expects t to be a scalar type, and expands to the floating-point +// type of the same width. +class Float: ComplexType<(CTO_CopyKind t, f32)>; +def FScalar: Float; + +// ACLE CDE intrinsic class CDEIntrinsic : Intrinsic { let builtinExtension = "cde"; @@ -27,6 +37,13 @@ class CDEImmediateBits : Immediate>; class CDEIRInt params = [], bit appendKind = 0> : IRIntBase<"arm_cde_" # name, params, appendKind>; +// Class for generating function macros in arm_cde.h: +// "#define () " +class FunctionMacro params_, string definition_> { + list params = params_; + string definition = definition_; +} + // Coprocessor immediate def imm_coproc : Immediate>; @@ -40,6 +57,142 @@ def imm_11b : CDEImmediateBits<11>; def imm_12b : CDEImmediateBits<12>; def imm_13b : CDEImmediateBits<13>; -let pnt = PNT_None, params = T.None in -def cx1 : CDEIntrinsic $cp, $imm)>; +// CX* instructions operating on GPRs +multiclass CDE_CX_m { + defvar cp = (args imm_coproc:$cp); + let pnt = PNT_None, params = T.None in { + def "" : CDEIntrinsic $cp), cgArgs, (? $imm))>; + def a : CDEIntrinsic $cp, $acc), + cgArgs, (? $imm))>; + + def d : + CDEIntrinsic $cp), cgArgs, (? $imm)):$pair, + (or (shl (u64 (xval $pair, 1)), (u64 32)), + (u64 (xval $pair, 0))))>; + def da : + CDEIntrinsic $cp, $acc_lo, $acc_hi), cgArgs, + (? $imm)):$pair, + (or (shl (u64 (xval $pair, 1)), (u64 32)), + (u64 (xval $pair, 0))))>; + } +} + +defm cx1 : CDE_CX_m<(args imm_13b:$imm), (args), (?)>; +defm cx2 : CDE_CX_m<(args imm_9b:$imm), (args u32:$n), (? $n)>; +defm cx3 : CDE_CX_m<(args imm_6b:$imm), (args u32:$n, u32:$m), (? $n, $m)>; + +// VCX* instructions operating on VFP registers +multiclass CDE_VCXFP_m { + defvar cp = (args imm_coproc:$cp); + let pnt = PNT_None, params = [u32] in { + def "" : CDEIntrinsic $cp), cgArgs, (? $imm)), + Scalar)>; + def a : CDEIntrinsic $cp, + (bitcast $acc, FScalar)), cgArgs, (? $imm)), Scalar)>; + } + let pnt = PNT_None, params = [u64] in { + def d : CDEIntrinsic $cp), cgArgs, (? $imm)), + Scalar)>; + def da : CDEIntrinsic $cp, + (bitcast $acc, FScalar)), cgArgs, (? $imm)), Scalar)>; + } +} + +defm vcx1: CDE_VCXFP_m<(args imm_11b:$imm), (args), (args), (?)>; +defm vcx2: CDE_VCXFP_m<(args imm_6b:$imm), (args u32:$n), (args u64:$n), + (? (bitcast $n, FScalar))>; +defm vcx3: CDE_VCXFP_m<(args imm_3b:$imm), + (args u32:$n, u32:$m), (args u64:$n, u64:$m), + (? (bitcast $n, FScalar), (bitcast $m, FScalar))>; + +// VCX* instructions operating on Q vector registers + +def v16u8 : VecOf; + +let pnt = PNT_None, params = [u8] in +def vcx1q : CDEIntrinsic $cp, $imm)>; + +let pnt = PNT_Type, params = T.All, polymorphicOnly = 1 in { + def vcx1qa : + CDEIntrinsic $cp, (bitcast $acc, v16u8), $imm), + Vector)>; + + def vcx2q : + CDEIntrinsic $cp, (bitcast $n, VecOf), $imm), + Vector)>; + def vcx2q_u8 : + CDEIntrinsic $cp, (bitcast $n, VecOf), $imm)>; + + def vcx2qa_impl : + CDEIntrinsic $cp, (bitcast $acc, v16u8), $n, $imm), + Vector)>; + + def vcx3q_impl : + CDEIntrinsic $cp, (bitcast $n, v16u8), $m, $imm), + Vector)>; + def vcx3q_u8_impl : + CDEIntrinsic $cp, (bitcast $n, v16u8), $m, $imm)>; + def vcx3qa_impl : + CDEIntrinsic $cp, (bitcast $acc, v16u8), $n, $m, + $imm), + Vector)>; +} + +// Reinterpret intrinsics required to implement __arm_vcx*q with 2 or 3 +// polymorphic paramters. +let params = [/* no u8 */ s8, u16, s16, u32, s32, u64, s64, f16, f32], + headerOnly = 1, polymorphicOnly = 1 in +def vreinterpretq_u8 : + Intrinsic; + +// We need vreinterpretq_u8_u8 to avoid doing smart tricks in the macros +let params = [u8], polymorphicOnly = 1 in +def vreinterpretq_u8_cde : + CDEIntrinsic, + NameOverride<"vreinterpretq_u8">; + + +def vcx2qa : FunctionMacro< + ["cp", "acc", "n", "imm"], + "__arm_vcx2qa_impl((cp), (acc), __arm_vreinterpretq_u8(n), (imm))">; + +def vcx3q : FunctionMacro< + ["cp", "n", "m", "imm"], + "__arm_vcx3q_impl((cp), (n), __arm_vreinterpretq_u8(m), (imm))">; +def vcx3q_u8 : FunctionMacro< + ["cp", "n", "m", "imm"], + "__arm_vcx3q_u8_impl((cp), (n), __arm_vreinterpretq_u8(m), (imm))">; +def vcx3qa : FunctionMacro< + ["cp", "acc", "n", "m", "imm"], + "__arm_vcx3qa_impl((cp), (acc), __arm_vreinterpretq_u8(n), " + "__arm_vreinterpretq_u8(m), (imm))">; + +// vreinterpretq intrinsics required by the ACLE CDE specification + +foreach desttype = [/* no u8 */ s8, u16, s16, u32, s32, u64, s64, f16, f32] in { + let params = [u8], headerOnly = 1, pnt = PNT_None in + def "vreinterpretq_" # desttype : Intrinsic< + VecOf, (args Vector:$x), (vreinterpret $x, VecOf)>; +} diff --git a/clang/include/clang/Basic/arm_mve.td b/clang/include/clang/Basic/arm_mve.td index 45e45899de5f0..25daae2a0a25d 100644 --- a/clang/include/clang/Basic/arm_mve.td +++ b/clang/include/clang/Basic/arm_mve.td @@ -536,11 +536,42 @@ let params = T.Float in { (IRInt<"vmaxnma_predicated", [Vector,Predicate]> $a, $b, $pred)>; } +multiclass Reduction basetypes, + bit needSign = 0, + dag postCG = (seq (id $ret)), + dag accArg = (args Accumulator:$prev), + dag preCG = (seq)> { + defvar intArgsBase = (? $prev, $vec); + defvar intArgsUnpred = !con(intArgsBase, + !if(needSign, (? (unsignedflag Scalar)), (?))); + defvar intArgsPred = !con(intArgsUnpred, (? $pred)); + defvar intUnpred = !setop(intArgsUnpred, IRInt); + defvar intPred = !setop(intArgsPred, IRInt< + basename#"_predicated", !listconcat(basetypes, [Predicate])>); + + def "": Intrinsic< + Accumulator, !con(accArg, (args Vector:$vec)), + !con(preCG, (seq intUnpred:$ret), postCG)>; + def _p: Intrinsic< + Accumulator, !con(accArg, (args Vector:$vec, Predicate:$pred)), + !con(preCG, (seq intPred:$ret), postCG)>; +} + let params = T.Int in { -def vminvq: Intrinsic $prev, $vec))>; -def vmaxvq: Intrinsic $prev, $vec))>; +defm vminvq: Reduction; +defm vmaxvq: Reduction; +} + +let params = T.Signed in { +defm vminavq: Reduction; +defm vmaxavq: Reduction; +} + +let params = T.Float in { +defm vminnmvq: Reduction; +defm vmaxnmvq: Reduction; +defm vminnmavq: Reduction; +defm vmaxnmavq: Reduction; } foreach half = [ "b", "t" ] in { @@ -1414,6 +1445,33 @@ multiclass MVEBinaryVectorHoriz64R { "vrmlldavha">; } +multiclass VADDV { + defvar accArg = !if(acc, (args Scalar:$acc), (args)); + defvar predArg = !if(pred, (args Predicate:$pred), (args)); + defvar intrinsic = !if(pred, + IRInt, + IRInt); + defvar intCG = !con((intrinsic $v, (unsignedflag Scalar)), + !if(pred, (? $pred), (?))); + defvar accCG = !if(acc, (add intCG, $acc), intCG); + + def "": Intrinsic; +} + +let params = T.Int in { +defm vaddvq : VADDV<0, 0, "addv", Scalar32>; +defm vaddvaq : VADDV<1, 0, "addv", Scalar32>; +defm vaddvq_p : VADDV<0, 1, "addv", Scalar32>; +defm vaddvaq_p : VADDV<1, 1, "addv", Scalar32>; +} + +let params = [s32, u32] in { +defm vaddlvq : VADDV<0, 0, "addlv", Scalar64>; +defm vaddlvaq : VADDV<1, 0, "addlv", Scalar64>; +defm vaddlvq_p : VADDV<0, 1, "addlv", Scalar64>; +defm vaddlvaq_p : VADDV<1, 1, "addlv", Scalar64>; +} + let params = T.Int in { def vabavq : Intrinsic (unsignedflag Scalar), $a, $b, $c)>; diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td index 380a2a0a5fa68..f949edc378fcb 100644 --- a/clang/include/clang/Basic/arm_neon.td +++ b/clang/include/clang/Basic/arm_neon.td @@ -51,39 +51,39 @@ def OP_FMLA_N : Op<(call "vfma", $p0, $p1, (dup $p2))>; def OP_FMLS_N : Op<(call "vfma", $p0, (op "-", $p1), (dup $p2))>; def OP_MLAL_N : Op<(op "+", $p0, (call "vmull", $p1, (dup $p2)))>; def OP_MLSL_N : Op<(op "-", $p0, (call "vmull", $p1, (dup $p2)))>; -def OP_MUL_LN : Op<(op "*", $p0, (splat $p1, $p2))>; -def OP_MULX_LN : Op<(call "vmulx", $p0, (splat $p1, $p2))>; +def OP_MUL_LN : Op<(op "*", $p0, (call_mangled "splat_lane", $p1, $p2))>; +def OP_MULX_LN : Op<(call "vmulx", $p0, (call_mangled "splat_lane", $p1, $p2))>; def OP_MULL_N : Op<(call "vmull", $p0, (dup $p1))>; -def OP_MULL_LN : Op<(call "vmull", $p0, (splat $p1, $p2))>; -def OP_MULLHi_LN: Op<(call "vmull", (call "vget_high", $p0), (splat $p1, $p2))>; -def OP_MLA_LN : Op<(op "+", $p0, (op "*", $p1, (splat $p2, $p3)))>; -def OP_MLS_LN : Op<(op "-", $p0, (op "*", $p1, (splat $p2, $p3)))>; -def OP_MLAL_LN : Op<(op "+", $p0, (call "vmull", $p1, (splat $p2, $p3)))>; +def OP_MULL_LN : Op<(call "vmull", $p0, (call_mangled "splat_lane", $p1, $p2))>; +def OP_MULLHi_LN: Op<(call "vmull", (call "vget_high", $p0), (call_mangled "splat_lane", $p1, $p2))>; +def OP_MLA_LN : Op<(op "+", $p0, (op "*", $p1, (call_mangled "splat_lane", $p2, $p3)))>; +def OP_MLS_LN : Op<(op "-", $p0, (op "*", $p1, (call_mangled "splat_lane", $p2, $p3)))>; +def OP_MLAL_LN : Op<(op "+", $p0, (call "vmull", $p1, (call_mangled "splat_lane", $p2, $p3)))>; def OP_MLALHi_LN: Op<(op "+", $p0, (call "vmull", (call "vget_high", $p1), - (splat $p2, $p3)))>; -def OP_MLSL_LN : Op<(op "-", $p0, (call "vmull", $p1, (splat $p2, $p3)))>; + (call_mangled "splat_lane", $p2, $p3)))>; +def OP_MLSL_LN : Op<(op "-", $p0, (call "vmull", $p1, (call_mangled "splat_lane", $p2, $p3)))>; def OP_MLSLHi_LN : Op<(op "-", $p0, (call "vmull", (call "vget_high", $p1), - (splat $p2, $p3)))>; + (call_mangled "splat_lane", $p2, $p3)))>; def OP_QDMULL_N : Op<(call "vqdmull", $p0, (dup $p1))>; -def OP_QDMULL_LN : Op<(call "vqdmull", $p0, (splat $p1, $p2))>; +def OP_QDMULL_LN : Op<(call "vqdmull", $p0, (call_mangled "splat_lane", $p1, $p2))>; def OP_QDMULLHi_LN : Op<(call "vqdmull", (call "vget_high", $p0), - (splat $p1, $p2))>; + (call_mangled "splat_lane", $p1, $p2))>; def OP_QDMLAL_N : Op<(call "vqdmlal", $p0, $p1, (dup $p2))>; -def OP_QDMLAL_LN : Op<(call "vqdmlal", $p0, $p1, (splat $p2, $p3))>; +def OP_QDMLAL_LN : Op<(call "vqdmlal", $p0, $p1, (call_mangled "splat_lane", $p2, $p3))>; def OP_QDMLALHi_LN : Op<(call "vqdmlal", $p0, (call "vget_high", $p1), - (splat $p2, $p3))>; + (call_mangled "splat_lane", $p2, $p3))>; def OP_QDMLSL_N : Op<(call "vqdmlsl", $p0, $p1, (dup $p2))>; -def OP_QDMLSL_LN : Op<(call "vqdmlsl", $p0, $p1, (splat $p2, $p3))>; +def OP_QDMLSL_LN : Op<(call "vqdmlsl", $p0, $p1, (call_mangled "splat_lane", $p2, $p3))>; def OP_QDMLSLHi_LN : Op<(call "vqdmlsl", $p0, (call "vget_high", $p1), - (splat $p2, $p3))>; + (call_mangled "splat_lane", $p2, $p3))>; def OP_QDMULH_N : Op<(call "vqdmulh", $p0, (dup $p1))>; -def OP_QDMULH_LN : Op<(call "vqdmulh", $p0, (splat $p1, $p2))>; -def OP_QRDMULH_LN : Op<(call "vqrdmulh", $p0, (splat $p1, $p2))>; +def OP_QDMULH_LN : Op<(call "vqdmulh", $p0, (call_mangled "splat_lane", $p1, $p2))>; +def OP_QRDMULH_LN : Op<(call "vqrdmulh", $p0, (call_mangled "splat_lane", $p1, $p2))>; def OP_QRDMULH_N : Op<(call "vqrdmulh", $p0, (dup $p1))>; def OP_QRDMLAH : Op<(call "vqadd", $p0, (call "vqrdmulh", $p1, $p2))>; def OP_QRDMLSH : Op<(call "vqsub", $p0, (call "vqrdmulh", $p1, $p2))>; -def OP_QRDMLAH_LN : Op<(call "vqadd", $p0, (call "vqrdmulh", $p1, (splat $p2, $p3)))>; -def OP_QRDMLSH_LN : Op<(call "vqsub", $p0, (call "vqrdmulh", $p1, (splat $p2, $p3)))>; +def OP_QRDMLAH_LN : Op<(call "vqadd", $p0, (call "vqrdmulh", $p1, (call_mangled "splat_lane", $p2, $p3)))>; +def OP_QRDMLSH_LN : Op<(call "vqsub", $p0, (call "vqrdmulh", $p1, (call_mangled "splat_lane", $p2, $p3)))>; def OP_FMS_LN : Op<(call "vfma_lane", $p0, (op "-", $p1), $p2, $p3)>; def OP_FMS_LNQ : Op<(call "vfma_laneq", $p0, (op "-", $p1), $p2, $p3)>; def OP_TRN1 : Op<(shuffle $p0, $p1, (interleave (decimate mask0, 2), @@ -115,7 +115,7 @@ def OP_HI : Op<(shuffle $p0, $p0, (highhalf mask0))>; def OP_LO : Op<(shuffle $p0, $p0, (lowhalf mask0))>; def OP_CONC : Op<(shuffle $p0, $p1, (add mask0, mask1))>; def OP_DUP : Op<(dup $p0)>; -def OP_DUP_LN : Op<(splat $p0, $p1)>; +def OP_DUP_LN : Op<(call_mangled "splat_lane", $p0, $p1)>; def OP_SEL : Op<(cast "R", (op "|", (op "&", $p0, (cast $p0, $p1)), (op "&", (op "~", $p0), (cast $p0, $p2))))>; @@ -207,10 +207,10 @@ def OP_SCALAR_HALF_SET_LNQ : Op<(bitcast "float16x8_t", def OP_DOT_LN : Op<(call "vdot", $p0, $p1, - (bitcast $p1, (splat(bitcast "uint32x2_t", $p2), $p3)))>; + (bitcast $p1, (call_mangled "splat_lane", (bitcast "32", $p2), $p3)))>; def OP_DOT_LNQ : Op<(call "vdot", $p0, $p1, - (bitcast $p1, (splat(bitcast "uint32x4_t", $p2), $p3)))>; + (bitcast $p1, (call_mangled "splat_lane", (bitcast "32", $p2), $p3)))>; def OP_FMLAL_LN : Op<(call "vfmlal_low", $p0, $p1, (dup_typed $p1, (call "vget_lane", $p2, $p3)))>; @@ -222,7 +222,19 @@ def OP_FMLSL_LN_Hi : Op<(call "vfmlsl_high", $p0, $p1, (dup_typed $p1, (call "vget_lane", $p2, $p3)))>; //===----------------------------------------------------------------------===// -// Instructions +// Auxiliary Instructions +//===----------------------------------------------------------------------===// + +// Splat operation - performs a range-checked splat over a vector +def SPLAT : WInst<"splat_lane", ".(!q)I", + "UcUsUicsilPcPsfQUcQUsQUiQcQsQiQPcQPsQflUlQlQUlhdQhQdPlQPl">; +def SPLATQ : WInst<"splat_laneq", ".(!Q)I", + "UcUsUicsilPcPsfQUcQUsQUiQcQsQiQPcQPsQflUlQlQUlhdQhQdPlQPl"> { + let isLaneQ = 1; +} + +//===----------------------------------------------------------------------===// +// Intrinsics //===----------------------------------------------------------------------===// //////////////////////////////////////////////////////////////////////////////// @@ -535,8 +547,8 @@ def VQDMULH_LANE : SOpInst<"vqdmulh_lane", "..qI", "siQsQi", OP_QDMULH_LN>; def VQRDMULH_LANE : SOpInst<"vqrdmulh_lane", "..qI", "siQsQi", OP_QRDMULH_LN>; } let ArchGuard = "defined(__aarch64__)" in { -def A64_VQDMULH_LANE : SInst<"vqdmulh_lane", "..qI", "siQsQi">; -def A64_VQRDMULH_LANE : SInst<"vqrdmulh_lane", "..qI", "siQsQi">; +def A64_VQDMULH_LANE : SInst<"vqdmulh_lane", "..(!q)I", "siQsQi">; +def A64_VQRDMULH_LANE : SInst<"vqrdmulh_lane", "..(!q)I", "siQsQi">; } let ArchGuard = "defined(__ARM_FEATURE_QRDMX)" in { @@ -881,16 +893,22 @@ def COPY_LANE : IOpInst<"vcopy_lane", "..I.I", def COPYQ_LANE : IOpInst<"vcopy_lane", "..IqI", "QcQsQiQlQUcQUsQUiQUlQPcQPsQfQdQPl", OP_COPY_LN>; def COPY_LANEQ : IOpInst<"vcopy_laneq", "..IQI", - "csilPcPsPlUcUsUiUlfd", OP_COPY_LN>; + "csilPcPsPlUcUsUiUlfd", OP_COPY_LN> { + let isLaneQ = 1; +} def COPYQ_LANEQ : IOpInst<"vcopy_laneq", "..I.I", - "QcQsQiQlQUcQUsQUiQUlQPcQPsQfQdQPl", OP_COPY_LN>; + "QcQsQiQlQUcQUsQUiQUlQPcQPsQfQdQPl", OP_COPY_LN> { + let isLaneQ = 1; +} //////////////////////////////////////////////////////////////////////////////// // Set all lanes to same value def VDUP_LANE1: WOpInst<"vdup_lane", ".qI", "hdQhQdPlQPl", OP_DUP_LN>; def VDUP_LANE2: WOpInst<"vdup_laneq", ".QI", "csilUcUsUiUlPcPshfdQcQsQiQlQPcQPsQUcQUsQUiQUlQhQfQdPlQPl", - OP_DUP_LN>; + OP_DUP_LN> { + let isLaneQ = 1; +} def DUP_N : WOpInst<"vdup_n", ".1", "dQdPlQPl", OP_DUP>; def MOV_N : WOpInst<"vmov_n", ".1", "dQdPlQPl", OP_DUP>; @@ -906,38 +924,60 @@ def CREATE : NoTestOpInst<"vcreate", ".(IU>)", "dPl", OP_CAST> { //////////////////////////////////////////////////////////////////////////////// def VMLA_LANEQ : IOpInst<"vmla_laneq", "...QI", - "siUsUifQsQiQUsQUiQf", OP_MLA_LN>; + "siUsUifQsQiQUsQUiQf", OP_MLA_LN> { + let isLaneQ = 1; +} def VMLS_LANEQ : IOpInst<"vmls_laneq", "...QI", - "siUsUifQsQiQUsQUiQf", OP_MLS_LN>; + "siUsUifQsQiQUsQUiQf", OP_MLS_LN> { + let isLaneQ = 1; +} def VFMA_LANE : IInst<"vfma_lane", "...qI", "fdQfQd">; def VFMA_LANEQ : IInst<"vfma_laneq", "...QI", "fdQfQd"> { let isLaneQ = 1; } def VFMS_LANE : IOpInst<"vfms_lane", "...qI", "fdQfQd", OP_FMS_LN>; -def VFMS_LANEQ : IOpInst<"vfms_laneq", "...QI", "fdQfQd", OP_FMS_LNQ>; +def VFMS_LANEQ : IOpInst<"vfms_laneq", "...QI", "fdQfQd", OP_FMS_LNQ> { + let isLaneQ = 1; +} -def VMLAL_LANEQ : SOpInst<"vmlal_laneq", "(>Q)(>Q).QI", "siUsUi", OP_MLAL_LN>; +def VMLAL_LANEQ : SOpInst<"vmlal_laneq", "(>Q)(>Q).QI", "siUsUi", OP_MLAL_LN> { + let isLaneQ = 1; +} def VMLAL_HIGH_LANE : SOpInst<"vmlal_high_lane", "(>Q)(>Q)Q.I", "siUsUi", OP_MLALHi_LN>; def VMLAL_HIGH_LANEQ : SOpInst<"vmlal_high_laneq", "(>Q)(>Q)QQI", "siUsUi", - OP_MLALHi_LN>; -def VMLSL_LANEQ : SOpInst<"vmlsl_laneq", "(>Q)(>Q).QI", "siUsUi", OP_MLSL_LN>; + OP_MLALHi_LN> { + let isLaneQ = 1; +} +def VMLSL_LANEQ : SOpInst<"vmlsl_laneq", "(>Q)(>Q).QI", "siUsUi", OP_MLSL_LN> { + let isLaneQ = 1; +} def VMLSL_HIGH_LANE : SOpInst<"vmlsl_high_lane", "(>Q)(>Q)Q.I", "siUsUi", OP_MLSLHi_LN>; def VMLSL_HIGH_LANEQ : SOpInst<"vmlsl_high_laneq", "(>Q)(>Q)QQI", "siUsUi", - OP_MLSLHi_LN>; + OP_MLSLHi_LN> { + let isLaneQ = 1; +} -def VQDMLAL_LANEQ : SOpInst<"vqdmlal_laneq", "(>Q)(>Q).QI", "si", OP_QDMLAL_LN>; +def VQDMLAL_LANEQ : SOpInst<"vqdmlal_laneq", "(>Q)(>Q).QI", "si", OP_QDMLAL_LN> { + let isLaneQ = 1; +} def VQDMLAL_HIGH_LANE : SOpInst<"vqdmlal_high_lane", "(>Q)(>Q)Q.I", "si", OP_QDMLALHi_LN>; def VQDMLAL_HIGH_LANEQ : SOpInst<"vqdmlal_high_laneq", "(>Q)(>Q)QQI", "si", - OP_QDMLALHi_LN>; -def VQDMLSL_LANEQ : SOpInst<"vqdmlsl_laneq", "(>Q)(>Q).QI", "si", OP_QDMLSL_LN>; + OP_QDMLALHi_LN> { + let isLaneQ = 1; +} +def VQDMLSL_LANEQ : SOpInst<"vqdmlsl_laneq", "(>Q)(>Q).QI", "si", OP_QDMLSL_LN> { + let isLaneQ = 1; +} def VQDMLSL_HIGH_LANE : SOpInst<"vqdmlsl_high_lane", "(>Q)(>Q)Q.I", "si", OP_QDMLSLHi_LN>; def VQDMLSL_HIGH_LANEQ : SOpInst<"vqdmlsl_high_laneq", "(>Q)(>Q)QQI", "si", - OP_QDMLSLHi_LN>; + OP_QDMLSLHi_LN> { + let isLaneQ = 1; +} // Newly add double parameter for vmul_lane in aarch64 // Note: d type is handled by SCALAR_VMUL_LANE @@ -945,32 +985,48 @@ def VMUL_LANE_A64 : IOpInst<"vmul_lane", "..qI", "Qd", OP_MUL_LN>; // Note: d type is handled by SCALAR_VMUL_LANEQ def VMUL_LANEQ : IOpInst<"vmul_laneq", "..QI", - "sifUsUiQsQiQUsQUiQfQd", OP_MUL_LN>; -def VMULL_LANEQ : SOpInst<"vmull_laneq", "(>Q).QI", "siUsUi", OP_MULL_LN>; + "sifUsUiQsQiQUsQUiQfQd", OP_MUL_LN> { + let isLaneQ = 1; +} +def VMULL_LANEQ : SOpInst<"vmull_laneq", "(>Q).QI", "siUsUi", OP_MULL_LN> { + let isLaneQ = 1; +} def VMULL_HIGH_LANE : SOpInst<"vmull_high_lane", "(>Q)Q.I", "siUsUi", OP_MULLHi_LN>; def VMULL_HIGH_LANEQ : SOpInst<"vmull_high_laneq", "(>Q)QQI", "siUsUi", - OP_MULLHi_LN>; + OP_MULLHi_LN> { + let isLaneQ = 1; +} -def VQDMULL_LANEQ : SOpInst<"vqdmull_laneq", "(>Q).QI", "si", OP_QDMULL_LN>; +def VQDMULL_LANEQ : SOpInst<"vqdmull_laneq", "(>Q).QI", "si", OP_QDMULL_LN> { + let isLaneQ = 1; +} def VQDMULL_HIGH_LANE : SOpInst<"vqdmull_high_lane", "(>Q)Q.I", "si", OP_QDMULLHi_LN>; def VQDMULL_HIGH_LANEQ : SOpInst<"vqdmull_high_laneq", "(>Q)QQI", "si", - OP_QDMULLHi_LN>; + OP_QDMULLHi_LN> { + let isLaneQ = 1; +} let isLaneQ = 1 in { def VQDMULH_LANEQ : SInst<"vqdmulh_laneq", "..QI", "siQsQi">; def VQRDMULH_LANEQ : SInst<"vqrdmulh_laneq", "..QI", "siQsQi">; } let ArchGuard = "defined(__ARM_FEATURE_QRDMX) && defined(__aarch64__)" in { -def VQRDMLAH_LANEQ : SOpInst<"vqrdmlah_laneq", "...QI", "siQsQi", OP_QRDMLAH_LN>; -def VQRDMLSH_LANEQ : SOpInst<"vqrdmlsh_laneq", "...QI", "siQsQi", OP_QRDMLSH_LN>; +def VQRDMLAH_LANEQ : SOpInst<"vqrdmlah_laneq", "...QI", "siQsQi", OP_QRDMLAH_LN> { + let isLaneQ = 1; +} +def VQRDMLSH_LANEQ : SOpInst<"vqrdmlsh_laneq", "...QI", "siQsQi", OP_QRDMLSH_LN> { + let isLaneQ = 1; +} } // Note: d type implemented by SCALAR_VMULX_LANE def VMULX_LANE : IOpInst<"vmulx_lane", "..qI", "fQfQd", OP_MULX_LN>; // Note: d type is implemented by SCALAR_VMULX_LANEQ -def VMULX_LANEQ : IOpInst<"vmulx_laneq", "..QI", "fQfQd", OP_MULX_LN>; +def VMULX_LANEQ : IOpInst<"vmulx_laneq", "..QI", "fQfQd", OP_MULX_LN> { + let isLaneQ = 1; +} //////////////////////////////////////////////////////////////////////////////// // Across vectors class @@ -1380,11 +1436,15 @@ def SCALAR_UQXTN : SInst<"vqmovn", "(1<)1", "SUsSUiSUl">; // Scalar Floating Point multiply (scalar, by element) def SCALAR_FMUL_LANE : IOpInst<"vmul_lane", "11.I", "SfSd", OP_SCALAR_MUL_LN>; -def SCALAR_FMUL_LANEQ : IOpInst<"vmul_laneq", "11QI", "SfSd", OP_SCALAR_MUL_LN>; +def SCALAR_FMUL_LANEQ : IOpInst<"vmul_laneq", "11QI", "SfSd", OP_SCALAR_MUL_LN> { + let isLaneQ = 1; +} // Scalar Floating Point multiply extended (scalar, by element) def SCALAR_FMULX_LANE : IOpInst<"vmulx_lane", "11.I", "SfSd", OP_SCALAR_MULX_LN>; -def SCALAR_FMULX_LANEQ : IOpInst<"vmulx_laneq", "11QI", "SfSd", OP_SCALAR_MULX_LN>; +def SCALAR_FMULX_LANEQ : IOpInst<"vmulx_laneq", "11QI", "SfSd", OP_SCALAR_MULX_LN> { + let isLaneQ = 1; +} def SCALAR_VMUL_N : IInst<"vmul_n", "..1", "d">; @@ -1400,48 +1460,70 @@ def SCALAR_VMUL_LANEQ : IInst<"vmul_laneq", "..QI", "d"> { def SCALAR_VMULX_LANE : IOpInst<"vmulx_lane", "..qI", "d", OP_SCALAR_VMULX_LN>; // VMULX_LANEQ d type implemented using scalar vmulx_laneq -def SCALAR_VMULX_LANEQ : IOpInst<"vmulx_laneq", "..QI", "d", OP_SCALAR_VMULX_LNQ>; +def SCALAR_VMULX_LANEQ : IOpInst<"vmulx_laneq", "..QI", "d", OP_SCALAR_VMULX_LNQ> { + let isLaneQ = 1; +} // Scalar Floating Point fused multiply-add (scalar, by element) def SCALAR_FMLA_LANE : IInst<"vfma_lane", "111.I", "SfSd">; -def SCALAR_FMLA_LANEQ : IInst<"vfma_laneq", "111QI", "SfSd">; +def SCALAR_FMLA_LANEQ : IInst<"vfma_laneq", "111QI", "SfSd"> { + let isLaneQ = 1; +} // Scalar Floating Point fused multiply-subtract (scalar, by element) def SCALAR_FMLS_LANE : IOpInst<"vfms_lane", "111.I", "SfSd", OP_FMS_LN>; -def SCALAR_FMLS_LANEQ : IOpInst<"vfms_laneq", "111QI", "SfSd", OP_FMS_LNQ>; +def SCALAR_FMLS_LANEQ : IOpInst<"vfms_laneq", "111QI", "SfSd", OP_FMS_LNQ> { + let isLaneQ = 1; +} // Signed Saturating Doubling Multiply Long (scalar by element) def SCALAR_SQDMULL_LANE : SOpInst<"vqdmull_lane", "(1>)1.I", "SsSi", OP_SCALAR_QDMULL_LN>; -def SCALAR_SQDMULL_LANEQ : SOpInst<"vqdmull_laneq", "(1>)1QI", "SsSi", OP_SCALAR_QDMULL_LN>; +def SCALAR_SQDMULL_LANEQ : SOpInst<"vqdmull_laneq", "(1>)1QI", "SsSi", OP_SCALAR_QDMULL_LN> { + let isLaneQ = 1; +} // Signed Saturating Doubling Multiply-Add Long (scalar by element) def SCALAR_SQDMLAL_LANE : SInst<"vqdmlal_lane", "(1>)(1>)1.I", "SsSi">; -def SCALAR_SQDMLAL_LANEQ : SInst<"vqdmlal_laneq", "(1>)(1>)1QI", "SsSi">; +def SCALAR_SQDMLAL_LANEQ : SInst<"vqdmlal_laneq", "(1>)(1>)1QI", "SsSi"> { + let isLaneQ = 1; +} // Signed Saturating Doubling Multiply-Subtract Long (scalar by element) def SCALAR_SQDMLS_LANE : SInst<"vqdmlsl_lane", "(1>)(1>)1.I", "SsSi">; -def SCALAR_SQDMLS_LANEQ : SInst<"vqdmlsl_laneq", "(1>)(1>)1QI", "SsSi">; +def SCALAR_SQDMLS_LANEQ : SInst<"vqdmlsl_laneq", "(1>)(1>)1QI", "SsSi"> { + let isLaneQ = 1; +} // Scalar Integer Saturating Doubling Multiply Half High (scalar by element) def SCALAR_SQDMULH_LANE : SOpInst<"vqdmulh_lane", "11.I", "SsSi", OP_SCALAR_QDMULH_LN>; -def SCALAR_SQDMULH_LANEQ : SOpInst<"vqdmulh_laneq", "11QI", "SsSi", OP_SCALAR_QDMULH_LN>; +def SCALAR_SQDMULH_LANEQ : SOpInst<"vqdmulh_laneq", "11QI", "SsSi", OP_SCALAR_QDMULH_LN> { + let isLaneQ = 1; +} // Scalar Integer Saturating Rounding Doubling Multiply Half High def SCALAR_SQRDMULH_LANE : SOpInst<"vqrdmulh_lane", "11.I", "SsSi", OP_SCALAR_QRDMULH_LN>; -def SCALAR_SQRDMULH_LANEQ : SOpInst<"vqrdmulh_laneq", "11QI", "SsSi", OP_SCALAR_QRDMULH_LN>; +def SCALAR_SQRDMULH_LANEQ : SOpInst<"vqrdmulh_laneq", "11QI", "SsSi", OP_SCALAR_QRDMULH_LN> { + let isLaneQ = 1; +} let ArchGuard = "defined(__ARM_FEATURE_QRDMX) && defined(__aarch64__)" in { // Signed Saturating Rounding Doubling Multiply Accumulate Returning High Half def SCALAR_SQRDMLAH_LANE : SOpInst<"vqrdmlah_lane", "111.I", "SsSi", OP_SCALAR_QRDMLAH_LN>; -def SCALAR_SQRDMLAH_LANEQ : SOpInst<"vqrdmlah_laneq", "111QI", "SsSi", OP_SCALAR_QRDMLAH_LN>; +def SCALAR_SQRDMLAH_LANEQ : SOpInst<"vqrdmlah_laneq", "111QI", "SsSi", OP_SCALAR_QRDMLAH_LN> { + let isLaneQ = 1; +} // Signed Saturating Rounding Doubling Multiply Subtract Returning High Half def SCALAR_SQRDMLSH_LANE : SOpInst<"vqrdmlsh_lane", "111.I", "SsSi", OP_SCALAR_QRDMLSH_LN>; -def SCALAR_SQRDMLSH_LANEQ : SOpInst<"vqrdmlsh_laneq", "111QI", "SsSi", OP_SCALAR_QRDMLSH_LN>; +def SCALAR_SQRDMLSH_LANEQ : SOpInst<"vqrdmlsh_laneq", "111QI", "SsSi", OP_SCALAR_QRDMLSH_LN> { + let isLaneQ = 1; +} } def SCALAR_VDUP_LANE : IInst<"vdup_lane", "1.I", "ScSsSiSlSfSdSUcSUsSUiSUlSPcSPs">; -def SCALAR_VDUP_LANEQ : IInst<"vdup_laneq", "1QI", "ScSsSiSlSfSdSUcSUsSUiSUlSPcSPs">; +def SCALAR_VDUP_LANEQ : IInst<"vdup_laneq", "1QI", "ScSsSiSlSfSdSUcSUsSUiSUlSPcSPs"> { + let isLaneQ = 1; +} } // ARMv8.2-A FP16 vector intrinsics for A32/A64. @@ -1605,36 +1687,52 @@ let ArchGuard = "defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(__aarc // FMA lane def VFMA_LANEH : IInst<"vfma_lane", "...qI", "hQh">; - def VFMA_LANEQH : IInst<"vfma_laneq", "...QI", "hQh">; + def VFMA_LANEQH : IInst<"vfma_laneq", "...QI", "hQh"> { + let isLaneQ = 1; + } // FMA lane with scalar argument def FMLA_NH : SOpInst<"vfma_n", "...1", "hQh", OP_FMLA_N>; // Scalar floating point fused multiply-add (scalar, by element) def SCALAR_FMLA_LANEH : IInst<"vfma_lane", "111.I", "Sh">; - def SCALAR_FMLA_LANEQH : IInst<"vfma_laneq", "111QI", "Sh">; + def SCALAR_FMLA_LANEQH : IInst<"vfma_laneq", "111QI", "Sh"> { + let isLaneQ = 1; + } // FMS lane def VFMS_LANEH : IOpInst<"vfms_lane", "...qI", "hQh", OP_FMS_LN>; - def VFMS_LANEQH : IOpInst<"vfms_laneq", "...QI", "hQh", OP_FMS_LNQ>; + def VFMS_LANEQH : IOpInst<"vfms_laneq", "...QI", "hQh", OP_FMS_LNQ> { + let isLaneQ = 1; + } // FMS lane with scalar argument def FMLS_NH : SOpInst<"vfms_n", "...1", "hQh", OP_FMLS_N>; // Scalar floating foint fused multiply-subtract (scalar, by element) def SCALAR_FMLS_LANEH : IOpInst<"vfms_lane", "111.I", "Sh", OP_FMS_LN>; - def SCALAR_FMLS_LANEQH : IOpInst<"vfms_laneq", "111QI", "Sh", OP_FMS_LNQ>; + def SCALAR_FMLS_LANEQH : IOpInst<"vfms_laneq", "111QI", "Sh", OP_FMS_LNQ> { + let isLaneQ = 1; + } // Mul lane - def VMUL_LANEQH : IOpInst<"vmul_laneq", "..QI", "hQh", OP_MUL_LN>; + def VMUL_LANEQH : IOpInst<"vmul_laneq", "..QI", "hQh", OP_MUL_LN> { + let isLaneQ = 1; + } // Scalar floating point multiply (scalar, by element) def SCALAR_FMUL_LANEH : IOpInst<"vmul_lane", "11.I", "Sh", OP_SCALAR_MUL_LN>; - def SCALAR_FMUL_LANEQH : IOpInst<"vmul_laneq", "11QI", "Sh", OP_SCALAR_MUL_LN>; + def SCALAR_FMUL_LANEQH : IOpInst<"vmul_laneq", "11QI", "Sh", OP_SCALAR_MUL_LN> { + let isLaneQ = 1; + } // Mulx lane def VMULX_LANEH : IOpInst<"vmulx_lane", "..qI", "hQh", OP_MULX_LN>; - def VMULX_LANEQH : IOpInst<"vmulx_laneq", "..QI", "hQh", OP_MULX_LN>; + def VMULX_LANEQH : IOpInst<"vmulx_laneq", "..QI", "hQh", OP_MULX_LN> { + let isLaneQ = 1; + } def VMULX_NH : IOpInst<"vmulx_n", "..1", "hQh", OP_MULX_N>; // Scalar floating point mulx (scalar, by element) def SCALAR_FMULX_LANEH : IInst<"vmulx_lane", "11.I", "Sh">; - def SCALAR_FMULX_LANEQH : IInst<"vmulx_laneq", "11QI", "Sh">; + def SCALAR_FMULX_LANEQH : IInst<"vmulx_laneq", "11QI", "Sh"> { + let isLaneQ = 1; + } // ARMv8.2-A FP16 reduction vector intrinsics. def VMAXVH : SInst<"vmaxv", "1.", "hQh">; @@ -1651,7 +1749,9 @@ let ArchGuard = "defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(__aarc def VUZP2H : SOpInst<"vuzp2", "...", "hQh", OP_UZP2>; def SCALAR_VDUP_LANEH : IInst<"vdup_lane", "1.I", "Sh">; - def SCALAR_VDUP_LANEQH : IInst<"vdup_laneq", "1QI", "Sh">; + def SCALAR_VDUP_LANEQH : IInst<"vdup_laneq", "1QI", "Sh"> { + let isLaneQ = 1; + } } // v8.2-A dot product instructions. @@ -1661,7 +1761,9 @@ let ArchGuard = "defined(__ARM_FEATURE_DOTPROD)" in { } let ArchGuard = "defined(__ARM_FEATURE_DOTPROD) && defined(__aarch64__)" in { // Variants indexing into a 128-bit vector are A64 only. - def UDOT_LANEQ : SOpInst<"vdot_laneq", "..(<<)(<; + def UDOT_LANEQ : SOpInst<"vdot_laneq", "..(<<)(< { + let isLaneQ = 1; + } } // v8.2-A FP16 fused multiply-add long instructions. @@ -1676,10 +1778,18 @@ let ArchGuard = "defined(__ARM_FEATURE_FP16FML) && defined(__aarch64__)" in { def VFMLAL_LANE_HIGH : SOpInst<"vfmlal_lane_high", "(F>)(F>)F(Fq)I", "hQh", OP_FMLAL_LN_Hi>; def VFMLSL_LANE_HIGH : SOpInst<"vfmlsl_lane_high", "(F>)(F>)F(Fq)I", "hQh", OP_FMLSL_LN_Hi>; - def VFMLAL_LANEQ_LOW : SOpInst<"vfmlal_laneq_low", "(F>)(F>)F(FQ)I", "hQh", OP_FMLAL_LN>; - def VFMLSL_LANEQ_LOW : SOpInst<"vfmlsl_laneq_low", "(F>)(F>)F(FQ)I", "hQh", OP_FMLSL_LN>; - def VFMLAL_LANEQ_HIGH : SOpInst<"vfmlal_laneq_high", "(F>)(F>)F(FQ)I", "hQh", OP_FMLAL_LN_Hi>; - def VFMLSL_LANEQ_HIGH : SOpInst<"vfmlsl_laneq_high", "(F>)(F>)F(FQ)I", "hQh", OP_FMLSL_LN_Hi>; + def VFMLAL_LANEQ_LOW : SOpInst<"vfmlal_laneq_low", "(F>)(F>)F(FQ)I", "hQh", OP_FMLAL_LN> { + let isLaneQ = 1; + } + def VFMLSL_LANEQ_LOW : SOpInst<"vfmlsl_laneq_low", "(F>)(F>)F(FQ)I", "hQh", OP_FMLSL_LN> { + let isLaneQ = 1; + } + def VFMLAL_LANEQ_HIGH : SOpInst<"vfmlal_laneq_high", "(F>)(F>)F(FQ)I", "hQh", OP_FMLAL_LN_Hi> { + let isLaneQ = 1; + } + def VFMLSL_LANEQ_HIGH : SOpInst<"vfmlsl_laneq_high", "(F>)(F>)F(FQ)I", "hQh", OP_FMLSL_LN_Hi> { + let isLaneQ = 1; + } } // v8.3-A Vector complex addition intrinsics diff --git a/clang/include/clang/Basic/arm_neon_incl.td b/clang/include/clang/Basic/arm_neon_incl.td index 28b00d162a00d..d817e7acb9135 100644 --- a/clang/include/clang/Basic/arm_neon_incl.td +++ b/clang/include/clang/Basic/arm_neon_incl.td @@ -60,6 +60,15 @@ def op; // example: (call "vget_high", $p0) -> "vgetq_high_s16(__p0)" // (assuming $p0 has type int16x8_t). def call; +// call_mangled - Invoke another intrinsic matching the mangled name variation +// of the caller's base type. If there is no intrinsic defined +// that has the variation and takes the given types, an error +// is generated at tblgen time. +// example: (call_mangled "vfma_lane", $p0, $p1) -> "vfma_lane(__p0, __p1)" +// (assuming non-LaneQ caller) +// (call_mangled "vfma_lane", $p0, $p1) -> "vfma_laneq(__p0, __p1)" +// (assuming LaneQ caller) +def call_mangled; // cast - Perform a cast to a different type. This gets emitted as a static // C-style cast. For a pure reinterpret cast (T x = *(T*)&y), use // "bitcast". @@ -79,6 +88,7 @@ def call; // - "D" - Double the number of lanes in the type. // - "8" - Convert type to an equivalent vector of 8-bit signed // integers. +// - "32" - Convert type to an equivalent vector of 32-bit integers. // example: (cast "R", "U", $p0) -> "(uint32x4_t)__p0" (assuming the return // value is of type "int32x4_t". // (cast $p0, "D", "8", $p1) -> "(int8x16_t)__p1" (assuming __p0 @@ -100,12 +110,6 @@ def dup; // example: (dup_typed $p1, $p2) -> "(float16x4_t) {__p2, __p2, __p2, __p2}" // (assuming __p1 is float16x4_t, and __p2 is a compatible scalar). def dup_typed; -// splat - Take a vector and a lane index, and return a vector of the same type -// containing repeated instances of the source vector at the lane index. -// example: (splat $p0, $p1) -> -// "__builtin_shufflevector(__p0, __p0, __p1, __p1, __p1, __p1)" -// (assuming __p0 has four elements). -def splat; // save_temp - Create a temporary (local) variable. The variable takes a name // based on the zero'th parameter and can be referenced using // using that name in subsequent DAGs in the same diff --git a/clang/include/clang/CodeGen/CGFunctionInfo.h b/clang/include/clang/CodeGen/CGFunctionInfo.h index 588c96afe402f..eaf5a3d5aad71 100644 --- a/clang/include/clang/CodeGen/CGFunctionInfo.h +++ b/clang/include/clang/CodeGen/CGFunctionInfo.h @@ -508,6 +508,9 @@ class CGFunctionInfo final /// Whether this is a chain call. unsigned ChainCall : 1; + /// Whether this function is a CMSE nonsecure call + unsigned CmseNSCall : 1; + /// Whether this function is noreturn. unsigned NoReturn : 1; @@ -598,6 +601,8 @@ class CGFunctionInfo final bool isChainCall() const { return ChainCall; } + bool isCmseNSCall() const { return CmseNSCall; } + bool isNoReturn() const { return NoReturn; } /// In ARC, whether this function retains its return value. This @@ -635,7 +640,8 @@ class CGFunctionInfo final FunctionType::ExtInfo getExtInfo() const { return FunctionType::ExtInfo(isNoReturn(), getHasRegParm(), getRegParm(), getASTCallingConvention(), isReturnsRetained(), - isNoCallerSavedRegs(), isNoCfCheck()); + isNoCallerSavedRegs(), isNoCfCheck(), + isCmseNSCall()); } CanQualType getReturnType() const { return getArgsBuffer()[0].type; } @@ -676,6 +682,7 @@ class CGFunctionInfo final ID.AddBoolean(HasRegParm); ID.AddInteger(RegParm); ID.AddBoolean(NoCfCheck); + ID.AddBoolean(CmseNSCall); ID.AddInteger(Required.getOpaqueData()); ID.AddBoolean(HasExtParameterInfos); if (HasExtParameterInfos) { @@ -703,6 +710,7 @@ class CGFunctionInfo final ID.AddBoolean(info.getHasRegParm()); ID.AddInteger(info.getRegParm()); ID.AddBoolean(info.getNoCfCheck()); + ID.AddBoolean(info.getCmseNSCall()); ID.AddInteger(required.getOpaqueData()); ID.AddBoolean(!paramInfos.empty()); if (!paramInfos.empty()) { diff --git a/clang/include/clang/Driver/CC1Options.td b/clang/include/clang/Driver/CC1Options.td index bb1e7fba10e55..2224c152f6266 100644 --- a/clang/include/clang/Driver/CC1Options.td +++ b/clang/include/clang/Driver/CC1Options.td @@ -313,6 +313,8 @@ def mno_zero_initialized_in_bss : Flag<["-"], "mno-zero-initialized-in-bss">, HelpText<"Do not put zero initialized data in the BSS">; def mregparm : Separate<["-"], "mregparm">, HelpText<"Limit the number of registers available for integer arguments">; +def msmall_data_limit : Separate<["-"], "msmall-data-limit">, + HelpText<"Put global and static data smaller than the limit into a special section">; def munwind_tables : Flag<["-"], "munwind-tables">, HelpText<"Generate unwinding tables for all functions">; def mconstructor_aliases : Flag<["-"], "mconstructor-aliases">, @@ -388,8 +390,6 @@ def flto_visibility_public_std: def flto_unit: Flag<["-"], "flto-unit">, HelpText<"Emit IR to support LTO unit features (CFI, whole program vtable opt)">; def fno_lto_unit: Flag<["-"], "fno-lto-unit">; -def femit_debug_entry_values : Flag<["-"], "femit-debug-entry-values">, - HelpText<"Enables debug info about call site parameter's entry values">; def fdebug_pass_manager : Flag<["-"], "fdebug-pass-manager">, HelpText<"Prints debug information for the new pass manager">; def fno_debug_pass_manager : Flag<["-"], "fno-debug-pass-manager">, @@ -568,6 +568,11 @@ def fno_concept_satisfaction_caching : Flag<["-"], "fno-concept-satisfaction-caching">, HelpText<"Disable satisfaction caching for C++2a Concepts.">; +def frecovery_ast : Flag<["-"], "frecovery-ast">, + HelpText<"Preserve expressions in AST rather than dropping them when " + "encountering semantic errors">; +def fno_recovery_ast : Flag<["-"], "fno-recovery-ast">; + let Group = Action_Group in { def Eonly : Flag<["-"], "Eonly">, diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 563a894d5439c..383a37bf57a1a 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -468,6 +468,10 @@ def Xanalyzer : Separate<["-"], "Xanalyzer">, HelpText<"Pass to the static analyzer">, MetaVarName<"">, Group; def Xarch__ : JoinedAndSeparate<["-"], "Xarch_">, Flags<[DriverOption]>; +def Xarch_host : Separate<["-"], "Xarch_host">, Flags<[DriverOption]>, + HelpText<"Pass to the CUDA/HIP host compilation">, MetaVarName<"">; +def Xarch_device : Separate<["-"], "Xarch_device">, Flags<[DriverOption]>, + HelpText<"Pass to the CUDA/HIP device compilation">, MetaVarName<"">; def Xassembler : Separate<["-"], "Xassembler">, HelpText<"Pass to the assembler">, MetaVarName<"">, Group; @@ -2357,6 +2361,9 @@ def mrelax : Flag<["-"], "mrelax">, Group, HelpText<"Enable linker relaxation">; def mno_relax : Flag<["-"], "mno-relax">, Group, HelpText<"Disable linker relaxation">; +def msmall_data_limit_EQ : Joined<["-"], "msmall-data-limit=">, Group, + Alias, + HelpText<"Put global and static data smaller than the limit into a special section">; def msave_restore : Flag<["-"], "msave-restore">, Group, HelpText<"Enable using library calls for save and restore">; def mno_save_restore : Flag<["-"], "mno-save-restore">, Group, diff --git a/clang/include/clang/Driver/ToolChain.h b/clang/include/clang/Driver/ToolChain.h index 9423b9296f7a3..4e0bda105fa47 100644 --- a/clang/include/clang/Driver/ToolChain.h +++ b/clang/include/clang/Driver/ToolChain.h @@ -310,6 +310,22 @@ class ToolChain { SmallVectorImpl &AllocatedArgs, Action::OffloadKind DeviceOffloadKind) const; + /// Append the argument following \p A to \p DAL assuming \p A is an Xarch + /// argument. If \p AllocatedArgs is null pointer, synthesized arguments are + /// added to \p DAL, otherwise they are appended to \p AllocatedArgs. + virtual void TranslateXarchArgs( + const llvm::opt::DerivedArgList &Args, llvm::opt::Arg *&A, + llvm::opt::DerivedArgList *DAL, + SmallVectorImpl *AllocatedArgs = nullptr) const; + + /// Translate -Xarch_ arguments. If there are no such arguments, return + /// a null pointer, otherwise return a DerivedArgList containing the + /// translated arguments. + virtual llvm::opt::DerivedArgList * + TranslateXarchArgs(const llvm::opt::DerivedArgList &Args, StringRef BoundArch, + Action::OffloadKind DeviceOffloadKind, + SmallVectorImpl *AllocatedArgs) const; + /// Choose a tool to use to handle the action \p JA. /// /// This can be overridden when a particular ToolChain needs to use diff --git a/clang/include/clang/Parse/Parser.h b/clang/include/clang/Parse/Parser.h index f62f2a3fee40a..b6f65227ab2eb 100644 --- a/clang/include/clang/Parse/Parser.h +++ b/clang/include/clang/Parse/Parser.h @@ -1815,7 +1815,9 @@ class Parser : public CodeCompletionHandler { bool EnteringContext, IdentifierInfo &II, CXXScopeSpec &SS); - bool ParseOptionalCXXScopeSpecifier(CXXScopeSpec &SS, ParsedType ObjectType, + bool ParseOptionalCXXScopeSpecifier(CXXScopeSpec &SS, + ParsedType ObjectType, + bool ObjectHasErrors, bool EnteringContext, bool *MayBePseudoDestructor = nullptr, bool IsTypename = false, @@ -2917,11 +2919,12 @@ class Parser : public CodeCompletionHandler { AccessSpecifier getAccessSpecifierIfPresent() const; bool ParseUnqualifiedIdTemplateId(CXXScopeSpec &SS, + ParsedType ObjectType, + bool ObjectHadErrors, SourceLocation TemplateKWLoc, IdentifierInfo *Name, SourceLocation NameLoc, bool EnteringContext, - ParsedType ObjectType, UnqualifiedId &Id, bool AssumeTemplateId); bool ParseUnqualifiedIdOperator(CXXScopeSpec &SS, bool EnteringContext, @@ -3083,20 +3086,19 @@ class Parser : public CodeCompletionHandler { SmallVector MapTypeModifiersLoc; bool IsMapTypeImplicit = false; - SourceLocation DepLinMapLastLoc; + SourceLocation ExtraModifierLoc; }; /// Parses clauses with list. bool ParseOpenMPVarList(OpenMPDirectiveKind DKind, OpenMPClauseKind Kind, SmallVectorImpl &Vars, OpenMPVarListDataTy &Data); - bool ParseUnqualifiedId(CXXScopeSpec &SS, bool EnteringContext, - bool AllowDestructorName, - bool AllowConstructorName, + bool ParseUnqualifiedId(CXXScopeSpec &SS, ParsedType ObjectType, + bool ObjectHadErrors, bool EnteringContext, + bool AllowDestructorName, bool AllowConstructorName, bool AllowDeductionGuide, - ParsedType ObjectType, - SourceLocation *TemplateKWLoc, - UnqualifiedId &Result); + SourceLocation *TemplateKWLoc, UnqualifiedId &Result); + /// Parses the mapper modifier in map, to, and from clauses. bool parseMapperModifier(OpenMPVarListDataTy &Data); /// Parses map-type-modifiers in map clause. diff --git a/clang/include/clang/Sema/ExternalSemaSource.h b/clang/include/clang/Sema/ExternalSemaSource.h index c79ca0e71df5e..2854b4893484d 100644 --- a/clang/include/clang/Sema/ExternalSemaSource.h +++ b/clang/include/clang/Sema/ExternalSemaSource.h @@ -193,6 +193,15 @@ class ExternalSemaSource : public ExternalASTSource { llvm::MapVector> &LPTMap) {} + /// Read the set of decls to be checked for deferred diags. + /// + /// The external source should append its own potentially emitted function + /// and variable decls which may cause deferred diags. Note that this routine + /// may be invoked multiple times; the external source should take care not to + /// introduce the same declarations repeatedly. + virtual void ReadDeclsToCheckForDeferredDiags( + llvm::SmallVector &Decls) {} + /// \copydoc Sema::CorrectTypo /// \note LookupKind must correspond to a valid Sema::LookupNameKind /// diff --git a/clang/include/clang/Sema/MultiplexExternalSemaSource.h b/clang/include/clang/Sema/MultiplexExternalSemaSource.h index dcbac9f0ba105..e94dd5d468711 100644 --- a/clang/include/clang/Sema/MultiplexExternalSemaSource.h +++ b/clang/include/clang/Sema/MultiplexExternalSemaSource.h @@ -332,6 +332,15 @@ class MultiplexExternalSemaSource : public ExternalSemaSource { llvm::MapVector> &LPTMap) override; + /// Read the set of decls to be checked for deferred diags. + /// + /// The external source should append its own potentially emitted function + /// and variable decls which may cause deferred diags. Note that this routine + /// may be invoked multiple times; the external source should take care not to + /// introduce the same declarations repeatedly. + void ReadDeclsToCheckForDeferredDiags( + llvm::SmallVector &Decls) override; + /// \copydoc ExternalSemaSource::CorrectTypo /// \note Returns the first nonempty correction. TypoCorrection CorrectTypo(const DeclarationNameInfo &Typo, diff --git a/clang/include/clang/Sema/Overload.h b/clang/include/clang/Sema/Overload.h index 6944b0b5756e0..5023525aa41bd 100644 --- a/clang/include/clang/Sema/Overload.h +++ b/clang/include/clang/Sema/Overload.h @@ -983,6 +983,14 @@ class Sema; return CRK; } + /// Determines whether this operator could be implemented by a function + /// with reversed parameter order. + bool isReversible() { + return AllowRewrittenCandidates && OriginalOperator && + (getRewrittenOverloadedOperator(OriginalOperator) != OO_None || + shouldAddReversed(OriginalOperator)); + } + /// Determine whether we should consider looking for and adding reversed /// candidates for operator Op. bool shouldAddReversed(OverloadedOperatorKind Op); diff --git a/clang/include/clang/Sema/ParsedAttr.h b/clang/include/clang/Sema/ParsedAttr.h index 27e9e66d8d843..e1e63ed87896b 100644 --- a/clang/include/clang/Sema/ParsedAttr.h +++ b/clang/include/clang/Sema/ParsedAttr.h @@ -68,7 +68,7 @@ struct ParsedAttrInfo { std::vector Spellings; ParsedAttrInfo(AttributeCommonInfo::Kind AttrKind = - AttributeCommonInfo::UnknownAttribute) + AttributeCommonInfo::NoSemaHandlerAttribute) : AttrKind(AttrKind), NumArgs(0), OptArgs(0), HasCustomParsing(0), IsTargetSpecific(0), IsType(0), IsStmt(0), IsKnownToGCC(0), IsSupportedByPragmaAttribute(0) {} @@ -99,6 +99,18 @@ struct ParsedAttrInfo { llvm::SmallVectorImpl> &Rules, const LangOptions &LangOpts) const { } + enum AttrHandling { + NotHandled, + AttributeApplied, + AttributeNotApplied + }; + /// If this ParsedAttrInfo knows how to handle this ParsedAttr applied to this + /// Decl then do so and return either AttributeApplied if it was applied or + /// AttributeNotApplied if it wasn't. Otherwise return NotHandled. + virtual AttrHandling handleDeclAttribute(Sema &S, Decl *D, + const ParsedAttr &Attr) const { + return NotHandled; + } static const ParsedAttrInfo &get(const AttributeCommonInfo &A); }; diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index 29c9b840407cd..5e7f20cb4e9c1 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -1632,6 +1632,18 @@ class Sema final { void emitAndClearUnusedLocalTypedefWarnings(); + private: + /// Function or variable declarations to be checked for whether the deferred + /// diagnostics should be emitted. + SmallVector DeclsToCheckForDeferredDiags; + + public: + // Emit all deferred diagnostics. + void emitDeferredDiags(); + // Emit any deferred diagnostics for FD and erase them from the map in which + // they're stored. + void emitDeferredDiags(FunctionDecl *FD, bool ShowCallStack); + enum TUFragmentKind { /// The global module fragment, between 'module;' and a module-declaration. Global, @@ -3918,7 +3930,8 @@ class Sema final { TemplateDiscarded, // Discarded due to uninstantiated templates Unknown, }; - FunctionEmissionStatus getEmissionStatus(FunctionDecl *Decl); + FunctionEmissionStatus getEmissionStatus(FunctionDecl *Decl, + bool Final = false); // Whether the callee should be ignored in CUDA/HIP/OpenMP host/device check. bool shouldIgnoreInHostDeviceCheck(FunctionDecl *Callee); @@ -4023,6 +4036,10 @@ class Sema final { void DiagnoseAmbiguousLookup(LookupResult &Result); //@} + /// Attempts to produce a RecoveryExpr after some AST node cannot be created. + ExprResult CreateRecoveryExpr(SourceLocation Begin, SourceLocation End, + ArrayRef SubExprs); + ObjCInterfaceDecl *getObjCInterfaceDecl(IdentifierInfo *&Id, SourceLocation IdLoc, bool TypoCorrection = false); @@ -4952,13 +4969,14 @@ class Sema final { ExprResult ActOnPredefinedExpr(SourceLocation Loc, tok::TokenKind Kind); ExprResult ActOnIntegerConstant(SourceLocation Loc, uint64_t Val); - ExprResult BuildUniqueStableName(SourceLocation OpLoc, - TypeSourceInfo *Operand); - ExprResult BuildUniqueStableName(SourceLocation OpLoc, Expr *E); - ExprResult ActOnUniqueStableNameExpr(SourceLocation OpLoc, SourceLocation L, - SourceLocation R, ParsedType Ty); - ExprResult ActOnUniqueStableNameExpr(SourceLocation OpLoc, SourceLocation L, - SourceLocation R, Expr *Operand); + ExprResult BuildUniqueStableName(SourceLocation Loc, TypeSourceInfo *Operand); + ExprResult BuildUniqueStableName(SourceLocation Loc, Expr *E); + ExprResult ActOnUniqueStableNameExpr(SourceLocation OpLoc, + SourceLocation LParen, + SourceLocation RParen, ParsedType Ty); + ExprResult ActOnUniqueStableNameExpr(SourceLocation OpLoc, + SourceLocation LParen, + SourceLocation RParen, Expr *Operand); bool CheckLoopHintExpr(Expr *E, SourceLocation Loc); @@ -9942,22 +9960,10 @@ class Sema final { /// Pop OpenMP function region for non-capturing function. void popOpenMPFunctionRegion(const sema::FunctionScopeInfo *OldFSI); - /// Check whether we're allowed to call Callee from the current function. - void checkOpenMPDeviceFunction(SourceLocation Loc, FunctionDecl *Callee, - bool CheckForDelayedContext = true); - - /// Check whether we're allowed to call Callee from the current function. - void checkOpenMPHostFunction(SourceLocation Loc, FunctionDecl *Callee, - bool CheckCaller = true); - /// Check if the expression is allowed to be used in expressions for the /// OpenMP devices. void checkOpenMPDeviceExpr(const Expr *E); - /// Finishes analysis of the deferred functions calls that may be declared as - /// host/nohost during device/host compilation. - void finalizeOpenMPDelayedAnalysis(); - /// Checks if a type or a declaration is disabled due to the owning extension /// being disabled, and emits diagnostic messages if it is disabled. /// \param D type or declaration to be checked. @@ -10148,6 +10154,11 @@ class Sema final { void checkDeclIsAllowedInOpenMPTarget(Expr *E, Decl *D, SourceLocation IdLoc = SourceLocation()); + /// Finishes analysis of the deferred functions calls that may be declared as + /// host/nohost during device/host compilation. + void finalizeOpenMPDelayedAnalysis(const FunctionDecl *Caller, + const FunctionDecl *Callee, + SourceLocation Loc); /// Return true inside OpenMP declare target region. bool isInOpenMPDeclareTargetContext() const { return DeclareTargetNestingLevel > 0; @@ -10269,6 +10280,10 @@ class Sema final { StmtResult ActOnOpenMPDepobjDirective(ArrayRef Clauses, SourceLocation StartLoc, SourceLocation EndLoc); + /// Called on well-formed '\#pragma omp scan'. + StmtResult ActOnOpenMPScanDirective(ArrayRef Clauses, + SourceLocation StartLoc, + SourceLocation EndLoc); /// Called on well-formed '\#pragma omp ordered' after parsing of the /// associated statement. StmtResult ActOnOpenMPOrderedDirective(ArrayRef Clauses, @@ -10665,7 +10680,17 @@ class Sema final { DeclarationNameInfo &ReductionOrMapperId, int ExtraModifier, ArrayRef MapTypeModifiers, ArrayRef MapTypeModifiersLoc, bool IsMapTypeImplicit, - SourceLocation DepLinMapLastLoc); + SourceLocation ExtraModifierLoc); + /// Called on well-formed 'inclusive' clause. + OMPClause *ActOnOpenMPInclusiveClause(ArrayRef VarList, + SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc); + /// Called on well-formed 'exclusive' clause. + OMPClause *ActOnOpenMPExclusiveClause(ArrayRef VarList, + SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc); /// Called on well-formed 'allocate' clause. OMPClause * ActOnOpenMPAllocateClause(Expr *Allocator, ArrayRef VarList, @@ -10693,9 +10718,10 @@ class Sema final { SourceLocation EndLoc); /// Called on well-formed 'reduction' clause. OMPClause *ActOnOpenMPReductionClause( - ArrayRef VarList, SourceLocation StartLoc, - SourceLocation LParenLoc, SourceLocation ColonLoc, SourceLocation EndLoc, - CXXScopeSpec &ReductionIdScopeSpec, + ArrayRef VarList, OpenMPReductionClauseModifier Modifier, + SourceLocation StartLoc, SourceLocation LParenLoc, + SourceLocation ModifierLoc, SourceLocation ColonLoc, + SourceLocation EndLoc, CXXScopeSpec &ReductionIdScopeSpec, const DeclarationNameInfo &ReductionId, ArrayRef UnresolvedReductions = llvm::None); /// Called on well-formed 'task_reduction' clause. @@ -11525,18 +11551,6 @@ class Sema final { /* Caller = */ FunctionDeclAndLoc> DeviceKnownEmittedFns; - /// A partial call graph maintained during CUDA/OpenMP device code compilation - /// to support deferred diagnostics. - /// - /// Functions are only added here if, at the time they're considered, they are - /// not known-emitted. As soon as we discover that a function is - /// known-emitted, we remove it and everything it transitively calls from this - /// set and add those functions to DeviceKnownEmittedFns. - llvm::DenseMap, - /* Callees = */ llvm::MapVector, - SourceLocation>> - DeviceCallGraph; - /// Diagnostic builder for CUDA/OpenMP devices errors which may or may not be /// deferred. /// @@ -11611,14 +11625,6 @@ class Sema final { llvm::Optional PartialDiagId; }; - /// Indicate that this function (and thus everything it transtively calls) - /// will be codegen'ed, and emit any deferred diagnostics on this function and - /// its (transitive) callees. - void markKnownEmitted( - Sema &S, FunctionDecl *OrigCaller, FunctionDecl *OrigCallee, - SourceLocation OrigLoc, - const llvm::function_ref IsKnownEmitted); - /// Creates a DeviceDiagBuilder that emits the diagnostic if the current context /// is "used as device code". /// @@ -12486,10 +12492,11 @@ class Sema final { /// codegen'ed yet. bool checkSYCLDeviceFunction(SourceLocation Loc, FunctionDecl *Callee); - /// Emit diagnostic that can't be emitted with deferred diagnostics mechanism. - /// At this step we imply that all device functions are marked with - /// sycl_device attribute. - void finalizeSYCLDelayedAnalysis(); + /// Finishes analysis of the deferred functions calls that may be not + /// properly declared for device compilation. + void finalizeSYCLDelayedAnalysis(const FunctionDecl *Caller, + const FunctionDecl *Callee, + SourceLocation Loc); }; template diff --git a/clang/include/clang/Serialization/ASTBitCodes.h b/clang/include/clang/Serialization/ASTBitCodes.h index 31226fb0516cb..f185c1a16834b 100644 --- a/clang/include/clang/Serialization/ASTBitCodes.h +++ b/clang/include/clang/Serialization/ASTBitCodes.h @@ -650,7 +650,10 @@ namespace serialization { PP_CONDITIONAL_STACK = 62, /// A table of skipped ranges within the preprocessing record. - PPD_SKIPPED_RANGES = 63 + PPD_SKIPPED_RANGES = 63, + + /// Record code for the Decls to be checked for deferred diags. + DECLS_TO_CHECK_FOR_DEFERRED_DIAGS = 64, }; /// Record types used within a source manager block. @@ -1631,6 +1634,9 @@ namespace serialization { /// An AtomicExpr record. EXPR_ATOMIC, + /// A RecoveryExpr record. + EXPR_RECOVERY, + // Objective-C /// An ObjCStringLiteral record. @@ -1826,6 +1832,7 @@ namespace serialization { STMT_OMP_TASKWAIT_DIRECTIVE, STMT_OMP_FLUSH_DIRECTIVE, STMT_OMP_DEPOBJ_DIRECTIVE, + STMT_OMP_SCAN_DIRECTIVE, STMT_OMP_ORDERED_DIRECTIVE, STMT_OMP_ATOMIC_DIRECTIVE, STMT_OMP_TARGET_DIRECTIVE, diff --git a/clang/include/clang/Serialization/ASTReader.h b/clang/include/clang/Serialization/ASTReader.h index e74bf00e08727..94645fff9f932 100644 --- a/clang/include/clang/Serialization/ASTReader.h +++ b/clang/include/clang/Serialization/ASTReader.h @@ -890,6 +890,12 @@ class ASTReader // A list of late parsed template function data. SmallVector LateParsedTemplates; + /// The IDs of all decls to be checked for deferred diags. + /// + /// Sema tracks these to emit deferred diags. + SmallVector DeclsToCheckForDeferredDiags; + + public: struct ImportedSubmodule { serialization::SubmoduleID ID; @@ -1983,6 +1989,9 @@ class ASTReader void ReadUnusedLocalTypedefNameCandidates( llvm::SmallSetVector &Decls) override; + void ReadDeclsToCheckForDeferredDiags( + llvm::SmallVector &Decls) override; + void ReadReferencedSelectors( SmallVectorImpl> &Sels) override; diff --git a/clang/include/clang/Serialization/ASTRecordReader.h b/clang/include/clang/Serialization/ASTRecordReader.h index 7d4ef7c43a9d9..5bdc9ca2ddbfc 100644 --- a/clang/include/clang/Serialization/ASTRecordReader.h +++ b/clang/include/clang/Serialization/ASTRecordReader.h @@ -22,7 +22,7 @@ #include "llvm/ADT/APSInt.h" namespace clang { -struct OMPTraitInfo; +class OMPTraitInfo; /// An object for streaming information from a record. class ASTRecordReader @@ -260,7 +260,7 @@ class ASTRecordReader } /// Read an OMPTraitInfo object, advancing Idx. - OMPTraitInfo readOMPTraitInfo(); + OMPTraitInfo *readOMPTraitInfo(); /// Read an OpenMP clause, advancing Idx. OMPClause *readOMPClause(); diff --git a/clang/include/clang/Serialization/ASTRecordWriter.h b/clang/include/clang/Serialization/ASTRecordWriter.h index 924aa5d4b758b..491207c9de906 100644 --- a/clang/include/clang/Serialization/ASTRecordWriter.h +++ b/clang/include/clang/Serialization/ASTRecordWriter.h @@ -267,7 +267,7 @@ class ASTRecordWriter void AddCXXDefinitionData(const CXXRecordDecl *D); /// Write an OMPTraitInfo object. - void writeOMPTraitInfo(const OMPTraitInfo &TI); + void writeOMPTraitInfo(const OMPTraitInfo *TI); void writeOMPClause(OMPClause *C); diff --git a/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td b/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td index 91c0fc88f6e7b..a21107cd4c2df 100644 --- a/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td +++ b/clang/include/clang/StaticAnalyzer/Checkers/Checkers.td @@ -295,6 +295,13 @@ def StdCLibraryFunctionsChecker : Checker<"StdCLibraryFunctions">, HelpText<"Improve modeling of the C standard library functions">, Documentation; +def StdCLibraryFunctionArgsChecker : Checker<"StdCLibraryFunctionArgs">, + HelpText<"Check constraints of arguments of C standard library functions, " + "such as whether the parameter of isalpha is in the range [0, 255] " + "or is EOF.">, + Dependencies<[StdCLibraryFunctionsChecker]>, + Documentation; + def TrustNonnullChecker : Checker<"TrustNonnull">, HelpText<"Trust that returns from framework methods annotated with _Nonnull " "are not null">, diff --git a/clang/include/clang/StaticAnalyzer/Core/Analyses.def b/clang/include/clang/StaticAnalyzer/Core/Analyses.def index 377451576148c..c4e5f5be6fd7d 100644 --- a/clang/include/clang/StaticAnalyzer/Core/Analyses.def +++ b/clang/include/clang/StaticAnalyzer/Core/Analyses.def @@ -14,41 +14,80 @@ #define ANALYSIS_STORE(NAME, CMDFLAG, DESC, CREATFN) #endif -ANALYSIS_STORE(RegionStore, "region", "Use region-based analyzer store", CreateRegionStoreManager) +ANALYSIS_STORE(RegionStore, "region", "Use region-based analyzer store", + CreateRegionStoreManager) #ifndef ANALYSIS_CONSTRAINTS #define ANALYSIS_CONSTRAINTS(NAME, CMDFLAG, DESC, CREATFN) #endif -ANALYSIS_CONSTRAINTS(RangeConstraints, "range", "Use constraint tracking of concrete value ranges", CreateRangeConstraintManager) -ANALYSIS_CONSTRAINTS(Z3Constraints, "z3", "Use Z3 contraint solver", CreateZ3ConstraintManager) +ANALYSIS_CONSTRAINTS(RangeConstraints, "range", + "Use constraint tracking of concrete value ranges", + CreateRangeConstraintManager) + +ANALYSIS_CONSTRAINTS(Z3Constraints, "z3", "Use Z3 contraint solver", + CreateZ3ConstraintManager) #ifndef ANALYSIS_DIAGNOSTICS #define ANALYSIS_DIAGNOSTICS(NAME, CMDFLAG, DESC, CREATEFN) #endif -ANALYSIS_DIAGNOSTICS(HTML, "html", "Output analysis results using HTML", createHTMLDiagnosticConsumer) -ANALYSIS_DIAGNOSTICS(HTML_SINGLE_FILE, "html-single-file", "Output analysis results using HTML (not allowing for multi-file bugs)", createHTMLSingleFileDiagnosticConsumer) -ANALYSIS_DIAGNOSTICS(PLIST, "plist", "Output analysis results using Plists", createPlistDiagnosticConsumer) -ANALYSIS_DIAGNOSTICS(PLIST_MULTI_FILE, "plist-multi-file", "Output analysis results using Plists (allowing for multi-file bugs)", createPlistMultiFileDiagnosticConsumer) -ANALYSIS_DIAGNOSTICS(PLIST_HTML, "plist-html", "Output analysis results using HTML wrapped with Plists", createPlistHTMLDiagnosticConsumer) -ANALYSIS_DIAGNOSTICS(SARIF, "sarif", "Output analysis results in a SARIF file", createSarifDiagnosticConsumer) -ANALYSIS_DIAGNOSTICS(TEXT, "text", "Text output of analysis results", createTextPathDiagnosticConsumer) +ANALYSIS_DIAGNOSTICS(HTML, "html", "Output analysis results using HTML", + createHTMLDiagnosticConsumer) + +ANALYSIS_DIAGNOSTICS( + HTML_SINGLE_FILE, "html-single-file", + "Output analysis results using HTML (not allowing for multi-file bugs)", + createHTMLSingleFileDiagnosticConsumer) + +ANALYSIS_DIAGNOSTICS(PLIST, "plist", "Output analysis results using Plists", + createPlistDiagnosticConsumer) + +ANALYSIS_DIAGNOSTICS( + PLIST_MULTI_FILE, "plist-multi-file", + "Output analysis results using Plists (allowing for multi-file bugs)", + createPlistMultiFileDiagnosticConsumer) + +ANALYSIS_DIAGNOSTICS(PLIST_HTML, "plist-html", + "Output analysis results using HTML wrapped with Plists", + createPlistHTMLDiagnosticConsumer) + +ANALYSIS_DIAGNOSTICS(SARIF, "sarif", "Output analysis results in a SARIF file", + createSarifDiagnosticConsumer) + +ANALYSIS_DIAGNOSTICS(TEXT, "text", "Text output of analysis results to stderr", + createTextPathDiagnosticConsumer) + +ANALYSIS_DIAGNOSTICS(TEXT_MINIMAL, "text-minimal", + "Emits minimal diagnostics to stderr, stating only the " + "warning message and the associated notes. Usually " + "used in addition to other analysis types", + createTextMinimalPathDiagnosticConsumer) #ifndef ANALYSIS_PURGE #define ANALYSIS_PURGE(NAME, CMDFLAG, DESC) #endif -ANALYSIS_PURGE(PurgeStmt, "statement", "Purge symbols, bindings, and constraints before every statement") -ANALYSIS_PURGE(PurgeBlock, "block", "Purge symbols, bindings, and constraints before every basic block") -ANALYSIS_PURGE(PurgeNone, "none", "Do not purge symbols, bindings, or constraints") +ANALYSIS_PURGE( + PurgeStmt, "statement", + "Purge symbols, bindings, and constraints before every statement") + +ANALYSIS_PURGE( + PurgeBlock, "block", + "Purge symbols, bindings, and constraints before every basic block") + +ANALYSIS_PURGE(PurgeNone, "none", + "Do not purge symbols, bindings, or constraints") #ifndef ANALYSIS_INLINING_MODE #define ANALYSIS_INLINING_MODE(NAME, CMDFLAG, DESC) #endif -ANALYSIS_INLINING_MODE(All, "all", "Analyze all functions as top level") -ANALYSIS_INLINING_MODE(NoRedundancy, "noredundancy", "Do not analyze a function which has been previously inlined") +ANALYSIS_INLINING_MODE(All, "all", "Analyze all functions as top level") + +ANALYSIS_INLINING_MODE( + NoRedundancy, "noredundancy", + "Do not analyze a function which has been previously inlined") #undef ANALYSIS_STORE #undef ANALYSIS_CONSTRAINTS @@ -56,4 +95,3 @@ ANALYSIS_INLINING_MODE(NoRedundancy, "noredundancy", "Do not analyze a function #undef ANALYSIS_PURGE #undef ANALYSIS_INLINING_MODE #undef ANALYSIS_IPA - diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ConstraintManager.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ConstraintManager.h index f85c373791585..935b2bb7b937d 100644 --- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ConstraintManager.h +++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ConstraintManager.h @@ -96,11 +96,7 @@ class ConstraintManager { // If StTrue is infeasible, asserting the falseness of Cond is unnecessary // because the existing constraints already establish this. if (!StTrue) { -#ifndef __OPTIMIZE__ - // This check is expensive and should be disabled even in Release+Asserts - // builds. - // FIXME: __OPTIMIZE__ is a GNU extension that Clang implements but MSVC - // does not. Is there a good equivalent there? +#ifdef EXPENSIVE_CHECKS assert(assume(State, Cond, false) && "System is over constrained."); #endif return ProgramStatePair((ProgramStateRef)nullptr, State); diff --git a/clang/include/clang/StaticAnalyzer/Frontend/CheckerRegistry.h b/clang/include/clang/StaticAnalyzer/Frontend/CheckerRegistry.h index 8f0c7edc58b43..8830542f27d82 100644 --- a/clang/include/clang/StaticAnalyzer/Frontend/CheckerRegistry.h +++ b/clang/include/clang/StaticAnalyzer/Frontend/CheckerRegistry.h @@ -167,7 +167,7 @@ class CheckerRegistry { } bool isDisabled(const LangOptions &LO) const { - return State == StateFromCmdLine::State_Disabled && ShouldRegister(LO); + return State == StateFromCmdLine::State_Disabled || !ShouldRegister(LO); } // Since each checker must have a different full name, we can identify diff --git a/clang/include/clang/Tooling/Syntax/Nodes.h b/clang/include/clang/Tooling/Syntax/Nodes.h index 82fcac33f99bd..f4d482bb848c6 100644 --- a/clang/include/clang/Tooling/Syntax/Nodes.h +++ b/clang/include/clang/Tooling/Syntax/Nodes.h @@ -64,6 +64,8 @@ enum class NodeKind : uint16_t { StaticAssertDeclaration, LinkageSpecificationDeclaration, SimpleDeclaration, + TemplateDeclaration, + ExplicitTemplateInstantiation, NamespaceDefinition, NamespaceAliasDefinition, UsingNamespaceDirective, @@ -112,6 +114,9 @@ enum class NodeRole : uint8_t { StaticAssertDeclaration_condition, StaticAssertDeclaration_message, SimpleDeclaration_declarator, + TemplateDeclaration_declaration, + ExplicitTemplateInstantiation_externKeyword, + ExplicitTemplateInstantiation_declaration, ArraySubscript_sizeExpression, TrailingReturnType_arrow, TrailingReturnType_declarator, @@ -396,6 +401,34 @@ class SimpleDeclaration final : public Declaration { std::vector declarators(); }; +/// template +class TemplateDeclaration final : public Declaration { +public: + TemplateDeclaration() : Declaration(NodeKind::TemplateDeclaration) {} + static bool classof(const Node *N) { + return N->kind() == NodeKind::TemplateDeclaration; + } + syntax::Leaf *templateKeyword(); + syntax::Declaration *declaration(); +}; + +/// template +/// Examples: +/// template struct X +/// template void foo() +/// template int var +class ExplicitTemplateInstantiation final : public Declaration { +public: + ExplicitTemplateInstantiation() + : Declaration(NodeKind::ExplicitTemplateInstantiation) {} + static bool classof(const Node *N) { + return N->kind() == NodeKind::ExplicitTemplateInstantiation; + } + syntax::Leaf *templateKeyword(); + syntax::Leaf *externKeyword(); + syntax::Declaration *declaration(); +}; + /// namespace { } class NamespaceDefinition final : public Declaration { public: diff --git a/clang/include/clang/Tooling/Syntax/Tree.h b/clang/include/clang/Tooling/Syntax/Tree.h index 8702fe60ce1b2..bc581004c46e6 100644 --- a/clang/include/clang/Tooling/Syntax/Tree.h +++ b/clang/include/clang/Tooling/Syntax/Tree.h @@ -126,6 +126,8 @@ class Node { // FactoryImpl sets CanModify flag. friend class FactoryImpl; + void setRole(NodeRole NR); + Tree *Parent; Node *NextSibling; unsigned Kind : 16; @@ -171,8 +173,11 @@ class Tree : public Node { /// Prepend \p Child to the list of children and and sets the parent pointer. /// A very low-level operation that does not check any invariants, only used /// by TreeBuilder and FactoryImpl. - /// EXPECTS: Role != NodeRoleDetached. + /// EXPECTS: Role != Detached. void prependChildLowLevel(Node *Child, NodeRole Role); + /// Like the previous overload, but does not set role for \p Child. + /// EXPECTS: Child->Role != Detached + void prependChildLowLevel(Node *Child); friend class TreeBuilder; friend class FactoryImpl; diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp index b4c24bbf30ec8..29f028bfc2020 100644 --- a/clang/lib/AST/ASTContext.cpp +++ b/clang/lib/AST/ASTContext.cpp @@ -29,6 +29,7 @@ #include "clang/AST/DeclOpenMP.h" #include "clang/AST/DeclTemplate.h" #include "clang/AST/DeclarationName.h" +#include "clang/AST/DependenceFlags.h" #include "clang/AST/Expr.h" #include "clang/AST/ExprCXX.h" #include "clang/AST/ExprConcepts.h" @@ -1010,6 +1011,9 @@ ASTContext::~ASTContext() { for (APValue *Value : APValueCleanups) Value->~APValue(); + + // Destroy the OMPTraitInfo objects that life here. + llvm::DeleteContainerPointers(OMPTraitInfoVector); } void ASTContext::setTraversalScope(const std::vector &TopLevelDecls) { @@ -5127,8 +5131,12 @@ ASTContext::getAutoType(QualType DeducedType, AutoTypeKeyword Keyword, void *Mem = Allocate(sizeof(AutoType) + sizeof(TemplateArgument) * TypeConstraintArgs.size(), TypeAlignment); - auto *AT = new (Mem) AutoType(DeducedType, Keyword, IsDependent, IsPack, - TypeConstraintConcept, TypeConstraintArgs); + auto *AT = new (Mem) AutoType( + DeducedType, Keyword, + (IsDependent ? TypeDependence::DependentInstantiation + : TypeDependence::None) | + (IsPack ? TypeDependence::UnexpandedPack : TypeDependence::None), + TypeConstraintConcept, TypeConstraintArgs); Types.push_back(AT); if (InsertPos) AutoTypes.InsertNode(AT, InsertPos); @@ -5188,11 +5196,11 @@ QualType ASTContext::getAtomicType(QualType T) const { /// getAutoDeductType - Get type pattern for deducing against 'auto'. QualType ASTContext::getAutoDeductType() const { if (AutoDeductTy.isNull()) - AutoDeductTy = QualType( - new (*this, TypeAlignment) AutoType(QualType(), AutoTypeKeyword::Auto, - /*dependent*/false, /*pack*/false, - /*concept*/nullptr, /*args*/{}), - 0); + AutoDeductTy = QualType(new (*this, TypeAlignment) + AutoType(QualType(), AutoTypeKeyword::Auto, + TypeDependence::None, + /*concept*/ nullptr, /*args*/ {}), + 0); return AutoDeductTy; } @@ -10861,3 +10869,8 @@ void ASTContext::getFunctionFeatureMap(llvm::StringMap &FeatureMap, Target->getTargetOpts().Features); } } + +OMPTraitInfo &ASTContext::getNewOMPTraitInfo() { + OMPTraitInfoVector.push_back(new OMPTraitInfo()); + return *OMPTraitInfoVector.back(); +} diff --git a/clang/lib/AST/AttrImpl.cpp b/clang/lib/AST/AttrImpl.cpp index 2c76f86713fbd..a5ff68c187788 100644 --- a/clang/lib/AST/AttrImpl.cpp +++ b/clang/lib/AST/AttrImpl.cpp @@ -159,7 +159,7 @@ void OMPDeclareVariantAttr::printPrettyPragma( OS << ")"; } OS << " match("; - traitInfos.print(OS, Policy); + traitInfos->print(OS, Policy); OS << ")"; } diff --git a/clang/lib/AST/ComputeDependence.cpp b/clang/lib/AST/ComputeDependence.cpp index 4ca4eacde8b77..a6ccf9aad321e 100644 --- a/clang/lib/AST/ComputeDependence.cpp +++ b/clang/lib/AST/ComputeDependence.cpp @@ -120,9 +120,9 @@ ExprDependence clang::computeDependence(BinaryConditionalOperator *E) { } ExprDependence clang::computeDependence(StmtExpr *E, unsigned TemplateDepth) { - auto D = ExprDependence::None; - if (E->getType()->isDependentType()) - D |= ExprDependence::Type; + // FIXME: why is unexpanded-pack not propagated? + auto D = toExprDependence(E->getType()->getDependence()) & + ~ExprDependence::UnexpandedPack; // Note: we treat a statement-expression in a dependent context as always // being value- and instantiation-dependent. This matches the behavior of // lambda-expressions and GCC. @@ -172,7 +172,7 @@ ExprDependence clang::computeDependence(VAArgExpr *E) { ExprDependence clang::computeDependence(NoInitExpr *E) { return toExprDependence(E->getType()->getDependence()) & - ExprDependence::Instantiation; + (ExprDependence::Instantiation & ExprDependence::Error); } ExprDependence clang::computeDependence(ArrayInitLoopExpr *E) { @@ -213,8 +213,8 @@ ExprDependence clang::computeDependence(CXXRewrittenBinaryOperator *E) { ExprDependence clang::computeDependence(CXXStdInitializerListExpr *E) { auto D = turnTypeToValueDependence(E->getSubExpr()->getDependence()); - if (E->getType()->isDependentType()) - D |= ExprDependence::Type; + D |= toExprDependence(E->getType()->getDependence()) & + (ExprDependence::Type | ExprDependence::Error); return D; } @@ -296,13 +296,19 @@ ExprDependence clang::computeDependence(CXXNoexceptExpr *E, CanThrowResult CT) { return D; } +ExprDependence clang::computeDependence(PackExpansionExpr *E) { + return (E->getPattern()->getDependence() & ~ExprDependence::UnexpandedPack) | + ExprDependence::TypeValueInstantiation; +} + ExprDependence clang::computeDependence(SubstNonTypeTemplateParmExpr *E) { return E->getReplacement()->getDependence(); } ExprDependence clang::computeDependence(CoroutineSuspendExpr *E) { if (auto *Resume = E->getResumeExpr()) - return (Resume->getDependence() & ExprDependence::TypeValue) | + return (Resume->getDependence() & + (ExprDependence::TypeValue | ExprDependence::Error)) | (E->getCommonExpr()->getDependence() & ~ExprDependence::TypeValue); return E->getCommonExpr()->getDependence() | ExprDependence::TypeValueInstantiation; @@ -377,6 +383,7 @@ ExprDependence clang::computeDependence(DeclRefExpr *E, const ASTContext &Ctx) { if (Decl->isParameterPack()) Deps |= ExprDependence::UnexpandedPack; + Deps |= toExprDependence(Type->getDependence()) & ExprDependence::Error; // (TD) C++ [temp.dep.expr]p3: // An id-expression is type-dependent if it contains: @@ -449,6 +456,15 @@ ExprDependence clang::computeDependence(DeclRefExpr *E, const ASTContext &Ctx) { return Deps; } +ExprDependence clang::computeDependence(RecoveryExpr *E) { + // FIXME: drop type+value+instantiation once Error is sufficient to suppress + // bogus dianostics. + auto D = ExprDependence::TypeValueInstantiation | ExprDependence::Error; + for (auto *S : E->subExpressions()) + D |= S->getDependence(); + return D; +} + ExprDependence clang::computeDependence(PredefinedExpr *E) { return toExprDependence(E->getType()->getDependence()) & ~ExprDependence::UnexpandedPack; @@ -475,7 +491,25 @@ ExprDependence clang::computeDependence(OffsetOfExpr *E) { } ExprDependence clang::computeDependence(MemberExpr *E) { - return E->getBase()->getDependence(); + auto *MemberDecl = E->getMemberDecl(); + auto D = E->getBase()->getDependence(); + if (FieldDecl *FD = dyn_cast(MemberDecl)) { + DeclContext *DC = MemberDecl->getDeclContext(); + // dyn_cast_or_null is used to handle objC variables which do not + // have a declaration context. + CXXRecordDecl *RD = dyn_cast_or_null(DC); + if (RD && RD->isDependentContext() && RD->isCurrentInstantiation(DC)) { + if (!E->getType()->isDependentType()) + D &= ~ExprDependence::Type; + } + + // Bitfield with value-dependent width is type-dependent. + if (FD && FD->isBitField() && FD->getBitWidth()->isValueDependent()) { + D |= ExprDependence::Type; + } + } + // FIXME: move remaining dependence computation from MemberExpr::Create() + return D; } ExprDependence clang::computeDependence(InitListExpr *E) { @@ -496,6 +530,10 @@ ExprDependence clang::computeDependence(GenericSelectionExpr *E, bool ContainsUnexpandedPack) { auto D = ContainsUnexpandedPack ? ExprDependence::UnexpandedPack : ExprDependence::None; + for (auto *AE : E->getAssocExprs()) + D |= AE->getDependence() & ExprDependence::Error; + D |= E->getControllingExpr()->getDependence() & ExprDependence::Error; + if (E->isResultDependent()) return D | ExprDependence::TypeValueInstantiation; return D | (E->getResultExpr()->getDependence() & @@ -623,7 +661,8 @@ ExprDependence clang::computeDependence(CXXUnresolvedConstructExpr *E) { if (E->getType()->getContainedDeducedType()) D |= ExprDependence::Type; for (auto *A : E->arguments()) - D |= A->getDependence() & ExprDependence::UnexpandedPack; + D |= A->getDependence() & + (ExprDependence::UnexpandedPack | ExprDependence::Error); return D; } @@ -643,6 +682,15 @@ ExprDependence clang::computeDependence(MaterializeTemporaryExpr *E) { return E->getSubExpr()->getDependence(); } +ExprDependence clang::computeDependence(CXXFoldExpr *E) { + auto D = ExprDependence::TypeValueInstantiation; + for (const auto *C : {E->getLHS(), E->getRHS()}) { + if (C) + D |= C->getDependence() & ~ExprDependence::UnexpandedPack; + } + return D; +} + ExprDependence clang::computeDependence(TypeTraitExpr *E) { auto D = ExprDependence::None; for (const auto *A : E->getArgs()) diff --git a/clang/lib/AST/Expr.cpp b/clang/lib/AST/Expr.cpp index 97eb3c5300055..b603d2ab29eed 100644 --- a/clang/lib/AST/Expr.cpp +++ b/clang/lib/AST/Expr.cpp @@ -357,6 +357,8 @@ llvm::APSInt ConstantExpr::getResultAsAPSInt() const { } APValue ConstantExpr::getAPValueResult() const { + assert(hasAPValueResult()); + switch (ConstantExprBits.ResultKind) { case ConstantExpr::RSK_APValue: return APValueResult(); @@ -505,12 +507,12 @@ PredefinedExpr::PredefinedExpr(SourceLocation L, QualType FNTy, IdentKind IK, setDependence(computeDependence(this)); } -PredefinedExpr::PredefinedExpr(SourceLocation L, QualType FNTy, IdentKind IK, +PredefinedExpr::PredefinedExpr(SourceLocation L, QualType FnTy, IdentKind IK, TypeSourceInfo *Info) - : Expr(PredefinedExprClass, FNTy, VK_LValue, OK_Ordinary) { + : Expr(PredefinedExprClass, FnTy, VK_LValue, OK_Ordinary) { PredefinedExprBits.Kind = IK; assert((getIdentKind() == IK) && - "IdentKind do not fit in PredefinedExprBitfields!"); + "IdentKind do not fit in PredefinedExprBitFields!"); assert(IK == UniqueStableNameType && "Constructor only valid with UniqueStableNameType"); PredefinedExprBits.HasFunctionName = false; @@ -519,12 +521,12 @@ PredefinedExpr::PredefinedExpr(SourceLocation L, QualType FNTy, IdentKind IK, setDependence(computeDependence(this)); } -PredefinedExpr::PredefinedExpr(SourceLocation L, QualType FNTy, IdentKind IK, +PredefinedExpr::PredefinedExpr(SourceLocation L, QualType FnTy, IdentKind IK, Expr *Info) - : Expr(PredefinedExprClass, FNTy, VK_LValue, OK_Ordinary) { + : Expr(PredefinedExprClass, FnTy, VK_LValue, OK_Ordinary) { PredefinedExprBits.Kind = IK; assert((getIdentKind() == IK) && - "IdentKind do not fit in PredefinedExprBitfields!"); + "IdentKind do not fit in PredefinedExprBitFields!"); assert(IK == UniqueStableNameExpr && "Constructor only valid with UniqueStableNameExpr"); PredefinedExprBits.HasFunctionName = false; @@ -542,9 +544,9 @@ PredefinedExpr *PredefinedExpr::Create(const ASTContext &Ctx, SourceLocation L, QualType FNTy, IdentKind IK, StringLiteral *SL) { bool HasFunctionName = SL != nullptr; - void *Mem = - Ctx.Allocate(totalSizeToAlloc(HasFunctionName), - alignof(PredefinedExpr)); + void *Mem = Ctx.Allocate( + totalSizeToAlloc(HasFunctionName, 0, 0), + alignof(PredefinedExpr)); return new (Mem) PredefinedExpr(L, FNTy, IK, SL); } @@ -552,12 +554,11 @@ PredefinedExpr *PredefinedExpr::Create(const ASTContext &Ctx, SourceLocation L, QualType FNTy, IdentKind IK, StringLiteral *SL, TypeSourceInfo *Info) { - assert(IK == UniqueStableNameType && "Wrong Type"); + assert(IK == UniqueStableNameType && "Only valid with UniqueStableNameType"); bool HasFunctionName = SL != nullptr; - void *Mem = - Ctx.Allocate(totalSizeToAlloc(1), - alignof(PredefinedExpr)); - + void *Mem = Ctx.Allocate(totalSizeToAlloc( + HasFunctionName, 0, !HasFunctionName), + alignof(PredefinedExpr)); if (HasFunctionName) return new (Mem) PredefinedExpr(L, FNTy, IK, SL); return new (Mem) PredefinedExpr(L, FNTy, IK, Info); @@ -566,11 +567,11 @@ PredefinedExpr *PredefinedExpr::Create(const ASTContext &Ctx, SourceLocation L, PredefinedExpr *PredefinedExpr::Create(const ASTContext &Ctx, SourceLocation L, QualType FNTy, IdentKind IK, StringLiteral *SL, Expr *E) { - assert(IK == UniqueStableNameExpr && "Wrong Type"); + assert(IK == UniqueStableNameExpr && "Only valid with UniqueStableNameExpr"); bool HasFunctionName = SL != nullptr; - void *Mem = - Ctx.Allocate(totalSizeToAlloc(1), - alignof(PredefinedExpr)); + void *Mem = Ctx.Allocate(totalSizeToAlloc( + HasFunctionName, !HasFunctionName, 0), + alignof(PredefinedExpr)); if (HasFunctionName) return new (Mem) PredefinedExpr(L, FNTy, IK, SL); return new (Mem) PredefinedExpr(L, FNTy, IK, E); @@ -578,8 +579,9 @@ PredefinedExpr *PredefinedExpr::Create(const ASTContext &Ctx, SourceLocation L, PredefinedExpr *PredefinedExpr::CreateEmpty(const ASTContext &Ctx, bool HasFunctionName) { - void *Mem = Ctx.Allocate(totalSizeToAlloc(HasFunctionName), - alignof(PredefinedExpr)); + void *Mem = Ctx.Allocate( + totalSizeToAlloc(HasFunctionName, 0, 0), + alignof(PredefinedExpr)); return new (Mem) PredefinedExpr(EmptyShell(), HasFunctionName); } @@ -601,7 +603,7 @@ StringRef PredefinedExpr::getIdentKindName(PredefinedExpr::IdentKind IK) { return "L__FUNCSIG__"; case UniqueStableNameType: case UniqueStableNameExpr: - return "__unique_stable_name"; + return "__builtin_unique_stable_name"; case PrettyFunctionNoVirtual: break; } @@ -610,8 +612,9 @@ StringRef PredefinedExpr::getIdentKindName(PredefinedExpr::IdentKind IK) { std::string PredefinedExpr::ComputeName(ASTContext &Context, IdentKind IK, QualType Ty) { - std::unique_ptr Ctx{ - ItaniumMangleContext::create(Context, Context.getDiagnostics(), true)}; + std::unique_ptr Ctx{ItaniumMangleContext::create( + Context, Context.getDiagnostics(), /*IsUniqueNameMangler*/ true)}; + Ty = Ty.getCanonicalType(); SmallString<256> Buffer; @@ -1569,28 +1572,15 @@ MemberExpr *MemberExpr::Create( MemberExpr *E = new (Mem) MemberExpr(Base, IsArrow, OperatorLoc, MemberDecl, NameInfo, T, VK, OK, NOUR); - if (isa(MemberDecl)) { - DeclContext *DC = MemberDecl->getDeclContext(); - // dyn_cast_or_null is used to handle objC variables which do not - // have a declaration context. - CXXRecordDecl *RD = dyn_cast_or_null(DC); - if (RD && RD->isDependentContext() && RD->isCurrentInstantiation(DC)) { - if (E->isTypeDependent() && !T->isDependentType()) - E->removeDependence(ExprDependence::Type); - } - // Bitfield with value-dependent width is type-dependent. - FieldDecl *FD = dyn_cast(MemberDecl); - if (FD && FD->isBitField() && FD->getBitWidth()->isValueDependent()) - E->addDependence(ExprDependence::Type); - } - + // FIXME: remove remaining dependence computation to computeDependence(). + auto Deps = E->getDependence(); if (HasQualOrFound) { // FIXME: Wrong. We should be looking at the member declaration we found. if (QualifierLoc && QualifierLoc.getNestedNameSpecifier()->isDependent()) - E->addDependence(ExprDependence::TypeValueInstantiation); + Deps |= ExprDependence::TypeValueInstantiation; else if (QualifierLoc && QualifierLoc.getNestedNameSpecifier()->isInstantiationDependent()) - E->addDependence(ExprDependence::Instantiation); + Deps |= ExprDependence::Instantiation; E->MemberExprBits.HasQualifierOrFoundDecl = true; @@ -1604,16 +1594,17 @@ MemberExpr *MemberExpr::Create( TemplateArgs || TemplateKWLoc.isValid(); if (TemplateArgs) { - auto Deps = TemplateArgumentDependence::None; + auto TemplateArgDeps = TemplateArgumentDependence::None; E->getTrailingObjects()->initializeFrom( TemplateKWLoc, *TemplateArgs, - E->getTrailingObjects(), Deps); - if (Deps & TemplateArgumentDependence::Instantiation) - E->addDependence(ExprDependence::Instantiation); + E->getTrailingObjects(), TemplateArgDeps); + if (TemplateArgDeps & TemplateArgumentDependence::Instantiation) + Deps |= ExprDependence::Instantiation; } else if (TemplateKWLoc.isValid()) { E->getTrailingObjects()->initializeFrom( TemplateKWLoc); } + E->setDependence(Deps); return E; } @@ -2457,6 +2448,7 @@ bool Expr::isUnusedResultAWarning(const Expr *&WarnE, SourceLocation &Loc, // If we don't know precisely what we're looking at, let's not warn. case UnresolvedLookupExprClass: case CXXUnresolvedConstructExprClass: + case RecoveryExprClass: return false; case CXXTemporaryObjectExprClass: @@ -2806,9 +2798,6 @@ static Expr *IgnoreParensSingleStep(Expr *E) { return CE->getChosenSubExpr(); } - else if (auto *CE = dyn_cast(E)) - return CE->getSubExpr(); - return E; } @@ -3094,6 +3083,9 @@ bool Expr::isConstantInitializer(ASTContext &Ctx, bool IsForRef, switch (getStmtClass()) { default: break; + case Stmt::ExprWithCleanupsClass: + return cast(this)->getSubExpr()->isConstantInitializer( + Ctx, IsForRef, Culprit); case StringLiteralClass: case ObjCEncodeExprClass: return true; @@ -3309,6 +3301,7 @@ bool Expr::HasSideEffects(const ASTContext &Ctx, case SubstNonTypeTemplateParmPackExprClass: case FunctionParmPackExprClass: case TypoExprClass: + case RecoveryExprClass: case CXXFoldExprClass: llvm_unreachable("shouldn't see dependent / unresolved nodes here"); @@ -4303,6 +4296,7 @@ DesignatedInitUpdateExpr::DesignatedInitUpdateExpr(const ASTContext &C, ILE->setType(baseExpr->getType()); BaseAndUpdaterExprs[1] = ILE; + // FIXME: this is wrong, set it correctly. setDependence(ExprDependence::None); } @@ -4548,3 +4542,30 @@ QualType OMPArraySectionExpr::getBaseOriginalType(const Expr *Base) { } return OriginalTy; } + +RecoveryExpr::RecoveryExpr(ASTContext &Ctx, SourceLocation BeginLoc, + SourceLocation EndLoc, ArrayRef SubExprs) + : Expr(RecoveryExprClass, Ctx.DependentTy, VK_LValue, OK_Ordinary), + BeginLoc(BeginLoc), EndLoc(EndLoc), NumExprs(SubExprs.size()) { +#ifndef NDEBUG + for (auto *E : SubExprs) + assert(E != nullptr); +#endif + + llvm::copy(SubExprs, getTrailingObjects()); + setDependence(computeDependence(this)); +} + +RecoveryExpr *RecoveryExpr::Create(ASTContext &Ctx, SourceLocation BeginLoc, + SourceLocation EndLoc, + ArrayRef SubExprs) { + void *Mem = Ctx.Allocate(totalSizeToAlloc(SubExprs.size()), + alignof(RecoveryExpr)); + return new (Mem) RecoveryExpr(Ctx, BeginLoc, EndLoc, SubExprs); +} + +RecoveryExpr *RecoveryExpr::CreateEmpty(ASTContext &Ctx, unsigned NumSubExprs) { + void *Mem = Ctx.Allocate(totalSizeToAlloc(NumSubExprs), + alignof(RecoveryExpr)); + return new (Mem) RecoveryExpr(EmptyShell()); +} diff --git a/clang/lib/AST/ExprClassification.cpp b/clang/lib/AST/ExprClassification.cpp index d201af31f521e..58e70347292c7 100644 --- a/clang/lib/AST/ExprClassification.cpp +++ b/clang/lib/AST/ExprClassification.cpp @@ -129,6 +129,7 @@ static Cl::Kinds ClassifyInternal(ASTContext &Ctx, const Expr *E) { case Expr::UnresolvedLookupExprClass: case Expr::UnresolvedMemberExprClass: case Expr::TypoExprClass: + case Expr::RecoveryExprClass: case Expr::DependentCoawaitExprClass: case Expr::CXXDependentScopeMemberExprClass: case Expr::DependentScopeDeclRefExprClass: diff --git a/clang/lib/AST/ExprConcepts.cpp b/clang/lib/AST/ExprConcepts.cpp index b3a4bd9215d5c..d00d8329095c0 100644 --- a/clang/lib/AST/ExprConcepts.cpp +++ b/clang/lib/AST/ExprConcepts.cpp @@ -175,13 +175,13 @@ RequiresExpr::RequiresExpr(ASTContext &C, SourceLocation RequiresKWLoc, RequiresExprBits.IsSatisfied |= Dependent; // FIXME: move the computing dependency logic to ComputeDependence.h if (ContainsUnexpandedParameterPack) - addDependence(ExprDependence::UnexpandedPack); + setDependence(getDependence() | ExprDependence::UnexpandedPack); // FIXME: this is incorrect for cases where we have a non-dependent // requirement, but its parameters are instantiation-dependent. RequiresExpr // should be instantiation-dependent if it has instantiation-dependent // parameters. if (Dependent) - addDependence(ExprDependence::ValueInstantiation); + setDependence(getDependence() | ExprDependence::ValueInstantiation); } RequiresExpr::RequiresExpr(ASTContext &C, EmptyShell Empty, diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index f5c37ad44ebd1..06f4885e47d6c 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -6776,8 +6776,13 @@ class ExprEvaluatorBase return Error(E); } - bool VisitConstantExpr(const ConstantExpr *E) - { return StmtVisitorTy::Visit(E->getSubExpr()); } + bool VisitConstantExpr(const ConstantExpr *E) { + if (E->hasAPValueResult()) + return DerivedSuccess(E->getAPValueResult(), E); + + return StmtVisitorTy::Visit(E->getSubExpr()); + } + bool VisitParenExpr(const ParenExpr *E) { return StmtVisitorTy::Visit(E->getSubExpr()); } bool VisitUnaryExtension(const UnaryOperator *E) @@ -14184,6 +14189,7 @@ static ICEDiag CheckICE(const Expr* E, const ASTContext &Ctx) { case Expr::CXXPseudoDestructorExprClass: case Expr::UnresolvedLookupExprClass: case Expr::TypoExprClass: + case Expr::RecoveryExprClass: case Expr::DependentScopeDeclRefExprClass: case Expr::CXXConstructExprClass: case Expr::CXXInheritedCtorInitExprClass: diff --git a/clang/lib/AST/ItaniumMangle.cpp b/clang/lib/AST/ItaniumMangle.cpp index 5fb71b48a6538..823c085c4aea8 100644 --- a/clang/lib/AST/ItaniumMangle.cpp +++ b/clang/lib/AST/ItaniumMangle.cpp @@ -126,9 +126,6 @@ class ItaniumMangleContextImpl : public ItaniumMangleContext { llvm::DenseMap Uniquifier; public: - explicit ItaniumMangleContextImpl(ASTContext &Context, - DiagnosticsEngine &Diags) - : ItaniumMangleContext(Context, Diags) {} explicit ItaniumMangleContextImpl(ASTContext &Context, DiagnosticsEngine &Diags, bool IsUniqueNameMangler) @@ -1773,10 +1770,10 @@ void CXXNameMangler::mangleTemplateParamDecl(const NamedDecl *Decl) { } } -// Handles the __unique_stable_name feature for lambdas. Instead of the ordinal -// of the lambda in its function, this does line/column to uniquely and reliably -// identify the lambda. Additionally, Macro expansions are expanded as well to -// prevent macros causing duplicates. +// Handles the __builtin_unique_stable_name feature for lambdas. Instead of the +// ordinal of the lambda in its mangling, this does line/column to uniquely and +// reliably identify the lambda. Additionally, macro expansions are expressed +// as well to prevent macros causing duplicates. static void mangleUniqueNameLambda(CXXNameMangler &Mangler, SourceManager &SM, raw_ostream &Out, const CXXRecordDecl *Lambda) { @@ -1787,20 +1784,20 @@ static void mangleUniqueNameLambda(CXXNameMangler &Mangler, SourceManager &SM, Out << "->"; Mangler.mangleNumber(PLoc.getColumn()); - while (Loc.isMacroID()) { - SourceLocation ToPrint = Loc; + while(Loc.isMacroID()) { + SourceLocation SLToPrint = Loc; if (SM.isMacroArgExpansion(Loc)) - ToPrint = SM.getImmediateExpansionRange(Loc).getBegin(); + SLToPrint = SM.getImmediateExpansionRange(Loc).getBegin(); - Loc = SM.getImmediateMacroCallerLoc(Loc); - if (Loc.isFileID()) - Loc = SM.getImmediateMacroCallerLoc(ToPrint); - - PresumedLoc PLoc = SM.getPresumedLoc(SM.getSpellingLoc(ToPrint)); - Out << '~'; + PLoc = SM.getPresumedLoc(SM.getSpellingLoc(SLToPrint)); + Out << "~"; Mangler.mangleNumber(PLoc.getLine()); Out << "->"; Mangler.mangleNumber(PLoc.getColumn()); + + Loc = SM.getImmediateMacroCallerLoc(Loc); + if (Loc.isFileID()) + Loc = SM.getImmediateMacroCallerLoc(SLToPrint); } } @@ -1835,8 +1832,8 @@ void CXXNameMangler::mangleLambda(const CXXRecordDecl *Lambda) { Out << "E"; if (Context.isUniqueNameMangler()) { - mangleUniqueNameLambda(*this, Context.getASTContext().getSourceManager(), - Out, Lambda); + mangleUniqueNameLambda( + *this, Context.getASTContext().getSourceManager(), Out, Lambda); return; } @@ -3714,7 +3711,8 @@ void CXXNameMangler::mangleExpression(const Expr *E, unsigned Arity) { case Expr::LambdaExprClass: case Expr::MSPropertyRefExprClass: case Expr::MSPropertySubscriptExprClass: - case Expr::TypoExprClass: // This should no longer exist in the AST by now. + case Expr::TypoExprClass: // This should no longer exist in the AST by now. + case Expr::RecoveryExprClass: case Expr::OMPArraySectionExprClass: case Expr::CXXInheritedCtorInitExprClass: llvm_unreachable("unexpected statement kind"); @@ -5244,11 +5242,6 @@ void ItaniumMangleContextImpl::mangleLambdaSig(const CXXRecordDecl *Lambda, Mangler.mangleLambdaSig(Lambda); } -ItaniumMangleContext * -ItaniumMangleContext::create(ASTContext &Context, DiagnosticsEngine &Diags) { - return new ItaniumMangleContextImpl(Context, Diags); -} - ItaniumMangleContext *ItaniumMangleContext::create(ASTContext &Context, DiagnosticsEngine &Diags, bool IsUniqueNameMangler) { diff --git a/clang/lib/AST/OpenMPClause.cpp b/clang/lib/AST/OpenMPClause.cpp index a957890670562..fc7912d6fdcac 100644 --- a/clang/lib/AST/OpenMPClause.cpp +++ b/clang/lib/AST/OpenMPClause.cpp @@ -146,6 +146,8 @@ const OMPClauseWithPreInit *OMPClauseWithPreInit::get(const OMPClause *C) { case OMPC_order: case OMPC_destroy: case OMPC_detach: + case OMPC_inclusive: + case OMPC_exclusive: break; } @@ -233,6 +235,8 @@ const OMPClauseWithPostUpdate *OMPClauseWithPostUpdate::get(const OMPClause *C) case OMPC_order: case OMPC_destroy: case OMPC_detach: + case OMPC_inclusive: + case OMPC_exclusive: break; } @@ -699,14 +703,16 @@ void OMPReductionClause::setReductionOps(ArrayRef ReductionOps) { OMPReductionClause *OMPReductionClause::Create( const ASTContext &C, SourceLocation StartLoc, SourceLocation LParenLoc, - SourceLocation EndLoc, SourceLocation ColonLoc, ArrayRef VL, + SourceLocation ModifierLoc, SourceLocation EndLoc, SourceLocation ColonLoc, + OpenMPReductionClauseModifier Modifier, ArrayRef VL, NestedNameSpecifierLoc QualifierLoc, const DeclarationNameInfo &NameInfo, ArrayRef Privates, ArrayRef LHSExprs, ArrayRef RHSExprs, ArrayRef ReductionOps, Stmt *PreInit, Expr *PostUpdate) { void *Mem = C.Allocate(totalSizeToAlloc(5 * VL.size())); - OMPReductionClause *Clause = new (Mem) OMPReductionClause( - StartLoc, LParenLoc, EndLoc, ColonLoc, VL.size(), QualifierLoc, NameInfo); + auto *Clause = new (Mem) + OMPReductionClause(StartLoc, LParenLoc, ModifierLoc, EndLoc, ColonLoc, + Modifier, VL.size(), QualifierLoc, NameInfo); Clause->setVarRefs(VL); Clause->setPrivates(Privates); Clause->setLHSExprs(LHSExprs); @@ -1248,6 +1254,42 @@ void OMPNontemporalClause::setPrivateRefs(ArrayRef VL) { std::copy(VL.begin(), VL.end(), varlist_end()); } +OMPInclusiveClause *OMPInclusiveClause::Create(const ASTContext &C, + SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc, + ArrayRef VL) { + void *Mem = C.Allocate(totalSizeToAlloc(VL.size())); + auto *Clause = + new (Mem) OMPInclusiveClause(StartLoc, LParenLoc, EndLoc, VL.size()); + Clause->setVarRefs(VL); + return Clause; +} + +OMPInclusiveClause *OMPInclusiveClause::CreateEmpty(const ASTContext &C, + unsigned N) { + void *Mem = C.Allocate(totalSizeToAlloc(N)); + return new (Mem) OMPInclusiveClause(N); +} + +OMPExclusiveClause *OMPExclusiveClause::Create(const ASTContext &C, + SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc, + ArrayRef VL) { + void *Mem = C.Allocate(totalSizeToAlloc(VL.size())); + auto *Clause = + new (Mem) OMPExclusiveClause(StartLoc, LParenLoc, EndLoc, VL.size()); + Clause->setVarRefs(VL); + return Clause; +} + +OMPExclusiveClause *OMPExclusiveClause::CreateEmpty(const ASTContext &C, + unsigned N) { + void *Mem = C.Allocate(totalSizeToAlloc(N)); + return new (Mem) OMPExclusiveClause(N); +} + //===----------------------------------------------------------------------===// // OpenMP clauses printing methods //===----------------------------------------------------------------------===// @@ -1555,6 +1597,9 @@ void OMPClausePrinter::VisitOMPSharedClause(OMPSharedClause *Node) { void OMPClausePrinter::VisitOMPReductionClause(OMPReductionClause *Node) { if (!Node->varlist_empty()) { OS << "reduction("; + if (Node->getModifierLoc().isValid()) + OS << getOpenMPSimpleClauseTypeName(OMPC_reduction, Node->getModifier()) + << ", "; NestedNameSpecifier *QualifierLoc = Node->getQualifierLoc().getNestedNameSpecifier(); OverloadedOperatorKind OOK = @@ -1805,6 +1850,22 @@ void OMPClausePrinter::VisitOMPOrderClause(OMPOrderClause *Node) { << ")"; } +void OMPClausePrinter::VisitOMPInclusiveClause(OMPInclusiveClause *Node) { + if (!Node->varlist_empty()) { + OS << "inclusive"; + VisitOMPClauseList(Node, '('); + OS << ")"; + } +} + +void OMPClausePrinter::VisitOMPExclusiveClause(OMPExclusiveClause *Node) { + if (!Node->varlist_empty()) { + OS << "exclusive"; + VisitOMPClauseList(Node, '('); + OS << ")"; + } +} + void OMPTraitInfo::getAsVariantMatchInfo( ASTContext &ASTCtx, llvm::omp::VariantMatchInfo &VMI) const { for (const OMPTraitSet &Set : Sets) { @@ -1908,3 +1969,7 @@ llvm::raw_ostream &clang::operator<<(llvm::raw_ostream &OS, TI.print(OS, Policy); return OS; } +llvm::raw_ostream &clang::operator<<(llvm::raw_ostream &OS, + const OMPTraitInfo *TI) { + return TI ? OS << *TI : OS; +} diff --git a/clang/lib/AST/StmtOpenMP.cpp b/clang/lib/AST/StmtOpenMP.cpp index 153d492598d3f..995f710876af5 100644 --- a/clang/lib/AST/StmtOpenMP.cpp +++ b/clang/lib/AST/StmtOpenMP.cpp @@ -781,6 +781,27 @@ OMPDepobjDirective *OMPDepobjDirective::CreateEmpty(const ASTContext &C, return new (Mem) OMPDepobjDirective(NumClauses); } +OMPScanDirective *OMPScanDirective::Create(const ASTContext &C, + SourceLocation StartLoc, + SourceLocation EndLoc, + ArrayRef Clauses) { + unsigned Size = llvm::alignTo(sizeof(OMPScanDirective), alignof(OMPClause *)); + void *Mem = C.Allocate(Size + sizeof(OMPClause *) * Clauses.size(), + alignof(OMPScanDirective)); + auto *Dir = new (Mem) OMPScanDirective(StartLoc, EndLoc, Clauses.size()); + Dir->setClauses(Clauses); + return Dir; +} + +OMPScanDirective *OMPScanDirective::CreateEmpty(const ASTContext &C, + unsigned NumClauses, + EmptyShell) { + unsigned Size = llvm::alignTo(sizeof(OMPScanDirective), alignof(OMPClause *)); + void *Mem = C.Allocate(Size + sizeof(OMPClause *) * NumClauses, + alignof(OMPScanDirective)); + return new (Mem) OMPScanDirective(NumClauses); +} + OMPOrderedDirective *OMPOrderedDirective::Create(const ASTContext &C, SourceLocation StartLoc, SourceLocation EndLoc, diff --git a/clang/lib/AST/StmtPrinter.cpp b/clang/lib/AST/StmtPrinter.cpp index f7a97c2743c19..80fdc09a8a6cf 100644 --- a/clang/lib/AST/StmtPrinter.cpp +++ b/clang/lib/AST/StmtPrinter.cpp @@ -758,6 +758,11 @@ void StmtPrinter::VisitOMPDepobjDirective(OMPDepobjDirective *Node) { PrintOMPExecutableDirective(Node); } +void StmtPrinter::VisitOMPScanDirective(OMPScanDirective *Node) { + Indent() << "#pragma omp scan"; + PrintOMPExecutableDirective(Node); +} + void StmtPrinter::VisitOMPOrderedDirective(OMPOrderedDirective *Node) { Indent() << "#pragma omp ordered"; PrintOMPExecutableDirective(Node, Node->hasClausesOfKind()); @@ -2501,6 +2506,17 @@ void StmtPrinter::VisitTypoExpr(TypoExpr *Node) { llvm_unreachable("Cannot print TypoExpr nodes"); } +void StmtPrinter::VisitRecoveryExpr(RecoveryExpr *Node) { + OS << "("; + const char *Sep = ""; + for (Expr *E : Node->subExpressions()) { + OS << Sep; + PrintExpr(E); + Sep = ", "; + } + OS << ')'; +} + void StmtPrinter::VisitAsTypeExpr(AsTypeExpr *Node) { OS << "__builtin_astype("; PrintExpr(Node->getSrcExpr()); diff --git a/clang/lib/AST/StmtProfile.cpp b/clang/lib/AST/StmtProfile.cpp index 8d43491ea8f4c..5e87eb3e237c3 100644 --- a/clang/lib/AST/StmtProfile.cpp +++ b/clang/lib/AST/StmtProfile.cpp @@ -795,6 +795,12 @@ void OMPClauseProfiler::VisitOMPNontemporalClause( for (auto *E : C->private_refs()) Profiler->VisitStmt(E); } +void OMPClauseProfiler::VisitOMPInclusiveClause(const OMPInclusiveClause *C) { + VisitOMPClauseList(C); +} +void OMPClauseProfiler::VisitOMPExclusiveClause(const OMPExclusiveClause *C) { + VisitOMPClauseList(C); +} void OMPClauseProfiler::VisitOMPOrderClause(const OMPOrderClause *C) {} } // namespace @@ -900,6 +906,10 @@ void StmtProfiler::VisitOMPDepobjDirective(const OMPDepobjDirective *S) { VisitOMPExecutableDirective(S); } +void StmtProfiler::VisitOMPScanDirective(const OMPScanDirective *S) { + VisitOMPExecutableDirective(S); +} + void StmtProfiler::VisitOMPOrderedDirective(const OMPOrderedDirective *S) { VisitOMPExecutableDirective(S); } @@ -2015,6 +2025,8 @@ void StmtProfiler::VisitSourceLocExpr(const SourceLocExpr *E) { VisitExpr(E); } +void StmtProfiler::VisitRecoveryExpr(const RecoveryExpr *E) { VisitExpr(E); } + void StmtProfiler::VisitObjCStringLiteral(const ObjCStringLiteral *S) { VisitExpr(S); } diff --git a/clang/lib/AST/TextNodeDumper.cpp b/clang/lib/AST/TextNodeDumper.cpp index 0b86dbb873475..6a6d8692228af 100644 --- a/clang/lib/AST/TextNodeDumper.cpp +++ b/clang/lib/AST/TextNodeDumper.cpp @@ -126,6 +126,11 @@ void TextNodeDumper::Visit(const Stmt *Node) { if (const auto *E = dyn_cast(Node)) { dumpType(E->getType()); + if (E->containsErrors()) { + ColorScope Color(OS, ShowColors, ErrorsColor); + OS << " contains-errors"; + } + { ColorScope Color(OS, ShowColors, ValueKindColor); switch (E->getValueKind()) { diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp index 6e1c70f952629..69c942e46f729 100644 --- a/clang/lib/AST/Type.cpp +++ b/clang/lib/AST/Type.cpp @@ -20,6 +20,7 @@ #include "clang/AST/DeclCXX.h" #include "clang/AST/DeclObjC.h" #include "clang/AST/DeclTemplate.h" +#include "clang/AST/DependenceFlags.h" #include "clang/AST/Expr.h" #include "clang/AST/NestedNameSpecifier.h" #include "clang/AST/NonTrivialTypeVisitor.h" @@ -123,14 +124,15 @@ ArrayType::ArrayType(TypeClass tc, QualType et, QualType can, // // template int arr[] = {N...}; : Type(tc, can, - et->isDependentType() || (sz && sz->isValueDependent()) || - tc == DependentSizedArray, - et->isInstantiationDependentType() || - (sz && sz->isInstantiationDependent()) || - tc == DependentSizedArray, - (tc == VariableArray || et->isVariablyModifiedType()), - et->containsUnexpandedParameterPack() || - (sz && sz->containsUnexpandedParameterPack())), + et->getDependence() | + (sz ? toTypeDependence( + turnValueToTypeDependence(sz->getDependence())) + : TypeDependence::None) | + (tc == VariableArray ? TypeDependence::VariablyModified + : TypeDependence::None) | + (tc == DependentSizedArray + ? TypeDependence::DependentInstantiation + : TypeDependence::None)), ElementType(et) { ArrayTypeBits.IndexTypeQuals = tq; ArrayTypeBits.SizeModifier = sm; @@ -217,14 +219,16 @@ void DependentSizedArrayType::Profile(llvm::FoldingSetNodeID &ID, E->Profile(ID, Context, true); } -DependentVectorType::DependentVectorType( - const ASTContext &Context, QualType ElementType, QualType CanonType, - Expr *SizeExpr, SourceLocation Loc, VectorType::VectorKind VecKind) - : Type(DependentVector, CanonType, /*Dependent=*/true, - /*InstantiationDependent=*/true, - ElementType->isVariablyModifiedType(), - ElementType->containsUnexpandedParameterPack() || - (SizeExpr && SizeExpr->containsUnexpandedParameterPack())), +DependentVectorType::DependentVectorType(const ASTContext &Context, + QualType ElementType, + QualType CanonType, Expr *SizeExpr, + SourceLocation Loc, + VectorType::VectorKind VecKind) + : Type(DependentVector, CanonType, + TypeDependence::DependentInstantiation | + ElementType->getDependence() | + (SizeExpr ? toTypeDependence(SizeExpr->getDependence()) + : TypeDependence::None)), Context(Context), ElementType(ElementType), SizeExpr(SizeExpr), Loc(Loc) { VectorTypeBits.VecKind = VecKind; } @@ -238,19 +242,16 @@ void DependentVectorType::Profile(llvm::FoldingSetNodeID &ID, SizeExpr->Profile(ID, Context, true); } -DependentSizedExtVectorType::DependentSizedExtVectorType(const - ASTContext &Context, - QualType ElementType, - QualType can, - Expr *SizeExpr, - SourceLocation loc) - : Type(DependentSizedExtVector, can, /*Dependent=*/true, - /*InstantiationDependent=*/true, - ElementType->isVariablyModifiedType(), - (ElementType->containsUnexpandedParameterPack() || - (SizeExpr && SizeExpr->containsUnexpandedParameterPack()))), - Context(Context), SizeExpr(SizeExpr), ElementType(ElementType), - loc(loc) {} +DependentSizedExtVectorType::DependentSizedExtVectorType( + const ASTContext &Context, QualType ElementType, QualType can, + Expr *SizeExpr, SourceLocation loc) + : Type(DependentSizedExtVector, can, + TypeDependence::DependentInstantiation | + ElementType->getDependence() | + (SizeExpr ? toTypeDependence(SizeExpr->getDependence()) + : TypeDependence::None)), + Context(Context), SizeExpr(SizeExpr), ElementType(ElementType), loc(loc) { +} void DependentSizedExtVectorType::Profile(llvm::FoldingSetNodeID &ID, @@ -260,15 +261,16 @@ DependentSizedExtVectorType::Profile(llvm::FoldingSetNodeID &ID, SizeExpr->Profile(ID, Context, true); } -DependentAddressSpaceType::DependentAddressSpaceType( - const ASTContext &Context, QualType PointeeType, QualType can, - Expr *AddrSpaceExpr, SourceLocation loc) - : Type(DependentAddressSpace, can, /*Dependent=*/true, - /*InstantiationDependent=*/true, - PointeeType->isVariablyModifiedType(), - (PointeeType->containsUnexpandedParameterPack() || - (AddrSpaceExpr && - AddrSpaceExpr->containsUnexpandedParameterPack()))), +DependentAddressSpaceType::DependentAddressSpaceType(const ASTContext &Context, + QualType PointeeType, + QualType can, + Expr *AddrSpaceExpr, + SourceLocation loc) + : Type(DependentAddressSpace, can, + TypeDependence::DependentInstantiation | + PointeeType->getDependence() | + (AddrSpaceExpr ? toTypeDependence(AddrSpaceExpr->getDependence()) + : TypeDependence::None)), Context(Context), AddrSpaceExpr(AddrSpaceExpr), PointeeType(PointeeType), loc(loc) {} @@ -286,11 +288,7 @@ VectorType::VectorType(QualType vecType, unsigned nElements, QualType canonType, VectorType::VectorType(TypeClass tc, QualType vecType, unsigned nElements, QualType canonType, VectorKind vecKind) - : Type(tc, canonType, vecType->isDependentType(), - vecType->isInstantiationDependentType(), - vecType->isVariablyModifiedType(), - vecType->containsUnexpandedParameterPack()), - ElementType(vecType) { + : Type(tc, canonType, vecType->getDependence()), ElementType(vecType) { VectorTypeBits.VecKind = vecKind; VectorTypeBits.NumElements = nElements; } @@ -652,14 +650,11 @@ bool Type::isObjCClassOrClassKindOfType() const { return OPT->isObjCClassType() || OPT->isObjCQualifiedClassType(); } -ObjCTypeParamType::ObjCTypeParamType(const ObjCTypeParamDecl *D, - QualType can, +ObjCTypeParamType::ObjCTypeParamType(const ObjCTypeParamDecl *D, QualType can, ArrayRef protocols) - : Type(ObjCTypeParam, can, can->isDependentType(), - can->isInstantiationDependentType(), - can->isVariablyModifiedType(), - /*ContainsUnexpandedParameterPack=*/false), - OTPDecl(const_cast(D)) { + : Type(ObjCTypeParam, can, + can->getDependence() & ~TypeDependence::UnexpandedPack), + OTPDecl(const_cast(D)) { initialize(protocols); } @@ -667,11 +662,7 @@ ObjCObjectType::ObjCObjectType(QualType Canonical, QualType Base, ArrayRef typeArgs, ArrayRef protocols, bool isKindOf) - : Type(ObjCObject, Canonical, Base->isDependentType(), - Base->isInstantiationDependentType(), - Base->isVariablyModifiedType(), - Base->containsUnexpandedParameterPack()), - BaseType(Base) { + : Type(ObjCObject, Canonical, Base->getDependence()), BaseType(Base) { ObjCObjectTypeBits.IsKindOf = isKindOf; ObjCObjectTypeBits.NumTypeArgs = typeArgs.size(); @@ -682,13 +673,7 @@ ObjCObjectType::ObjCObjectType(QualType Canonical, QualType Base, typeArgs.size() * sizeof(QualType)); for (auto typeArg : typeArgs) { - if (typeArg->isDependentType()) - setDependent(); - else if (typeArg->isInstantiationDependentType()) - setInstantiationDependent(); - - if (typeArg->containsUnexpandedParameterPack()) - setContainsUnexpandedParameterPack(); + addDependence(typeArg->getDependence() & ~TypeDependence::VariablyModified); } // Initialize the protocol qualifiers. The protocol storage is known // after we set number of type arguments. @@ -2715,21 +2700,20 @@ StringRef TypeWithKeyword::getKeywordName(ElaboratedTypeKeyword Keyword) { } DependentTemplateSpecializationType::DependentTemplateSpecializationType( - ElaboratedTypeKeyword Keyword, - NestedNameSpecifier *NNS, const IdentifierInfo *Name, - ArrayRef Args, - QualType Canon) - : TypeWithKeyword(Keyword, DependentTemplateSpecialization, Canon, true, true, - /*VariablyModified=*/false, - NNS && NNS->containsUnexpandedParameterPack()), - NNS(NNS), Name(Name) { + ElaboratedTypeKeyword Keyword, NestedNameSpecifier *NNS, + const IdentifierInfo *Name, ArrayRef Args, QualType Canon) + : TypeWithKeyword(Keyword, DependentTemplateSpecialization, Canon, + TypeDependence::DependentInstantiation | + (NNS ? toTypeDependence(NNS->getDependence()) + : TypeDependence::None)), + NNS(NNS), Name(Name) { DependentTemplateSpecializationTypeBits.NumArgs = Args.size(); assert((!NNS || NNS->isDependent()) && "DependentTemplateSpecializatonType requires dependent qualifier"); TemplateArgument *ArgBuffer = getArgBuffer(); for (const TemplateArgument &Arg : Args) { - if (Arg.containsUnexpandedParameterPack()) - setContainsUnexpandedParameterPack(); + addDependence(toTypeDependence(Arg.getDependence() & + TemplateArgumentDependence::UnexpandedPack)); new (ArgBuffer++) TemplateArgument(Arg); } @@ -2972,10 +2956,8 @@ StringRef FunctionType::getNameForCallConv(CallingConv CC) { FunctionProtoType::FunctionProtoType(QualType result, ArrayRef params, QualType canonical, const ExtProtoInfo &epi) - : FunctionType(FunctionProto, result, canonical, result->isDependentType(), - result->isInstantiationDependentType(), - result->isVariablyModifiedType(), - result->containsUnexpandedParameterPack(), epi.ExtInfo) { + : FunctionType(FunctionProto, result, canonical, result->getDependence(), + epi.ExtInfo) { FunctionTypeBits.FastTypeQuals = epi.TypeQuals.getFastQualifiers(); FunctionTypeBits.RefQualifier = epi.RefQualifier; FunctionTypeBits.NumParams = params.size(); @@ -2994,14 +2976,8 @@ FunctionProtoType::FunctionProtoType(QualType result, ArrayRef params, // Fill in the trailing argument array. auto *argSlot = getTrailingObjects(); for (unsigned i = 0; i != getNumParams(); ++i) { - if (params[i]->isDependentType()) - setDependent(); - else if (params[i]->isInstantiationDependentType()) - setInstantiationDependent(); - - if (params[i]->containsUnexpandedParameterPack()) - setContainsUnexpandedParameterPack(); - + addDependence(params[i]->getDependence() & + ~TypeDependence::VariablyModified); argSlot[i] = params[i]; } @@ -3015,11 +2991,9 @@ FunctionProtoType::FunctionProtoType(QualType result, ArrayRef params, // Note that, before C++17, a dependent exception specification does // *not* make a type dependent; it's not even part of the C++ type // system. - if (ExceptionType->isInstantiationDependentType()) - setInstantiationDependent(); - - if (ExceptionType->containsUnexpandedParameterPack()) - setContainsUnexpandedParameterPack(); + addDependence( + ExceptionType->getDependence() & + (TypeDependence::Instantiation | TypeDependence::UnexpandedPack)); exnSlot[I++] = ExceptionType; } @@ -3033,12 +3007,9 @@ FunctionProtoType::FunctionProtoType(QualType result, ArrayRef params, // Store the noexcept expression and context. *getTrailingObjects() = epi.ExceptionSpec.NoexceptExpr; - if (epi.ExceptionSpec.NoexceptExpr->isValueDependent() || - epi.ExceptionSpec.NoexceptExpr->isInstantiationDependent()) - setInstantiationDependent(); - - if (epi.ExceptionSpec.NoexceptExpr->containsUnexpandedParameterPack()) - setContainsUnexpandedParameterPack(); + addDependence( + toTypeDependence(epi.ExceptionSpec.NoexceptExpr->getDependence()) & + (TypeDependence::Instantiation | TypeDependence::UnexpandedPack)); } // Fill in the FunctionDecl * in the exception specification if present. else if (getExceptionSpecType() == EST_Uninstantiated) { @@ -3062,11 +3033,11 @@ FunctionProtoType::FunctionProtoType(QualType result, ArrayRef params, if (getExceptionSpecType() == EST_Dynamic || getExceptionSpecType() == EST_DependentNoexcept) { assert(hasDependentExceptionSpec() && "type should not be canonical"); - setDependent(); + addDependence(TypeDependence::DependentInstantiation); } } else if (getCanonicalTypeInternal()->isDependentType()) { // Ask our canonical type whether our exception specification was dependent. - setDependent(); + addDependence(TypeDependence::DependentInstantiation); } // Fill in the extra parameter info if present. @@ -3229,10 +3200,10 @@ QualType MacroQualifiedType::getModifiedType() const { } TypeOfExprType::TypeOfExprType(Expr *E, QualType can) - : Type(TypeOfExpr, can, E->isTypeDependent(), - E->isInstantiationDependent(), - E->getType()->isVariablyModifiedType(), - E->containsUnexpandedParameterPack()), + : Type(TypeOfExpr, can, + toTypeDependence(E->getDependence()) | + (E->getType()->getDependence() & + TypeDependence::VariablyModified)), TOExpr(E) {} bool TypeOfExprType::isSugared() const { @@ -3252,13 +3223,15 @@ void DependentTypeOfExprType::Profile(llvm::FoldingSetNodeID &ID, } DecltypeType::DecltypeType(Expr *E, QualType underlyingType, QualType can) - // C++11 [temp.type]p2: "If an expression e involves a template parameter, - // decltype(e) denotes a unique dependent type." Hence a decltype type is - // type-dependent even if its expression is only instantiation-dependent. - : Type(Decltype, can, E->isInstantiationDependent(), - E->isInstantiationDependent(), - E->getType()->isVariablyModifiedType(), - E->containsUnexpandedParameterPack()), + // C++11 [temp.type]p2: "If an expression e involves a template parameter, + // decltype(e) denotes a unique dependent type." Hence a decltype type is + // type-dependent even if its expression is only instantiation-dependent. + : Type(Decltype, can, + toTypeDependence(E->getDependence()) | + (E->isInstantiationDependent() ? TypeDependence::Dependent + : TypeDependence::None) | + (E->getType()->getDependence() & + TypeDependence::VariablyModified)), E(E), UnderlyingType(underlyingType) {} bool DecltypeType::isSugared() const { return !E->isInstantiationDependent(); } @@ -3279,13 +3252,9 @@ void DependentDecltypeType::Profile(llvm::FoldingSetNodeID &ID, } UnaryTransformType::UnaryTransformType(QualType BaseType, - QualType UnderlyingType, - UTTKind UKind, + QualType UnderlyingType, UTTKind UKind, QualType CanonicalType) - : Type(UnaryTransform, CanonicalType, BaseType->isDependentType(), - BaseType->isInstantiationDependentType(), - BaseType->isVariablyModifiedType(), - BaseType->containsUnexpandedParameterPack()), + : Type(UnaryTransform, CanonicalType, BaseType->getDependence()), BaseType(BaseType), UnderlyingType(UnderlyingType), UKind(UKind) {} DependentUnaryTransformType::DependentUnaryTransformType(const ASTContext &C, @@ -3294,11 +3263,10 @@ DependentUnaryTransformType::DependentUnaryTransformType(const ASTContext &C, : UnaryTransformType(BaseType, C.DependentTy, UKind, QualType()) {} TagType::TagType(TypeClass TC, const TagDecl *D, QualType can) - : Type(TC, can, D->isDependentType(), - /*InstantiationDependent=*/D->isDependentType(), - /*VariablyModified=*/false, - /*ContainsUnexpandedParameterPack=*/false), - decl(const_cast(D)) {} + : Type(TC, can, + D->isDependentType() ? TypeDependence::DependentInstantiation + : TypeDependence::None), + decl(const_cast(D)) {} static TagDecl *getInterestingTagDecl(TagDecl *decl) { for (auto I : decl->redecls()) { @@ -3407,11 +3375,12 @@ IdentifierInfo *TemplateTypeParmType::getIdentifier() const { return isCanonicalUnqualified() ? nullptr : getDecl()->getIdentifier(); } -SubstTemplateTypeParmPackType:: -SubstTemplateTypeParmPackType(const TemplateTypeParmType *Param, - QualType Canon, - const TemplateArgument &ArgPack) - : Type(SubstTemplateTypeParmPack, Canon, true, true, false, true), +SubstTemplateTypeParmPackType::SubstTemplateTypeParmPackType( + const TemplateTypeParmType *Param, QualType Canon, + const TemplateArgument &ArgPack) + : Type(SubstTemplateTypeParmPack, Canon, + TypeDependence::DependentInstantiation | + TypeDependence::UnexpandedPack), Replaced(Param), Arguments(ArgPack.pack_begin()) { SubstTemplateTypeParmPackTypeBits.NumArgs = ArgPack.pack_size(); } @@ -3455,16 +3424,17 @@ anyDependentTemplateArguments(ArrayRef Args, return false; } -TemplateSpecializationType:: -TemplateSpecializationType(TemplateName T, - ArrayRef Args, - QualType Canon, QualType AliasedType) - : Type(TemplateSpecialization, - Canon.isNull()? QualType(this, 0) : Canon, - Canon.isNull()? true : Canon->isDependentType(), - Canon.isNull()? true : Canon->isInstantiationDependentType(), - false, - T.containsUnexpandedParameterPack()), Template(T) { +TemplateSpecializationType::TemplateSpecializationType( + TemplateName T, ArrayRef Args, QualType Canon, + QualType AliasedType) + : Type(TemplateSpecialization, Canon.isNull() ? QualType(this, 0) : Canon, + (Canon.isNull() + ? TypeDependence::DependentInstantiation + : Canon->getDependence() & ~(TypeDependence::VariablyModified | + TypeDependence::UnexpandedPack)) | + (toTypeDependence(T.getDependence()) & + TypeDependence::UnexpandedPack)), + Template(T) { TemplateSpecializationTypeBits.NumArgs = Args.size(); TemplateSpecializationTypeBits.TypeAlias = !AliasedType.isNull(); @@ -3485,13 +3455,11 @@ TemplateSpecializationType(TemplateName T, // U is always non-dependent, irrespective of the type T. // However, U contains an unexpanded parameter pack, even though // its expansion (and thus its desugared type) doesn't. - if (Arg.isInstantiationDependent()) - setInstantiationDependent(); - if (Arg.getKind() == TemplateArgument::Type && - Arg.getAsType()->isVariablyModifiedType()) - setVariablyModified(); - if (Arg.containsUnexpandedParameterPack()) - setContainsUnexpandedParameterPack(); + addDependence(toTypeDependence(Arg.getDependence()) & + ~TypeDependence::Dependent); + if (Arg.getKind() == TemplateArgument::Type) + addDependence(Arg.getAsType()->getDependence() & + TypeDependence::VariablyModified); new (TemplateArgs++) TemplateArgument(Arg); } @@ -4178,19 +4146,18 @@ void clang::FixedPointValueToString(SmallVectorImpl &Str, } AutoType::AutoType(QualType DeducedAsType, AutoTypeKeyword Keyword, - bool IsDeducedAsDependent, bool IsDeducedAsPack, + TypeDependence ExtraDependence, ConceptDecl *TypeConstraintConcept, ArrayRef TypeConstraintArgs) - : DeducedType(Auto, DeducedAsType, IsDeducedAsDependent, - IsDeducedAsDependent, IsDeducedAsPack) { + : DeducedType(Auto, DeducedAsType, ExtraDependence) { AutoTypeBits.Keyword = (unsigned)Keyword; AutoTypeBits.NumArgs = TypeConstraintArgs.size(); this->TypeConstraintConcept = TypeConstraintConcept; if (TypeConstraintConcept) { TemplateArgument *ArgBuffer = getArgBuffer(); for (const TemplateArgument &Arg : TypeConstraintArgs) { - if (Arg.containsUnexpandedParameterPack()) - setContainsUnexpandedParameterPack(); + addDependence(toTypeDependence( + Arg.getDependence() & TemplateArgumentDependence::UnexpandedPack)); new (ArgBuffer++) TemplateArgument(Arg); } diff --git a/clang/lib/AST/TypePrinter.cpp b/clang/lib/AST/TypePrinter.cpp index 1e8e814f2d906..28244fea55b24 100644 --- a/clang/lib/AST/TypePrinter.cpp +++ b/clang/lib/AST/TypePrinter.cpp @@ -917,6 +917,8 @@ void TypePrinter::printFunctionAfter(const FunctionType::ExtInfo &Info, if (Info.getNoReturn()) OS << " __attribute__((noreturn))"; + if (Info.getCmseNSCall()) + OS << " __attribute__((cmse_nonsecure_call))"; if (Info.getProducesResult()) OS << " __attribute__((ns_returns_retained))"; if (Info.getRegParm()) @@ -1536,6 +1538,7 @@ void TypePrinter::printAttributedAfter(const AttributedType *T, case attr::SPtr: case attr::UPtr: case attr::AddressSpace: + case attr::CmseNSCall: llvm_unreachable("This attribute should have been handled already"); case attr::NSReturnsRetained: diff --git a/clang/lib/Analysis/CallGraph.cpp b/clang/lib/Analysis/CallGraph.cpp index 71c38ade13ee8..2299ba32db501 100644 --- a/clang/lib/Analysis/CallGraph.cpp +++ b/clang/lib/Analysis/CallGraph.cpp @@ -67,7 +67,7 @@ class CGBuilder : public StmtVisitor { } void addCalledDecl(Decl *D, Expr *CallExpr) { - if (G->includeInGraph(D)) { + if (G->includeCalleeInGraph(D)) { CallGraphNode *CalleeNode = G->getOrInsertNode(D); CallerNode->addCallee({CalleeNode, CallExpr}); } @@ -165,6 +165,10 @@ bool CallGraph::includeInGraph(const Decl *D) { if (!D->hasBody()) return false; + return includeCalleeInGraph(D); +} + +bool CallGraph::includeCalleeInGraph(const Decl *D) { if (const FunctionDecl *FD = dyn_cast(D)) { // We skip function template definitions, as their semantics is // only determined when they are instantiated. diff --git a/clang/lib/Basic/OpenMPKinds.cpp b/clang/lib/Basic/OpenMPKinds.cpp index f2106531d6eb1..e8bf41dc07be2 100644 --- a/clang/lib/Basic/OpenMPKinds.cpp +++ b/clang/lib/Basic/OpenMPKinds.cpp @@ -159,6 +159,11 @@ unsigned clang::getOpenMPSimpleClauseType(OpenMPClauseKind Kind, #define OPENMP_DEVICE_MODIFIER(Name) .Case(#Name, OMPC_DEVICE_##Name) #include "clang/Basic/OpenMPKinds.def" .Default(OMPC_DEVICE_unknown); + case OMPC_reduction: + return llvm::StringSwitch(Str) +#define OPENMP_REDUCTION_MODIFIER(Name) .Case(#Name, OMPC_REDUCTION_##Name) +#include "clang/Basic/OpenMPKinds.def" + .Default(OMPC_REDUCTION_unknown); case OMPC_unknown: case OMPC_threadprivate: case OMPC_if: @@ -172,7 +177,6 @@ unsigned clang::getOpenMPSimpleClauseType(OpenMPClauseKind Kind, case OMPC_private: case OMPC_firstprivate: case OMPC_shared: - case OMPC_reduction: case OMPC_task_reduction: case OMPC_in_reduction: case OMPC_aligned: @@ -212,6 +216,8 @@ unsigned clang::getOpenMPSimpleClauseType(OpenMPClauseKind Kind, case OMPC_nontemporal: case OMPC_destroy: case OMPC_detach: + case OMPC_inclusive: + case OMPC_exclusive: break; } llvm_unreachable("Invalid OpenMP simple clause kind"); @@ -394,6 +400,16 @@ const char *clang::getOpenMPSimpleClauseTypeName(OpenMPClauseKind Kind, #include "clang/Basic/OpenMPKinds.def" } llvm_unreachable("Invalid OpenMP 'device' clause modifier"); + case OMPC_reduction: + switch (Type) { + case OMPC_REDUCTION_unknown: + return "unknown"; +#define OPENMP_REDUCTION_MODIFIER(Name) \ + case OMPC_REDUCTION_##Name: \ + return #Name; +#include "clang/Basic/OpenMPKinds.def" + } + llvm_unreachable("Invalid OpenMP 'reduction' clause modifier"); case OMPC_unknown: case OMPC_threadprivate: case OMPC_if: @@ -407,7 +423,6 @@ const char *clang::getOpenMPSimpleClauseTypeName(OpenMPClauseKind Kind, case OMPC_private: case OMPC_firstprivate: case OMPC_shared: - case OMPC_reduction: case OMPC_task_reduction: case OMPC_in_reduction: case OMPC_aligned: @@ -447,6 +462,8 @@ const char *clang::getOpenMPSimpleClauseTypeName(OpenMPClauseKind Kind, case OMPC_nontemporal: case OMPC_destroy: case OMPC_detach: + case OMPC_inclusive: + case OMPC_exclusive: break; } llvm_unreachable("Invalid OpenMP simple clause kind"); @@ -608,6 +625,18 @@ bool clang::isAllowedClauseForDirective(OpenMPDirectiveKind DKind, break; } break; + case OMPD_scan: + if (OpenMPVersion < 50) + return false; + switch (CKind) { +#define OPENMP_SCAN_CLAUSE(Name) \ + case OMPC_##Name: \ + return true; +#include "clang/Basic/OpenMPKinds.def" + default: + break; + } + break; case OMPD_atomic: if (OpenMPVersion < 50 && (CKind == OMPC_acq_rel || CKind == OMPC_acquire || @@ -1251,6 +1280,7 @@ void clang::getOpenMPCaptureRegions( case OMPD_cancel: case OMPD_flush: case OMPD_depobj: + case OMPD_scan: case OMPD_declare_reduction: case OMPD_declare_mapper: case OMPD_declare_simd: diff --git a/clang/lib/Basic/Targets.cpp b/clang/lib/Basic/Targets.cpp index 07a9b7e30a245..db459a9ae5650 100644 --- a/clang/lib/Basic/Targets.cpp +++ b/clang/lib/Basic/Targets.cpp @@ -117,6 +117,9 @@ TargetInfo *AllocateTarget(const llvm::Triple &Triple, return new XCoreTargetInfo(Triple, Opts); case llvm::Triple::hexagon: + if (os == llvm::Triple::Linux && + Triple.getEnvironment() == llvm::Triple::Musl) + return new LinuxTargetInfo(Triple, Opts); return new HexagonTargetInfo(Triple, Opts); case llvm::Triple::lanai: diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp index 7e5db5f088dff..b8a59bcdab391 100644 --- a/clang/lib/CodeGen/BackendUtil.cpp +++ b/clang/lib/CodeGen/BackendUtil.cpp @@ -497,7 +497,6 @@ static void initTargetOptions(llvm::TargetOptions &Options, Options.DebuggerTuning = CodeGenOpts.getDebuggerTuning(); Options.EmitStackSizeSection = CodeGenOpts.StackSizeSection; Options.EmitAddrsig = CodeGenOpts.Addrsig; - Options.EnableDebugEntryValues = CodeGenOpts.EnableDebugEntryValues; Options.ForceDwarfFrameSection = CodeGenOpts.ForceDwarfFrameSection; Options.EmitCallSiteInfo = CodeGenOpts.EmitCallSiteInfo; @@ -586,8 +585,9 @@ void EmitAssemblyHelper::CreatePasses(legacy::PassManager &MPM, // At O0 and O1 we only run the always inliner which is more efficient. At // higher optimization levels we run the normal inliner. if (CodeGenOpts.OptimizationLevel <= 1) { - bool InsertLifetimeIntrinsics = (CodeGenOpts.OptimizationLevel != 0 && - !CodeGenOpts.DisableLifetimeMarkers); + bool InsertLifetimeIntrinsics = ((CodeGenOpts.OptimizationLevel != 0 && + !CodeGenOpts.DisableLifetimeMarkers) || + LangOpts.Coroutines); PMBuilder.Inliner = createAlwaysInlinerLegacyPass(InsertLifetimeIntrinsics); } else { // We do not want to inline hot callsites for SamplePGO module-summary build @@ -1186,7 +1186,10 @@ void EmitAssemblyHelper::EmitAssemblyWithNewPassManager( // which is just that always inlining occurs. Further, disable generating // lifetime intrinsics to avoid enabling further optimizations during // code generation. - MPM.addPass(AlwaysInlinerPass(/*InsertLifetimeIntrinsics=*/false)); + // However, we need to insert lifetime intrinsics to avoid invalid access + // caused by multithreaded coroutines. + MPM.addPass( + AlwaysInlinerPass(/*InsertLifetimeIntrinsics=*/LangOpts.Coroutines)); // At -O0, we can still do PGO. Add all the requested passes for // instrumentation PGO, if requested. diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index aa40aeab98942..b9327197c8423 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -4501,10 +4501,15 @@ static llvm::VectorType *GetFloatNeonType(CodeGenFunction *CGF, } } +Value *CodeGenFunction::EmitNeonSplat(Value *V, Constant *C, + const ElementCount &Count) { + Value *SV = llvm::ConstantVector::getSplat(Count, C); + return Builder.CreateShuffleVector(V, V, SV, "lane"); +} + Value *CodeGenFunction::EmitNeonSplat(Value *V, Constant *C) { ElementCount EC = V->getType()->getVectorElementCount(); - Value *SV = llvm::ConstantVector::getSplat(EC, C); - return Builder.CreateShuffleVector(V, V, SV, "lane"); + return EmitNeonSplat(V, C, EC); } Value *CodeGenFunction::EmitNeonCall(Function *F, SmallVectorImpl &Ops, @@ -4611,6 +4616,10 @@ struct ARMVectorIntrinsicInfo { TypeModifier } static const ARMVectorIntrinsicInfo ARMSIMDIntrinsicMap [] = { + NEONMAP0(splat_lane_v), + NEONMAP0(splat_laneq_v), + NEONMAP0(splatq_lane_v), + NEONMAP0(splatq_laneq_v), NEONMAP2(vabd_v, arm_neon_vabdu, arm_neon_vabds, Add1ArgType | UnsignedAlts), NEONMAP2(vabdq_v, arm_neon_vabdu, arm_neon_vabds, Add1ArgType | UnsignedAlts), NEONMAP1(vabs_v, arm_neon_vabs, 0), @@ -4892,6 +4901,10 @@ static const ARMVectorIntrinsicInfo ARMSIMDIntrinsicMap [] = { }; static const ARMVectorIntrinsicInfo AArch64SIMDIntrinsicMap[] = { + NEONMAP0(splat_lane_v), + NEONMAP0(splat_laneq_v), + NEONMAP0(splatq_lane_v), + NEONMAP0(splatq_laneq_v), NEONMAP1(vabs_v, aarch64_neon_abs, 0), NEONMAP1(vabsq_v, aarch64_neon_abs, 0), NEONMAP0(vaddhn_v), @@ -5466,6 +5479,19 @@ Value *CodeGenFunction::EmitCommonNeonBuiltinExpr( switch (BuiltinID) { default: break; + case NEON::BI__builtin_neon_splat_lane_v: + case NEON::BI__builtin_neon_splat_laneq_v: + case NEON::BI__builtin_neon_splatq_lane_v: + case NEON::BI__builtin_neon_splatq_laneq_v: { + auto NumElements = VTy->getElementCount(); + if (BuiltinID == NEON::BI__builtin_neon_splatq_lane_v) + NumElements = NumElements * 2; + if (BuiltinID == NEON::BI__builtin_neon_splat_laneq_v) + NumElements = NumElements / 2; + + Ops[0] = Builder.CreateBitCast(Ops[0], VTy); + return EmitNeonSplat(Ops[0], cast(Ops[1]), NumElements); + } case NEON::BI__builtin_neon_vpadd_v: case NEON::BI__builtin_neon_vpaddq_v: // We don't allow fp/int overloading of intrinsics. @@ -5804,9 +5830,14 @@ Value *CodeGenFunction::EmitCommonNeonBuiltinExpr( case NEON::BI__builtin_neon_vqdmulh_lane_v: case NEON::BI__builtin_neon_vqrdmulhq_lane_v: case NEON::BI__builtin_neon_vqrdmulh_lane_v: { + llvm::Type *RTy = Ty; + if (BuiltinID == NEON::BI__builtin_neon_vqdmulhq_lane_v || + BuiltinID == NEON::BI__builtin_neon_vqrdmulhq_lane_v) + RTy = llvm::VectorType::get(Ty->getVectorElementType(), + Ty->getVectorNumElements() * 2); llvm::Type *Tys[2] = { - Ty, GetNeonType(this, NeonTypeFlags(Type.getEltType(), false, - /*isQuad*/ false))}; + RTy, GetNeonType(this, NeonTypeFlags(Type.getEltType(), false, + /*isQuad*/ false))}; return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, NameHint); } case NEON::BI__builtin_neon_vqdmulhq_laneq_v: @@ -13584,6 +13615,13 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, return emitRangedBuiltin(*this, Intrinsic::r600_read_tidig_y, 0, 1024); case AMDGPU::BI__builtin_r600_read_tidig_z: return emitRangedBuiltin(*this, Intrinsic::r600_read_tidig_z, 0, 1024); + case AMDGPU::BI__builtin_amdgcn_alignbit: { + llvm::Value *Src0 = EmitScalarExpr(E->getArg(0)); + llvm::Value *Src1 = EmitScalarExpr(E->getArg(1)); + llvm::Value *Src2 = EmitScalarExpr(E->getArg(2)); + Function *F = CGM.getIntrinsic(Intrinsic::fshr, Src0->getType()); + return Builder.CreateCall(F, { Src0, Src1, Src2 }); + } default: return nullptr; } @@ -15025,6 +15063,15 @@ Value *CodeGenFunction::EmitWebAssemblyBuiltinExpr(unsigned BuiltinID, Function *Callee = CGM.getIntrinsic(IntNo, ConvertType(E->getType())); return Builder.CreateCall(Callee, {LHS, RHS}); } + case WebAssembly::BI__builtin_wasm_abs_i8x16: + case WebAssembly::BI__builtin_wasm_abs_i16x8: + case WebAssembly::BI__builtin_wasm_abs_i32x4: { + Value *Vec = EmitScalarExpr(E->getArg(0)); + Value *Neg = Builder.CreateNeg(Vec, "neg"); + Constant *Zero = llvm::Constant::getNullValue(Vec->getType()); + Value *ICmp = Builder.CreateICmpSLT(Vec, Zero, "abscond"); + return Builder.CreateSelect(ICmp, Neg, Vec, "abs"); + } case WebAssembly::BI__builtin_wasm_min_s_i8x16: case WebAssembly::BI__builtin_wasm_min_u_i8x16: case WebAssembly::BI__builtin_wasm_max_s_i8x16: @@ -15117,6 +15164,14 @@ Value *CodeGenFunction::EmitWebAssemblyBuiltinExpr(unsigned BuiltinID, Function *Callee = CGM.getIntrinsic(IntNo, Vec->getType()); return Builder.CreateCall(Callee, {Vec}); } + case WebAssembly::BI__builtin_wasm_bitmask_i8x16: + case WebAssembly::BI__builtin_wasm_bitmask_i16x8: + case WebAssembly::BI__builtin_wasm_bitmask_i32x4: { + Value *Vec = EmitScalarExpr(E->getArg(0)); + Function *Callee = + CGM.getIntrinsic(Intrinsic::wasm_bitmask, Vec->getType()); + return Builder.CreateCall(Callee, {Vec}); + } case WebAssembly::BI__builtin_wasm_abs_f32x4: case WebAssembly::BI__builtin_wasm_abs_f64x2: { Value *Vec = EmitScalarExpr(E->getArg(0)); diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp index 4e7f689a213fd..1eb5bb0eda18f 100644 --- a/clang/lib/CodeGen/CGCall.cpp +++ b/clang/lib/CodeGen/CGCall.cpp @@ -821,6 +821,7 @@ CGFunctionInfo *CGFunctionInfo::create(unsigned llvmCC, FI->ASTCallingConvention = info.getCC(); FI->InstanceMethod = instanceMethod; FI->ChainCall = chainCall; + FI->CmseNSCall = info.getCmseNSCall(); FI->NoReturn = info.getNoReturn(); FI->ReturnsRetained = info.getProducesResult(); FI->NoCallerSavedRegs = info.getNoCallerSavedRegs(); @@ -1883,6 +1884,9 @@ void CodeGenModule::ConstructAttributeList( if (FI.isNoReturn()) FuncAttrs.addAttribute(llvm::Attribute::NoReturn); + if (FI.isCmseNSCall()) + FuncAttrs.addAttribute("cmse_nonsecure_call"); + // If we have information about the function prototype, we can learn // attributes from there. AddAttributesFromFunctionProtoType(getContext(), FuncAttrs, @@ -2010,6 +2014,9 @@ void CodeGenModule::ConstructAttributeList( } if (!AttrOnCallSite) { + if (TargetDecl && TargetDecl->hasAttr()) + FuncAttrs.addAttribute("cmse_nonsecure_entry"); + bool DisableTailCalls = false; if (CodeGenOpts.DisableTailCalls) @@ -2083,6 +2090,7 @@ void CodeGenModule::ConstructAttributeList( hasUsedSRet = true; if (RetAI.getInReg()) SRETAttrs.addAttribute(llvm::Attribute::InReg); + SRETAttrs.addAlignmentAttr(RetAI.getIndirectAlign().getQuantity()); ArgAttrs[IRFunctionArgs.getSRetArgNo()] = llvm::AttributeSet::get(getLLVMContext(), SRETAttrs); } @@ -4839,6 +4847,11 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo, for (CallLifetimeEnd &LifetimeEnd : CallLifetimeEndAfterCall) LifetimeEnd.Emit(*this, /*Flags=*/{}); + if (!ReturnValue.isExternallyDestructed() && + RetTy.isDestructedType() == QualType::DK_nontrivial_c_struct) + pushDestroy(QualType::DK_nontrivial_c_struct, Ret.getAggregateAddress(), + RetTy); + return Ret; } diff --git a/clang/lib/CodeGen/CGCall.h b/clang/lib/CodeGen/CGCall.h index 28121af946f9b..509ca43a97848 100644 --- a/clang/lib/CodeGen/CGCall.h +++ b/clang/lib/CodeGen/CGCall.h @@ -358,27 +358,26 @@ class FunctionArgList : public SmallVector {}; /// ReturnValueSlot - Contains the address where the return value of a /// function can be stored, and whether the address is volatile or not. class ReturnValueSlot { - llvm::PointerIntPair Value; - CharUnits Alignment; + Address Addr = Address::invalid(); // Return value slot flags - enum Flags { - IS_VOLATILE = 0x1, - IS_UNUSED = 0x2, - }; + unsigned IsVolatile : 1; + unsigned IsUnused : 1; + unsigned IsExternallyDestructed : 1; public: - ReturnValueSlot() {} - ReturnValueSlot(Address Addr, bool IsVolatile, bool IsUnused = false) - : Value(Addr.isValid() ? Addr.getPointer() : nullptr, - (IsVolatile ? IS_VOLATILE : 0) | (IsUnused ? IS_UNUSED : 0)), - Alignment(Addr.isValid() ? Addr.getAlignment() : CharUnits::Zero()) {} - - bool isNull() const { return !getValue().isValid(); } - - bool isVolatile() const { return Value.getInt() & IS_VOLATILE; } - Address getValue() const { return Address(Value.getPointer(), Alignment); } - bool isUnused() const { return Value.getInt() & IS_UNUSED; } + ReturnValueSlot() + : IsVolatile(false), IsUnused(false), IsExternallyDestructed(false) {} + ReturnValueSlot(Address Addr, bool IsVolatile, bool IsUnused = false, + bool IsExternallyDestructed = false) + : Addr(Addr), IsVolatile(IsVolatile), IsUnused(IsUnused), + IsExternallyDestructed(IsExternallyDestructed) {} + + bool isNull() const { return !Addr.isValid(); } + bool isVolatile() const { return IsVolatile; } + Address getValue() const { return Addr; } + bool isUnused() const { return IsUnused; } + bool isExternallyDestructed() const { return IsExternallyDestructed; } }; } // end namespace CodeGen diff --git a/clang/lib/CodeGen/CGClass.cpp b/clang/lib/CodeGen/CGClass.cpp index 5db670cb52c1d..86b5455b4b8cf 100644 --- a/clang/lib/CodeGen/CGClass.cpp +++ b/clang/lib/CodeGen/CGClass.cpp @@ -2870,7 +2870,9 @@ void CodeGenFunction::EmitForwardingCallToLambda( if (!resultType->isVoidType() && calleeFnInfo.getReturnInfo().getKind() == ABIArgInfo::Indirect && !hasScalarEvaluationKind(calleeFnInfo.getReturnType())) - returnSlot = ReturnValueSlot(ReturnValue, resultType.isVolatileQualified()); + returnSlot = + ReturnValueSlot(ReturnValue, resultType.isVolatileQualified(), + /*IsUnused=*/false, /*IsExternallyDestructed=*/true); // We don't need to separately arrange the call arguments because // the call can't be variadic anyway --- it's impossible to forward diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp index 94dab4c856146..da6cb458982ba 100644 --- a/clang/lib/CodeGen/CGDebugInfo.cpp +++ b/clang/lib/CodeGen/CGDebugInfo.cpp @@ -631,7 +631,7 @@ void CGDebugInfo::CreateCompileUnit() { ? llvm::DICompileUnit::DebugNameTableKind::None : static_cast( CGOpts.DebugNameTable), - CGOpts.DebugRangesBaseAddress, Sysroot, SDK); + CGOpts.DebugRangesBaseAddress, remapDIPath(Sysroot), SDK); } llvm::DIType *CGDebugInfo::CreateType(const BuiltinType *BT) { @@ -2492,12 +2492,20 @@ llvm::DIModule *CGDebugInfo::getOrCreateModuleRef(ASTSourceDescriptor Mod, ? (uint64_t)Mod.getSignature()[1] << 32 | Mod.getSignature()[0] : ~1ULL; llvm::DIBuilder DIB(CGM.getModule()); + SmallString<0> PCM; + if (!llvm::sys::path::is_absolute(Mod.getASTFile())) + PCM = Mod.getPath(); + llvm::sys::path::append(PCM, Mod.getASTFile()); + std::string RemappedPCM = remapDIPath(PCM); + StringRef RelativePCM(RemappedPCM); + StringRef CompDir = TheCU->getDirectory(); + if (RelativePCM.consume_front(CompDir)) + RelativePCM.consume_front(llvm::sys::path::get_separator()); DIB.createCompileUnit(TheCU->getSourceLanguage(), // TODO: Support "Source" from external AST providers? - DIB.createFile(Mod.getModuleName(), Mod.getPath()), - TheCU->getProducer(), true, StringRef(), 0, - Mod.getASTFile(), llvm::DICompileUnit::FullDebug, - Signature); + DIB.createFile(Mod.getModuleName(), CompDir), + TheCU->getProducer(), false, StringRef(), 0, RelativePCM, + llvm::DICompileUnit::FullDebug, Signature); DIB.finalize(); } @@ -4883,8 +4891,7 @@ llvm::DINode::DIFlags CGDebugInfo::getCallSiteRelatedAttrs() const { (CGM.getCodeGenOpts().getDebuggerTuning() == llvm::DebuggerKind::LLDB || CGM.getCodeGenOpts().getDebuggerTuning() == llvm::DebuggerKind::GDB); - if (!SupportsDWARFv4Ext && CGM.getCodeGenOpts().DwarfVersion < 5 && - !CGM.getCodeGenOpts().EnableDebugEntryValues) + if (!SupportsDWARFv4Ext && CGM.getCodeGenOpts().DwarfVersion < 5) return llvm::DINode::FlagZero; return llvm::DINode::FlagAllCallsDescribed; diff --git a/clang/lib/CodeGen/CGDecl.cpp b/clang/lib/CodeGen/CGDecl.cpp index 726e3829793dd..74a63ea9b899b 100644 --- a/clang/lib/CodeGen/CGDecl.cpp +++ b/clang/lib/CodeGen/CGDecl.cpp @@ -1077,13 +1077,13 @@ static llvm::Constant *constWithPadding(CodeGenModule &CGM, IsPattern isPattern, llvm::Type *OrigTy = constant->getType(); if (const auto STy = dyn_cast(OrigTy)) return constStructWithPadding(CGM, isPattern, STy, constant); - if (auto *STy = dyn_cast(OrigTy)) { + if (auto *ArrayTy = dyn_cast(OrigTy)) { llvm::SmallVector Values; - unsigned Size = STy->getNumElements(); + uint64_t Size = ArrayTy->getNumElements(); if (!Size) return constant; - llvm::Type *ElemTy = STy->getElementType(); - bool ZeroInitializer = constant->isZeroValue(); + llvm::Type *ElemTy = ArrayTy->getElementType(); + bool ZeroInitializer = constant->isNullValue(); llvm::Constant *OpValue, *PaddedOp; if (ZeroInitializer) { OpValue = llvm::Constant::getNullValue(ElemTy); @@ -1099,13 +1099,10 @@ static llvm::Constant *constWithPadding(CodeGenModule &CGM, IsPattern isPattern, auto *NewElemTy = Values[0]->getType(); if (NewElemTy == ElemTy) return constant; - if (OrigTy->isArrayTy()) { - auto *ArrayTy = llvm::ArrayType::get(NewElemTy, Size); - return llvm::ConstantArray::get(ArrayTy, Values); - } else { - return llvm::ConstantVector::get(Values); - } + auto *NewArrayTy = llvm::ArrayType::get(NewElemTy, Size); + return llvm::ConstantArray::get(NewArrayTy, Values); } + // FIXME: Do we need to handle tail padding in vectors? return constant; } diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp index 694172f1012f8..4b5b2a5454062 100644 --- a/clang/lib/CodeGen/CGExpr.cpp +++ b/clang/lib/CodeGen/CGExpr.cpp @@ -867,8 +867,12 @@ void CodeGenFunction::EmitTypeCheck(TypeCheckKind TCK, SourceLocation Loc, static bool isFlexibleArrayMemberExpr(const Expr *E) { // For compatibility with existing code, we treat arrays of length 0 or // 1 as flexible array members. + // FIXME: This is inconsistent with the warning code in SemaChecking. Unify + // the two mechanisms. const ArrayType *AT = E->getType()->castAsArrayTypeUnsafe(); if (const auto *CAT = dyn_cast(AT)) { + // FIXME: Sema doesn't treat [1] as a flexible array member if the bound + // was produced by macro expansion. if (CAT->getSize().ugt(1)) return false; } else if (!isa(AT)) @@ -881,6 +885,10 @@ static bool isFlexibleArrayMemberExpr(const Expr *E) { // FIXME: If the base type of the member expr is not FD->getParent(), // this should not be treated as a flexible array member access. if (const auto *FD = dyn_cast(ME->getMemberDecl())) { + // FIXME: Sema doesn't treat a T[1] union member as a flexible array + // member, only a T[0] or T[] member gets that treatment. + if (FD->getParent()->isUnion()) + return true; RecordDecl::field_iterator FI( DeclContext::decl_iterator(const_cast(FD))); return ++FI == FD->getParent()->field_end(); diff --git a/clang/lib/CodeGen/CGExprAgg.cpp b/clang/lib/CodeGen/CGExprAgg.cpp index 9881d28fe25ce..df576decd69d6 100644 --- a/clang/lib/CodeGen/CGExprAgg.cpp +++ b/clang/lib/CodeGen/CGExprAgg.cpp @@ -249,7 +249,7 @@ void AggExprEmitter::withReturnValueSlot( const Expr *E, llvm::function_ref EmitCall) { QualType RetTy = E->getType(); bool RequiresDestruction = - Dest.isIgnored() && + !Dest.isExternallyDestructed() && RetTy.isDestructedType() == QualType::DK_nontrivial_c_struct; // If it makes no observable difference, save a memcpy + temporary. @@ -287,10 +287,8 @@ void AggExprEmitter::withReturnValueSlot( } RValue Src = - EmitCall(ReturnValueSlot(RetAddr, Dest.isVolatile(), IsResultUnused)); - - if (RequiresDestruction) - CGF.pushDestroy(RetTy.isDestructedType(), Src.getAggregateAddress(), RetTy); + EmitCall(ReturnValueSlot(RetAddr, Dest.isVolatile(), IsResultUnused, + Dest.isExternallyDestructed())); if (!UseTemp) return; @@ -827,8 +825,19 @@ void AggExprEmitter::VisitCastExpr(CastExpr *E) { // If we're loading from a volatile type, force the destination // into existence. if (E->getSubExpr()->getType().isVolatileQualified()) { + bool Destruct = + !Dest.isExternallyDestructed() && + E->getType().isDestructedType() == QualType::DK_nontrivial_c_struct; + if (Destruct) + Dest.setExternallyDestructed(); EnsureDest(E->getType()); - return Visit(E->getSubExpr()); + Visit(E->getSubExpr()); + + if (Destruct) + CGF.pushDestroy(QualType::DK_nontrivial_c_struct, Dest.getAddress(), + E->getType()); + + return; } LLVM_FALLTHROUGH; diff --git a/clang/lib/CodeGen/CGExprConstant.cpp b/clang/lib/CodeGen/CGExprConstant.cpp index aa9c8f96f9664..e17c1c5f7ac4f 100644 --- a/clang/lib/CodeGen/CGExprConstant.cpp +++ b/clang/lib/CodeGen/CGExprConstant.cpp @@ -1167,9 +1167,7 @@ class ConstExprEmitter : } llvm::Constant *VisitExprWithCleanups(ExprWithCleanups *E, QualType T) { - if (!E->cleanupsHaveSideEffects()) - return Visit(E->getSubExpr(), T); - return nullptr; + return Visit(E->getSubExpr(), T); } llvm::Constant *VisitMaterializeTemporaryExpr(MaterializeTemporaryExpr *E, diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp index fa60221e8b593..58c5334776f3f 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp @@ -6992,6 +6992,7 @@ emitNumTeamsForTargetDirective(CodeGenFunction &CGF, case OMPD_atomic: case OMPD_flush: case OMPD_depobj: + case OMPD_scan: case OMPD_teams: case OMPD_target_data: case OMPD_target_exit_data: @@ -7304,6 +7305,7 @@ emitNumThreadsForTargetDirective(CodeGenFunction &CGF, case OMPD_atomic: case OMPD_flush: case OMPD_depobj: + case OMPD_scan: case OMPD_teams: case OMPD_target_data: case OMPD_target_exit_data: @@ -9089,6 +9091,7 @@ getNestedDistributeDirective(ASTContext &Ctx, const OMPExecutableDirective &D) { case OMPD_atomic: case OMPD_flush: case OMPD_depobj: + case OMPD_scan: case OMPD_teams: case OMPD_target_data: case OMPD_target_exit_data: @@ -9483,7 +9486,7 @@ void CGOpenMPRuntime::emitTargetNumIterationsCall( void CGOpenMPRuntime::emitTargetCall( CodeGenFunction &CGF, const OMPExecutableDirective &D, llvm::Function *OutlinedFn, llvm::Value *OutlinedFnID, const Expr *IfCond, - const Expr *Device, + llvm::PointerIntPair Device, llvm::function_ref SizeEmitter) { @@ -9507,6 +9510,16 @@ void CGOpenMPRuntime::emitTargetCall( auto &&ThenGen = [this, Device, OutlinedFn, OutlinedFnID, &D, &InputInfo, &MapTypesArray, &CS, RequiresOuterTask, &CapturedVars, SizeEmitter](CodeGenFunction &CGF, PrePostActionTy &) { + if (Device.getInt() == OMPC_DEVICE_ancestor) { + // Reverse offloading is not supported, so just execute on the host. + if (RequiresOuterTask) { + CapturedVars.clear(); + CGF.GenerateOpenMPCapturedVars(CS, CapturedVars); + } + emitOutlinedFunctionCall(CGF, D.getBeginLoc(), OutlinedFn, CapturedVars); + return; + } + // On top of the arrays that were filled up, the target offloading call // takes as arguments the device id as well as the host pointer. The host // pointer is used by the runtime library to identify the current target @@ -9521,9 +9534,13 @@ void CGOpenMPRuntime::emitTargetCall( // Emit device ID if any. llvm::Value *DeviceID; - if (Device) { - DeviceID = CGF.Builder.CreateIntCast(CGF.EmitScalarExpr(Device), - CGF.Int64Ty, /*isSigned=*/true); + if (Device.getPointer()) { + assert((Device.getInt() == OMPC_DEVICE_unknown || + Device.getInt() == OMPC_DEVICE_device_num) && + "Expected device_num modifier."); + llvm::Value *DevVal = CGF.EmitScalarExpr(Device.getPointer()); + DeviceID = + CGF.Builder.CreateIntCast(DevVal, CGF.Int64Ty, /*isSigned=*/true); } else { DeviceID = CGF.Builder.getInt64(OMP_DEVICEID_UNDEF); } @@ -9853,6 +9870,7 @@ void CGOpenMPRuntime::scanForTargetRegionsFunctions(const Stmt *S, case OMPD_atomic: case OMPD_flush: case OMPD_depobj: + case OMPD_scan: case OMPD_teams: case OMPD_target_data: case OMPD_target_exit_data: @@ -10494,6 +10512,7 @@ void CGOpenMPRuntime::emitTargetDataStandAloneCall( case OMPD_atomic: case OMPD_flush: case OMPD_depobj: + case OMPD_scan: case OMPD_teams: case OMPD_target_data: case OMPD_distribute: @@ -11374,7 +11393,7 @@ static const FunctionDecl *getDeclareVariantFunction(CodeGenModule &CGM, SmallVector VariantExprs; SmallVector VMIs; for (const auto *A : FD->specific_attrs()) { - const OMPTraitInfo &TI = A->getTraitInfos(); + const OMPTraitInfo &TI = *A->getTraitInfos(); VMIs.push_back(VariantMatchInfo()); TI.getAsVariantMatchInfo(CGM.getContext(), VMIs.back()); VariantExprs.push_back(A->getVariantFuncRef()); @@ -12135,7 +12154,7 @@ void CGOpenMPSIMDRuntime::emitTargetOutlinedFunction( void CGOpenMPSIMDRuntime::emitTargetCall( CodeGenFunction &CGF, const OMPExecutableDirective &D, llvm::Function *OutlinedFn, llvm::Value *OutlinedFnID, const Expr *IfCond, - const Expr *Device, + llvm::PointerIntPair Device, llvm::function_ref SizeEmitter) { diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.h b/clang/lib/CodeGen/CGOpenMPRuntime.h index b9df5ee114735..99b201bcf6fea 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntime.h +++ b/clang/lib/CodeGen/CGOpenMPRuntime.h @@ -20,6 +20,7 @@ #include "clang/Basic/OpenMPKinds.h" #include "clang/Basic/SourceLocation.h" #include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/PointerIntPair.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/StringMap.h" #include "llvm/ADT/StringSet.h" @@ -1504,16 +1505,16 @@ class CGOpenMPRuntime { /// \param IfCond Expression evaluated in if clause associated with the target /// directive, or null if no if clause is used. /// \param Device Expression evaluated in device clause associated with the - /// target directive, or null if no device clause is used. + /// target directive, or null if no device clause is used and device modifier. /// \param SizeEmitter Callback to emit number of iterations for loop-based /// directives. - virtual void - emitTargetCall(CodeGenFunction &CGF, const OMPExecutableDirective &D, - llvm::Function *OutlinedFn, llvm::Value *OutlinedFnID, - const Expr *IfCond, const Expr *Device, - llvm::function_ref - SizeEmitter); + virtual void emitTargetCall( + CodeGenFunction &CGF, const OMPExecutableDirective &D, + llvm::Function *OutlinedFn, llvm::Value *OutlinedFnID, const Expr *IfCond, + llvm::PointerIntPair Device, + llvm::function_ref + SizeEmitter); /// Emit the target regions enclosed in \a GD function definition or /// the function itself in case it is a valid device function. Returns true if @@ -2275,14 +2276,14 @@ class CGOpenMPSIMDRuntime final : public CGOpenMPRuntime { /// \param IfCond Expression evaluated in if clause associated with the target /// directive, or null if no if clause is used. /// \param Device Expression evaluated in device clause associated with the - /// target directive, or null if no device clause is used. - void - emitTargetCall(CodeGenFunction &CGF, const OMPExecutableDirective &D, - llvm::Function *OutlinedFn, llvm::Value *OutlinedFnID, - const Expr *IfCond, const Expr *Device, - llvm::function_ref - SizeEmitter) override; + /// target directive, or null if no device clause is used and device modifier. + void emitTargetCall( + CodeGenFunction &CGF, const OMPExecutableDirective &D, + llvm::Function *OutlinedFn, llvm::Value *OutlinedFnID, const Expr *IfCond, + llvm::PointerIntPair Device, + llvm::function_ref + SizeEmitter) override; /// Emit the target regions enclosed in \a GD function definition or /// the function itself in case it is a valid device function. Returns true if diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp index b139529d8eb34..4b5c85541a7a5 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp @@ -787,6 +787,7 @@ static bool hasNestedSPMDDirective(ASTContext &Ctx, case OMPD_atomic: case OMPD_flush: case OMPD_depobj: + case OMPD_scan: case OMPD_teams: case OMPD_target_data: case OMPD_target_exit_data: @@ -864,6 +865,7 @@ static bool supportsSPMDExecutionMode(ASTContext &Ctx, case OMPD_atomic: case OMPD_flush: case OMPD_depobj: + case OMPD_scan: case OMPD_teams: case OMPD_target_data: case OMPD_target_exit_data: @@ -1034,6 +1036,7 @@ static bool hasNestedLightweightDirective(ASTContext &Ctx, case OMPD_atomic: case OMPD_flush: case OMPD_depobj: + case OMPD_scan: case OMPD_teams: case OMPD_target_data: case OMPD_target_exit_data: @@ -1117,6 +1120,7 @@ static bool supportsLightweightRuntime(ASTContext &Ctx, case OMPD_atomic: case OMPD_flush: case OMPD_depobj: + case OMPD_scan: case OMPD_teams: case OMPD_target_data: case OMPD_target_exit_data: diff --git a/clang/lib/CodeGen/CGStmt.cpp b/clang/lib/CodeGen/CGStmt.cpp index 283942441b0e1..369450259d517 100644 --- a/clang/lib/CodeGen/CGStmt.cpp +++ b/clang/lib/CodeGen/CGStmt.cpp @@ -250,6 +250,9 @@ void CodeGenFunction::EmitStmt(const Stmt *S, ArrayRef Attrs) { case Stmt::OMPDepobjDirectiveClass: EmitOMPDepobjDirective(cast(*S)); break; + case Stmt::OMPScanDirectiveClass: + llvm_unreachable("Scan directive not supported yet."); + break; case Stmt::OMPOrderedDirectiveClass: EmitOMPOrderedDirective(cast(*S)); break; diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp index 2ffe79957ee85..bef36bf4693b2 100644 --- a/clang/lib/CodeGen/CGStmtOpenMP.cpp +++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp @@ -4616,6 +4616,8 @@ static void emitOMPAtomicExpr(CodeGenFunction &CGF, OpenMPClauseKind Kind, case OMPC_order: case OMPC_destroy: case OMPC_detach: + case OMPC_inclusive: + case OMPC_exclusive: llvm_unreachable("Clause is not allowed in 'omp atomic'."); } } @@ -4724,12 +4726,10 @@ static void emitCommonOMPTargetDirective(CodeGenFunction &CGF, } // Check if we have any device clause associated with the directive. - const Expr *Device = nullptr; - if (auto *C = S.getSingleClause()) { - if (C->getModifier() == OMPC_DEVICE_unknown || - C->getModifier() == OMPC_DEVICE_device_num) - Device = C->getDevice(); - } + llvm::PointerIntPair Device( + nullptr, OMPC_DEVICE_unknown); + if (auto *C = S.getSingleClause()) + Device.setPointerAndInt(C->getDevice(), C->getModifier()); // Check if we have an if clause whose conditional always evaluates to false // or if we do not have any targets specified. If so the target region is not diff --git a/clang/lib/CodeGen/CGVTables.cpp b/clang/lib/CodeGen/CGVTables.cpp index 403b9e25f7a39..6a0a848a49df9 100644 --- a/clang/lib/CodeGen/CGVTables.cpp +++ b/clang/lib/CodeGen/CGVTables.cpp @@ -364,7 +364,8 @@ void CodeGenFunction::EmitCallAndReturnForThunk(llvm::FunctionCallee Callee, ReturnValueSlot Slot; if (!ResultType->isVoidType() && CurFnInfo->getReturnInfo().getKind() == ABIArgInfo::Indirect) - Slot = ReturnValueSlot(ReturnValue, ResultType.isVolatileQualified()); + Slot = ReturnValueSlot(ReturnValue, ResultType.isVolatileQualified(), + /*IsUnused=*/false, /*IsExternallyDestructed=*/true); // Now emit our call. llvm::CallBase *CallOrInvoke; @@ -437,7 +438,8 @@ void CodeGenFunction::EmitMustTailThunk(GlobalDecl GD, // Finish the function to maintain CodeGenFunction invariants. // FIXME: Don't emit unreachable code. EmitBlock(createBasicBlock()); - FinishFunction(); + + FinishThunk(); } void CodeGenFunction::generateThunk(llvm::Function *Fn, @@ -564,7 +566,7 @@ llvm::Constant *CodeGenVTables::maybeEmitThunk(GlobalDecl GD, CGM.SetLLVMFunctionAttributesForDefinition(GD.getDecl(), ThunkFn); // Thunks for variadic methods are special because in general variadic - // arguments cannot be perferctly forwarded. In the general case, clang + // arguments cannot be perfectly forwarded. In the general case, clang // implements such thunks by cloning the original function body. However, for // thunks with no return adjustment on targets that support musttail, we can // use musttail to perfectly forward the variadic arguments. diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h index 0a4a4c3a71d42..0f346574ce81f 100644 --- a/clang/lib/CodeGen/CodeGenFunction.h +++ b/clang/lib/CodeGen/CodeGenFunction.h @@ -3894,6 +3894,8 @@ class CodeGenFunction : public CodeGenTypeCache { SmallVectorImpl &O, const char *name, unsigned shift = 0, bool rightshift = false); + llvm::Value *EmitNeonSplat(llvm::Value *V, llvm::Constant *Idx, + const llvm::ElementCount &Count); llvm::Value *EmitNeonSplat(llvm::Value *V, llvm::Constant *Idx); llvm::Value *EmitNeonShiftVector(llvm::Value *V, llvm::Type *Ty, bool negateForRightShift); diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp index 37b453de9b99a..b603bd037172e 100644 --- a/clang/lib/CodeGen/CodeGenModule.cpp +++ b/clang/lib/CodeGen/CodeGenModule.cpp @@ -695,6 +695,8 @@ void CodeGenModule::Release() { EmitCommandLineMetadata(); EmitTargetMetadata(); + + EmitBackendOptionsMetadata(getCodeGenOpts()); } void CodeGenModule::EmitOpenCLMetadata() { @@ -714,6 +716,19 @@ void CodeGenModule::EmitOpenCLMetadata() { OCLVerMD->addOperand(llvm::MDNode::get(Ctx, OCLVerElts)); } +void CodeGenModule::EmitBackendOptionsMetadata( + const CodeGenOptions CodeGenOpts) { + switch (getTriple().getArch()) { + default: + break; + case llvm::Triple::riscv32: + case llvm::Triple::riscv64: + getModule().addModuleFlag(llvm::Module::Error, "SmallDataLimit", + CodeGenOpts.SmallDataLimit); + break; + } +} + void CodeGenModule::UpdateCompletedType(const TagDecl *TD) { // Make sure that this type is translated. Types.UpdateCompletedType(TD); diff --git a/clang/lib/CodeGen/CodeGenModule.h b/clang/lib/CodeGen/CodeGenModule.h index af53d77b9358d..a538884a97f6e 100644 --- a/clang/lib/CodeGen/CodeGenModule.h +++ b/clang/lib/CodeGen/CodeGenModule.h @@ -1525,6 +1525,10 @@ class CodeGenModule : public CodeGenTypeCache { /// Emits target specific Metadata for global declarations. void EmitTargetMetadata(); + /// Emit the module flag metadata used to pass options controlling the + /// the backend to LLVM. + void EmitBackendOptionsMetadata(const CodeGenOptions CodeGenOpts); + /// Emits OpenCL specific Metadata e.g. OpenCL version. void EmitOpenCLMetadata(); diff --git a/clang/lib/CodeGen/EHScopeStack.h b/clang/lib/CodeGen/EHScopeStack.h index 4dd3da3e90e73..0ed67aabcd621 100644 --- a/clang/lib/CodeGen/EHScopeStack.h +++ b/clang/lib/CodeGen/EHScopeStack.h @@ -148,7 +148,7 @@ class EHScopeStack { virtual void anchor(); protected: - virtual ~Cleanup() = default; + ~Cleanup() = default; public: Cleanup(const Cleanup &) = default; diff --git a/clang/lib/Driver/Compilation.cpp b/clang/lib/Driver/Compilation.cpp index 28eacdb055441..c24a6f3765f26 100644 --- a/clang/lib/Driver/Compilation.cpp +++ b/clang/lib/Driver/Compilation.cpp @@ -80,16 +80,29 @@ Compilation::getArgsForToolChain(const ToolChain *TC, StringRef BoundArch, *TranslatedArgs, SameTripleAsHost, AllocatedArgs, DeviceOffloadKind); } + DerivedArgList *NewDAL = nullptr; if (!OffloadArgs) { + NewDAL = TC->TranslateXarchArgs(*TranslatedArgs, BoundArch, + DeviceOffloadKind, &AllocatedArgs); + } else { + NewDAL = TC->TranslateXarchArgs(*OffloadArgs, BoundArch, DeviceOffloadKind, + &AllocatedArgs); + if (!NewDAL) + NewDAL = OffloadArgs; + else + delete OffloadArgs; + } + + if (!NewDAL) { Entry = TC->TranslateArgs(*TranslatedArgs, BoundArch, DeviceOffloadKind); if (!Entry) Entry = TranslatedArgs; } else { - Entry = TC->TranslateArgs(*OffloadArgs, BoundArch, DeviceOffloadKind); + Entry = TC->TranslateArgs(*NewDAL, BoundArch, DeviceOffloadKind); if (!Entry) - Entry = OffloadArgs; + Entry = NewDAL; else - delete OffloadArgs; + delete NewDAL; } // Add allocated arguments to the final DAL. diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp index 0a49fccb50005..dc8b05f2b1cce 100644 --- a/clang/lib/Driver/ToolChain.cpp +++ b/clang/lib/Driver/ToolChain.cpp @@ -1205,3 +1205,86 @@ llvm::opt::DerivedArgList *ToolChain::TranslateOffloadTargetArgs( delete DAL; return nullptr; } + +// TODO: Currently argument values separated by space e.g. +// -Xclang -mframe-pointer=no cannot be passed by -Xarch_. This should be +// fixed. +void ToolChain::TranslateXarchArgs( + const llvm::opt::DerivedArgList &Args, llvm::opt::Arg *&A, + llvm::opt::DerivedArgList *DAL, + SmallVectorImpl *AllocatedArgs) const { + const OptTable &Opts = getDriver().getOpts(); + unsigned ValuePos = 1; + if (A->getOption().matches(options::OPT_Xarch_device) || + A->getOption().matches(options::OPT_Xarch_host)) + ValuePos = 0; + + unsigned Index = Args.getBaseArgs().MakeIndex(A->getValue(ValuePos)); + unsigned Prev = Index; + std::unique_ptr XarchArg(Opts.ParseOneArg(Args, Index)); + + // If the argument parsing failed or more than one argument was + // consumed, the -Xarch_ argument's parameter tried to consume + // extra arguments. Emit an error and ignore. + // + // We also want to disallow any options which would alter the + // driver behavior; that isn't going to work in our model. We + // use isDriverOption() as an approximation, although things + // like -O4 are going to slip through. + if (!XarchArg || Index > Prev + 1) { + getDriver().Diag(diag::err_drv_invalid_Xarch_argument_with_args) + << A->getAsString(Args); + return; + } else if (XarchArg->getOption().hasFlag(options::DriverOption)) { + getDriver().Diag(diag::err_drv_invalid_Xarch_argument_isdriver) + << A->getAsString(Args); + return; + } + XarchArg->setBaseArg(A); + A = XarchArg.release(); + if (!AllocatedArgs) + DAL->AddSynthesizedArg(A); + else + AllocatedArgs->push_back(A); +} + +llvm::opt::DerivedArgList *ToolChain::TranslateXarchArgs( + const llvm::opt::DerivedArgList &Args, StringRef BoundArch, + Action::OffloadKind OFK, + SmallVectorImpl *AllocatedArgs) const { + DerivedArgList *DAL = new DerivedArgList(Args.getBaseArgs()); + bool Modified = false; + + bool IsGPU = OFK == Action::OFK_Cuda || OFK == Action::OFK_HIP; + for (Arg *A : Args) { + bool NeedTrans = false; + bool Skip = false; + if (A->getOption().matches(options::OPT_Xarch_device)) { + NeedTrans = IsGPU; + Skip = !IsGPU; + } else if (A->getOption().matches(options::OPT_Xarch_host)) { + NeedTrans = !IsGPU; + Skip = IsGPU; + } else if (A->getOption().matches(options::OPT_Xarch__) && IsGPU) { + // Do not translate -Xarch_ options for non CUDA/HIP toolchain since + // they may need special translation. + // Skip this argument unless the architecture matches BoundArch + if (BoundArch.empty() || A->getValue(0) != BoundArch) + Skip = true; + else + NeedTrans = true; + } + if (NeedTrans || Skip) + Modified = true; + if (NeedTrans) + TranslateXarchArgs(Args, A, DAL, AllocatedArgs); + if (!Skip) + DAL->append(A); + } + + if (Modified) + return DAL; + + delete DAL; + return nullptr; +} diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index b0679dee12286..12d5345a64af5 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -1980,6 +1980,36 @@ void Clang::AddPPCTargetArgs(const ArgList &Args, } } +static void SetRISCVSmallDataLimit(const ToolChain &TC, const ArgList &Args, + ArgStringList &CmdArgs) { + const Driver &D = TC.getDriver(); + const llvm::Triple &Triple = TC.getTriple(); + // Default small data limitation is eight. + const char *SmallDataLimit = "8"; + // Get small data limitation. + if (Args.getLastArg(options::OPT_shared, options::OPT_fpic, + options::OPT_fPIC)) { + // Not support linker relaxation for PIC. + SmallDataLimit = "0"; + if (Args.hasArg(options::OPT_G)) { + D.Diag(diag::warn_drv_unsupported_sdata); + } + } else if (Args.getLastArgValue(options::OPT_mcmodel_EQ) + .equals_lower("large") && + (Triple.getArch() == llvm::Triple::riscv64)) { + // Not support linker relaxation for RV64 with large code model. + SmallDataLimit = "0"; + if (Args.hasArg(options::OPT_G)) { + D.Diag(diag::warn_drv_unsupported_sdata); + } + } else if (Arg *A = Args.getLastArg(options::OPT_G)) { + SmallDataLimit = A->getValue(); + } + // Forward the -msmall-data-limit= option. + CmdArgs.push_back("-msmall-data-limit"); + CmdArgs.push_back(SmallDataLimit); +} + void Clang::AddRISCVTargetArgs(const ArgList &Args, ArgStringList &CmdArgs) const { const llvm::Triple &Triple = getToolChain().getTriple(); @@ -1987,6 +2017,8 @@ void Clang::AddRISCVTargetArgs(const ArgList &Args, CmdArgs.push_back("-target-abi"); CmdArgs.push_back(ABIName.data()); + + SetRISCVSmallDataLimit(getToolChain(), Args, CmdArgs); } void Clang::AddSparcTargetArgs(const ArgList &Args, @@ -4487,14 +4519,24 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA, bool IsPIE; std::tie(RelocationModel, PICLevel, IsPIE) = ParsePICArgs(TC, Args); - const char *RMName = RelocationModelName(RelocationModel); + bool IsROPI = RelocationModel == llvm::Reloc::ROPI || + RelocationModel == llvm::Reloc::ROPI_RWPI; + bool IsRWPI = RelocationModel == llvm::Reloc::RWPI || + RelocationModel == llvm::Reloc::ROPI_RWPI; - if ((RelocationModel == llvm::Reloc::ROPI || - RelocationModel == llvm::Reloc::ROPI_RWPI) && - types::isCXX(Input.getType()) && + if (Args.hasArg(options::OPT_mcmse) && + !Args.hasArg(options::OPT_fallow_unsupported)) { + if (IsROPI) + D.Diag(diag::err_cmse_pi_are_incompatible) << IsROPI; + if (IsRWPI) + D.Diag(diag::err_cmse_pi_are_incompatible) << !IsRWPI; + } + + if (IsROPI && types::isCXX(Input.getType()) && !Args.hasArg(options::OPT_fallow_unsupported)) D.Diag(diag::err_drv_ropi_incompatible_with_cxx); + const char *RMName = RelocationModelName(RelocationModel); if (RMName) { CmdArgs.push_back("-mrelocation-model"); CmdArgs.push_back(RMName); diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp index 59933c94d0f83..54cde5cc28425 100644 --- a/clang/lib/Driver/ToolChains/Cuda.cpp +++ b/clang/lib/Driver/ToolChains/Cuda.cpp @@ -848,36 +848,6 @@ CudaToolChain::TranslateArgs(const llvm::opt::DerivedArgList &Args, } for (Arg *A : Args) { - if (A->getOption().matches(options::OPT_Xarch__)) { - // Skip this argument unless the architecture matches BoundArch - if (BoundArch.empty() || A->getValue(0) != BoundArch) - continue; - - unsigned Index = Args.getBaseArgs().MakeIndex(A->getValue(1)); - unsigned Prev = Index; - std::unique_ptr XarchArg(Opts.ParseOneArg(Args, Index)); - - // If the argument parsing failed or more than one argument was - // consumed, the -Xarch_ argument's parameter tried to consume - // extra arguments. Emit an error and ignore. - // - // We also want to disallow any options which would alter the - // driver behavior; that isn't going to work in our model. We - // use isDriverOption() as an approximation, although things - // like -O4 are going to slip through. - if (!XarchArg || Index > Prev + 1) { - getDriver().Diag(diag::err_drv_invalid_Xarch_argument_with_args) - << A->getAsString(Args); - continue; - } else if (XarchArg->getOption().hasFlag(options::DriverOption)) { - getDriver().Diag(diag::err_drv_invalid_Xarch_argument_isdriver) - << A->getAsString(Args); - continue; - } - XarchArg->setBaseArg(A); - A = XarchArg.release(); - DAL->AddSynthesizedArg(A); - } DAL->append(A); } diff --git a/clang/lib/Driver/ToolChains/Darwin.cpp b/clang/lib/Driver/ToolChains/Darwin.cpp index ab3a68b70f5d4..d1aec649cdc71 100644 --- a/clang/lib/Driver/ToolChains/Darwin.cpp +++ b/clang/lib/Driver/ToolChains/Darwin.cpp @@ -1068,8 +1068,8 @@ StringRef Darwin::getPlatformFamily() const { StringRef Darwin::getSDKName(StringRef isysroot) { // Assume SDK has path: SOME_PATH/SDKs/PlatformXX.YY.sdk - auto BeginSDK = llvm::sys::path::begin(isysroot); - auto EndSDK = llvm::sys::path::end(isysroot); + auto BeginSDK = llvm::sys::path::rbegin(isysroot); + auto EndSDK = llvm::sys::path::rend(isysroot); for (auto IT = BeginSDK; IT != EndSDK; ++IT) { StringRef SDK = *IT; if (SDK.endswith(".sdk")) @@ -2132,32 +2132,7 @@ DerivedArgList *MachO::TranslateArgs(const DerivedArgList &Args, continue; Arg *OriginalArg = A; - unsigned Index = Args.getBaseArgs().MakeIndex(A->getValue(1)); - unsigned Prev = Index; - std::unique_ptr XarchArg(Opts.ParseOneArg(Args, Index)); - - // If the argument parsing failed or more than one argument was - // consumed, the -Xarch_ argument's parameter tried to consume - // extra arguments. Emit an error and ignore. - // - // We also want to disallow any options which would alter the - // driver behavior; that isn't going to work in our model. We - // use isDriverOption() as an approximation, although things - // like -O4 are going to slip through. - if (!XarchArg || Index > Prev + 1) { - getDriver().Diag(diag::err_drv_invalid_Xarch_argument_with_args) - << A->getAsString(Args); - continue; - } else if (XarchArg->getOption().hasFlag(options::DriverOption)) { - getDriver().Diag(diag::err_drv_invalid_Xarch_argument_isdriver) - << A->getAsString(Args); - continue; - } - - XarchArg->setBaseArg(A); - - A = XarchArg.release(); - DAL->AddSynthesizedArg(A); + TranslateXarchArgs(Args, A, DAL); // Linker input arguments require custom handling. The problem is that we // have already constructed the phase actions, so we can not treat them as diff --git a/clang/lib/Driver/ToolChains/HIP.cpp b/clang/lib/Driver/ToolChains/HIP.cpp index f761659c9cc76..157dca7e0c8db 100644 --- a/clang/lib/Driver/ToolChains/HIP.cpp +++ b/clang/lib/Driver/ToolChains/HIP.cpp @@ -378,36 +378,6 @@ HIPToolChain::TranslateArgs(const llvm::opt::DerivedArgList &Args, const OptTable &Opts = getDriver().getOpts(); for (Arg *A : Args) { - if (A->getOption().matches(options::OPT_Xarch__)) { - // Skip this argument unless the architecture matches BoundArch. - if (BoundArch.empty() || A->getValue(0) != BoundArch) - continue; - - unsigned Index = Args.getBaseArgs().MakeIndex(A->getValue(1)); - unsigned Prev = Index; - std::unique_ptr XarchArg(Opts.ParseOneArg(Args, Index)); - - // If the argument parsing failed or more than one argument was - // consumed, the -Xarch_ argument's parameter tried to consume - // extra arguments. Emit an error and ignore. - // - // We also want to disallow any options which would alter the - // driver behavior; that isn't going to work in our model. We - // use isDriverOption() as an approximation, although things - // like -O4 are going to slip through. - if (!XarchArg || Index > Prev + 1) { - getDriver().Diag(diag::err_drv_invalid_Xarch_argument_with_args) - << A->getAsString(Args); - continue; - } else if (XarchArg->getOption().hasFlag(options::DriverOption)) { - getDriver().Diag(diag::err_drv_invalid_Xarch_argument_isdriver) - << A->getAsString(Args); - continue; - } - XarchArg->setBaseArg(A); - A = XarchArg.release(); - DAL->AddSynthesizedArg(A); - } DAL->append(A); } diff --git a/clang/lib/Format/ContinuationIndenter.cpp b/clang/lib/Format/ContinuationIndenter.cpp index 9a6d7877efaa6..d2397dbfeb87b 100644 --- a/clang/lib/Format/ContinuationIndenter.cpp +++ b/clang/lib/Format/ContinuationIndenter.cpp @@ -634,6 +634,7 @@ void ContinuationIndenter::addTokenOnCurrentLine(LineState &State, bool DryRun, State.Stack.back().NoLineBreak = true; if (Style.AlignAfterOpenBracket != FormatStyle::BAS_DontAlign && + !State.Stack.back().IsCSharpGenericTypeConstraint && Previous.opensScope() && Previous.isNot(TT_ObjCMethodExpr) && (Current.isNot(TT_LineComment) || Previous.BlockKind == BK_BracedInit)) State.Stack.back().Indent = State.Column + Spaces; @@ -715,6 +716,8 @@ void ContinuationIndenter::addTokenOnCurrentLine(LineState &State, bool DryRun, } else if (Previous.is(TT_InheritanceColon)) { State.Stack.back().Indent = State.Column; State.Stack.back().LastSpace = State.Column; + } else if (Current.is(TT_CSharpGenericTypeConstraintColon)) { + State.Stack.back().ColonPos = State.Column; } else if (Previous.opensScope()) { // If a function has a trailing call, indent all parameters from the // opening parenthesis. This avoids confusing indents like: @@ -924,7 +927,13 @@ unsigned ContinuationIndenter::addTokenOnNewLine(LineState &State, unsigned ContinuationIndenter::getNewLineColumn(const LineState &State) { if (!State.NextToken || !State.NextToken->Previous) return 0; + FormatToken &Current = *State.NextToken; + + if (State.Stack.back().IsCSharpGenericTypeConstraint && + Current.isNot(TT_CSharpGenericTypeConstraint)) + return State.Stack.back().ColonPos + 2; + const FormatToken &Previous = *Current.Previous; // If we are continuing an expression, we want to use the continuation indent. unsigned ContinuationIndent = @@ -1106,9 +1115,11 @@ unsigned ContinuationIndenter::moveStateToNextToken(LineState &State, assert(State.Stack.size()); const FormatToken &Current = *State.NextToken; + if (Current.is(TT_CSharpGenericTypeConstraint)) + State.Stack.back().IsCSharpGenericTypeConstraint = true; if (Current.isOneOf(tok::comma, TT_BinaryOperator)) State.Stack.back().NoLineBreakInOperand = false; - if (Current.is(TT_InheritanceColon)) + if (Current.isOneOf(TT_InheritanceColon, TT_CSharpGenericTypeConstraintColon)) State.Stack.back().AvoidBinPacking = true; if (Current.is(tok::lessless) && Current.isNot(TT_OverloadedOperator)) { if (State.Stack.back().FirstLessLess == 0) @@ -1329,6 +1340,11 @@ void ContinuationIndenter::moveStatePastScopeOpener(LineState &State, if (!Current.opensScope()) return; + // Don't allow '<' or '(' in C# generic type constraints to start new scopes. + if (Current.isOneOf(tok::less, tok::l_paren) && + State.Stack.back().IsCSharpGenericTypeConstraint) + return; + if (Current.MatchingParen && Current.BlockKind == BK_Block) { moveStateToNewBlock(State); return; @@ -1393,6 +1409,7 @@ void ContinuationIndenter::moveStatePastScopeOpener(LineState &State, (State.Line->Type == LT_ObjCDecl && ObjCBinPackProtocolList); AvoidBinPacking = + (State.Stack.back().IsCSharpGenericTypeConstraint) || (Style.Language == FormatStyle::LK_JavaScript && EndsInComma) || (State.Line->MustBeDeclaration && !BinPackDeclaration) || (!State.Line->MustBeDeclaration && !Style.BinPackArguments) || diff --git a/clang/lib/Format/ContinuationIndenter.h b/clang/lib/Format/ContinuationIndenter.h index 11df619e0f40b..ab116d5468e8c 100644 --- a/clang/lib/Format/ContinuationIndenter.h +++ b/clang/lib/Format/ContinuationIndenter.h @@ -208,7 +208,8 @@ struct ParenState { LastOperatorWrapped(true), ContainsLineBreak(false), ContainsUnwrappedBuilder(false), AlignColons(true), ObjCSelectorNameFound(false), HasMultipleNestedBlocks(false), - NestedBlockInlined(false), IsInsideObjCArrayLiteral(false) {} + NestedBlockInlined(false), IsInsideObjCArrayLiteral(false), + IsCSharpGenericTypeConstraint(false) {} /// \brief The token opening this parenthesis level, or nullptr if this level /// is opened by fake parenthesis. @@ -329,6 +330,8 @@ struct ParenState { /// array literal. bool IsInsideObjCArrayLiteral : 1; + bool IsCSharpGenericTypeConstraint : 1; + bool operator<(const ParenState &Other) const { if (Indent != Other.Indent) return Indent < Other.Indent; @@ -366,6 +369,8 @@ struct ParenState { return ContainsUnwrappedBuilder; if (NestedBlockInlined != Other.NestedBlockInlined) return NestedBlockInlined; + if (IsCSharpGenericTypeConstraint != Other.IsCSharpGenericTypeConstraint) + return IsCSharpGenericTypeConstraint; return false; } }; diff --git a/clang/lib/Format/FormatToken.h b/clang/lib/Format/FormatToken.h index 1b885b518f0d0..10a5f0e96f96f 100644 --- a/clang/lib/Format/FormatToken.h +++ b/clang/lib/Format/FormatToken.h @@ -108,6 +108,9 @@ namespace format { TYPE(CSharpNullCoalescing) \ TYPE(CSharpNullConditional) \ TYPE(CSharpNullConditionalLSquare) \ + TYPE(CSharpGenericTypeConstraint) \ + TYPE(CSharpGenericTypeConstraintColon) \ + TYPE(CSharpGenericTypeConstraintComma) \ TYPE(Unknown) enum TokenType { @@ -779,6 +782,7 @@ struct AdditionalKeywords { kw_unsafe = &IdentTable.get("unsafe"); kw_ushort = &IdentTable.get("ushort"); kw_when = &IdentTable.get("when"); + kw_where = &IdentTable.get("where"); // Keep this at the end of the constructor to make sure everything here // is @@ -796,6 +800,7 @@ struct AdditionalKeywords { kw_is, kw_lock, kw_null, kw_object, kw_out, kw_override, kw_params, kw_readonly, kw_ref, kw_string, kw_stackalloc, kw_sbyte, kw_sealed, kw_uint, kw_ulong, kw_unchecked, kw_unsafe, kw_ushort, kw_when, + kw_where, // Keywords from the JavaScript section. kw_as, kw_async, kw_await, kw_declare, kw_finally, kw_from, kw_function, kw_get, kw_import, kw_is, kw_let, kw_module, kw_readonly, @@ -900,6 +905,7 @@ struct AdditionalKeywords { IdentifierInfo *kw_unsafe; IdentifierInfo *kw_ushort; IdentifierInfo *kw_when; + IdentifierInfo *kw_where; /// Returns \c true if \p Tok is a true JavaScript identifier, returns /// \c false if it is a keyword or a pseudo keyword. diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp index d546a9f7c6067..f2666a8bd1717 100644 --- a/clang/lib/Format/TokenAnnotator.cpp +++ b/clang/lib/Format/TokenAnnotator.cpp @@ -1047,6 +1047,11 @@ class AnnotatingParser { Keywords.kw___has_include_next)) { parseHasInclude(); } + if (Tok->is(Keywords.kw_where) && Tok->Next && + Tok->Next->isNot(tok::l_paren)) { + Tok->Type = TT_CSharpGenericTypeConstraint; + parseCSharpGenericTypeConstraint(); + } break; default: break; @@ -1054,6 +1059,30 @@ class AnnotatingParser { return true; } + void parseCSharpGenericTypeConstraint() { + while (CurrentToken) { + if (CurrentToken->is(tok::less)) { + // parseAngle is too greedy and will consume the whole line. + CurrentToken->Type = TT_TemplateOpener; + next(); + } else if (CurrentToken->is(tok::greater)) { + CurrentToken->Type = TT_TemplateCloser; + next(); + } else if (CurrentToken->is(tok::comma)) { + CurrentToken->Type = TT_CSharpGenericTypeConstraintComma; + next(); + } else if (CurrentToken->is(Keywords.kw_where)) { + CurrentToken->Type = TT_CSharpGenericTypeConstraint; + next(); + } else if (CurrentToken->is(tok::colon)) { + CurrentToken->Type = TT_CSharpGenericTypeConstraintColon; + next(); + } else { + next(); + } + } + } + void parseIncludeDirective() { if (CurrentToken && CurrentToken->is(tok::less)) { next(); @@ -2986,6 +3015,10 @@ bool TokenAnnotator::spaceRequiredBefore(const AnnotatedLine &Line, if (Right.is(TT_CSharpNullConditionalLSquare)) return false; + // No space between consecutive commas '[,,]'. + if (Left.is(tok::comma) && Right.is(tok::comma)) + return false; + // Possible space inside `?[ 0 ]`. if (Left.is(TT_CSharpNullConditionalLSquare)) return Style.SpacesInSquareBrackets; @@ -3299,6 +3332,8 @@ bool TokenAnnotator::mustBreakBefore(const AnnotatedLine &Line, if (Right.is(TT_CSharpNamedArgumentColon) || Left.is(TT_CSharpNamedArgumentColon)) return false; + if (Right.is(TT_CSharpGenericTypeConstraint)) + return true; } else if (Style.Language == FormatStyle::LK_JavaScript) { // FIXME: This might apply to other languages and token kinds. if (Right.is(tok::string_literal) && Left.is(tok::plus) && Left.Previous && @@ -3589,6 +3624,9 @@ bool TokenAnnotator::canBreakBefore(const AnnotatedLine &Line, if (Left.isOneOf(TT_CSharpNamedArgumentColon, TT_AttributeColon) || Right.isOneOf(TT_CSharpNamedArgumentColon, TT_AttributeColon)) return false; + // Only break after commas for generic type constraints. + if (Line.First->is(TT_CSharpGenericTypeConstraint)) + return Left.is(TT_CSharpGenericTypeConstraintComma); } else if (Style.Language == FormatStyle::LK_Java) { if (Left.isOneOf(Keywords.kw_throws, Keywords.kw_extends, Keywords.kw_implements)) diff --git a/clang/lib/Format/UnwrappedLineFormatter.cpp b/clang/lib/Format/UnwrappedLineFormatter.cpp index 84ccbec2150d1..a81d480c8e64f 100644 --- a/clang/lib/Format/UnwrappedLineFormatter.cpp +++ b/clang/lib/Format/UnwrappedLineFormatter.cpp @@ -64,6 +64,8 @@ class LevelIndentTracker { } if (static_cast(Indent) + Offset >= 0) Indent += Offset; + if (Line.First->is(TT_CSharpGenericTypeConstraint)) + Indent = Line.Level * Style.IndentWidth + Style.ContinuationIndentWidth; } /// Update the indent state given that \p Line indent should be diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp index 00447ebdf5a92..d8202bd614580 100644 --- a/clang/lib/Format/UnwrappedLineParser.cpp +++ b/clang/lib/Format/UnwrappedLineParser.cpp @@ -323,6 +323,24 @@ void UnwrappedLineParser::parseFile() { addUnwrappedLine(); } +void UnwrappedLineParser::parseCSharpGenericTypeConstraint() { + do { + switch (FormatTok->Tok.getKind()) { + case tok::l_brace: + return; + default: + if (FormatTok->is(Keywords.kw_where)) { + addUnwrappedLine(); + nextToken(); + parseCSharpGenericTypeConstraint(); + break; + } + nextToken(); + break; + } + } while (!eof()); +} + void UnwrappedLineParser::parseCSharpAttribute() { int UnpairedSquareBrackets = 1; do { @@ -1344,6 +1362,12 @@ void UnwrappedLineParser::parseStructuralElement() { parseTryCatch(); return; case tok::identifier: { + if (Style.isCSharp() && FormatTok->is(Keywords.kw_where) && + Line->MustBeDeclaration) { + addUnwrappedLine(); + parseCSharpGenericTypeConstraint(); + break; + } if (FormatTok->is(TT_MacroBlockEnd)) { addUnwrappedLine(); return; @@ -1436,6 +1460,11 @@ void UnwrappedLineParser::parseStructuralElement() { nextToken(); if (FormatTok->Tok.is(tok::l_brace)) { + // Block kind should probably be set to BK_BracedInit for any language. + // C# needs this change to ensure that array initialisers and object + // initialisers are indented the same way. + if (Style.isCSharp()) + FormatTok->BlockKind = BK_BracedInit; nextToken(); parseBracedList(); } else if (Style.Language == FormatStyle::LK_Proto && @@ -1628,7 +1657,7 @@ bool UnwrappedLineParser::tryToParseBracedList() { bool UnwrappedLineParser::parseBracedList(bool ContinueOnSemicolons, tok::TokenKind ClosingBraceKind) { bool HasError = false; - + // FIXME: Once we have an expression parser in the UnwrappedLineParser, // replace this by using parseAssigmentExpression() inside. do { diff --git a/clang/lib/Format/UnwrappedLineParser.h b/clang/lib/Format/UnwrappedLineParser.h index e184cf5354fd1..42b8b51a37cc0 100644 --- a/clang/lib/Format/UnwrappedLineParser.h +++ b/clang/lib/Format/UnwrappedLineParser.h @@ -126,6 +126,10 @@ class UnwrappedLineParser { void parseJavaScriptEs6ImportExport(); void parseStatementMacro(); void parseCSharpAttribute(); + // Parse a C# generic type constraint: `where T : IComparable`. + // See: + // https://docs.microsoft.com/en-us/dotnet/csharp/language-reference/keywords/where-generic-type-constraint + void parseCSharpGenericTypeConstraint(); bool tryToParseLambda(); bool tryToParseLambdaIntroducer(); void tryToParseJSFunction(); diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp index 1e41ebbf17e44..646ce7852da47 100644 --- a/clang/lib/Frontend/CompilerInvocation.cpp +++ b/clang/lib/Frontend/CompilerInvocation.cpp @@ -792,10 +792,8 @@ static bool ParseCodeGenArgs(CodeGenOptions &Opts, ArgList &Args, InputKind IK, llvm::Triple T(TargetOpts.Triple); if (Opts.OptimizationLevel > 0 && Opts.hasReducedDebugInfo() && - llvm::is_contained(DebugEntryValueArchs, T.getArch())) { - Opts.EnableDebugEntryValues = Args.hasArg(OPT_femit_debug_entry_values); + llvm::is_contained(DebugEntryValueArchs, T.getArch())) Opts.EmitCallSiteInfo = true; - } Opts.DisableO0ImplyOptNone = Args.hasArg(OPT_disable_O0_optnone); Opts.DisableRedZone = Args.hasArg(OPT_disable_red_zone); @@ -927,6 +925,8 @@ static bool ParseCodeGenArgs(CodeGenOptions &Opts, ArgList &Args, InputKind IK, Opts.NoZeroInitializedInBSS = Args.hasArg(OPT_mno_zero_initialized_in_bss); Opts.NumRegisterParameters = getLastArgIntValue(Args, OPT_mregparm, 0, Diags); Opts.NoExecStack = Args.hasArg(OPT_mno_exec_stack); + Opts.SmallDataLimit = + getLastArgIntValue(Args, OPT_msmall_data_limit, 0, Diags); Opts.FatalWarnings = Args.hasArg(OPT_massembler_fatal_warnings); Opts.NoWarn = Args.hasArg(OPT_massembler_no_warn); Opts.EnableSegmentedStacks = Args.hasArg(OPT_split_stacks); @@ -2925,6 +2925,8 @@ static void ParseLangArgs(LangOptions &Opts, ArgList &Args, InputKind IK, !Args.hasArg(OPT_fno_concept_satisfaction_caching); if (Args.hasArg(OPT_fconcepts_ts)) Diags.Report(diag::warn_fe_concepts_ts_flag); + Opts.RecoveryAST = + Args.hasFlag(OPT_frecovery_ast, OPT_fno_recovery_ast, Opts.CPlusPlus); Opts.HeinousExtensions = Args.hasArg(OPT_fheinous_gnu_extensions); Opts.AccessControl = !Args.hasArg(OPT_fno_access_control); Opts.ElideConstructors = !Args.hasArg(OPT_fno_elide_constructors); diff --git a/clang/lib/Parse/ParseDecl.cpp b/clang/lib/Parse/ParseDecl.cpp index bdff5273de611..b7cfe21da95d4 100644 --- a/clang/lib/Parse/ParseDecl.cpp +++ b/clang/lib/Parse/ParseDecl.cpp @@ -3252,10 +3252,8 @@ void Parser::ParseDeclarationSpecifiers(DeclSpec &DS, if (!TypeRep) { if (TryAnnotateTypeConstraint()) goto DoneWithDeclSpec; - if (isTypeConstraintAnnotation()) - continue; - if (NextToken().is(tok::annot_template_id)) - // Might have been annotated by TryAnnotateTypeConstraint. + if (Tok.isNot(tok::annot_cxxscope) || + NextToken().isNot(tok::identifier)) continue; // Eat the scope spec so the identifier is current. ConsumeAnnotationToken(); @@ -3410,9 +3408,6 @@ void Parser::ParseDeclarationSpecifiers(DeclSpec &DS, goto DoneWithDeclSpec; if (Tok.isNot(tok::identifier)) continue; - if (Tok.is(tok::annot_template_id)) - // Might have been annotated by TryAnnotateTypeConstraint. - continue; ParsedAttributesWithRange Attrs(AttrFactory); if (ParseImplicitInt(DS, nullptr, TemplateInfo, AS, DSContext, Attrs)) { if (!Attrs.empty()) { @@ -4422,7 +4417,8 @@ void Parser::ParseEnumSpecifier(SourceLocation StartLoc, DeclSpec &DS, ColonProtectionRAIIObject X(*this, AllowDeclaration); CXXScopeSpec Spec; - if (ParseOptionalCXXScopeSpecifier(Spec, nullptr, + if (ParseOptionalCXXScopeSpecifier(Spec, /*ObjectType=*/nullptr, + /*ObjectHadErrors=*/false, /*EnteringContext=*/true)) return; @@ -5255,7 +5251,8 @@ bool Parser::isConstructorDeclarator(bool IsUnqualified, bool DeductionGuide) { // Parse the C++ scope specifier. CXXScopeSpec SS; - if (ParseOptionalCXXScopeSpecifier(SS, nullptr, + if (ParseOptionalCXXScopeSpecifier(SS, /*ObjectType=*/nullptr, + /*ObjectHadErrors=*/false, /*EnteringContext=*/true)) { TPA.Revert(); return false; @@ -5635,7 +5632,8 @@ void Parser::ParseDeclaratorInternal(Declarator &D, D.getContext() == DeclaratorContext::FileContext || D.getContext() == DeclaratorContext::MemberContext; CXXScopeSpec SS; - ParseOptionalCXXScopeSpecifier(SS, nullptr, EnteringContext); + ParseOptionalCXXScopeSpecifier(SS, /*ObjectType=*/nullptr, + /*ObjectHadErrors=*/false, EnteringContext); if (SS.isNotEmpty()) { if (Tok.isNot(tok::star)) { @@ -5858,8 +5856,9 @@ void Parser::ParseDirectDeclarator(Declarator &D) { bool EnteringContext = D.getContext() == DeclaratorContext::FileContext || D.getContext() == DeclaratorContext::MemberContext; - ParseOptionalCXXScopeSpecifier(D.getCXXScopeSpec(), nullptr, - EnteringContext); + ParseOptionalCXXScopeSpecifier( + D.getCXXScopeSpec(), /*ObjectType=*/nullptr, + /*ObjectHadErrors=*/false, EnteringContext); } if (D.getCXXScopeSpec().isValid()) { @@ -5933,10 +5932,11 @@ void Parser::ParseDirectDeclarator(Declarator &D) { bool HadScope = D.getCXXScopeSpec().isValid(); if (ParseUnqualifiedId(D.getCXXScopeSpec(), + /*ObjectType=*/nullptr, + /*ObjectHadErrors=*/false, /*EnteringContext=*/true, /*AllowDestructorName=*/true, AllowConstructorName, - AllowDeductionGuide, nullptr, nullptr, - D.getName()) || + AllowDeductionGuide, nullptr, D.getName()) || // Once we're past the identifier, if the scope was bad, mark the // whole declarator bad. D.getCXXScopeSpec().isInvalid()) { diff --git a/clang/lib/Parse/ParseDeclCXX.cpp b/clang/lib/Parse/ParseDeclCXX.cpp index 09e5c7996fcd5..86a9a8208b2eb 100644 --- a/clang/lib/Parse/ParseDeclCXX.cpp +++ b/clang/lib/Parse/ParseDeclCXX.cpp @@ -290,7 +290,9 @@ Decl *Parser::ParseNamespaceAlias(SourceLocation NamespaceLoc, CXXScopeSpec SS; // Parse (optional) nested-name-specifier. - ParseOptionalCXXScopeSpecifier(SS, nullptr, /*EnteringContext=*/false, + ParseOptionalCXXScopeSpecifier(SS, /*ObjectType=*/nullptr, + /*ObjectHadErrors=*/false, + /*EnteringContext=*/false, /*MayBePseudoDestructor=*/nullptr, /*IsTypename=*/false, /*LastII=*/nullptr, @@ -530,7 +532,9 @@ Decl *Parser::ParseUsingDirective(DeclaratorContext Context, CXXScopeSpec SS; // Parse (optional) nested-name-specifier. - ParseOptionalCXXScopeSpecifier(SS, nullptr, /*EnteringContext=*/false, + ParseOptionalCXXScopeSpecifier(SS, /*ObjectType=*/nullptr, + /*ObjectHadErrors=*/false, + /*EnteringContext=*/false, /*MayBePseudoDestructor=*/nullptr, /*IsTypename=*/false, /*LastII=*/nullptr, @@ -597,7 +601,9 @@ bool Parser::ParseUsingDeclarator(DeclaratorContext Context, // Parse nested-name-specifier. IdentifierInfo *LastII = nullptr; - if (ParseOptionalCXXScopeSpecifier(D.SS, nullptr, /*EnteringContext=*/false, + if (ParseOptionalCXXScopeSpecifier(D.SS, /*ObjectType=*/nullptr, + /*ObjectHadErrors=*/false, + /*EnteringContext=*/false, /*MayBePseudoDtor=*/nullptr, /*IsTypename=*/false, /*LastII=*/&LastII, @@ -632,12 +638,12 @@ bool Parser::ParseUsingDeclarator(DeclaratorContext Context, D.Name.setConstructorName(Type, IdLoc, IdLoc); } else { if (ParseUnqualifiedId( - D.SS, /*EnteringContext=*/false, + D.SS, /*ObjectType=*/nullptr, + /*ObjectHadErrors=*/false, /*EnteringContext=*/false, /*AllowDestructorName=*/true, - /*AllowConstructorName=*/!(Tok.is(tok::identifier) && - NextToken().is(tok::equal)), - /*AllowDeductionGuide=*/false, - nullptr, nullptr, D.Name)) + /*AllowConstructorName=*/ + !(Tok.is(tok::identifier) && NextToken().is(tok::equal)), + /*AllowDeductionGuide=*/false, nullptr, D.Name)) return true; } @@ -1115,7 +1121,9 @@ TypeResult Parser::ParseBaseTypeSpecifier(SourceLocation &BaseLoc, // Parse optional nested-name-specifier CXXScopeSpec SS; - if (ParseOptionalCXXScopeSpecifier(SS, nullptr, /*EnteringContext=*/false)) + if (ParseOptionalCXXScopeSpecifier(SS, /*ObjectType=*/nullptr, + /*ObjectHadErrors=*/false, + /*EnteringContext=*/false)) return true; BaseLoc = Tok.getLocation(); @@ -1547,7 +1555,9 @@ void Parser::ParseClassSpecifier(tok::TokenKind TagTokKind, CXXScopeSpec Spec; bool HasValidSpec = true; - if (ParseOptionalCXXScopeSpecifier(Spec, nullptr, EnteringContext)) { + if (ParseOptionalCXXScopeSpecifier(Spec, /*ObjectType=*/nullptr, + /*ObjectHadErrors=*/false, + EnteringContext)) { DS.SetTypeSpecError(); HasValidSpec = false; } @@ -2501,7 +2511,8 @@ Parser::ParseCXXClassMemberDeclaration(AccessSpecifier AS, if (isAccessDecl) { // Collect the scope specifier token we annotated earlier. CXXScopeSpec SS; - ParseOptionalCXXScopeSpecifier(SS, nullptr, + ParseOptionalCXXScopeSpecifier(SS, /*ObjectType=*/nullptr, + /*ObjectHadErrors=*/false, /*EnteringContext=*/false); if (SS.isInvalid()) { @@ -2512,8 +2523,9 @@ Parser::ParseCXXClassMemberDeclaration(AccessSpecifier AS, // Try to parse an unqualified-id. SourceLocation TemplateKWLoc; UnqualifiedId Name; - if (ParseUnqualifiedId(SS, false, true, true, false, nullptr, - &TemplateKWLoc, Name)) { + if (ParseUnqualifiedId(SS, /*ObjectType=*/nullptr, + /*ObjectHadErrors=*/false, false, true, true, + false, &TemplateKWLoc, Name)) { SkipUntil(tok::semi); return nullptr; } @@ -2658,7 +2670,7 @@ Parser::ParseCXXClassMemberDeclaration(AccessSpecifier AS, auto &Zero = NextToken(); SmallString<8> Buffer; - if (Zero.isNot(tok::numeric_constant) || Zero.getLength() != 1 || + if (Zero.isNot(tok::numeric_constant) || PP.getSpelling(Zero, Buffer) != "0") return false; @@ -3493,7 +3505,9 @@ void Parser::ParseConstructorInitializer(Decl *ConstructorDecl) { MemInitResult Parser::ParseMemInitializer(Decl *ConstructorDecl) { // parse '::'[opt] nested-name-specifier[opt] CXXScopeSpec SS; - if (ParseOptionalCXXScopeSpecifier(SS, nullptr, /*EnteringContext=*/false)) + if (ParseOptionalCXXScopeSpecifier(SS, /*ObjectType=*/nullptr, + /*ObjectHadErrors=*/false, + /*EnteringContext=*/false)) return true; // : identifier diff --git a/clang/lib/Parse/ParseExpr.cpp b/clang/lib/Parse/ParseExpr.cpp index c417649de41bd..bcd5679cb43f2 100644 --- a/clang/lib/Parse/ParseExpr.cpp +++ b/clang/lib/Parse/ParseExpr.cpp @@ -625,13 +625,31 @@ Parser::ParseRHSOfBinaryExpression(ExprResult LHS, prec::Level MinPrec) { SourceRange(Actions.getExprRange(LHS.get()).getBegin(), Actions.getExprRange(RHS.get()).getEnd())); - LHS = Actions.ActOnBinOp(getCurScope(), OpToken.getLocation(), - OpToken.getKind(), LHS.get(), RHS.get()); - + ExprResult BinOp = + Actions.ActOnBinOp(getCurScope(), OpToken.getLocation(), + OpToken.getKind(), LHS.get(), RHS.get()); + if (BinOp.isInvalid()) + BinOp = Actions.CreateRecoveryExpr(LHS.get()->getBeginLoc(), + RHS.get()->getEndLoc(), + {LHS.get(), RHS.get()}); + + LHS = BinOp; } else { - LHS = Actions.ActOnConditionalOp(OpToken.getLocation(), ColonLoc, - LHS.get(), TernaryMiddle.get(), - RHS.get()); + ExprResult CondOp = Actions.ActOnConditionalOp( + OpToken.getLocation(), ColonLoc, LHS.get(), TernaryMiddle.get(), + RHS.get()); + if (CondOp.isInvalid()) { + std::vector Args; + // TernaryMiddle can be null for the GNU conditional expr extension. + if (TernaryMiddle.get()) + Args = {LHS.get(), TernaryMiddle.get(), RHS.get()}; + else + Args = {LHS.get(), RHS.get()}; + CondOp = Actions.CreateRecoveryExpr(LHS.get()->getBeginLoc(), + RHS.get()->getEndLoc(), Args); + } + + LHS = CondOp; } // In this case, ActOnBinOp or ActOnConditionalOp performed the // CorrectDelayedTyposInExpr check. @@ -1305,9 +1323,14 @@ ExprResult Parser::ParseCastExpression(CastParseKind ParseKind, UnconsumeToken(SavedTok); return ExprError(); } - if (!Res.isInvalid()) + if (!Res.isInvalid()) { + Expr *Arg = Res.get(); Res = Actions.ActOnUnaryOp(getCurScope(), SavedTok.getLocation(), - SavedKind, Res.get()); + SavedKind, Arg); + if (Res.isInvalid()) + Res = Actions.CreateRecoveryExpr(SavedTok.getLocation(), + Arg->getEndLoc(), Arg); + } return Res; } case tok::amp: { // unary-expression: '&' cast-expression @@ -1317,8 +1340,13 @@ ExprResult Parser::ParseCastExpression(CastParseKind ParseKind, SourceLocation SavedLoc = ConsumeToken(); PreferredType.enterUnary(Actions, Tok.getLocation(), tok::amp, SavedLoc); Res = ParseCastExpression(AnyCastExpr, true); - if (!Res.isInvalid()) - Res = Actions.ActOnUnaryOp(getCurScope(), SavedLoc, SavedKind, Res.get()); + if (!Res.isInvalid()) { + Expr *Arg = Res.get(); + Res = Actions.ActOnUnaryOp(getCurScope(), SavedLoc, SavedKind, Arg); + if (Res.isInvalid()) + Res = Actions.CreateRecoveryExpr(Tok.getLocation(), Arg->getEndLoc(), + Arg); + } return Res; } @@ -1334,8 +1362,12 @@ ExprResult Parser::ParseCastExpression(CastParseKind ParseKind, SourceLocation SavedLoc = ConsumeToken(); PreferredType.enterUnary(Actions, Tok.getLocation(), SavedKind, SavedLoc); Res = ParseCastExpression(AnyCastExpr); - if (!Res.isInvalid()) - Res = Actions.ActOnUnaryOp(getCurScope(), SavedLoc, SavedKind, Res.get()); + if (!Res.isInvalid()) { + Expr *Arg = Res.get(); + Res = Actions.ActOnUnaryOp(getCurScope(), SavedLoc, SavedKind, Arg); + if (Res.isInvalid()) + Res = Actions.CreateRecoveryExpr(SavedLoc, Arg->getEndLoc(), Arg); + } return Res; } @@ -1410,9 +1442,6 @@ ExprResult Parser::ParseCastExpression(CastParseKind ParseKind, *NotPrimaryExpression = true; Res = ParseCXXTypeid(); break; - case tok::kw___unique_stable_name: - Res = ParseUniqueStableNameExpression(); - break; case tok::kw___uuidof: if (NotPrimaryExpression) *NotPrimaryExpression = true; @@ -1421,7 +1450,9 @@ ExprResult Parser::ParseCastExpression(CastParseKind ParseKind, case tok::kw_this: Res = ParseCXXThis(); break; - + case tok::kw___builtin_unique_stable_name: + Res = ParseUniqueStableNameExpression(); + break; case tok::annot_typename: if (isStartOfObjCClassMessageMissingOpenBracket()) { ParsedType Type = getTypeAnnotation(Tok); @@ -1532,7 +1563,8 @@ ExprResult Parser::ParseCastExpression(CastParseKind ParseKind, // type, translate it into a type and continue parsing as a // cast expression. CXXScopeSpec SS; - ParseOptionalCXXScopeSpecifier(SS, nullptr, + ParseOptionalCXXScopeSpecifier(SS, /*ObjectType=*/nullptr, + /*ObjectHadErrors=*/false, /*EnteringContext=*/false); AnnotateTemplateIdTokenAsType(SS); return ParseCastExpression(ParseKind, isAddressOfOperand, NotCastExpr, @@ -1943,12 +1975,18 @@ Parser::ParsePostfixExpressionSuffix(ExprResult LHS) { PT.consumeClose(); LHS = ExprError(); } else { - assert((ArgExprs.size() == 0 || - ArgExprs.size()-1 == CommaLocs.size())&& - "Unexpected number of commas!"); - LHS = Actions.ActOnCallExpr(getCurScope(), LHS.get(), Loc, - ArgExprs, Tok.getLocation(), + assert( + (ArgExprs.size() == 0 || ArgExprs.size() - 1 == CommaLocs.size()) && + "Unexpected number of commas!"); + Expr *Fn = LHS.get(); + SourceLocation RParLoc = Tok.getLocation(); + LHS = Actions.ActOnCallExpr(getCurScope(), Fn, Loc, ArgExprs, RParLoc, ExecConfig); + if (LHS.isInvalid()) { + ArgExprs.insert(ArgExprs.begin(), Fn); + LHS = + Actions.CreateRecoveryExpr(Fn->getBeginLoc(), RParLoc, ArgExprs); + } PT.consumeClose(); } @@ -1986,9 +2024,9 @@ Parser::ParsePostfixExpressionSuffix(ExprResult LHS) { if (LHS.isInvalid()) break; - ParseOptionalCXXScopeSpecifier(SS, ObjectType, - /*EnteringContext=*/false, - &MayBePseudoDestructor); + ParseOptionalCXXScopeSpecifier( + SS, ObjectType, LHS.get() && LHS.get()->containsErrors(), + /*EnteringContext=*/false, &MayBePseudoDestructor); if (SS.isNotEmpty()) ObjectType = nullptr; } @@ -2048,14 +2086,13 @@ Parser::ParsePostfixExpressionSuffix(ExprResult LHS) { IdentifierInfo *Id = Tok.getIdentifierInfo(); SourceLocation Loc = ConsumeToken(); Name.setIdentifier(Id, Loc); - } else if (ParseUnqualifiedId(SS, - /*EnteringContext=*/false, - /*AllowDestructorName=*/true, - /*AllowConstructorName=*/ - getLangOpts().MicrosoftExt && - SS.isNotEmpty(), - /*AllowDeductionGuide=*/false, - ObjectType, &TemplateKWLoc, Name)) { + } else if (ParseUnqualifiedId( + SS, ObjectType, LHS.get() && LHS.get()->containsErrors(), + /*EnteringContext=*/false, + /*AllowDestructorName=*/true, + /*AllowConstructorName=*/ + getLangOpts().MicrosoftExt && SS.isNotEmpty(), + /*AllowDeductionGuide=*/false, &TemplateKWLoc, Name)) { (void)Actions.CorrectDelayedTyposInExpr(LHS); LHS = ExprError(); } @@ -2072,8 +2109,12 @@ Parser::ParsePostfixExpressionSuffix(ExprResult LHS) { case tok::plusplus: // postfix-expression: postfix-expression '++' case tok::minusminus: // postfix-expression: postfix-expression '--' if (!LHS.isInvalid()) { + Expr *Arg = LHS.get(); LHS = Actions.ActOnPostfixUnaryOp(getCurScope(), Tok.getLocation(), - Tok.getKind(), LHS.get()); + Tok.getKind(), Arg); + if (LHS.isInvalid()) + LHS = Actions.CreateRecoveryExpr(Arg->getBeginLoc(), + Tok.getLocation(), Arg); } ConsumeToken(); break; @@ -2182,47 +2223,42 @@ Parser::ParseExprAfterUnaryExprOrTypeTrait(const Token &OpTok, return Operand; } + ExprResult Parser::ParseUniqueStableNameExpression() { - assert(Tok.is(tok::kw___unique_stable_name) && "Not unique stable name"); + assert(Tok.is(tok::kw___builtin_unique_stable_name) && + "Not __bulitin_unique_stable_name"); SourceLocation OpLoc = ConsumeToken(); BalancedDelimiterTracker T(*this, tok::l_paren); // typeid expressions are always parenthesized. if (T.expectAndConsume(diag::err_expected_lparen_after, - "__unique_stable_name")) + "__builtin_unique_stable_name")) return ExprError(); - ExprResult Result; - if (isTypeIdInParens()) { TypeResult Ty = ParseTypeName(); - - // Match the ')'. T.consumeClose(); if (Ty.isInvalid()) return ExprError(); - Result = Actions.ActOnUniqueStableNameExpr(OpLoc, T.getOpenLocation(), - T.getCloseLocation(), Ty.get()); - } else { - EnterExpressionEvaluationContext Unevaluated( - Actions, Sema::ExpressionEvaluationContext::Unevaluated); - Result = ParseExpression(); + return Actions.ActOnUniqueStableNameExpr(OpLoc, T.getOpenLocation(), + T.getCloseLocation(), Ty.get()); + } - // Match the ')'. - if (Result.isInvalid()) - SkipUntil(tok::r_paren, StopAtSemi); - else { - T.consumeClose(); + EnterExpressionEvaluationContext Unevaluated( + Actions, Sema::ExpressionEvaluationContext::Unevaluated); + ExprResult Result = ParseExpression(); - Result = Actions.ActOnUniqueStableNameExpr( - OpLoc, T.getOpenLocation(), T.getCloseLocation(), Result.get()); - } + if (Result.isInvalid()) { + SkipUntil(tok::r_paren, StopAtSemi); + return Result; } - return Result; + T.consumeClose(); + return Actions.ActOnUniqueStableNameExpr(OpLoc, T.getOpenLocation(), + T.getCloseLocation(), Result.get()); } /// Parse a sizeof or alignof expression. diff --git a/clang/lib/Parse/ParseExprCXX.cpp b/clang/lib/Parse/ParseExprCXX.cpp index 10608644a8fe9..a0b97ea7514dc 100644 --- a/clang/lib/Parse/ParseExprCXX.cpp +++ b/clang/lib/Parse/ParseExprCXX.cpp @@ -124,6 +124,10 @@ void Parser::CheckForTemplateAndDigraph(Token &Next, ParsedType ObjectType, /// the "." or "->" of a member access expression, this parameter provides the /// type of the object whose members are being accessed. /// +/// \param ObjectHadErrors if this unqualified-id occurs within a member access +/// expression, indicates whether the original subexpressions had any errors. +/// When true, diagnostics for missing 'template' keyword will be supressed. +/// /// \param EnteringContext whether we will be entering into the context of /// the nested-name-specifier after parsing it. /// @@ -146,14 +150,10 @@ void Parser::CheckForTemplateAndDigraph(Token &Next, ParsedType ObjectType, /// /// /// \returns true if there was an error parsing a scope specifier -bool Parser::ParseOptionalCXXScopeSpecifier(CXXScopeSpec &SS, - ParsedType ObjectType, - bool EnteringContext, - bool *MayBePseudoDestructor, - bool IsTypename, - IdentifierInfo **LastII, - bool OnlyNamespace, - bool InUsingDeclaration) { +bool Parser::ParseOptionalCXXScopeSpecifier( + CXXScopeSpec &SS, ParsedType ObjectType, bool ObjectHadErrors, + bool EnteringContext, bool *MayBePseudoDestructor, bool IsTypename, + IdentifierInfo **LastII, bool OnlyNamespace, bool InUsingDeclaration) { assert(getLangOpts().CPlusPlus && "Call sites of this function should be guarded by checking for C++"); @@ -511,17 +511,21 @@ bool Parser::ParseOptionalCXXScopeSpecifier(CXXScopeSpec &SS, if (MemberOfUnknownSpecialization && (ObjectType || SS.isSet()) && (IsTypename || isTemplateArgumentList(1) == TPResult::True)) { - // We have something like t::getAs, where getAs is a - // member of an unknown specialization. However, this will only - // parse correctly as a template, so suggest the keyword 'template' - // before 'getAs' and treat this as a dependent template name. - unsigned DiagID = diag::err_missing_dependent_template_keyword; - if (getLangOpts().MicrosoftExt) - DiagID = diag::warn_missing_dependent_template_keyword; - - Diag(Tok.getLocation(), DiagID) - << II.getName() - << FixItHint::CreateInsertion(Tok.getLocation(), "template "); + // If we had errors before, ObjectType can be dependent even without any + // templates, do not report missing template keyword in that case. + if (!ObjectHadErrors) { + // We have something like t::getAs, where getAs is a + // member of an unknown specialization. However, this will only + // parse correctly as a template, so suggest the keyword 'template' + // before 'getAs' and treat this as a dependent template name. + unsigned DiagID = diag::err_missing_dependent_template_keyword; + if (getLangOpts().MicrosoftExt) + DiagID = diag::warn_missing_dependent_template_keyword; + + Diag(Tok.getLocation(), DiagID) + << II.getName() + << FixItHint::CreateInsertion(Tok.getLocation(), "template "); + } if (TemplateNameKind TNK = Actions.ActOnDependentTemplateName( getCurScope(), SS, Tok.getLocation(), TemplateName, ObjectType, @@ -593,12 +597,12 @@ ExprResult Parser::tryParseCXXIdExpression(CXXScopeSpec &SS, default: SourceLocation TemplateKWLoc; UnqualifiedId Name; - if (ParseUnqualifiedId(SS, + if (ParseUnqualifiedId(SS, /*ObjectType=*/nullptr, + /*ObjectHadErrors=*/false, /*EnteringContext=*/false, /*AllowDestructorName=*/false, /*AllowConstructorName=*/false, - /*AllowDeductionGuide=*/false, - /*ObjectType=*/nullptr, &TemplateKWLoc, Name)) + /*AllowDeductionGuide=*/false, &TemplateKWLoc, Name)) return ExprError(); // This is only the direct operand of an & operator if it is not @@ -666,7 +670,9 @@ ExprResult Parser::ParseCXXIdExpression(bool isAddressOfOperand) { // '::' unqualified-id // CXXScopeSpec SS; - ParseOptionalCXXScopeSpecifier(SS, nullptr, /*EnteringContext=*/false); + ParseOptionalCXXScopeSpecifier(SS, /*ObjectType=*/nullptr, + /*ObjectHadErrors=*/false, + /*EnteringContext=*/false); Token Replacement; ExprResult Result = @@ -1769,10 +1775,10 @@ Parser::ParseCXXPseudoDestructor(Expr *Base, SourceLocation OpLoc, // If there is a '<', the second type name is a template-id. Parse // it as such. if (Tok.is(tok::less) && - ParseUnqualifiedIdTemplateId(SS, SourceLocation(), - Name, NameLoc, - false, ObjectType, SecondTypeName, - /*AssumeTemplateId=*/true)) + ParseUnqualifiedIdTemplateId( + SS, ObjectType, Base && Base->containsErrors(), SourceLocation(), + Name, NameLoc, false, SecondTypeName, + /*AssumeTemplateId=*/true)) return ExprError(); return Actions.ActOnPseudoDestructorExpr(getCurScope(), Base, OpLoc, OpKind, @@ -2259,6 +2265,12 @@ bool Parser::ParseCXXTypeSpecifierSeq(DeclSpec &DS) { /// \param SS the nested-name-specifier that precedes this template-id, if /// we're actually parsing a qualified-id. /// +/// \param ObjectType if this unqualified-id occurs within a member access +/// expression, the type of the base object whose member is being accessed. +/// +/// \param ObjectHadErrors this unqualified-id occurs within a member access +/// expression, indicates whether the original subexpressions had any errors. +/// /// \param Name for constructor and destructor names, this is the actual /// identifier that may be a template-name. /// @@ -2268,9 +2280,6 @@ bool Parser::ParseCXXTypeSpecifierSeq(DeclSpec &DS) { /// \param EnteringContext whether we're entering the scope of the /// nested-name-specifier. /// -/// \param ObjectType if this unqualified-id occurs within a member access -/// expression, the type of the base object whose member is being accessed. -/// /// \param Id as input, describes the template-name or operator-function-id /// that precedes the '<'. If template arguments were parsed successfully, /// will be updated with the template-id. @@ -2279,14 +2288,10 @@ bool Parser::ParseCXXTypeSpecifierSeq(DeclSpec &DS) { /// refers to a template without performing name lookup to verify. /// /// \returns true if a parse error occurred, false otherwise. -bool Parser::ParseUnqualifiedIdTemplateId(CXXScopeSpec &SS, - SourceLocation TemplateKWLoc, - IdentifierInfo *Name, - SourceLocation NameLoc, - bool EnteringContext, - ParsedType ObjectType, - UnqualifiedId &Id, - bool AssumeTemplateId) { +bool Parser::ParseUnqualifiedIdTemplateId( + CXXScopeSpec &SS, ParsedType ObjectType, bool ObjectHadErrors, + SourceLocation TemplateKWLoc, IdentifierInfo *Name, SourceLocation NameLoc, + bool EnteringContext, UnqualifiedId &Id, bool AssumeTemplateId) { assert(Tok.is(tok::less) && "Expected '<' to finish parsing a template-id"); TemplateTy Template; @@ -2318,23 +2323,27 @@ bool Parser::ParseUnqualifiedIdTemplateId(CXXScopeSpec &SS, if (TNK == TNK_Non_template && MemberOfUnknownSpecialization && ObjectType && isTemplateArgumentList(0) == TPResult::True) { - // We have something like t->getAs(), where getAs is a - // member of an unknown specialization. However, this will only - // parse correctly as a template, so suggest the keyword 'template' - // before 'getAs' and treat this as a dependent template name. - std::string Name; - if (Id.getKind() == UnqualifiedIdKind::IK_Identifier) - Name = std::string(Id.Identifier->getName()); - else { - Name = "operator "; - if (Id.getKind() == UnqualifiedIdKind::IK_OperatorFunctionId) - Name += getOperatorSpelling(Id.OperatorFunctionId.Operator); - else - Name += Id.Identifier->getName(); + // If we had errors before, ObjectType can be dependent even without any + // templates, do not report missing template keyword in that case. + if (!ObjectHadErrors) { + // We have something like t->getAs(), where getAs is a + // member of an unknown specialization. However, this will only + // parse correctly as a template, so suggest the keyword 'template' + // before 'getAs' and treat this as a dependent template name. + std::string Name; + if (Id.getKind() == UnqualifiedIdKind::IK_Identifier) + Name = std::string(Id.Identifier->getName()); + else { + Name = "operator "; + if (Id.getKind() == UnqualifiedIdKind::IK_OperatorFunctionId) + Name += getOperatorSpelling(Id.OperatorFunctionId.Operator); + else + Name += Id.Identifier->getName(); + } + Diag(Id.StartLocation, diag::err_missing_dependent_template_keyword) + << Name + << FixItHint::CreateInsertion(Id.StartLocation, "template "); } - Diag(Id.StartLocation, diag::err_missing_dependent_template_keyword) - << Name - << FixItHint::CreateInsertion(Id.StartLocation, "template "); TNK = Actions.ActOnDependentTemplateName( getCurScope(), SS, TemplateKWLoc, Id, ObjectType, EnteringContext, Template, /*AllowInjectedClassName*/ true); @@ -2691,6 +2700,13 @@ bool Parser::ParseUnqualifiedIdOperator(CXXScopeSpec &SS, bool EnteringContext, /// \param SS The nested-name-specifier that preceded this unqualified-id. If /// non-empty, then we are parsing the unqualified-id of a qualified-id. /// +/// \param ObjectType if this unqualified-id occurs within a member access +/// expression, the type of the base object whose member is being accessed. +/// +/// \param ObjectHadErrors if this unqualified-id occurs within a member access +/// expression, indicates whether the original subexpressions had any errors. +/// When true, diagnostics for missing 'template' keyword will be supressed. +/// /// \param EnteringContext whether we are entering the scope of the /// nested-name-specifier. /// @@ -2700,17 +2716,14 @@ bool Parser::ParseUnqualifiedIdOperator(CXXScopeSpec &SS, bool EnteringContext, /// /// \param AllowDeductionGuide whether we allow parsing a deduction guide name. /// -/// \param ObjectType if this unqualified-id occurs within a member access -/// expression, the type of the base object whose member is being accessed. -/// /// \param Result on a successful parse, contains the parsed unqualified-id. /// /// \returns true if parsing fails, false otherwise. -bool Parser::ParseUnqualifiedId(CXXScopeSpec &SS, bool EnteringContext, +bool Parser::ParseUnqualifiedId(CXXScopeSpec &SS, ParsedType ObjectType, + bool ObjectHadErrors, bool EnteringContext, bool AllowDestructorName, bool AllowConstructorName, bool AllowDeductionGuide, - ParsedType ObjectType, SourceLocation *TemplateKWLoc, UnqualifiedId &Result) { if (TemplateKWLoc) @@ -2769,8 +2782,9 @@ bool Parser::ParseUnqualifiedId(CXXScopeSpec &SS, bool EnteringContext, TemplateTy Template; if (Tok.is(tok::less)) return ParseUnqualifiedIdTemplateId( - SS, TemplateKWLoc ? *TemplateKWLoc : SourceLocation(), Id, IdLoc, - EnteringContext, ObjectType, Result, TemplateSpecified); + SS, ObjectType, ObjectHadErrors, + TemplateKWLoc ? *TemplateKWLoc : SourceLocation(), Id, IdLoc, + EnteringContext, Result, TemplateSpecified); else if (TemplateSpecified && Actions.ActOnDependentTemplateName( getCurScope(), SS, *TemplateKWLoc, Result, ObjectType, @@ -2847,9 +2861,9 @@ bool Parser::ParseUnqualifiedId(CXXScopeSpec &SS, bool EnteringContext, Result.getKind() == UnqualifiedIdKind::IK_LiteralOperatorId) && Tok.is(tok::less)) return ParseUnqualifiedIdTemplateId( - SS, TemplateKWLoc ? *TemplateKWLoc : SourceLocation(), nullptr, - SourceLocation(), EnteringContext, ObjectType, Result, - TemplateSpecified); + SS, ObjectType, ObjectHadErrors, + TemplateKWLoc ? *TemplateKWLoc : SourceLocation(), nullptr, + SourceLocation(), EnteringContext, Result, TemplateSpecified); else if (TemplateSpecified && Actions.ActOnDependentTemplateName( getCurScope(), SS, *TemplateKWLoc, Result, ObjectType, @@ -2870,6 +2884,22 @@ bool Parser::ParseUnqualifiedId(CXXScopeSpec &SS, bool EnteringContext, // Parse the '~'. SourceLocation TildeLoc = ConsumeToken(); + if (TemplateSpecified) { + // C++ [temp.names]p3: + // A name prefixed by the keyword template shall be a template-id [...] + // + // A template-id cannot begin with a '~' token. This would never work + // anyway: x.~A() would specify that the destructor is a template, + // not that 'A' is a template. + // + // FIXME: Suggest replacing the attempted destructor name with a correct + // destructor name and recover. (This is not trivial if this would become + // a pseudo-destructor name). + Diag(*TemplateKWLoc, diag::err_unexpected_template_in_destructor_name) + << Tok.getLocation(); + return true; + } + if (SS.isEmpty() && Tok.is(tok::kw_decltype)) { DeclSpec DS(AttrFactory); SourceLocation EndLoc = ParseDecltypeSpecifier(DS); @@ -2889,7 +2919,7 @@ bool Parser::ParseUnqualifiedId(CXXScopeSpec &SS, bool EnteringContext, // If the user wrote ~T::T, correct it to T::~T. DeclaratorScopeObj DeclScopeObj(*this, SS); - if (!TemplateSpecified && NextToken().is(tok::coloncolon)) { + if (NextToken().is(tok::coloncolon)) { // Don't let ParseOptionalCXXScopeSpecifier() "correct" // `int A; struct { ~A::A(); };` to `int A; struct { ~A:A(); };`, // it will confuse this recovery logic. @@ -2899,7 +2929,8 @@ bool Parser::ParseUnqualifiedId(CXXScopeSpec &SS, bool EnteringContext, AnnotateScopeToken(SS, /*NewAnnotation*/true); SS.clear(); } - if (ParseOptionalCXXScopeSpecifier(SS, ObjectType, EnteringContext)) + if (ParseOptionalCXXScopeSpecifier(SS, ObjectType, ObjectHadErrors, + EnteringContext)) return true; if (SS.isNotEmpty()) ObjectType = nullptr; @@ -2926,8 +2957,9 @@ bool Parser::ParseUnqualifiedId(CXXScopeSpec &SS, bool EnteringContext, if (Tok.is(tok::less)) { Result.setDestructorName(TildeLoc, nullptr, ClassNameLoc); return ParseUnqualifiedIdTemplateId( - SS, TemplateKWLoc ? *TemplateKWLoc : SourceLocation(), ClassName, - ClassNameLoc, EnteringContext, ObjectType, Result, TemplateSpecified); + SS, ObjectType, ObjectHadErrors, + TemplateKWLoc ? *TemplateKWLoc : SourceLocation(), ClassName, + ClassNameLoc, EnteringContext, Result, TemplateSpecified); } // Note that this is a destructor name. diff --git a/clang/lib/Parse/ParseOpenMP.cpp b/clang/lib/Parse/ParseOpenMP.cpp index 84e7b7c9995c5..7ae9885abe2bd 100644 --- a/clang/lib/Parse/ParseOpenMP.cpp +++ b/clang/lib/Parse/ParseOpenMP.cpp @@ -13,6 +13,7 @@ #include "clang/AST/ASTContext.h" #include "clang/AST/StmtOpenMP.h" #include "clang/Basic/OpenMPKinds.h" +#include "clang/Basic/TokenKinds.h" #include "clang/Parse/ParseDiagnostic.h" #include "clang/Parse/Parser.h" #include "clang/Parse/RAIIObjectsForParser.h" @@ -754,7 +755,7 @@ static bool parseDeclareSimdClauses( "Unexpected linear modifier."); if (P.getActions().CheckOpenMPLinearModifier( static_cast(Data.ExtraModifier), - Data.DepLinMapLastLoc)) + Data.ExtraModifierLoc)) Data.ExtraModifier = OMPC_LINEAR_val; LinModifiers.append(Linears.size() - LinModifiers.size(), Data.ExtraModifier); @@ -1365,7 +1366,7 @@ void Parser::ParseOMPDeclareVariantClauses(Parser::DeclGroupPtrTy Ptr, } // Parse inner context selectors. - OMPTraitInfo TI; + OMPTraitInfo &TI = Actions.getASTContext().getNewOMPTraitInfo(); parseOMPContextSelectors(Loc, TI); // Parse ')' @@ -1843,6 +1844,7 @@ Parser::DeclGroupPtrTy Parser::ParseOpenMPDeclarativeDirectiveWithExtDecl( case OMPD_taskgroup: case OMPD_flush: case OMPD_depobj: + case OMPD_scan: case OMPD_for: case OMPD_for_simd: case OMPD_sections: @@ -2066,6 +2068,7 @@ Parser::ParseOpenMPDeclarativeOrExecutableDirective(ParsedStmtContext StmtCtx) { } case OMPD_flush: case OMPD_depobj: + case OMPD_scan: case OMPD_taskyield: case OMPD_barrier: case OMPD_taskwait: @@ -2287,12 +2290,14 @@ bool Parser::ParseOpenMPSimpleVarList( NoIdentIsFound = false; if (AllowScopeSpecifier && getLangOpts().CPlusPlus && - ParseOptionalCXXScopeSpecifier(SS, nullptr, false)) { + ParseOptionalCXXScopeSpecifier(SS, /*ObjectType=*/nullptr, + /*ObjectHadErrors=*/false, false)) { IsCorrect = false; SkipUntil(tok::comma, tok::r_paren, tok::annot_pragma_openmp_end, StopBeforeMatch); - } else if (ParseUnqualifiedId(SS, false, false, false, false, nullptr, - nullptr, Name)) { + } else if (ParseUnqualifiedId(SS, /*ObjectType=*/nullptr, + /*ObjectHadErrors=*/false, false, false, + false, false, nullptr, Name)) { IsCorrect = false; SkipUntil(tok::comma, tok::r_paren, tok::annot_pragma_openmp_end, StopBeforeMatch); @@ -2340,7 +2345,8 @@ bool Parser::ParseOpenMPSimpleVarList( /// from-clause | is_device_ptr-clause | task_reduction-clause | /// in_reduction-clause | allocator-clause | allocate-clause | /// acq_rel-clause | acquire-clause | release-clause | relaxed-clause | -/// depobj-clause | destroy-clause | detach-clause +/// depobj-clause | destroy-clause | detach-clause | inclusive-clause | +/// exclusive-clause /// OMPClause *Parser::ParseOpenMPClause(OpenMPDirectiveKind DKind, OpenMPClauseKind CKind, bool FirstClause) { @@ -2509,6 +2515,8 @@ OMPClause *Parser::ParseOpenMPClause(OpenMPDirectiveKind DKind, case OMPC_is_device_ptr: case OMPC_allocate: case OMPC_nontemporal: + case OMPC_inclusive: + case OMPC_exclusive: Clause = ParseOpenMPVarListClause(DKind, CKind, WrongDirective); break; case OMPC_device_type: @@ -2874,11 +2882,12 @@ static bool ParseReductionId(Parser &P, CXXScopeSpec &ReductionIdScopeSpec, return false; } } - return P.ParseUnqualifiedId(ReductionIdScopeSpec, /*EnteringContext*/ false, - /*AllowDestructorName*/ false, - /*AllowConstructorName*/ false, - /*AllowDeductionGuide*/ false, - nullptr, nullptr, ReductionId); + return P.ParseUnqualifiedId( + ReductionIdScopeSpec, /*ObjectType=*/nullptr, + /*ObjectHadErrors=*/false, /*EnteringContext*/ false, + /*AllowDestructorName*/ false, + /*AllowConstructorName*/ false, + /*AllowDeductionGuide*/ false, nullptr, ReductionId); } /// Checks if the token is a valid map-type-modifier. @@ -2906,6 +2915,7 @@ bool Parser::parseMapperModifier(OpenMPVarListDataTy &Data) { if (getLangOpts().CPlusPlus) ParseOptionalCXXScopeSpecifier(Data.ReductionOrMapperIdScopeSpec, /*ObjectType=*/nullptr, + /*ObjectHadErrors=*/false, /*EnteringContext=*/false); if (Tok.isNot(tok::identifier) && Tok.isNot(tok::kw_default)) { Diag(Tok.getLocation(), diag::err_omp_mapper_illegal_identifier); @@ -3007,10 +3017,22 @@ bool Parser::ParseOpenMPVarList(OpenMPDirectiveKind DKind, // Handle reduction-identifier for reduction clause. if (Kind == OMPC_reduction || Kind == OMPC_task_reduction || Kind == OMPC_in_reduction) { + Data.ExtraModifier = OMPC_REDUCTION_unknown; + if (Kind == OMPC_reduction && getLangOpts().OpenMP >= 50 && + (Tok.is(tok::identifier) || Tok.is(tok::kw_default)) && + NextToken().is(tok::comma)) { + // Parse optional reduction modifier. + Data.ExtraModifier = getOpenMPSimpleClauseType(Kind, PP.getSpelling(Tok)); + Data.ExtraModifierLoc = Tok.getLocation(); + ConsumeToken(); + assert(Tok.is(tok::comma) && "Expected comma."); + (void)ConsumeToken(); + } ColonProtectionRAIIObject ColonRAII(*this); if (getLangOpts().CPlusPlus) ParseOptionalCXXScopeSpecifier(Data.ReductionOrMapperIdScopeSpec, /*ObjectType=*/nullptr, + /*ObjectHadErrors=*/false, /*EnteringContext=*/false); InvalidReductionId = ParseReductionId( *this, Data.ReductionOrMapperIdScopeSpec, UnqualifiedReductionId); @@ -3030,7 +3052,7 @@ bool Parser::ParseOpenMPVarList(OpenMPDirectiveKind DKind, ColonProtectionRAIIObject ColonRAII(*this); Data.ExtraModifier = getOpenMPSimpleClauseType( Kind, Tok.is(tok::identifier) ? PP.getSpelling(Tok) : ""); - Data.DepLinMapLastLoc = Tok.getLocation(); + Data.ExtraModifierLoc = Tok.getLocation(); if (Data.ExtraModifier == OMPC_DEPEND_unknown) { SkipUntil(tok::colon, tok::r_paren, tok::annot_pragma_openmp_end, StopBeforeMatch); @@ -3055,7 +3077,7 @@ bool Parser::ParseOpenMPVarList(OpenMPDirectiveKind DKind, Data.ExtraModifier = OMPC_LINEAR_val; if (Tok.is(tok::identifier) && PP.LookAhead(0).is(tok::l_paren)) { Data.ExtraModifier = getOpenMPSimpleClauseType(Kind, PP.getSpelling(Tok)); - Data.DepLinMapLastLoc = ConsumeToken(); + Data.ExtraModifierLoc = ConsumeToken(); LinearT.consumeOpen(); NeedRParenForLinear = true; } @@ -3068,13 +3090,8 @@ bool Parser::ParseOpenMPVarList(OpenMPDirectiveKind DKind, !isOpenMPTaskLoopDirective(DKind)) && Tok.is(tok::identifier) && PP.LookAhead(0).is(tok::colon)) { Data.ExtraModifier = getOpenMPSimpleClauseType(Kind, PP.getSpelling(Tok)); - Data.DepLinMapLastLoc = Tok.getLocation(); - if (Data.ExtraModifier == OMPC_LASTPRIVATE_unknown) { - SkipUntil(tok::colon, tok::r_paren, tok::annot_pragma_openmp_end, - StopBeforeMatch); - } else { - ConsumeToken(); - } + Data.ExtraModifierLoc = Tok.getLocation(); + ConsumeToken(); assert(Tok.is(tok::colon) && "Expected colon."); Data.ColonLoc = ConsumeToken(); } @@ -3086,7 +3103,7 @@ bool Parser::ParseOpenMPVarList(OpenMPDirectiveKind DKind, // map-type-modifier. The map-type can also be delete which has the same // spelling of the C++ delete keyword. Data.ExtraModifier = OMPC_MAP_unknown; - Data.DepLinMapLastLoc = Tok.getLocation(); + Data.ExtraModifierLoc = Tok.getLocation(); // Check for presence of a colon in the map clause. TentativeParsingAction TPA(*this); @@ -3230,8 +3247,8 @@ bool Parser::ParseOpenMPVarList(OpenMPDirectiveKind DKind, } /// Parsing of OpenMP clause 'private', 'firstprivate', 'lastprivate', -/// 'shared', 'copyin', 'copyprivate', 'flush', 'reduction', 'task_reduction' or -/// 'in_reduction'. +/// 'shared', 'copyin', 'copyprivate', 'flush', 'reduction', 'task_reduction', +/// 'in_reduction', 'nontemporal', 'exclusive' or 'inclusive'. /// /// private-clause: /// 'private' '(' list ')' @@ -3246,7 +3263,7 @@ bool Parser::ParseOpenMPVarList(OpenMPDirectiveKind DKind, /// aligned-clause: /// 'aligned' '(' list [ ':' alignment ] ')' /// reduction-clause: -/// 'reduction' '(' reduction-identifier ':' list ')' +/// 'reduction' '(' [ modifier ',' ] reduction-identifier ':' list ')' /// task_reduction-clause: /// 'task_reduction' '(' reduction-identifier ':' list ')' /// in_reduction-clause: @@ -3271,6 +3288,12 @@ bool Parser::ParseOpenMPVarList(OpenMPDirectiveKind DKind, /// 'is_device_ptr' '(' list ')' /// allocate-clause: /// 'allocate' '(' [ allocator ':' ] list ')' +/// nontemporal-clause: +/// 'nontemporal' '(' list ')' +/// inclusive-clause: +/// 'inclusive' '(' list ')' +/// exclusive-clause: +/// 'exclusive' '(' list ')' /// /// For 'linear' clause linear-list may have the following forms: /// list @@ -3294,6 +3317,6 @@ OMPClause *Parser::ParseOpenMPVarListClause(OpenMPDirectiveKind DKind, Kind, Vars, Data.TailExpr, Locs, Data.ColonLoc, Data.ReductionOrMapperIdScopeSpec, Data.ReductionOrMapperId, Data.ExtraModifier, Data.MapTypeModifiers, Data.MapTypeModifiersLoc, - Data.IsMapTypeImplicit, Data.DepLinMapLastLoc); + Data.IsMapTypeImplicit, Data.ExtraModifierLoc); } diff --git a/clang/lib/Parse/ParseStmtAsm.cpp b/clang/lib/Parse/ParseStmtAsm.cpp index 2e369448ab6a6..262def2b38a1a 100644 --- a/clang/lib/Parse/ParseStmtAsm.cpp +++ b/clang/lib/Parse/ParseStmtAsm.cpp @@ -220,9 +220,10 @@ ExprResult Parser::ParseMSAsmIdentifier(llvm::SmallVectorImpl &LineToks, // Parse an optional scope-specifier if we're in C++. CXXScopeSpec SS; - if (getLangOpts().CPlusPlus) { - ParseOptionalCXXScopeSpecifier(SS, nullptr, /*EnteringContext=*/false); - } + if (getLangOpts().CPlusPlus) + ParseOptionalCXXScopeSpecifier(SS, /*ObjectType=*/nullptr, + /*ObjectHadErrors=*/false, + /*EnteringContext=*/false); // Require an identifier here. SourceLocation TemplateKWLoc; @@ -233,12 +234,13 @@ ExprResult Parser::ParseMSAsmIdentifier(llvm::SmallVectorImpl &LineToks, Result = ParseCXXThis(); Invalid = false; } else { - Invalid = ParseUnqualifiedId(SS, - /*EnteringContext=*/false, - /*AllowDestructorName=*/false, - /*AllowConstructorName=*/false, - /*AllowDeductionGuide=*/false, - /*ObjectType=*/nullptr, &TemplateKWLoc, Id); + Invalid = + ParseUnqualifiedId(SS, /*ObjectType=*/nullptr, + /*ObjectHadErrors=*/false, + /*EnteringContext=*/false, + /*AllowDestructorName=*/false, + /*AllowConstructorName=*/false, + /*AllowDeductionGuide=*/false, &TemplateKWLoc, Id); // Perform the lookup. Result = Actions.LookupInlineAsmIdentifier(SS, TemplateKWLoc, Id, IsUnevaluatedContext); diff --git a/clang/lib/Parse/ParseTemplate.cpp b/clang/lib/Parse/ParseTemplate.cpp index 0406820f74a33..802fe35d4f62a 100644 --- a/clang/lib/Parse/ParseTemplate.cpp +++ b/clang/lib/Parse/ParseTemplate.cpp @@ -363,9 +363,11 @@ Parser::ParseConceptDefinition(const ParsedTemplateInfo &TemplateInfo, DiagnoseAndSkipCXX11Attributes(); CXXScopeSpec SS; - if (ParseOptionalCXXScopeSpecifier(SS, ParsedType(), - /*EnteringContext=*/false, /*MayBePseudoDestructor=*/nullptr, - /*IsTypename=*/false, /*LastII=*/nullptr, /*OnlyNamespace=*/true) || + if (ParseOptionalCXXScopeSpecifier( + SS, /*ObjectType=*/nullptr, + /*ObjectHadErrors=*/false, /*EnteringContext=*/false, + /*MayBePseudoDestructor=*/nullptr, + /*IsTypename=*/false, /*LastII=*/nullptr, /*OnlyNamespace=*/true) || SS.isInvalid()) { SkipUntil(tok::semi); return nullptr; @@ -376,12 +378,12 @@ Parser::ParseConceptDefinition(const ParsedTemplateInfo &TemplateInfo, diag::err_concept_definition_not_identifier); UnqualifiedId Result; - if (ParseUnqualifiedId(SS, /*EnteringContext=*/false, + if (ParseUnqualifiedId(SS, /*ObjectType=*/nullptr, + /*ObjectHadErrors=*/false, /*EnteringContext=*/false, /*AllowDestructorName=*/false, /*AllowConstructorName=*/false, /*AllowDeductionGuide=*/false, - /*ObjectType=*/ParsedType(), /*TemplateKWLoc=*/nullptr, - Result)) { + /*TemplateKWLoc=*/nullptr, Result)) { SkipUntil(tok::semi); return nullptr; } @@ -682,19 +684,19 @@ bool Parser::TryAnnotateTypeConstraint() { return false; CXXScopeSpec SS; bool WasScopeAnnotation = Tok.is(tok::annot_cxxscope); - if (ParseOptionalCXXScopeSpecifier( - SS, ParsedType(), - /*EnteringContext=*/false, - /*MayBePseudoDestructor=*/nullptr, - // If this is not a type-constraint, then - // this scope-spec is part of the typename - // of a non-type template parameter - /*IsTypename=*/true, /*LastII=*/nullptr, - // We won't find concepts in - // non-namespaces anyway, so might as well - // parse this correctly for possible type - // names. - /*OnlyNamespace=*/false)) + if (ParseOptionalCXXScopeSpecifier(SS, /*ObjectType=*/nullptr, + /*ObjectHadErrors=*/false, + /*EnteringContext=*/false, + /*MayBePseudoDestructor=*/nullptr, + // If this is not a type-constraint, then + // this scope-spec is part of the typename + // of a non-type template parameter + /*IsTypename=*/true, /*LastII=*/nullptr, + // We won't find concepts in + // non-namespaces anyway, so might as well + // parse this correctly for possible type + // names. + /*OnlyNamespace=*/false)) return true; if (Tok.is(tok::identifier)) { @@ -754,7 +756,8 @@ NamedDecl *Parser::ParseTypeParameter(unsigned Depth, unsigned Position) { TemplateIdAnnotation *TypeConstraint = nullptr; bool TypenameKeyword = false; SourceLocation KeyLoc; - ParseOptionalCXXScopeSpecifier(TypeConstraintSS, nullptr, + ParseOptionalCXXScopeSpecifier(TypeConstraintSS, /*ObjectType=*/nullptr, + /*ObjectHadErrors=*/false, /*EnteringContext*/ false); if (Tok.is(tok::annot_template_id)) { // Consume the 'type-constraint'. @@ -1438,7 +1441,8 @@ ParsedTemplateArgument Parser::ParseTemplateTemplateArgument() { // followed by a token that terminates a template argument, such as ',', // '>', or (in some cases) '>>'. CXXScopeSpec SS; // nested-name-specifier, if present - ParseOptionalCXXScopeSpecifier(SS, nullptr, + ParseOptionalCXXScopeSpecifier(SS, /*ObjectType=*/nullptr, + /*ObjectHadErrors=*/false, /*EnteringContext=*/false); ParsedTemplateArgument Result; diff --git a/clang/lib/Parse/ParseTentative.cpp b/clang/lib/Parse/ParseTentative.cpp index ce463beeddb2e..bca7d021da0a2 100644 --- a/clang/lib/Parse/ParseTentative.cpp +++ b/clang/lib/Parse/ParseTentative.cpp @@ -1119,7 +1119,7 @@ Parser::isExpressionOrTypeSpecifierSimple(tok::TokenKind Kind) { case tok::kw_L__FUNCSIG__: case tok::kw___PRETTY_FUNCTION__: case tok::kw___uuidof: - case tok::kw___unique_stable_name: + case tok::kw___builtin_unique_stable_name: #define TYPE_TRAIT(N,Spelling,K) \ case tok::kw_##Spelling: #include "clang/Basic/TokenKinds.def" diff --git a/clang/lib/Parse/Parser.cpp b/clang/lib/Parse/Parser.cpp index 27cb8a2a5e762..0a63ac2d5e1bc 100644 --- a/clang/lib/Parse/Parser.cpp +++ b/clang/lib/Parse/Parser.cpp @@ -1605,7 +1605,9 @@ Parser::TryAnnotateName(CorrectionCandidateCallback *CCC) { CXXScopeSpec SS; if (getLangOpts().CPlusPlus && - ParseOptionalCXXScopeSpecifier(SS, nullptr, EnteringContext)) + ParseOptionalCXXScopeSpecifier(SS, /*ObjectType=*/nullptr, + /*ObjectHadErrors=*/false, + EnteringContext)) return ANK_Error; if (Tok.isNot(tok::identifier) || SS.isInvalid()) { @@ -1842,6 +1844,7 @@ bool Parser::TryAnnotateTypeOrScopeToken() { SourceLocation TypenameLoc = ConsumeToken(); CXXScopeSpec SS; if (ParseOptionalCXXScopeSpecifier(SS, /*ObjectType=*/nullptr, + /*ObjectHadErrors=*/false, /*EnteringContext=*/false, nullptr, /*IsTypename*/ true)) return true; @@ -1914,7 +1917,9 @@ bool Parser::TryAnnotateTypeOrScopeToken() { CXXScopeSpec SS; if (getLangOpts().CPlusPlus) - if (ParseOptionalCXXScopeSpecifier(SS, nullptr, /*EnteringContext*/false)) + if (ParseOptionalCXXScopeSpecifier(SS, /*ObjectType=*/nullptr, + /*ObjectHadErrors=*/false, + /*EnteringContext*/ false)) return true; return TryAnnotateTypeOrScopeTokenAfterScopeSpec(SS, !WasScopeAnnotation); @@ -2043,7 +2048,9 @@ bool Parser::TryAnnotateCXXScopeToken(bool EnteringContext) { assert(MightBeCXXScopeToken() && "Cannot be a type or scope token!"); CXXScopeSpec SS; - if (ParseOptionalCXXScopeSpecifier(SS, nullptr, EnteringContext)) + if (ParseOptionalCXXScopeSpecifier(SS, /*ObjectType=*/nullptr, + /*ObjectHadErrors=*/false, + EnteringContext)) return true; if (SS.isEmpty()) return false; @@ -2152,7 +2159,8 @@ bool Parser::ParseMicrosoftIfExistsCondition(IfExistsCondition& Result) { // Parse nested-name-specifier. if (getLangOpts().CPlusPlus) - ParseOptionalCXXScopeSpecifier(Result.SS, nullptr, + ParseOptionalCXXScopeSpecifier(Result.SS, /*ObjectType=*/nullptr, + /*ObjectHadErrors=*/false, /*EnteringContext=*/false); // Check nested-name specifier. @@ -2163,10 +2171,12 @@ bool Parser::ParseMicrosoftIfExistsCondition(IfExistsCondition& Result) { // Parse the unqualified-id. SourceLocation TemplateKWLoc; // FIXME: parsed, but unused. - if (ParseUnqualifiedId( - Result.SS, /*EnteringContext*/false, /*AllowDestructorName*/true, - /*AllowConstructorName*/true, /*AllowDeductionGuide*/false, nullptr, - &TemplateKWLoc, Result.Name)) { + if (ParseUnqualifiedId(Result.SS, /*ObjectType=*/nullptr, + /*ObjectHadErrors=*/false, /*EnteringContext*/ false, + /*AllowDestructorName*/ true, + /*AllowConstructorName*/ true, + /*AllowDeductionGuide*/ false, &TemplateKWLoc, + Result.Name)) { T.skipToEnd(); return true; } diff --git a/clang/lib/Sema/MultiplexExternalSemaSource.cpp b/clang/lib/Sema/MultiplexExternalSemaSource.cpp index 2b0cd6b8c4fc3..80333e63127e4 100644 --- a/clang/lib/Sema/MultiplexExternalSemaSource.cpp +++ b/clang/lib/Sema/MultiplexExternalSemaSource.cpp @@ -275,6 +275,12 @@ void MultiplexExternalSemaSource::ReadExtVectorDecls( Sources[i]->ReadExtVectorDecls(Decls); } +void MultiplexExternalSemaSource::ReadDeclsToCheckForDeferredDiags( + llvm::SmallVector &Decls) { + for(size_t i = 0; i < Sources.size(); ++i) + Sources[i]->ReadDeclsToCheckForDeferredDiags(Decls); +} + void MultiplexExternalSemaSource::ReadUnusedLocalTypedefNameCandidates( llvm::SmallSetVector &Decls) { for(size_t i = 0; i < Sources.size(); ++i) diff --git a/clang/lib/Sema/ParsedAttr.cpp b/clang/lib/Sema/ParsedAttr.cpp index fa7b59de3e31a..6d96ea96cd371 100644 --- a/clang/lib/Sema/ParsedAttr.cpp +++ b/clang/lib/Sema/ParsedAttr.cpp @@ -140,7 +140,8 @@ const ParsedAttrInfo &ParsedAttrInfo::get(const AttributeCommonInfo &A) { return *Ptr; // If we failed to find a match then return a default ParsedAttrInfo. - static ParsedAttrInfo DefaultParsedAttrInfo; + static ParsedAttrInfo DefaultParsedAttrInfo( + AttributeCommonInfo::UnknownAttribute); return DefaultParsedAttrInfo; } diff --git a/clang/lib/Sema/Sema.cpp b/clang/lib/Sema/Sema.cpp index ccfe79e41bb2f..ff11e97c5783b 100644 --- a/clang/lib/Sema/Sema.cpp +++ b/clang/lib/Sema/Sema.cpp @@ -11,6 +11,7 @@ // //===----------------------------------------------------------------------===// +#include "UsedDeclVisitor.h" #include "clang/AST/ASTContext.h" #include "clang/AST/ASTDiagnostic.h" #include "clang/AST/DeclCXX.h" @@ -974,12 +975,9 @@ void Sema::ActOnEndOfTranslationUnitFragment(TUFragmentKind Kind) { if (SyclIntHeader != nullptr) SyclIntHeader->emit(getLangOpts().SYCLIntHeader); MarkDevice(); - finalizeSYCLDelayedAnalysis(); } - // Finalize analysis of OpenMP-specific constructs. - if (LangOpts.OpenMP) - finalizeOpenMPDelayedAnalysis(); + emitDeferredDiags(); assert(LateParsedInstantiations.empty() && "end of TU template instantiation should not create more " @@ -1474,27 +1472,114 @@ static void emitCallStackNotes(Sema &S, FunctionDecl *FD) { // Emit any deferred diagnostics for FD and erase them from the map in which // they're stored. -static void emitDeferredDiags(Sema &S, FunctionDecl *FD, bool ShowCallStack) { - auto It = S.DeviceDeferredDiags.find(FD); - if (It == S.DeviceDeferredDiags.end()) +void Sema::emitDeferredDiags(FunctionDecl *FD, bool ShowCallStack) { + auto It = DeviceDeferredDiags.find(FD); + if (It == DeviceDeferredDiags.end()) return; bool HasWarningOrError = false; + bool FirstDiag = true; for (PartialDiagnosticAt &PDAt : It->second) { const SourceLocation &Loc = PDAt.first; const PartialDiagnostic &PD = PDAt.second; - HasWarningOrError |= S.getDiagnostics().getDiagnosticLevel( + HasWarningOrError |= getDiagnostics().getDiagnosticLevel( PD.getDiagID(), Loc) >= DiagnosticsEngine::Warning; - DiagnosticBuilder Builder(S.Diags.Report(Loc, PD.getDiagID())); - Builder.setForceEmit(); - PD.Emit(Builder); + { + DiagnosticBuilder Builder(Diags.Report(Loc, PD.getDiagID())); + Builder.setForceEmit(); + PD.Emit(Builder); + } + + // Emit the note on the first diagnostic in case too many diagnostics cause + // the note not emitted. + if (FirstDiag && HasWarningOrError && ShowCallStack) { + emitCallStackNotes(*this, FD); + FirstDiag = false; + } } - S.DeviceDeferredDiags.erase(It); - // FIXME: Should this be called after every warning/error emitted in the loop - // above, instead of just once per function? That would be consistent with - // how we handle immediate errors, but it also seems like a bit much. - if (HasWarningOrError && ShowCallStack) - emitCallStackNotes(S, FD); +} + +namespace { +/// Helper class that emits deferred diagnostic messages if an entity directly +/// or indirectly using the function that causes the deferred diagnostic +/// messages is known to be emitted. +class DeferredDiagnosticsEmitter + : public UsedDeclVisitor { +public: + typedef UsedDeclVisitor Inherited; + llvm::SmallSet, 4> Visited; + llvm::SmallVector, 4> UseStack; + bool ShouldEmit; + unsigned InOMPDeviceContext; + + DeferredDiagnosticsEmitter(Sema &S) + : Inherited(S), ShouldEmit(false), InOMPDeviceContext(0) {} + + void VisitOMPTargetDirective(OMPTargetDirective *Node) { + ++InOMPDeviceContext; + Inherited::VisitOMPTargetDirective(Node); + --InOMPDeviceContext; + } + + void visitUsedDecl(SourceLocation Loc, Decl *D) { + if (auto *FD = dyn_cast(D)) { + FunctionDecl *Caller = UseStack.empty() ? nullptr : UseStack.back(); + auto IsKnownEmitted = S.getEmissionStatus(FD, /*Final=*/true) == + Sema::FunctionEmissionStatus::Emitted; + if (!Caller) + ShouldEmit = IsKnownEmitted; + if ((!ShouldEmit && !S.getLangOpts().OpenMP && !Caller) || + S.shouldIgnoreInHostDeviceCheck(FD) || Visited.count(D)) + return; + // Finalize analysis of OpenMP-specific constructs. + if (Caller && S.LangOpts.OpenMP && UseStack.size() == 1) + S.finalizeOpenMPDelayedAnalysis(Caller, FD, Loc); + // Finalize analysis of SYCL-specific constructs. + if (Caller && S.LangOpts.SYCLIsDevice) + S.finalizeSYCLDelayedAnalysis(Caller, FD, Loc); + if (Caller) + S.DeviceKnownEmittedFns[FD] = {Caller, Loc}; + if (ShouldEmit || InOMPDeviceContext) + S.emitDeferredDiags(FD, Caller); + Visited.insert(D); + UseStack.push_back(FD); + if (auto *S = FD->getBody()) { + this->Visit(S); + } + UseStack.pop_back(); + Visited.erase(D); + } else if (auto *VD = dyn_cast(D)) { + if (auto *Init = VD->getInit()) { + if (S.LangOpts.SYCLIsDevice) + return; + auto DevTy = OMPDeclareTargetDeclAttr::getDeviceType(VD); + bool IsDev = DevTy && (*DevTy == OMPDeclareTargetDeclAttr::DT_NoHost || + *DevTy == OMPDeclareTargetDeclAttr::DT_Any); + if (IsDev) + ++InOMPDeviceContext; + this->Visit(Init); + if (IsDev) + --InOMPDeviceContext; + } + } else + Inherited::visitUsedDecl(Loc, D); + } +}; +} // namespace + +void Sema::emitDeferredDiags() { + if (ExternalSource) + ExternalSource->ReadDeclsToCheckForDeferredDiags( + DeclsToCheckForDeferredDiags); + + if ((DeviceDeferredDiags.empty() && !LangOpts.OpenMP && + !LangOpts.SYCLIsDevice) || + DeclsToCheckForDeferredDiags.empty()) + return; + + DeferredDiagnosticsEmitter DDE(*this); + for (auto D : DeclsToCheckForDeferredDiags) + DDE.visitUsedDecl(SourceLocation(), D); } // In CUDA, there are some constructs which may appear in semantically-valid @@ -1567,71 +1652,6 @@ Sema::DeviceDiagBuilder::~DeviceDiagBuilder() { } } -// Indicate that this function (and thus everything it transtively calls) will -// be codegen'ed, and emit any deferred diagnostics on this function and its -// (transitive) callees. -void Sema::markKnownEmitted( - Sema &S, FunctionDecl *OrigCaller, FunctionDecl *OrigCallee, - SourceLocation OrigLoc, - const llvm::function_ref IsKnownEmitted) { - // Nothing to do if we already know that FD is emitted. - if (IsKnownEmitted(S, OrigCallee)) { - assert(!S.DeviceCallGraph.count(OrigCallee)); - return; - } - - // We've just discovered that OrigCallee is known-emitted. Walk our call - // graph to see what else we can now discover also must be emitted. - - struct CallInfo { - FunctionDecl *Caller; - FunctionDecl *Callee; - SourceLocation Loc; - }; - llvm::SmallVector Worklist = {{OrigCaller, OrigCallee, OrigLoc}}; - llvm::SmallSet, 4> Seen; - Seen.insert(OrigCallee); - while (!Worklist.empty()) { - CallInfo C = Worklist.pop_back_val(); - assert(!IsKnownEmitted(S, C.Callee) && - "Worklist should not contain known-emitted functions."); - S.DeviceKnownEmittedFns[C.Callee] = {C.Caller, C.Loc}; - emitDeferredDiags(S, C.Callee, C.Caller); - - // If this is a template instantiation, explore its callgraph as well: - // Non-dependent calls are part of the template's callgraph, while dependent - // calls are part of to the instantiation's call graph. - if (auto *Templ = C.Callee->getPrimaryTemplate()) { - FunctionDecl *TemplFD = Templ->getAsFunction(); - if (!Seen.count(TemplFD) && !S.DeviceKnownEmittedFns.count(TemplFD)) { - Seen.insert(TemplFD); - Worklist.push_back( - {/* Caller = */ C.Caller, /* Callee = */ TemplFD, C.Loc}); - } - } - - // Add all functions called by Callee to our worklist. - auto CGIt = S.DeviceCallGraph.find(C.Callee); - if (CGIt == S.DeviceCallGraph.end()) - continue; - - for (std::pair, SourceLocation> FDLoc : - CGIt->second) { - FunctionDecl *NewCallee = FDLoc.first; - SourceLocation CallLoc = FDLoc.second; - if (Seen.count(NewCallee) || IsKnownEmitted(S, NewCallee)) - continue; - Seen.insert(NewCallee); - Worklist.push_back( - {/* Caller = */ C.Callee, /* Callee = */ NewCallee, CallLoc}); - } - - // C.Callee is now known-emitted, so we no longer need to maintain its list - // of callees in DeviceCallGraph. - S.DeviceCallGraph.erase(CGIt); - } -} - Sema::DeviceDiagBuilder Sema::targetDiag(SourceLocation Loc, unsigned DiagID) { if (LangOpts.OpenMP) return LangOpts.OpenMPIsDevice ? diagIfOpenMPDeviceCode(Loc, DiagID) diff --git a/clang/lib/Sema/SemaCUDA.cpp b/clang/lib/Sema/SemaCUDA.cpp index 5bea9c8750b45..faab250e58ffd 100644 --- a/clang/lib/Sema/SemaCUDA.cpp +++ b/clang/lib/Sema/SemaCUDA.cpp @@ -675,25 +675,6 @@ bool Sema::CheckCUDACall(SourceLocation Loc, FunctionDecl *Callee) { // Otherwise, mark the call in our call graph so we can traverse it later. bool CallerKnownEmitted = getEmissionStatus(Caller) == FunctionEmissionStatus::Emitted; - if (CallerKnownEmitted) { - // Host-side references to a __global__ function refer to the stub, so the - // function itself is never emitted and therefore should not be marked. - if (!shouldIgnoreInHostDeviceCheck(Callee)) - markKnownEmitted( - *this, Caller, Callee, Loc, [](Sema &S, FunctionDecl *FD) { - return S.getEmissionStatus(FD) == FunctionEmissionStatus::Emitted; - }); - } else { - // If we have - // host fn calls kernel fn calls host+device, - // the HD function does not get instantiated on the host. We model this by - // omitting at the call to the kernel from the callgraph. This ensures - // that, when compiling for host, only HD functions actually called from the - // host get marked as known-emitted. - if (!shouldIgnoreInHostDeviceCheck(Callee)) - DeviceCallGraph[Caller].insert({Callee, Loc}); - } - DeviceDiagBuilder::Kind DiagKind = [this, Caller, Callee, CallerKnownEmitted] { switch (IdentifyCUDAPreference(Caller, Callee)) { diff --git a/clang/lib/Sema/SemaCast.cpp b/clang/lib/Sema/SemaCast.cpp index 2d0a2298329ed..73f9a86c12e3e 100644 --- a/clang/lib/Sema/SemaCast.cpp +++ b/clang/lib/Sema/SemaCast.cpp @@ -2652,6 +2652,13 @@ void CastOperation::CheckCStyleCast() { return; } + // Allow casting a sizeless built-in type to itself. + if (DestType->isSizelessBuiltinType() && + Self.Context.hasSameUnqualifiedType(DestType, SrcType)) { + Kind = CK_NoOp; + return; + } + if (!DestType->isScalarType() && !DestType->isVectorType()) { const RecordType *DestRecordTy = DestType->getAs(); diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp index 6c497f5673840..853e14b509c13 100644 --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -1649,11 +1649,16 @@ Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID, case Builtin::BI__builtin_nontemporal_store: return SemaBuiltinNontemporalOverloaded(TheCallResult); case Builtin::BI__builtin_memcpy_inline: { - // __builtin_memcpy_inline size argument is a constant by definition. - if (TheCall->getArg(2)->EvaluateKnownConstInt(Context).isNullValue()) + clang::Expr *SizeOp = TheCall->getArg(2); + // We warn about copying to or from `nullptr` pointers when `size` is + // greater than 0. When `size` is value dependent we cannot evaluate its + // value so we bail out. + if (SizeOp->isValueDependent()) break; - CheckNonNullArgument(*this, TheCall->getArg(0), TheCall->getExprLoc()); - CheckNonNullArgument(*this, TheCall->getArg(1), TheCall->getExprLoc()); + if (!SizeOp->EvaluateKnownConstInt(Context).isNullValue()) { + CheckNonNullArgument(*this, TheCall->getArg(0), TheCall->getExprLoc()); + CheckNonNullArgument(*this, TheCall->getArg(1), TheCall->getExprLoc()); + } break; } #define BUILTIN(ID, TYPE, ATTRS) diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp index b4d3258371274..04b231109c9d4 100644 --- a/clang/lib/Sema/SemaDecl.cpp +++ b/clang/lib/Sema/SemaDecl.cpp @@ -21,6 +21,7 @@ #include "clang/AST/DeclObjC.h" #include "clang/AST/DeclTemplate.h" #include "clang/AST/EvaluatedExprVisitor.h" +#include "clang/AST/Expr.h" #include "clang/AST/ExprCXX.h" #include "clang/AST/NonTrivialTypeVisitor.h" #include "clang/AST/StmtCXX.h" @@ -8778,6 +8779,9 @@ Sema::ActOnFunctionDeclarator(Scope *S, Declarator &D, DeclContext *DC, QualType R = TInfo->getType(); assert(R->isFunctionType()); + if (R.getCanonicalType()->castAs()->getCmseNSCallAttr()) + Diag(D.getIdentifierLoc(), diag::err_function_decl_cmse_ns_call); + SmallVector TemplateParamLists; for (TemplateParameterList *TPL : TemplateParamListsRef) TemplateParamLists.push_back(TPL); @@ -11513,6 +11517,7 @@ QualType Sema::deduceVarTypeFromInitializer(VarDecl *VDecl, bool Sema::DeduceVariableDeclarationType(VarDecl *VDecl, bool DirectInit, Expr *Init) { + assert(!Init || !Init->containsErrors()); QualType DeducedType = deduceVarTypeFromInitializer( VDecl, VDecl->getDeclName(), VDecl->getType(), VDecl->getTypeSourceInfo(), VDecl->getSourceRange(), DirectInit, Init); @@ -11546,6 +11551,9 @@ bool Sema::DeduceVariableDeclarationType(VarDecl *VDecl, bool DirectInit, void Sema::checkNonTrivialCUnionInInitializer(const Expr *Init, SourceLocation Loc) { + if (auto *EWC = dyn_cast(Init)) + Init = EWC->getSubExpr(); + if (auto *CE = dyn_cast(Init)) Init = CE->getSubExpr(); @@ -11847,7 +11855,7 @@ void Sema::AddInitializerToDecl(Decl *RealDecl, Expr *Init, bool DirectInit) { // be deduced based on the chosen correction if the original init contains a // TypoExpr. ExprResult Res = CorrectDelayedTyposInExpr(Init, VDecl); - if (!Res.isUsable()) { + if (!Res.isUsable() || Res.get()->containsErrors()) { RealDecl->setInvalidDecl(); return; } @@ -12257,6 +12265,8 @@ void Sema::AddInitializerToDecl(Decl *RealDecl, Expr *Init, bool DirectInit) { VDecl->setInitStyle(VarDecl::ListInit); } + if (LangOpts.OpenMP && VDecl->hasGlobalStorage()) + DeclsToCheckForDeferredDiags.push_back(VDecl); CheckCompleteVariableDeclaration(VDecl); } @@ -14373,6 +14383,13 @@ Decl *Sema::ActOnFinishFunctionBody(Decl *dcl, Stmt *Body, DiscardCleanupsInEvaluationContext(); } + if (LangOpts.OpenMP || LangOpts.CUDA || LangOpts.SYCLIsDevice) { + auto ES = getEmissionStatus(FD); + if (ES == Sema::FunctionEmissionStatus::Emitted || + ES == Sema::FunctionEmissionStatus::Unknown) + DeclsToCheckForDeferredDiags.push_back(FD); + } + return dcl; } @@ -18036,7 +18053,8 @@ Decl *Sema::getObjCDeclContext() const { return (dyn_cast_or_null(CurContext)); } -Sema::FunctionEmissionStatus Sema::getEmissionStatus(FunctionDecl *FD) { +Sema::FunctionEmissionStatus Sema::getEmissionStatus(FunctionDecl *FD, + bool Final) { // Due to SYCL functions are template we check if they have appropriate // attribute prior to checking if it is a template if (LangOpts.SYCLIsDevice && @@ -18054,8 +18072,10 @@ Sema::FunctionEmissionStatus Sema::getEmissionStatus(FunctionDecl *FD) { if (DevTy.hasValue()) { if (*DevTy == OMPDeclareTargetDeclAttr::DT_Host) OMPES = FunctionEmissionStatus::OMPDiscarded; - else if (DeviceKnownEmittedFns.count(FD) > 0) + else if (*DevTy == OMPDeclareTargetDeclAttr::DT_NoHost || + *DevTy == OMPDeclareTargetDeclAttr::DT_Any) { OMPES = FunctionEmissionStatus::Emitted; + } } } else if (LangOpts.OpenMP) { // In OpenMP 4.5 all the functions are host functions. @@ -18071,10 +18091,11 @@ Sema::FunctionEmissionStatus Sema::getEmissionStatus(FunctionDecl *FD) { if (DevTy.hasValue()) { if (*DevTy == OMPDeclareTargetDeclAttr::DT_NoHost) { OMPES = FunctionEmissionStatus::OMPDiscarded; - } else if (DeviceKnownEmittedFns.count(FD) > 0) { + } else if (*DevTy == OMPDeclareTargetDeclAttr::DT_Host || + *DevTy == OMPDeclareTargetDeclAttr::DT_Any) OMPES = FunctionEmissionStatus::Emitted; - } - } + } else if (Final) + OMPES = FunctionEmissionStatus::Emitted; } } if (OMPES == FunctionEmissionStatus::OMPDiscarded || @@ -18126,9 +18147,7 @@ Sema::FunctionEmissionStatus Sema::getEmissionStatus(FunctionDecl *FD) { // Otherwise, the function is known-emitted if it's in our set of // known-emitted functions. - return (DeviceKnownEmittedFns.count(FD) > 0) - ? FunctionEmissionStatus::Emitted - : FunctionEmissionStatus::Unknown; + return FunctionEmissionStatus::Unknown; } bool Sema::shouldIgnoreInHostDeviceCheck(FunctionDecl *Callee) { diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp index e433bbc992af8..deb3a66630244 100644 --- a/clang/lib/Sema/SemaDeclAttr.cpp +++ b/clang/lib/Sema/SemaDeclAttr.cpp @@ -1992,6 +1992,20 @@ static void handleCommonAttr(Sema &S, Decl *D, const ParsedAttr &AL) { D->addAttr(CA); } +static void handleCmseNSEntryAttr(Sema &S, Decl *D, const ParsedAttr &AL) { + if (S.LangOpts.CPlusPlus && !D->getDeclContext()->isExternCContext()) { + S.Diag(AL.getLoc(), diag::err_attribute_not_clinkage) << AL; + return; + } + + if (cast(D)->getStorageClass() == SC_Static) { + S.Diag(AL.getLoc(), diag::warn_attribute_cmse_entry_static); + return; + } + + D->addAttr(::new (S.Context) CmseNSEntryAttr(S.Context, AL)); +} + static void handleNakedAttr(Sema &S, Decl *D, const ParsedAttr &AL) { if (checkAttrMutualExclusion(S, D, AL)) return; @@ -5552,6 +5566,17 @@ static bool ArmCdeAliasValid(unsigned BuiltinID, StringRef AliasName) { return ArmBuiltinAliasValid(BuiltinID, AliasName, Map, IntrinNames); } +static bool ArmSveAliasValid(unsigned BuiltinID, StringRef AliasName) { + switch (BuiltinID) { + default: + return false; +#define GET_SVE_BUILTINS +#define BUILTIN(name, types, attr) case SVE::BI##name: +#include "clang/Basic/arm_sve_builtins.inc" + return true; + } +} + static void handleArmBuiltinAliasAttr(Sema &S, Decl *D, const ParsedAttr &AL) { if (!AL.isArgIdent(0)) { S.Diag(AL.getLoc(), diag::err_attribute_argument_n_type) @@ -5563,8 +5588,10 @@ static void handleArmBuiltinAliasAttr(Sema &S, Decl *D, const ParsedAttr &AL) { unsigned BuiltinID = Ident->getBuiltinID(); StringRef AliasName = cast(D)->getIdentifier()->getName(); - if (!ArmMveAliasValid(BuiltinID, AliasName) && - !ArmCdeAliasValid(BuiltinID, AliasName)) { + bool IsAArch64 = S.Context.getTargetInfo().getTriple().isAArch64(); + if ((IsAArch64 && !ArmSveAliasValid(BuiltinID, AliasName)) || + (!IsAArch64 && !ArmMveAliasValid(BuiltinID, AliasName) && + !ArmCdeAliasValid(BuiltinID, AliasName))) { S.Diag(AL.getLoc(), diag::err_attribute_arm_builtin_alias); return; } @@ -7317,6 +7344,8 @@ static void ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, switch (AL.getKind()) { default: + if (AL.getInfo().handleDeclAttribute(S, D, AL) != ParsedAttrInfo::NotHandled) + break; if (!AL.isStmtAttr()) { // Type attributes are handled elsewhere; silently move on. assert(AL.isTypeAttr() && "Non-type attribute not handled"); @@ -7339,15 +7368,9 @@ static void ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, handleSimpleAttributeWithExclusions(S, D, AL); break; - case ParsedAttr::AT_NoMips16: - handleSimpleAttribute(S, D, AL); - break; case ParsedAttr::AT_MicroMips: handleSimpleAttributeWithExclusions(S, D, AL); break; - case ParsedAttr::AT_NoMicroMips: - handleSimpleAttribute(S, D, AL); - break; case ParsedAttr::AT_MipsLongCall: handleSimpleAttributeWithExclusions( S, D, AL); @@ -7383,9 +7406,6 @@ static void ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, case ParsedAttr::AT_WebAssemblyImportName: handleWebAssemblyImportNameAttr(S, D, AL); break; - case ParsedAttr::AT_IBAction: - handleSimpleAttribute(S, D, AL); - break; case ParsedAttr::AT_IBOutlet: handleIBOutlet(S, D, AL); break; @@ -7410,9 +7430,6 @@ static void ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, case ParsedAttr::AT_AlwaysInline: handleAlwaysInlineAttr(S, D, AL); break; - case ParsedAttr::AT_Artificial: - handleSimpleAttribute(S, D, AL); - break; case ParsedAttr::AT_AnalyzerNoReturn: handleAnalyzerNoReturnAttr(S, D, AL); break; @@ -7444,9 +7461,6 @@ static void ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, case ParsedAttr::AT_Constructor: handleConstructorAttr(S, D, AL); break; - case ParsedAttr::AT_CXX11NoReturn: - handleSimpleAttribute(S, D, AL); - break; case ParsedAttr::AT_Deprecated: handleDeprecatedAttr(S, D, AL); break; @@ -7474,15 +7488,9 @@ static void ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, case ParsedAttr::AT_OptimizeNone: handleOptimizeNoneAttr(S, D, AL); break; - case ParsedAttr::AT_FlagEnum: - handleSimpleAttribute(S, D, AL); - break; case ParsedAttr::AT_EnumExtensibility: handleEnumExtensibilityAttr(S, D, AL); break; - case ParsedAttr::AT_Flatten: - handleSimpleAttribute(S, D, AL); - break; case ParsedAttr::AT_SYCLKernel: handleSYCLKernelAttr(S, D, AL); break; @@ -7524,27 +7532,9 @@ static void ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, case ParsedAttr::AT_Restrict: handleRestrictAttr(S, D, AL); break; - case ParsedAttr::AT_LifetimeBound: - handleSimpleAttribute(S, D, AL); - break; - case ParsedAttr::AT_MayAlias: - handleSimpleAttribute(S, D, AL); - break; case ParsedAttr::AT_Mode: handleModeAttr(S, D, AL); break; - case ParsedAttr::AT_NoAlias: - handleSimpleAttribute(S, D, AL); - break; - case ParsedAttr::AT_NoCommon: - handleSimpleAttribute(S, D, AL); - break; - case ParsedAttr::AT_NoSplitStack: - handleSimpleAttribute(S, D, AL); - break; - case ParsedAttr::AT_NoUniqueAddress: - handleSimpleAttribute(S, D, AL); - break; case ParsedAttr::AT_NonNull: if (auto *PVD = dyn_cast(D)) handleNonNullAttrParameter(S, PVD, AL); @@ -7563,9 +7553,6 @@ static void ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, case ParsedAttr::AT_AllocAlign: handleAllocAlignAttr(S, D, AL); break; - case ParsedAttr::AT_Overloadable: - handleSimpleAttribute(S, D, AL); - break; case ParsedAttr::AT_Ownership: handleOwnershipAttr(S, D, AL); break; @@ -7621,9 +7608,6 @@ static void ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, case ParsedAttr::AT_ObjCRuntimeName: handleObjCRuntimeName(S, D, AL); break; - case ParsedAttr::AT_ObjCRuntimeVisible: - handleSimpleAttribute(S, D, AL); - break; case ParsedAttr::AT_ObjCBoxable: handleObjCBoxable(S, D, AL); break; @@ -7641,12 +7625,6 @@ static void ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, S.AddXConsumedAttr(D, AL, parsedAttrToRetainOwnershipKind(AL), /*IsTemplateInstantiation=*/false); break; - case ParsedAttr::AT_NSConsumesSelf: - handleSimpleAttribute(S, D, AL); - break; - case ParsedAttr::AT_OSConsumesThis: - handleSimpleAttribute(S, D, AL); - break; case ParsedAttr::AT_OSReturnsRetainedOnZero: handleSimpleAttributeOrDiagnose( S, D, AL, isValidOSObjectOutParameter(D), @@ -7692,9 +7670,6 @@ static void ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, case ParsedAttr::AT_VecTypeHint: handleVecTypeHint(S, D, AL); break; - case ParsedAttr::AT_ConstInit: - handleSimpleAttribute(S, D, AL); - break; case ParsedAttr::AT_InitPriority: handleInitPriorityAttr(S, D, AL); break; @@ -7725,12 +7700,6 @@ static void ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, case ParsedAttr::AT_Unavailable: handleAttrWithMessage(S, D, AL); break; - case ParsedAttr::AT_ArcWeakrefUnavailable: - handleSimpleAttribute(S, D, AL); - break; - case ParsedAttr::AT_ObjCRootClass: - handleSimpleAttribute(S, D, AL); - break; case ParsedAttr::AT_ObjCDirect: handleObjCDirectAttr(S, D, AL); break; @@ -7738,27 +7707,12 @@ static void ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, handleObjCDirectMembersAttr(S, D, AL); handleSimpleAttribute(S, D, AL); break; - case ParsedAttr::AT_ObjCNonLazyClass: - handleSimpleAttribute(S, D, AL); - break; - case ParsedAttr::AT_ObjCSubclassingRestricted: - handleSimpleAttribute(S, D, AL); - break; - case ParsedAttr::AT_ObjCClassStub: - handleSimpleAttribute(S, D, AL); - break; case ParsedAttr::AT_ObjCExplicitProtocolImpl: handleObjCSuppresProtocolAttr(S, D, AL); break; - case ParsedAttr::AT_ObjCRequiresPropertyDefs: - handleSimpleAttribute(S, D, AL); - break; case ParsedAttr::AT_Unused: handleUnusedAttr(S, D, AL); break; - case ParsedAttr::AT_ReturnsTwice: - handleSimpleAttribute(S, D, AL); - break; case ParsedAttr::AT_NotTailCalled: handleSimpleAttributeWithExclusions( S, D, AL); @@ -7767,24 +7721,15 @@ static void ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, handleSimpleAttributeWithExclusions(S, D, AL); break; - case ParsedAttr::AT_Used: - handleSimpleAttribute(S, D, AL); - break; case ParsedAttr::AT_Visibility: handleVisibilityAttr(S, D, AL, false); break; case ParsedAttr::AT_TypeVisibility: handleVisibilityAttr(S, D, AL, true); break; - case ParsedAttr::AT_WarnUnused: - handleSimpleAttribute(S, D, AL); - break; case ParsedAttr::AT_WarnUnusedResult: handleWarnUnusedResult(S, D, AL); break; - case ParsedAttr::AT_Weak: - handleSimpleAttribute(S, D, AL); - break; case ParsedAttr::AT_WeakRef: handleWeakRefAttr(S, D, AL); break; @@ -7794,9 +7739,6 @@ static void ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, case ParsedAttr::AT_TransparentUnion: handleTransparentUnionAttr(S, D, AL); break; - case ParsedAttr::AT_ObjCException: - handleSimpleAttribute(S, D, AL); - break; case ParsedAttr::AT_ObjCMethodFamily: handleObjCMethodFamilyAttr(S, D, AL); break; @@ -7812,36 +7754,14 @@ static void ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, case ParsedAttr::AT_Sentinel: handleSentinelAttr(S, D, AL); break; - case ParsedAttr::AT_Const: - handleSimpleAttribute(S, D, AL); - break; - case ParsedAttr::AT_Pure: - handleSimpleAttribute(S, D, AL); - break; case ParsedAttr::AT_Cleanup: handleCleanupAttr(S, D, AL); break; case ParsedAttr::AT_NoDebug: handleNoDebugAttr(S, D, AL); break; - case ParsedAttr::AT_NoDuplicate: - handleSimpleAttribute(S, D, AL); - break; - case ParsedAttr::AT_Convergent: - handleSimpleAttribute(S, D, AL); - break; - case ParsedAttr::AT_NoInline: - handleSimpleAttribute(S, D, AL); - break; - case ParsedAttr::AT_NoInstrumentFunction: // Interacts with -pg. - handleSimpleAttribute(S, D, AL); - break; - case ParsedAttr::AT_NoStackProtector: - // Interacts with -fstack-protector options. - handleSimpleAttribute(S, D, AL); - break; - case ParsedAttr::AT_CFICanonicalJumpTable: - handleSimpleAttribute(S, D, AL); + case ParsedAttr::AT_CmseNSEntry: + handleCmseNSEntryAttr(S, D, AL); break; case ParsedAttr::AT_StdCall: case ParsedAttr::AT_CDecl: @@ -7867,9 +7787,6 @@ static void ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, case ParsedAttr::AT_Pointer: handleLifetimeCategoryAttr(S, D, AL); break; - case ParsedAttr::AT_OpenCLKernel: - handleSimpleAttribute(S, D, AL); - break; case ParsedAttr::AT_OpenCLAccess: handleOpenCLAccessAttr(S, D, AL); break; @@ -7888,38 +7805,17 @@ static void ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, case ParsedAttr::AT_InternalLinkage: handleInternalLinkageAttr(S, D, AL); break; - case ParsedAttr::AT_ExcludeFromExplicitInstantiation: - handleSimpleAttribute(S, D, AL); - break; - case ParsedAttr::AT_LTOVisibilityPublic: - handleSimpleAttribute(S, D, AL); - break; // Microsoft attributes: - case ParsedAttr::AT_EmptyBases: - handleSimpleAttribute(S, D, AL); - break; case ParsedAttr::AT_LayoutVersion: handleLayoutVersion(S, D, AL); break; - case ParsedAttr::AT_TrivialABI: - handleSimpleAttribute(S, D, AL); - break; - case ParsedAttr::AT_MSNoVTable: - handleSimpleAttribute(S, D, AL); - break; - case ParsedAttr::AT_MSStruct: - handleSimpleAttribute(S, D, AL); - break; case ParsedAttr::AT_Uuid: handleUuidAttr(S, D, AL); break; case ParsedAttr::AT_MSInheritance: handleMSInheritanceAttr(S, D, AL); break; - case ParsedAttr::AT_SelectAny: - handleSimpleAttribute(S, D, AL); - break; case ParsedAttr::AT_Thread: handleDeclspecThreadAttr(S, D, AL); break; @@ -7938,24 +7834,15 @@ static void ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, case ParsedAttr::AT_AssertSharedLock: handleAssertSharedLockAttr(S, D, AL); break; - case ParsedAttr::AT_GuardedVar: - handleSimpleAttribute(S, D, AL); - break; case ParsedAttr::AT_PtGuardedVar: handlePtGuardedVarAttr(S, D, AL); break; - case ParsedAttr::AT_ScopedLockable: - handleSimpleAttribute(S, D, AL); - break; case ParsedAttr::AT_NoSanitize: handleNoSanitizeAttr(S, D, AL); break; case ParsedAttr::AT_NoSanitizeSpecific: handleNoSanitizeSpecificAttr(S, D, AL); break; - case ParsedAttr::AT_NoThreadSafetyAnalysis: - handleSimpleAttribute(S, D, AL); - break; case ParsedAttr::AT_GuardedBy: handleGuardedByAttr(S, D, AL); break; @@ -8007,12 +7894,6 @@ static void ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, case ParsedAttr::AT_Consumable: handleConsumableAttr(S, D, AL); break; - case ParsedAttr::AT_ConsumableAutoCast: - handleSimpleAttribute(S, D, AL); - break; - case ParsedAttr::AT_ConsumableSetOnRead: - handleSimpleAttribute(S, D, AL); - break; case ParsedAttr::AT_CallableWhen: handleCallableWhenAttr(S, D, AL); break; @@ -8080,19 +7961,7 @@ static void ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, handleSYCLIntelPipeIOAttr(S, D, AL); break; - case ParsedAttr::AT_AnyX86NoCallerSavedRegisters: - handleSimpleAttribute(S, D, AL); - break; - case ParsedAttr::AT_RenderScriptKernel: - handleSimpleAttribute(S, D, AL); - break; - case ParsedAttr::AT_SYCLIntelKernelArgsRestrict: - handleSimpleAttribute(S, D, AL); - break; // XRay attributes. - case ParsedAttr::AT_XRayInstrument: - handleSimpleAttribute(S, D, AL); - break; case ParsedAttr::AT_XRayLogArgs: handleXRayLogArgsAttr(S, D, AL); break; @@ -8101,11 +7970,6 @@ static void ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, handlePatchableFunctionEntryAttr(S, D, AL); break; - // Move semantics attribute. - case ParsedAttr::AT_Reinitializes: - handleSimpleAttribute(S, D, AL); - break; - case ParsedAttr::AT_AlwaysDestroy: case ParsedAttr::AT_NoDestroy: handleDestroyAttr(S, D, AL); diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp index 9f19dae5dbe93..23d099a9f2e88 100644 --- a/clang/lib/Sema/SemaDeclCXX.cpp +++ b/clang/lib/Sema/SemaDeclCXX.cpp @@ -6322,7 +6322,6 @@ static bool canPassInRegisters(Sema &S, CXXRecordDecl *D, /// /// \param DiagID the primary error to report. /// \param MD the overriding method. -/// \param OEK which overrides to include as notes. static bool ReportOverrides(Sema &S, unsigned DiagID, const CXXMethodDecl *MD, llvm::function_ref Report) { @@ -14765,12 +14764,14 @@ Sema::BuildCXXConstructExpr(SourceLocation ConstructLoc, QualType DeclInitType, !checkSYCLDeviceFunction(ConstructLoc, Constructor)) return ExprError(); - return CXXConstructExpr::Create( - Context, DeclInitType, ConstructLoc, Constructor, Elidable, - ExprArgs, HadMultipleCandidates, IsListInitialization, - IsStdInitListInitialization, RequiresZeroInit, - static_cast(ConstructKind), - ParenRange); + return CheckForImmediateInvocation( + CXXConstructExpr::Create( + Context, DeclInitType, ConstructLoc, Constructor, Elidable, ExprArgs, + HadMultipleCandidates, IsListInitialization, + IsStdInitListInitialization, RequiresZeroInit, + static_cast(ConstructKind), + ParenRange), + Constructor); } ExprResult Sema::BuildCXXDefaultInitExpr(SourceLocation Loc, FieldDecl *Field) { diff --git a/clang/lib/Sema/SemaDeclObjC.cpp b/clang/lib/Sema/SemaDeclObjC.cpp index 369b04ed8e25a..934e1a23141cd 100644 --- a/clang/lib/Sema/SemaDeclObjC.cpp +++ b/clang/lib/Sema/SemaDeclObjC.cpp @@ -4581,6 +4581,62 @@ static void checkObjCMethodX86VectorTypes(Sema &SemaRef, << (Triple.isMacOSX() ? "macOS 10.11" : "iOS 9"); } +static void mergeObjCDirectMembers(Sema &S, Decl *CD, ObjCMethodDecl *Method) { + if (!Method->isDirectMethod() && !Method->hasAttr() && + CD->hasAttr()) { + Method->addAttr( + ObjCDirectAttr::CreateImplicit(S.Context, Method->getLocation())); + } +} + +static void checkObjCDirectMethodClashes(Sema &S, ObjCInterfaceDecl *IDecl, + ObjCMethodDecl *Method, + ObjCImplDecl *ImpDecl = nullptr) { + auto Sel = Method->getSelector(); + bool isInstance = Method->isInstanceMethod(); + bool diagnosed = false; + + auto diagClash = [&](const ObjCMethodDecl *IMD) { + if (diagnosed || IMD->isImplicit()) + return; + if (Method->isDirectMethod() || IMD->isDirectMethod()) { + S.Diag(Method->getLocation(), diag::err_objc_direct_duplicate_decl) + << Method->isDirectMethod() << /* method */ 0 << IMD->isDirectMethod() + << Method->getDeclName(); + S.Diag(IMD->getLocation(), diag::note_previous_declaration); + diagnosed = true; + } + }; + + // Look for any other declaration of this method anywhere we can see in this + // compilation unit. + // + // We do not use IDecl->lookupMethod() because we have specific needs: + // + // - we absolutely do not need to walk protocols, because + // diag::err_objc_direct_on_protocol has already been emitted + // during parsing if there's a conflict, + // + // - when we do not find a match in a given @interface container, + // we need to attempt looking it up in the @implementation block if the + // translation unit sees it to find more clashes. + + if (auto *IMD = IDecl->getMethod(Sel, isInstance)) + diagClash(IMD); + else if (auto *Impl = IDecl->getImplementation()) + if (Impl != ImpDecl) + if (auto *IMD = IDecl->getImplementation()->getMethod(Sel, isInstance)) + diagClash(IMD); + + for (const auto *Cat : IDecl->visible_categories()) + if (auto *IMD = Cat->getMethod(Sel, isInstance)) + diagClash(IMD); + else if (auto CatImpl = Cat->getImplementation()) + if (CatImpl != ImpDecl) + if (auto *IMD = Cat->getMethod(Sel, isInstance)) + diagClash(IMD); +} + Decl *Sema::ActOnMethodDeclaration( Scope *S, SourceLocation MethodLoc, SourceLocation EndLoc, tok::TokenKind MethodType, ObjCDeclSpec &ReturnQT, ParsedType ReturnType, @@ -4809,9 +4865,9 @@ Decl *Sema::ActOnMethodDeclaration( Diag(ObjCMethod->getLocation(), diag::warn_dealloc_in_category) << ObjCMethod->getDeclName(); } - } else if (ImpDecl->hasAttr()) { - ObjCMethod->addAttr( - ObjCDirectAttr::CreateImplicit(Context, ObjCMethod->getLocation())); + } else { + mergeObjCDirectMembers(*this, ClassDecl, ObjCMethod); + checkObjCDirectMethodClashes(*this, IDecl, ObjCMethod, ImpDecl); } // Warn if a method declared in a protocol to which a category or @@ -4832,39 +4888,16 @@ Decl *Sema::ActOnMethodDeclaration( } } else { if (!isa(ClassDecl)) { - if (!ObjCMethod->isDirectMethod() && - ClassDecl->hasAttr()) { - ObjCMethod->addAttr( - ObjCDirectAttr::CreateImplicit(Context, ObjCMethod->getLocation())); - } + mergeObjCDirectMembers(*this, ClassDecl, ObjCMethod); - // There can be a single declaration in any @interface container - // for a given direct method, look for clashes as we add them. - // - // For valid code, we should always know the primary interface - // declaration by now, however for invalid code we'll keep parsing - // but we won't find the primary interface and IDecl will be nil. ObjCInterfaceDecl *IDecl = dyn_cast(ClassDecl); if (!IDecl) IDecl = cast(ClassDecl)->getClassInterface(); - + // For valid code, we should always know the primary interface + // declaration by now, however for invalid code we'll keep parsing + // but we won't find the primary interface and IDecl will be nil. if (IDecl) - if (auto *IMD = IDecl->lookupMethod(ObjCMethod->getSelector(), - ObjCMethod->isInstanceMethod(), - /*shallowCategoryLookup=*/false, - /*followSuper=*/false)) { - if (isa(IMD->getDeclContext())) { - // Do not emit a diagnostic for the Protocol case: - // diag::err_objc_direct_on_protocol has already been emitted - // during parsing for these with a nicer diagnostic. - } else if (ObjCMethod->isDirectMethod() || IMD->isDirectMethod()) { - Diag(ObjCMethod->getLocation(), - diag::err_objc_direct_duplicate_decl) - << ObjCMethod->isDirectMethod() << /* method */ 0 - << IMD->isDirectMethod() << ObjCMethod->getDeclName(); - Diag(IMD->getLocation(), diag::note_previous_declaration); - } - } + checkObjCDirectMethodClashes(*this, IDecl, ObjCMethod); } cast(ClassDecl)->addDecl(ObjCMethod); diff --git a/clang/lib/Sema/SemaExceptionSpec.cpp b/clang/lib/Sema/SemaExceptionSpec.cpp index 5c9844e1cd281..53c62a1a40177 100644 --- a/clang/lib/Sema/SemaExceptionSpec.cpp +++ b/clang/lib/Sema/SemaExceptionSpec.cpp @@ -1340,6 +1340,7 @@ CanThrowResult Sema::canThrow(const Stmt *S) { case Expr::CXXUnresolvedConstructExprClass: case Expr::DependentScopeDeclRefExprClass: case Expr::CXXFoldExprClass: + case Expr::RecoveryExprClass: return CT_Dependent; case Expr::AsTypeExprClass: @@ -1439,6 +1440,7 @@ CanThrowResult Sema::canThrow(const Stmt *S) { case Stmt::OMPDistributeSimdDirectiveClass: case Stmt::OMPFlushDirectiveClass: case Stmt::OMPDepobjDirectiveClass: + case Stmt::OMPScanDirectiveClass: case Stmt::OMPForDirectiveClass: case Stmt::OMPForSimdDirectiveClass: case Stmt::OMPMasterDirectiveClass: diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp index 27343daa22c85..bd7db4a9aca18 100644 --- a/clang/lib/Sema/SemaExpr.cpp +++ b/clang/lib/Sema/SemaExpr.cpp @@ -691,6 +691,9 @@ ExprResult Sema::DefaultLvalueConversion(Expr *E) { if (E->getType().getObjCLifetime() == Qualifiers::OCL_Weak) Cleanup.setExprNeedsCleanups(true); + if (E->getType().isDestructedType() == QualType::DK_nontrivial_c_struct) + Cleanup.setExprNeedsCleanups(true); + // C++ [conv.lval]p3: // If T is cv std::nullptr_t, the result is a null pointer constant. CastKind CK = T->isNullPtrType() ? CK_NullToPointer : CK_LValueToRValue; @@ -3388,23 +3391,33 @@ ExprResult Sema::BuildPredefinedExpr(SourceLocation Loc, return PredefinedExpr::Create(Context, Loc, ResTy, IK, SL); } +static std::pair +GetUniqueStableNameInfo(ASTContext &Context, QualType OpType, + SourceLocation OpLoc, PredefinedExpr::IdentKind K) { + std::pair Result{{}, nullptr}; + + if (OpType->isDependentType()) { + Result.first = Context.DependentTy; + return Result; + } + + std::string Str = PredefinedExpr::ComputeName(Context, K, OpType); + llvm::APInt Length(32, Str.length() + 1); + Result.first = + Context.adjustStringLiteralBaseType(Context.CharTy.withConst()); + Result.first = Context.getConstantArrayType( + Result.first, Length, nullptr, ArrayType::Normal, /*IndexTypeQuals*/ 0); + Result.second = StringLiteral::Create(Context, Str, StringLiteral::Ascii, + /*Pascal*/ false, Result.first, OpLoc); + return Result; +} + ExprResult Sema::BuildUniqueStableName(SourceLocation OpLoc, TypeSourceInfo *Operand) { QualType ResultTy; - StringLiteral *SL = nullptr; - if (Operand->getType()->isDependentType()) { - ResultTy = Context.DependentTy; - } else { - std::string Str = PredefinedExpr::ComputeName( - Context, PredefinedExpr::UniqueStableNameType, Operand->getType()); - llvm::APInt Length(32, Str.length() + 1); - ResultTy = Context.adjustStringLiteralBaseType(Context.CharTy.withConst()); - ResultTy = Context.getConstantArrayType(ResultTy, Length, nullptr, - ArrayType::Normal, - /*IndexTypeQuals*/ 0); - SL = StringLiteral::Create(Context, Str, StringLiteral::Ascii, - /*Pascal*/ false, ResultTy, OpLoc); - } + StringLiteral *SL; + std::tie(ResultTy, SL) = GetUniqueStableNameInfo( + Context, Operand->getType(), OpLoc, PredefinedExpr::UniqueStableNameType); return PredefinedExpr::Create(Context, OpLoc, ResultTy, PredefinedExpr::UniqueStableNameType, SL, @@ -3414,20 +3427,9 @@ ExprResult Sema::BuildUniqueStableName(SourceLocation OpLoc, ExprResult Sema::BuildUniqueStableName(SourceLocation OpLoc, Expr *E) { QualType ResultTy; - StringLiteral *SL = nullptr; - if (E->getType()->isDependentType()) { - ResultTy = Context.DependentTy; - } else { - std::string Str = PredefinedExpr::ComputeName(Context, - PredefinedExpr::UniqueStableNameExpr, E->getType()); - llvm::APInt Length(32, Str.length() + 1); - ResultTy = Context.adjustStringLiteralBaseType(Context.CharTy.withConst()); - ResultTy = Context.getConstantArrayType(ResultTy, Length, nullptr, - ArrayType::Normal, - /*IndexTypeQuals*/ 0); - SL = StringLiteral::Create(Context, Str, StringLiteral::Ascii, - /*Pascal*/ false, ResultTy, OpLoc); - } + StringLiteral *SL; + std::tie(ResultTy, SL) = GetUniqueStableNameInfo( + Context, E->getType(), OpLoc, PredefinedExpr::UniqueStableNameExpr); return PredefinedExpr::Create(Context, OpLoc, ResultTy, PredefinedExpr::UniqueStableNameExpr, SL, E); @@ -3439,8 +3441,10 @@ ExprResult Sema::ActOnUniqueStableNameExpr(SourceLocation OpLoc, TypeSourceInfo *TInfo = nullptr; QualType T = GetTypeFromParser(Ty, &TInfo); - if (T.isNull()) return ExprError(); - if (!TInfo) TInfo = Context.getTrivialTypeSourceInfo(T, OpLoc); + if (T.isNull()) + return ExprError(); + if (!TInfo) + TInfo = Context.getTrivialTypeSourceInfo(T, OpLoc); return BuildUniqueStableName(OpLoc, TInfo); } @@ -7664,6 +7668,11 @@ QualType Sema::CheckConditionalOperands(ExprResult &Cond, ExprResult &LHS, /*IsIntFirstExpr=*/false)) return LHSTy; + // Allow ?: operations in which both operands have the same + // built-in sizeless type. + if (LHSTy->isSizelessBuiltinType() && LHSTy == RHSTy) + return LHSTy; + // Emit a better diagnostic if one of the expressions is a null pointer // constant and the other is not a pointer type. In this case, the user most // likely forgot to take the address of the other expression. @@ -8107,6 +8116,24 @@ ExprResult Sema::ActOnConditionalOp(SourceLocation QuestionLoc, ColonLoc, result, VK, OK); } +// Check if we have a conversion between incompatible cmse function pointer +// types, that is, a conversion between a function pointer with the +// cmse_nonsecure_call attribute and one without. +static bool IsInvalidCmseNSCallConversion(Sema &S, QualType FromType, + QualType ToType) { + if (const auto *ToFn = + dyn_cast(S.Context.getCanonicalType(ToType))) { + if (const auto *FromFn = + dyn_cast(S.Context.getCanonicalType(FromType))) { + FunctionType::ExtInfo ToEInfo = ToFn->getExtInfo(); + FunctionType::ExtInfo FromEInfo = FromFn->getExtInfo(); + + return ToEInfo.getCmseNSCall() != FromEInfo.getCmseNSCall(); + } + } + return false; +} + // checkPointerTypesForAssignment - This is a very tricky routine (despite // being closely modeled after the C99 spec:-). The odd characteristic of this // routine is it effectively iqnores the qualifiers on the top level pointee. @@ -8245,6 +8272,8 @@ checkPointerTypesForAssignment(Sema &S, QualType LHSType, QualType RHSType) { if (!S.getLangOpts().CPlusPlus && S.IsFunctionConversion(ltrans, rtrans, ltrans)) return Sema::IncompatibleFunctionPointer; + if (IsInvalidCmseNSCallConversion(S, ltrans, rtrans)) + return Sema::IncompatibleFunctionPointer; return ConvTy; } @@ -15247,6 +15276,12 @@ Sema::VerifyIntegerConstantExpression(Expr *E, llvm::APSInt *Result, return ExprError(); } + ExprResult RValueExpr = DefaultLvalueConversion(E); + if (RValueExpr.isInvalid()) + return ExprError(); + + E = RValueExpr.get(); + // Circumvent ICE checking in C++11 to avoid evaluating the expression twice // in the non-ICE case. if (!getLangOpts().CPlusPlus11 && E->isIntegerConstantExpr(Context)) { @@ -15484,6 +15519,8 @@ static void EvaluateAndDiagnoseImmediateInvocation( SemaRef.getASTContext(), true); if (!Result || !Notes.empty()) { Expr *InnerExpr = CE->getSubExpr()->IgnoreImplicit(); + if (auto *FunctionalCast = dyn_cast(InnerExpr)) + InnerExpr = FunctionalCast->getSubExpr(); FunctionDecl *FD = nullptr; if (auto *Call = dyn_cast(InnerExpr)) FD = cast(Call->getCalleeDecl()); @@ -15554,8 +15591,24 @@ static void RemoveNestedImmediateInvocation( } bool AlwaysRebuild() { return false; } bool ReplacingOriginal() { return true; } + bool AllowSkippingCXXConstructExpr() { + bool Res = AllowSkippingFirstCXXConstructExpr; + AllowSkippingFirstCXXConstructExpr = true; + return Res; + } + bool AllowSkippingFirstCXXConstructExpr = true; } Transformer(SemaRef, Rec.ReferenceToConsteval, Rec.ImmediateInvocationCandidates, It); + + /// CXXConstructExpr with a single argument are getting skipped by + /// TreeTransform in some situtation because they could be implicit. This + /// can only occur for the top-level CXXConstructExpr because it is used + /// nowhere in the expression being transformed therefore will not be rebuilt. + /// Setting AllowSkippingFirstCXXConstructExpr to false will prevent from + /// skipping the first CXXConstructExpr. + if (isa(It->getPointer()->IgnoreImplicit())) + Transformer.AllowSkippingFirstCXXConstructExpr = false; + ExprResult Res = Transformer.TransformExpr(It->getPointer()->getSubExpr()); assert(Res.isUsable()); Res = SemaRef.MaybeCreateExprWithCleanups(Res); @@ -16073,13 +16126,8 @@ void Sema::MarkFunctionReferenced(SourceLocation Loc, FunctionDecl *Func, Func->markUsed(Context); } - if (LangOpts.OpenMP) { + if (LangOpts.OpenMP) markOpenMPDeclareVariantFuncsReferenced(Loc, Func, MightBeOdrUse); - if (LangOpts.OpenMPIsDevice) - checkOpenMPDeviceFunction(Loc, Func); - else - checkOpenMPHostFunction(Loc, Func); - } } /// Directly mark a variable odr-used. Given a choice, prefer to use @@ -17508,7 +17556,10 @@ class EvaluatedExprMarker : public UsedDeclVisitor { S.MarkDeclRefReferenced(E); } - void VisitMemberExpr(MemberExpr *E) { S.MarkMemberReferenced(E); } + void VisitMemberExpr(MemberExpr *E) { + S.MarkMemberReferenced(E); + Visit(E->getBase()); + } }; } // namespace @@ -18472,3 +18523,17 @@ bool Sema::IsDependentFunctionNameExpr(Expr *E) { assert(E->isTypeDependent()); return isa(E); } + +ExprResult Sema::CreateRecoveryExpr(SourceLocation Begin, SourceLocation End, + ArrayRef SubExprs) { + // RecoveryExpr is type-dependent to suppress bogus diagnostics and this trick + // does not work in C. + // FIXME: use containsErrors() to suppress unwanted diags in C. + if (!Context.getLangOpts().RecoveryAST) + return ExprError(); + + if (isSFINAEContext()) + return ExprError(); + + return RecoveryExpr::Create(Context, Begin, End, SubExprs); +} diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp index afa46fb04822d..6299008876ca4 100644 --- a/clang/lib/Sema/SemaExprCXX.cpp +++ b/clang/lib/Sema/SemaExprCXX.cpp @@ -6860,6 +6860,9 @@ ExprResult Sema::MaybeBindToTemporary(Expr *E) { VK_RValue); } + if (E->getType().isDestructedType() == QualType::DK_nontrivial_c_struct) + Cleanup.setExprNeedsCleanups(true); + if (!getLangOpts().CPlusPlus) return E; diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp index 3a80513d87551..18214514d9faa 100644 --- a/clang/lib/Sema/SemaInit.cpp +++ b/clang/lib/Sema/SemaInit.cpp @@ -6434,12 +6434,14 @@ PerformConstructorInitialization(Sema &S, } S.MarkFunctionReferenced(Loc, Constructor); - CurInit = CXXTemporaryObjectExpr::Create( - S.Context, Constructor, - Entity.getType().getNonLValueExprType(S.Context), TSInfo, - ConstructorArgs, ParenOrBraceRange, HadMultipleCandidates, - IsListInitialization, IsStdInitListInitialization, - ConstructorInitRequiresZeroInit); + CurInit = S.CheckForImmediateInvocation( + CXXTemporaryObjectExpr::Create( + S.Context, Constructor, + Entity.getType().getNonLValueExprType(S.Context), TSInfo, + ConstructorArgs, ParenOrBraceRange, HadMultipleCandidates, + IsListInitialization, IsStdInitListInitialization, + ConstructorInitRequiresZeroInit), + Constructor); } else { CXXConstructExpr::ConstructionKind ConstructKind = CXXConstructExpr::CK_Complete; diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp index 7d0821829daa5..11cc43a16db1a 100644 --- a/clang/lib/Sema/SemaOpenMP.cpp +++ b/clang/lib/Sema/SemaOpenMP.cpp @@ -154,6 +154,7 @@ class DSAStackTy { bool CancelRegion = false; bool LoopStart = false; bool BodyComplete = false; + SourceLocation PrevScanLocation; SourceLocation InnerTeamsRegionLoc; /// Reference to the taskgroup task_reduction reference expression. Expr *TaskgroupReductionRef = nullptr; @@ -781,6 +782,22 @@ class DSAStackTy { return Top ? Top->CancelRegion : false; } + /// Mark that parent region already has scan directive. + void setParentHasScanDirective(SourceLocation Loc) { + if (SharingMapTy *Parent = getSecondOnStackOrNull()) + Parent->PrevScanLocation = Loc; + } + /// Return true if current region has inner cancel construct. + bool doesParentHasScanDirective() const { + const SharingMapTy *Top = getSecondOnStackOrNull(); + return Top ? Top->PrevScanLocation.isValid() : false; + } + /// Return true if current region has inner cancel construct. + SourceLocation getParentScanDirectiveLoc() const { + const SharingMapTy *Top = getSecondOnStackOrNull(); + return Top ? Top->PrevScanLocation : SourceLocation(); + } + /// Set collapse value for the region. void setAssociatedLoops(unsigned Val) { getTopOfStack().AssociatedLoops = Val; @@ -1768,92 +1785,6 @@ Sema::DeviceDiagBuilder Sema::diagIfOpenMPHostCode(SourceLocation Loc, return DeviceDiagBuilder(Kind, Loc, DiagID, getCurFunctionDecl(), *this); } -void Sema::checkOpenMPDeviceFunction(SourceLocation Loc, FunctionDecl *Callee, - bool CheckForDelayedContext) { - assert(LangOpts.OpenMP && LangOpts.OpenMPIsDevice && - "Expected OpenMP device compilation."); - assert(Callee && "Callee may not be null."); - Callee = Callee->getMostRecentDecl(); - FunctionDecl *Caller = getCurFunctionDecl(); - - // host only function are not available on the device. - if (Caller) { - FunctionEmissionStatus CallerS = getEmissionStatus(Caller); - FunctionEmissionStatus CalleeS = getEmissionStatus(Callee); - assert(CallerS != FunctionEmissionStatus::CUDADiscarded && - CalleeS != FunctionEmissionStatus::CUDADiscarded && - "CUDADiscarded unexpected in OpenMP device function check"); - if ((CallerS == FunctionEmissionStatus::Emitted || - (!isOpenMPDeviceDelayedContext(*this) && - CallerS == FunctionEmissionStatus::Unknown)) && - CalleeS == FunctionEmissionStatus::OMPDiscarded) { - StringRef HostDevTy = getOpenMPSimpleClauseTypeName( - OMPC_device_type, OMPC_DEVICE_TYPE_host); - Diag(Loc, diag::err_omp_wrong_device_function_call) << HostDevTy << 0; - Diag(Callee->getAttr()->getLocation(), - diag::note_omp_marked_device_type_here) - << HostDevTy; - return; - } - } - // If the caller is known-emitted, mark the callee as known-emitted. - // Otherwise, mark the call in our call graph so we can traverse it later. - if ((CheckForDelayedContext && !isOpenMPDeviceDelayedContext(*this)) || - (!Caller && !CheckForDelayedContext) || - (Caller && getEmissionStatus(Caller) == FunctionEmissionStatus::Emitted)) - markKnownEmitted(*this, Caller, Callee, Loc, - [CheckForDelayedContext](Sema &S, FunctionDecl *FD) { - return CheckForDelayedContext && - S.getEmissionStatus(FD) == - FunctionEmissionStatus::Emitted; - }); - else if (Caller) - DeviceCallGraph[Caller].insert({Callee, Loc}); -} - -void Sema::checkOpenMPHostFunction(SourceLocation Loc, FunctionDecl *Callee, - bool CheckCaller) { - assert(LangOpts.OpenMP && !LangOpts.OpenMPIsDevice && - "Expected OpenMP host compilation."); - assert(Callee && "Callee may not be null."); - Callee = Callee->getMostRecentDecl(); - FunctionDecl *Caller = getCurFunctionDecl(); - - // device only function are not available on the host. - if (Caller) { - FunctionEmissionStatus CallerS = getEmissionStatus(Caller); - FunctionEmissionStatus CalleeS = getEmissionStatus(Callee); - assert( - (LangOpts.CUDA || (CallerS != FunctionEmissionStatus::CUDADiscarded && - CalleeS != FunctionEmissionStatus::CUDADiscarded)) && - "CUDADiscarded unexpected in OpenMP host function check"); - if (CallerS == FunctionEmissionStatus::Emitted && - CalleeS == FunctionEmissionStatus::OMPDiscarded) { - StringRef NoHostDevTy = getOpenMPSimpleClauseTypeName( - OMPC_device_type, OMPC_DEVICE_TYPE_nohost); - Diag(Loc, diag::err_omp_wrong_device_function_call) << NoHostDevTy << 1; - Diag(Callee->getAttr()->getLocation(), - diag::note_omp_marked_device_type_here) - << NoHostDevTy; - return; - } - } - // If the caller is known-emitted, mark the callee as known-emitted. - // Otherwise, mark the call in our call graph so we can traverse it later. - if (!shouldIgnoreInHostDeviceCheck(Callee)) { - if ((!CheckCaller && !Caller) || - (Caller && - getEmissionStatus(Caller) == FunctionEmissionStatus::Emitted)) - markKnownEmitted( - *this, Caller, Callee, Loc, [CheckCaller](Sema &S, FunctionDecl *FD) { - return CheckCaller && - S.getEmissionStatus(FD) == FunctionEmissionStatus::Emitted; - }); - else if (Caller) - DeviceCallGraph[Caller].insert({Callee, Loc}); - } -} - void Sema::checkOpenMPDeviceExpr(const Expr *E) { assert(getLangOpts().OpenMP && getLangOpts().OpenMPIsDevice && "OpenMP device compilation mode is expected."); @@ -2313,52 +2244,43 @@ bool Sema::isOpenMPGlobalCapturedDecl(ValueDecl *D, unsigned Level, void Sema::DestroyDataSharingAttributesStack() { delete DSAStack; } -void Sema::finalizeOpenMPDelayedAnalysis() { +void Sema::finalizeOpenMPDelayedAnalysis(const FunctionDecl *Caller, + const FunctionDecl *Callee, + SourceLocation Loc) { assert(LangOpts.OpenMP && "Expected OpenMP compilation mode."); - // Diagnose implicit declare target functions and their callees. - for (const auto &CallerCallees : DeviceCallGraph) { - Optional DevTy = - OMPDeclareTargetDeclAttr::getDeviceType( - CallerCallees.getFirst()->getMostRecentDecl()); - // Ignore host functions during device analyzis. - if (LangOpts.OpenMPIsDevice && DevTy && - *DevTy == OMPDeclareTargetDeclAttr::DT_Host) - continue; - // Ignore nohost functions during host analyzis. - if (!LangOpts.OpenMPIsDevice && DevTy && - *DevTy == OMPDeclareTargetDeclAttr::DT_NoHost) - continue; - for (const std::pair, SourceLocation> - &Callee : CallerCallees.getSecond()) { - const FunctionDecl *FD = Callee.first->getMostRecentDecl(); - Optional DevTy = - OMPDeclareTargetDeclAttr::getDeviceType(FD); - if (LangOpts.OpenMPIsDevice && DevTy && - *DevTy == OMPDeclareTargetDeclAttr::DT_Host) { - // Diagnose host function called during device codegen. - StringRef HostDevTy = getOpenMPSimpleClauseTypeName( - OMPC_device_type, OMPC_DEVICE_TYPE_host); - Diag(Callee.second, diag::err_omp_wrong_device_function_call) - << HostDevTy << 0; - Diag(FD->getAttr()->getLocation(), - diag::note_omp_marked_device_type_here) - << HostDevTy; - continue; - } + Optional DevTy = + OMPDeclareTargetDeclAttr::getDeviceType(Caller->getMostRecentDecl()); + // Ignore host functions during device analyzis. + if (LangOpts.OpenMPIsDevice && DevTy && + *DevTy == OMPDeclareTargetDeclAttr::DT_Host) + return; + // Ignore nohost functions during host analyzis. + if (!LangOpts.OpenMPIsDevice && DevTy && + *DevTy == OMPDeclareTargetDeclAttr::DT_NoHost) + return; + const FunctionDecl *FD = Callee->getMostRecentDecl(); + DevTy = OMPDeclareTargetDeclAttr::getDeviceType(FD); + if (LangOpts.OpenMPIsDevice && DevTy && + *DevTy == OMPDeclareTargetDeclAttr::DT_Host) { + // Diagnose host function called during device codegen. + StringRef HostDevTy = + getOpenMPSimpleClauseTypeName(OMPC_device_type, OMPC_DEVICE_TYPE_host); + Diag(Loc, diag::err_omp_wrong_device_function_call) << HostDevTy << 0; + Diag(FD->getAttr()->getLocation(), + diag::note_omp_marked_device_type_here) + << HostDevTy; + return; + } if (!LangOpts.OpenMPIsDevice && DevTy && *DevTy == OMPDeclareTargetDeclAttr::DT_NoHost) { // Diagnose nohost function called during host codegen. StringRef NoHostDevTy = getOpenMPSimpleClauseTypeName( OMPC_device_type, OMPC_DEVICE_TYPE_nohost); - Diag(Callee.second, diag::err_omp_wrong_device_function_call) - << NoHostDevTy << 1; + Diag(Loc, diag::err_omp_wrong_device_function_call) << NoHostDevTy << 1; Diag(FD->getAttr()->getLocation(), diag::note_omp_marked_device_type_here) << NoHostDevTy; - continue; } - } - } } void Sema::StartOpenMPDSABlock(OpenMPDirectiveKind DKind, @@ -3858,6 +3780,7 @@ void Sema::ActOnOpenMPRegionStart(OpenMPDirectiveKind DKind, Scope *CurScope) { case OMPD_cancel: case OMPD_flush: case OMPD_depobj: + case OMPD_scan: case OMPD_declare_reduction: case OMPD_declare_mapper: case OMPD_declare_simd: @@ -4202,12 +4125,14 @@ static bool checkNestingOfRegions(Sema &SemaRef, const DSAStackTy *Stack, ShouldBeInParallelRegion, ShouldBeInOrderedRegion, ShouldBeInTargetRegion, - ShouldBeInTeamsRegion + ShouldBeInTeamsRegion, + ShouldBeInLoopSimdRegion, } Recommend = NoRecommend; if (isOpenMPSimdDirective(ParentRegion) && ((SemaRef.LangOpts.OpenMP <= 45 && CurrentRegion != OMPD_ordered) || (SemaRef.LangOpts.OpenMP >= 50 && CurrentRegion != OMPD_ordered && - CurrentRegion != OMPD_simd && CurrentRegion != OMPD_atomic))) { + CurrentRegion != OMPD_simd && CurrentRegion != OMPD_atomic && + CurrentRegion != OMPD_scan))) { // OpenMP [2.16, Nesting of Regions] // OpenMP constructs may not be nested inside a simd region. // OpenMP [2.8.1,simd Construct, Restrictions] @@ -4366,6 +4291,16 @@ static bool checkNestingOfRegions(Sema &SemaRef, const DSAStackTy *Stack, ParentRegion != OMPD_target); OrphanSeen = ParentRegion == OMPD_unknown; Recommend = ShouldBeInTargetRegion; + } else if (CurrentRegion == OMPD_scan) { + // OpenMP [2.16, Nesting of Regions] + // If specified, a teams construct must be contained within a target + // construct. + NestingProhibited = + SemaRef.LangOpts.OpenMP < 50 || + (ParentRegion != OMPD_simd && ParentRegion != OMPD_for && + ParentRegion != OMPD_for_simd); + OrphanSeen = ParentRegion == OMPD_unknown; + Recommend = ShouldBeInLoopSimdRegion; } if (!NestingProhibited && !isOpenMPTargetExecutionDirective(CurrentRegion) && @@ -4874,6 +4809,11 @@ StmtResult Sema::ActOnOpenMPExecutableDirective( "No associated statement allowed for 'omp depobj' directive"); Res = ActOnOpenMPDepobjDirective(ClausesWithImplicit, StartLoc, EndLoc); break; + case OMPD_scan: + assert(AStmt == nullptr && + "No associated statement allowed for 'omp scan' directive"); + Res = ActOnOpenMPScanDirective(ClausesWithImplicit, StartLoc, EndLoc); + break; case OMPD_ordered: Res = ActOnOpenMPOrderedDirective(ClausesWithImplicit, AStmt, StartLoc, EndLoc); @@ -5159,6 +5099,8 @@ StmtResult Sema::ActOnOpenMPExecutableDirective( case OMPC_nontemporal: case OMPC_order: case OMPC_destroy: + case OMPC_inclusive: + case OMPC_exclusive: continue; case OMPC_allocator: case OMPC_flush: @@ -5761,7 +5703,7 @@ void Sema::ActOnOpenMPDeclareVariantDirective(FunctionDecl *FD, OMPTraitInfo &TI, SourceRange SR) { auto *NewAttr = - OMPDeclareVariantAttr::CreateImplicit(Context, VariantRef, TI, SR); + OMPDeclareVariantAttr::CreateImplicit(Context, VariantRef, &TI, SR); FD->addAttr(NewAttr); } @@ -8773,6 +8715,27 @@ StmtResult Sema::ActOnOpenMPDepobjDirective(ArrayRef Clauses, return OMPDepobjDirective::Create(Context, StartLoc, EndLoc, Clauses); } +StmtResult Sema::ActOnOpenMPScanDirective(ArrayRef Clauses, + SourceLocation StartLoc, + SourceLocation EndLoc) { + // Check that exactly one clause is specified. + if (Clauses.size() != 1) { + Diag(Clauses.empty() ? EndLoc : Clauses[1]->getBeginLoc(), + diag::err_omp_scan_single_clause_expected); + return StmtError(); + } + // Check that only one instance of scan directives is used in the same outer + // region. + if (DSAStack->doesParentHasScanDirective()) { + Diag(StartLoc, diag::err_omp_several_scan_directives_in_region); + Diag(DSAStack->getParentScanDirectiveLoc(), + diag::note_omp_previous_scan_directive); + return StmtError(); + } + DSAStack->setParentHasScanDirective(StartLoc); + return OMPScanDirective::Create(Context, StartLoc, EndLoc, Clauses); +} + StmtResult Sema::ActOnOpenMPOrderedDirective(ArrayRef Clauses, Stmt *AStmt, SourceLocation StartLoc, @@ -11121,6 +11084,8 @@ OMPClause *Sema::ActOnOpenMPSingleExprClause(OpenMPClauseKind Kind, Expr *Expr, case OMPC_nontemporal: case OMPC_order: case OMPC_destroy: + case OMPC_inclusive: + case OMPC_exclusive: llvm_unreachable("Clause is not allowed."); } return Res; @@ -11251,6 +11216,7 @@ static OpenMPDirectiveKind getOpenMPCaptureRegionForClause( case OMPD_cancellation_point: case OMPD_flush: case OMPD_depobj: + case OMPD_scan: case OMPD_declare_reduction: case OMPD_declare_mapper: case OMPD_declare_simd: @@ -11322,6 +11288,7 @@ static OpenMPDirectiveKind getOpenMPCaptureRegionForClause( case OMPD_cancellation_point: case OMPD_flush: case OMPD_depobj: + case OMPD_scan: case OMPD_declare_reduction: case OMPD_declare_mapper: case OMPD_declare_simd: @@ -11398,6 +11365,7 @@ static OpenMPDirectiveKind getOpenMPCaptureRegionForClause( case OMPD_cancellation_point: case OMPD_flush: case OMPD_depobj: + case OMPD_scan: case OMPD_declare_reduction: case OMPD_declare_mapper: case OMPD_declare_simd: @@ -11471,6 +11439,7 @@ static OpenMPDirectiveKind getOpenMPCaptureRegionForClause( case OMPD_cancellation_point: case OMPD_flush: case OMPD_depobj: + case OMPD_scan: case OMPD_declare_reduction: case OMPD_declare_mapper: case OMPD_declare_simd: @@ -11545,6 +11514,7 @@ static OpenMPDirectiveKind getOpenMPCaptureRegionForClause( case OMPD_cancellation_point: case OMPD_flush: case OMPD_depobj: + case OMPD_scan: case OMPD_declare_reduction: case OMPD_declare_mapper: case OMPD_declare_simd: @@ -11618,6 +11588,7 @@ static OpenMPDirectiveKind getOpenMPCaptureRegionForClause( case OMPD_cancellation_point: case OMPD_flush: case OMPD_depobj: + case OMPD_scan: case OMPD_declare_reduction: case OMPD_declare_mapper: case OMPD_declare_simd: @@ -11690,6 +11661,7 @@ static OpenMPDirectiveKind getOpenMPCaptureRegionForClause( case OMPD_cancellation_point: case OMPD_flush: case OMPD_depobj: + case OMPD_scan: case OMPD_declare_reduction: case OMPD_declare_mapper: case OMPD_declare_simd: @@ -11765,6 +11737,7 @@ static OpenMPDirectiveKind getOpenMPCaptureRegionForClause( case OMPD_cancellation_point: case OMPD_flush: case OMPD_depobj: + case OMPD_scan: case OMPD_declare_reduction: case OMPD_declare_mapper: case OMPD_declare_simd: @@ -11848,6 +11821,8 @@ static OpenMPDirectiveKind getOpenMPCaptureRegionForClause( case OMPC_order: case OMPC_destroy: case OMPC_detach: + case OMPC_inclusive: + case OMPC_exclusive: llvm_unreachable("Unexpected OpenMP clause."); } return CaptureRegion; @@ -12285,6 +12260,8 @@ OMPClause *Sema::ActOnOpenMPSimpleClause( case OMPC_nontemporal: case OMPC_destroy: case OMPC_detach: + case OMPC_inclusive: + case OMPC_exclusive: llvm_unreachable("Clause is not allowed."); } return Res; @@ -12508,6 +12485,8 @@ OMPClause *Sema::ActOnOpenMPSingleExprWithArgClause( case OMPC_order: case OMPC_destroy: case OMPC_detach: + case OMPC_inclusive: + case OMPC_exclusive: llvm_unreachable("Clause is not allowed."); } return Res; @@ -12738,6 +12717,8 @@ OMPClause *Sema::ActOnOpenMPClause(OpenMPClauseKind Kind, case OMPC_nontemporal: case OMPC_order: case OMPC_detach: + case OMPC_inclusive: + case OMPC_exclusive: llvm_unreachable("Clause is not allowed."); } return Res; @@ -12851,7 +12832,7 @@ OMPClause *Sema::ActOnOpenMPVarListClause( DeclarationNameInfo &ReductionOrMapperId, int ExtraModifier, ArrayRef MapTypeModifiers, ArrayRef MapTypeModifiersLoc, bool IsMapTypeImplicit, - SourceLocation DepLinMapLastLoc) { + SourceLocation ExtraModifierLoc) { SourceLocation StartLoc = Locs.StartLoc; SourceLocation LParenLoc = Locs.LParenLoc; SourceLocation EndLoc = Locs.EndLoc; @@ -12868,15 +12849,18 @@ OMPClause *Sema::ActOnOpenMPVarListClause( "Unexpected lastprivate modifier."); Res = ActOnOpenMPLastprivateClause( VarList, static_cast(ExtraModifier), - DepLinMapLastLoc, ColonLoc, StartLoc, LParenLoc, EndLoc); + ExtraModifierLoc, ColonLoc, StartLoc, LParenLoc, EndLoc); break; case OMPC_shared: Res = ActOnOpenMPSharedClause(VarList, StartLoc, LParenLoc, EndLoc); break; case OMPC_reduction: - Res = ActOnOpenMPReductionClause(VarList, StartLoc, LParenLoc, ColonLoc, - EndLoc, ReductionOrMapperIdScopeSpec, - ReductionOrMapperId); + assert(0 <= ExtraModifier && ExtraModifier <= OMPC_REDUCTION_unknown && + "Unexpected lastprivate modifier."); + Res = ActOnOpenMPReductionClause( + VarList, static_cast(ExtraModifier), + StartLoc, LParenLoc, ExtraModifierLoc, ColonLoc, EndLoc, + ReductionOrMapperIdScopeSpec, ReductionOrMapperId); break; case OMPC_task_reduction: Res = ActOnOpenMPTaskReductionClause(VarList, StartLoc, LParenLoc, ColonLoc, @@ -12893,7 +12877,7 @@ OMPClause *Sema::ActOnOpenMPVarListClause( "Unexpected linear modifier."); Res = ActOnOpenMPLinearClause( VarList, TailExpr, StartLoc, LParenLoc, - static_cast(ExtraModifier), DepLinMapLastLoc, + static_cast(ExtraModifier), ExtraModifierLoc, ColonLoc, EndLoc); break; case OMPC_aligned: @@ -12913,7 +12897,7 @@ OMPClause *Sema::ActOnOpenMPVarListClause( assert(0 <= ExtraModifier && ExtraModifier <= OMPC_DEPEND_unknown && "Unexpected depend modifier."); Res = ActOnOpenMPDependClause( - static_cast(ExtraModifier), DepLinMapLastLoc, + static_cast(ExtraModifier), ExtraModifierLoc, ColonLoc, VarList, StartLoc, LParenLoc, EndLoc); break; case OMPC_map: @@ -12922,7 +12906,7 @@ OMPClause *Sema::ActOnOpenMPVarListClause( Res = ActOnOpenMPMapClause( MapTypeModifiers, MapTypeModifiersLoc, ReductionOrMapperIdScopeSpec, ReductionOrMapperId, static_cast(ExtraModifier), - IsMapTypeImplicit, DepLinMapLastLoc, ColonLoc, VarList, Locs); + IsMapTypeImplicit, ExtraModifierLoc, ColonLoc, VarList, Locs); break; case OMPC_to: Res = ActOnOpenMPToClause(VarList, ReductionOrMapperIdScopeSpec, @@ -12945,6 +12929,12 @@ OMPClause *Sema::ActOnOpenMPVarListClause( case OMPC_nontemporal: Res = ActOnOpenMPNontemporalClause(VarList, StartLoc, LParenLoc, EndLoc); break; + case OMPC_inclusive: + Res = ActOnOpenMPInclusiveClause(VarList, StartLoc, LParenLoc, EndLoc); + break; + case OMPC_exclusive: + Res = ActOnOpenMPExclusiveClause(VarList, StartLoc, LParenLoc, EndLoc); + break; case OMPC_if: case OMPC_depobj: case OMPC_final: @@ -14716,10 +14706,19 @@ static bool actOnOMPReductionKindClause( } OMPClause *Sema::ActOnOpenMPReductionClause( - ArrayRef VarList, SourceLocation StartLoc, SourceLocation LParenLoc, - SourceLocation ColonLoc, SourceLocation EndLoc, + ArrayRef VarList, OpenMPReductionClauseModifier Modifier, + SourceLocation StartLoc, SourceLocation LParenLoc, + SourceLocation ModifierLoc, SourceLocation ColonLoc, SourceLocation EndLoc, CXXScopeSpec &ReductionIdScopeSpec, const DeclarationNameInfo &ReductionId, ArrayRef UnresolvedReductions) { + if (ModifierLoc.isValid() && Modifier == OMPC_REDUCTION_unknown) { + Diag(LParenLoc, diag::err_omp_unexpected_clause_value) + << getListOfPossibleValues(OMPC_reduction, /*First=*/0, + /*Last=*/OMPC_REDUCTION_unknown) + << getOpenMPClauseName(OMPC_reduction); + return nullptr; + } + ReductionData RD(VarList.size()); if (actOnOMPReductionKindClause(*this, DSAStack, OMPC_reduction, VarList, StartLoc, LParenLoc, ColonLoc, EndLoc, @@ -14728,8 +14727,8 @@ OMPClause *Sema::ActOnOpenMPReductionClause( return nullptr; return OMPReductionClause::Create( - Context, StartLoc, LParenLoc, ColonLoc, EndLoc, RD.Vars, - ReductionIdScopeSpec.getWithLocInContext(Context), ReductionId, + Context, StartLoc, LParenLoc, ModifierLoc, ColonLoc, EndLoc, Modifier, + RD.Vars, ReductionIdScopeSpec.getWithLocInContext(Context), ReductionId, RD.Privates, RD.LHSs, RD.RHSs, RD.ReductionOps, buildPreInits(Context, RD.ExprCaptures), buildPostUpdate(*this, RD.ExprPostUpdates)); @@ -17674,15 +17673,6 @@ void Sema::checkDeclIsAllowedInOpenMPTarget(Expr *E, Decl *D, Diag(FD->getLocation(), diag::note_defined_here) << FD; return; } - // Mark the function as must be emitted for the device. - Optional DevTy = - OMPDeclareTargetDeclAttr::getDeviceType(FD); - if (LangOpts.OpenMPIsDevice && Res.hasValue() && IdLoc.isValid() && - *DevTy != OMPDeclareTargetDeclAttr::DT_Host) - checkOpenMPDeviceFunction(IdLoc, FD, /*CheckForDelayedContext=*/false); - if (!LangOpts.OpenMPIsDevice && Res.hasValue() && IdLoc.isValid() && - *DevTy != OMPDeclareTargetDeclAttr::DT_NoHost) - checkOpenMPHostFunction(IdLoc, FD, /*CheckCaller=*/false); } if (auto *VD = dyn_cast(D)) { // Problem if any with var declared with incomplete type will be reported @@ -18011,3 +18001,59 @@ OMPClause *Sema::ActOnOpenMPNontemporalClause(ArrayRef VarList, return OMPNontemporalClause::Create(Context, StartLoc, LParenLoc, EndLoc, Vars); } + +OMPClause *Sema::ActOnOpenMPInclusiveClause(ArrayRef VarList, + SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc) { + SmallVector Vars; + for (Expr *RefExpr : VarList) { + assert(RefExpr && "NULL expr in OpenMP nontemporal clause."); + SourceLocation ELoc; + SourceRange ERange; + Expr *SimpleRefExpr = RefExpr; + auto Res = getPrivateItem(*this, SimpleRefExpr, ELoc, ERange, + /*AllowArraySection=*/true); + if (Res.second) + // It will be analyzed later. + Vars.push_back(RefExpr); + ValueDecl *D = Res.first; + if (!D) + continue; + + Vars.push_back(RefExpr); + } + + if (Vars.empty()) + return nullptr; + + return OMPInclusiveClause::Create(Context, StartLoc, LParenLoc, EndLoc, Vars); +} + +OMPClause *Sema::ActOnOpenMPExclusiveClause(ArrayRef VarList, + SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc) { + SmallVector Vars; + for (Expr *RefExpr : VarList) { + assert(RefExpr && "NULL expr in OpenMP nontemporal clause."); + SourceLocation ELoc; + SourceRange ERange; + Expr *SimpleRefExpr = RefExpr; + auto Res = getPrivateItem(*this, SimpleRefExpr, ELoc, ERange, + /*AllowArraySection=*/true); + if (Res.second) + // It will be analyzed later. + Vars.push_back(RefExpr); + ValueDecl *D = Res.first; + if (!D) + continue; + + Vars.push_back(RefExpr); + } + + if (Vars.empty()) + return nullptr; + + return OMPExclusiveClause::Create(Context, StartLoc, LParenLoc, EndLoc, Vars); +} diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp index d80aac9d703ac..b048c5a8d8cfe 100644 --- a/clang/lib/Sema/SemaOverload.cpp +++ b/clang/lib/Sema/SemaOverload.cpp @@ -9420,6 +9420,49 @@ static bool isBetterMultiversionCandidate(const OverloadCandidate &Cand1, llvm_unreachable("No way to get here unless both had cpu_dispatch"); } +/// Compute the type of the implicit object parameter for the given function, +/// if any. Returns None if there is no implicit object parameter, and a null +/// QualType if there is a 'matches anything' implicit object parameter. +static Optional getImplicitObjectParamType(ASTContext &Context, + const FunctionDecl *F) { + if (!isa(F) || isa(F)) + return llvm::None; + + auto *M = cast(F); + // Static member functions' object parameters match all types. + if (M->isStatic()) + return QualType(); + + QualType T = M->getThisObjectType(); + if (M->getRefQualifier() == RQ_RValue) + return Context.getRValueReferenceType(T); + return Context.getLValueReferenceType(T); +} + +static bool haveSameParameterTypes(ASTContext &Context, const FunctionDecl *F1, + const FunctionDecl *F2, unsigned NumParams) { + if (declaresSameEntity(F1, F2)) + return true; + + auto NextParam = [&](const FunctionDecl *F, unsigned &I, bool First) { + if (First) { + if (Optional T = getImplicitObjectParamType(Context, F)) + return *T; + } + assert(I < F->getNumParams()); + return F->getParamDecl(I++)->getType(); + }; + + unsigned I1 = 0, I2 = 0; + for (unsigned I = 0; I != NumParams; ++I) { + QualType T1 = NextParam(F1, I1, I == 0); + QualType T2 = NextParam(F2, I2, I == 0); + if (!T1.isNull() && !T1.isNull() && !Context.hasSameUnqualifiedType(T1, T2)) + return false; + } + return true; +} + /// isBetterOverloadCandidate - Determines whether the first overload /// candidate is a better candidate than the second (C++ 13.3.3p1). bool clang::isBetterOverloadCandidate( @@ -9487,18 +9530,20 @@ bool clang::isBetterOverloadCandidate( break; case ImplicitConversionSequence::Worse: - if (Cand1.Function && Cand1.Function == Cand2.Function && - Cand2.isReversed()) { + if (Cand1.Function && Cand2.Function && + Cand1.isReversed() != Cand2.isReversed() && + haveSameParameterTypes(S.Context, Cand1.Function, Cand2.Function, + NumArgs)) { // Work around large-scale breakage caused by considering reversed // forms of operator== in C++20: // - // When comparing a function against its reversed form, if we have a - // better conversion for one argument and a worse conversion for the - // other, we prefer the non-reversed form. + // When comparing a function against a reversed function with the same + // parameter types, if we have a better conversion for one argument and + // a worse conversion for the other, the implicit conversion sequences + // are treated as being equally good. // - // This prevents a conversion function from being considered ambiguous - // with its own reversed form in various where it's only incidentally - // heterogeneous. + // This prevents a comparison function from being considered ambiguous + // with a reversed form that is written in the same way. // // We diagnose this as an extension from CreateOverloadedBinOp. HasWorseConversion = true; @@ -9516,10 +9561,8 @@ bool clang::isBetterOverloadCandidate( // -- for some argument j, ICSj(F1) is a better conversion sequence than // ICSj(F2), or, if not that, - if (HasBetterConversion) + if (HasBetterConversion && !HasWorseConversion) return true; - if (HasWorseConversion) - return false; // -- the context is an initialization by user-defined conversion // (see 8.5, 13.3.1.5) and the standard conversion sequence @@ -12720,7 +12763,7 @@ bool Sema::buildOverloadedCallSet(Scope *S, Expr *Fn, // base classes. CallExpr *CE = CallExpr::Create(Context, Fn, Args, Context.DependentTy, VK_RValue, RParenLoc); - CE->addDependence(ExprDependence::TypeValueInstantiation); + CE->markDependentForPostponedNameLookup(); *Result = CE; return true; } @@ -13256,36 +13299,56 @@ ExprResult Sema::CreateOverloadedBinOp(SourceLocation OpLoc, // resolution for an operator@, its return type shall be cv bool if (Best->RewriteKind && ChosenOp == OO_EqualEqual && !FnDecl->getReturnType()->isBooleanType()) { - Diag(OpLoc, diag::err_ovl_rewrite_equalequal_not_bool) + bool IsExtension = + FnDecl->getReturnType()->isIntegralOrUnscopedEnumerationType(); + Diag(OpLoc, IsExtension ? diag::ext_ovl_rewrite_equalequal_not_bool + : diag::err_ovl_rewrite_equalequal_not_bool) << FnDecl->getReturnType() << BinaryOperator::getOpcodeStr(Opc) << Args[0]->getSourceRange() << Args[1]->getSourceRange(); Diag(FnDecl->getLocation(), diag::note_declared_at); - return ExprError(); + if (!IsExtension) + return ExprError(); } if (AllowRewrittenCandidates && !IsReversed && - CandidateSet.getRewriteInfo().shouldAddReversed(ChosenOp)) { - // We could have reversed this operator, but didn't. Check if the + CandidateSet.getRewriteInfo().isReversible()) { + // We could have reversed this operator, but didn't. Check if some // reversed form was a viable candidate, and if so, if it had a // better conversion for either parameter. If so, this call is // formally ambiguous, and allowing it is an extension. + llvm::SmallVector AmbiguousWith; for (OverloadCandidate &Cand : CandidateSet) { - if (Cand.Viable && Cand.Function == FnDecl && - Cand.isReversed()) { + if (Cand.Viable && Cand.Function && Cand.isReversed() && + haveSameParameterTypes(Context, Cand.Function, FnDecl, 2)) { for (unsigned ArgIdx = 0; ArgIdx < 2; ++ArgIdx) { if (CompareImplicitConversionSequences( *this, OpLoc, Cand.Conversions[ArgIdx], Best->Conversions[ArgIdx]) == ImplicitConversionSequence::Better) { - Diag(OpLoc, diag::ext_ovl_ambiguous_oper_binary_reversed) - << BinaryOperator::getOpcodeStr(Opc) - << Args[0]->getType() << Args[1]->getType() - << Args[0]->getSourceRange() << Args[1]->getSourceRange(); - Diag(FnDecl->getLocation(), - diag::note_ovl_ambiguous_oper_binary_reversed_candidate); + AmbiguousWith.push_back(Cand.Function); + break; } } - break; + } + } + + if (!AmbiguousWith.empty()) { + bool AmbiguousWithSelf = + AmbiguousWith.size() == 1 && + declaresSameEntity(AmbiguousWith.front(), FnDecl); + Diag(OpLoc, diag::ext_ovl_ambiguous_oper_binary_reversed) + << BinaryOperator::getOpcodeStr(Opc) + << Args[0]->getType() << Args[1]->getType() << AmbiguousWithSelf + << Args[0]->getSourceRange() << Args[1]->getSourceRange(); + if (AmbiguousWithSelf) { + Diag(FnDecl->getLocation(), + diag::note_ovl_ambiguous_oper_binary_reversed_self); + } else { + Diag(FnDecl->getLocation(), + diag::note_ovl_ambiguous_oper_binary_selected_candidate); + for (auto *F : AmbiguousWith) + Diag(F->getLocation(), + diag::note_ovl_ambiguous_oper_binary_reversed_candidate); } } } diff --git a/clang/lib/Sema/SemaSYCL.cpp b/clang/lib/Sema/SemaSYCL.cpp index 3724984e71b66..71fc377b17c2f 100644 --- a/clang/lib/Sema/SemaSYCL.cpp +++ b/clang/lib/Sema/SemaSYCL.cpp @@ -1413,18 +1413,6 @@ bool Sema::checkSYCLDeviceFunction(SourceLocation Loc, FunctionDecl *Callee) { if (!Caller) return true; - bool CallerKnownEmitted = - getEmissionStatus(Caller) == FunctionEmissionStatus::Emitted; - - // If the caller is known-emitted, mark the callee as known-emitted. - // Otherwise, mark the call in our call graph so we can traverse it later. - if (CallerKnownEmitted) - markKnownEmitted(*this, Caller, Callee, Loc, [](Sema &S, FunctionDecl *FD) { - return S.getEmissionStatus(FD) == Sema::FunctionEmissionStatus::Emitted; - }); - else - DeviceCallGraph[Caller].insert({Callee, Loc}); - DeviceDiagBuilder::Kind DiagKind = DeviceDiagBuilder::K_Nop; // TODO Set DiagKind to K_Immediate/K_Deferred to emit diagnostics for Callee @@ -1439,9 +1427,9 @@ bool Sema::checkSYCLDeviceFunction(SourceLocation Loc, FunctionDecl *Callee) { DiagKind != DeviceDiagBuilder::K_ImmediateWithCallStack; } -static void emitCallToUndefinedFnDiag(Sema &SemaRef, const FunctionDecl *Callee, - const FunctionDecl *Caller, - const SourceLocation &Loc) { +void Sema::finalizeSYCLDelayedAnalysis(const FunctionDecl *Caller, + const FunctionDecl *Callee, + SourceLocation Loc) { // Somehow an unspecialized template appears to be in callgraph or list of // device functions. We don't want to emit diagnostic here. if (Callee->getTemplatedKind() == FunctionDecl::TK_FunctionTemplate) @@ -1451,9 +1439,7 @@ static void emitCallToUndefinedFnDiag(Sema &SemaRef, const FunctionDecl *Callee, for (const Decl *Redecl : Callee->redecls()) { if (const FunctionDecl *FD = dyn_cast_or_null(Redecl)) { - if ((FD->hasAttr() && - !FD->getAttr()->isImplicit()) || - FD->hasAttr()) { + if (FD->hasAttr() || FD->hasAttr()) { RedeclHasAttr = true; break; } @@ -1464,26 +1450,10 @@ static void emitCallToUndefinedFnDiag(Sema &SemaRef, const FunctionDecl *Callee, bool NotDefinedNoAttr = !Callee->isDefined() && !RedeclHasAttr; if (NotDefinedNoAttr && !Callee->getBuiltinID()) { - SemaRef.Diag(Loc, diag::err_sycl_restrict) + Diag(Loc, diag::err_sycl_restrict) << Sema::KernelCallUndefinedFunction; - SemaRef.Diag(Callee->getLocation(), diag::note_previous_decl) << Callee; - SemaRef.Diag(Caller->getLocation(), diag::note_called_by) << Caller; - } -} - -void Sema::finalizeSYCLDelayedAnalysis() { - assert(getLangOpts().SYCLIsDevice && - "Should only be called during SYCL compilation"); - - llvm::DenseSet Checked; - - for (const auto &EmittedWithLoc : DeviceKnownEmittedFns) { - const FunctionDecl *Caller = EmittedWithLoc.getSecond().FD; - const SourceLocation &Loc = EmittedWithLoc.getSecond().Loc; - const FunctionDecl *Callee = EmittedWithLoc.getFirst(); - - if (Checked.insert(Callee).second) - emitCallToUndefinedFnDiag(*this, Callee, Caller, Loc); + Diag(Callee->getLocation(), diag::note_previous_decl) << Callee; + Diag(Caller->getLocation(), diag::note_called_by) << Caller; } } diff --git a/clang/lib/Sema/SemaStmt.cpp b/clang/lib/Sema/SemaStmt.cpp index 13d172268e069..72b8b4765aa64 100644 --- a/clang/lib/Sema/SemaStmt.cpp +++ b/clang/lib/Sema/SemaStmt.cpp @@ -730,11 +730,11 @@ StmtResult Sema::ActOnStartOfSwitchStmt(SourceLocation SwitchLoc, if (CondExpr && !CondExpr->isTypeDependent()) { // We have already converted the expression to an integral or enumeration - // type, when we parsed the switch condition. If we don't have an - // appropriate type now, enter the switch scope but remember that it's - // invalid. - assert(CondExpr->getType()->isIntegralOrEnumerationType() && - "invalid condition type"); + // type, when we parsed the switch condition. There are cases where we don't + // have an appropriate type, e.g. a typo-expr Cond was corrected to an + // inappropriate-type expr, we just return an error. + if (!CondExpr->getType()->isIntegralOrEnumerationType()) + return StmtError(); if (CondExpr->isKnownToHaveBooleanValue()) { // switch(bool_expr) {...} is often a programmer error, e.g. // switch(n && mask) { ... } // Doh - should be "n & mask". diff --git a/clang/lib/Sema/SemaTemplateInstantiate.cpp b/clang/lib/Sema/SemaTemplateInstantiate.cpp index 686980914fcff..f55b72bfaa945 100644 --- a/clang/lib/Sema/SemaTemplateInstantiate.cpp +++ b/clang/lib/Sema/SemaTemplateInstantiate.cpp @@ -1402,34 +1402,46 @@ TemplateName TemplateInstantiator::TransformTemplateName( AllowInjectedClassName); } -ExprResult -TemplateInstantiator::TransformPredefinedExpr(PredefinedExpr *E) { - if (!E->isTypeDependent()) - return E; +static ExprResult TransformUniqueStableName(TemplateInstantiator &TI, + PredefinedExpr *E) { + if (E->getIdentKind() == PredefinedExpr::UniqueStableNameType) { + TypeSourceInfo *Info = + TI.getDerived().TransformType(E->getTypeSourceInfo()); + + if (!Info) + return ExprError(); + + if (!TI.getDerived().AlwaysRebuild() && Info == E->getTypeSourceInfo()) + return E; + + return TI.getSema().BuildUniqueStableName(E->getLocation(), Info); + } if (E->getIdentKind() == PredefinedExpr::UniqueStableNameExpr) { EnterExpressionEvaluationContext Unevaluated( - SemaRef, Sema::ExpressionEvaluationContext::Unevaluated); + TI.getSema(), Sema::ExpressionEvaluationContext::Unevaluated); + ExprResult SubExpr = TI.getDerived().TransformExpr(E->getExpr()); - ExprResult SubExpr = getDerived().TransformExpr(E->getExpr()); if (SubExpr.isInvalid()) return ExprError(); - if (!getDerived().AlwaysRebuild() && SubExpr.get() == E->getExpr()) + if (!TI.getDerived().AlwaysRebuild() && SubExpr.get() == E->getExpr()) return E; - return getSema().BuildUniqueStableName(E->getLocation(), SubExpr.get()); + return TI.getSema().BuildUniqueStableName(E->getLocation(), SubExpr.get()); } - if (E->getIdentKind() == PredefinedExpr::UniqueStableNameType) { - TypeSourceInfo *Info = getDerived().TransformType(E->getTypeSourceInfo()); - if (!Info) - return ExprError(); + llvm_unreachable("Only valid for UniqueStableNameType/Expr"); +} - if (!getDerived().AlwaysRebuild() && Info == E->getTypeSourceInfo()) - return E; - return getSema().BuildUniqueStableName(E->getLocation(), Info); - } +ExprResult +TemplateInstantiator::TransformPredefinedExpr(PredefinedExpr *E) { + if (!E->isTypeDependent()) + return E; + + if (E->getIdentKind() == PredefinedExpr::UniqueStableNameType || + E->getIdentKind() == PredefinedExpr::UniqueStableNameExpr) + return TransformUniqueStableName(*this, E); return getSema().BuildPredefinedExpr(E->getLocation(), E->getIdentKind()); } diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp index 333d6d7576f40..d2ae78227feb4 100644 --- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp +++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp @@ -399,7 +399,8 @@ static void instantiateOMPDeclareVariantAttr( // Copy the template version of the OMPTraitInfo and run substitute on all // score and condition expressiosn. - OMPTraitInfo TI = Attr.getTraitInfos(); + OMPTraitInfo &TI = S.getASTContext().getNewOMPTraitInfo(); + TI = *Attr.getTraitInfos(); // Try to substitute template parameters in score and condition expressions. auto SubstScoreOrConditionExpr = [&S, Subst](Expr *&E, bool) { diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp index 0ac822a0ee227..c0d039751d17c 100644 --- a/clang/lib/Sema/SemaType.cpp +++ b/clang/lib/Sema/SemaType.cpp @@ -129,6 +129,7 @@ static void diagnoseBadTypeAttribute(Sema &S, const ParsedAttr &attr, case ParsedAttr::AT_NSReturnsRetained: \ case ParsedAttr::AT_NoReturn: \ case ParsedAttr::AT_Regparm: \ + case ParsedAttr::AT_CmseNSCall: \ case ParsedAttr::AT_AnyX86NoCallerSavedRegisters: \ case ParsedAttr::AT_AnyX86NoCfCheck: \ CALLING_CONV_ATTRS_CASELIST @@ -6617,6 +6618,7 @@ namespace { Desugar, Attributed, Parens, + Array, Pointer, BlockPointer, Reference, @@ -6637,6 +6639,10 @@ namespace { } else if (isa(Ty)) { T = cast(Ty)->getInnerType(); Stack.push_back(Parens); + } else if (isa(Ty) || isa(Ty) || + isa(Ty)) { + T = cast(Ty)->getElementType(); + Stack.push_back(Array); } else if (isa(Ty)) { T = cast(Ty)->getPointeeType(); Stack.push_back(Pointer); @@ -6714,6 +6720,27 @@ namespace { case MacroQualified: return wrap(C, cast(Old)->getUnderlyingType(), I); + case Array: { + if (const auto *CAT = dyn_cast(Old)) { + QualType New = wrap(C, CAT->getElementType(), I); + return C.getConstantArrayType(New, CAT->getSize(), CAT->getSizeExpr(), + CAT->getSizeModifier(), + CAT->getIndexTypeCVRQualifiers()); + } + + if (const auto *VAT = dyn_cast(Old)) { + QualType New = wrap(C, VAT->getElementType(), I); + return C.getVariableArrayType( + New, VAT->getSizeExpr(), VAT->getSizeModifier(), + VAT->getIndexTypeCVRQualifiers(), VAT->getBracketsRange()); + } + + const auto *IAT = cast(Old); + QualType New = wrap(C, IAT->getElementType(), I); + return C.getIncompleteArrayType(New, IAT->getSizeModifier(), + IAT->getIndexTypeCVRQualifiers()); + } + case Pointer: { QualType New = wrap(C, cast(Old)->getPointeeType(), I); return C.getPointerType(New); @@ -7182,6 +7209,25 @@ static bool handleFunctionTypeAttr(TypeProcessingState &state, ParsedAttr &attr, return true; } + if (attr.getKind() == ParsedAttr::AT_CmseNSCall) { + // Delay if this is not a function type. + if (!unwrapped.isFunctionType()) + return false; + + // Ignore if we don't have CMSE enabled. + if (!S.getLangOpts().Cmse) { + S.Diag(attr.getLoc(), diag::warn_attribute_ignored) << attr; + attr.setInvalid(); + return true; + } + + // Otherwise we can process right away. + FunctionType::ExtInfo EI = + unwrapped.get()->getExtInfo().withCmseNSCall(true); + type = unwrapped.wrap(S, S.Context.adjustFunctionType(unwrapped.get(), EI)); + return true; + } + // ns_returns_retained is not always a type attribute, but if we got // here, we're treating it as one right now. if (attr.getKind() == ParsedAttr::AT_NSReturnsRetained) { diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h index 91d0a1cfa74b6..552723d06d942 100644 --- a/clang/lib/Sema/TreeTransform.h +++ b/clang/lib/Sema/TreeTransform.h @@ -28,6 +28,7 @@ #include "clang/AST/StmtCXX.h" #include "clang/AST/StmtObjC.h" #include "clang/AST/StmtOpenMP.h" +#include "clang/Basic/OpenMPKinds.h" #include "clang/Sema/Designator.h" #include "clang/Sema/Lookup.h" #include "clang/Sema/Ownership.h" @@ -157,6 +158,13 @@ class TreeTransform { /// existing lambdas. bool ReplacingOriginal() { return false; } + /// Wether CXXConstructExpr can be skipped when they are implicit. + /// They will be reconstructed when used if needed. + /// This is usefull when the user that cause rebuilding of the + /// CXXConstructExpr is outside of the expression at which the TreeTransform + /// started. + bool AllowSkippingCXXConstructExpr() { return true; } + /// Returns the location of the entity being transformed, if that /// information was not available elsewhere in the AST. /// @@ -1714,17 +1722,16 @@ class TreeTransform { /// /// By default, performs semantic analysis to build the new statement. /// Subclasses may override this routine to provide different behavior. - OMPClause *RebuildOMPReductionClause(ArrayRef VarList, - SourceLocation StartLoc, - SourceLocation LParenLoc, - SourceLocation ColonLoc, - SourceLocation EndLoc, - CXXScopeSpec &ReductionIdScopeSpec, - const DeclarationNameInfo &ReductionId, - ArrayRef UnresolvedReductions) { + OMPClause *RebuildOMPReductionClause( + ArrayRef VarList, OpenMPReductionClauseModifier Modifier, + SourceLocation StartLoc, SourceLocation LParenLoc, + SourceLocation ModifierLoc, SourceLocation ColonLoc, + SourceLocation EndLoc, CXXScopeSpec &ReductionIdScopeSpec, + const DeclarationNameInfo &ReductionId, + ArrayRef UnresolvedReductions) { return getSema().ActOnOpenMPReductionClause( - VarList, StartLoc, LParenLoc, ColonLoc, EndLoc, ReductionIdScopeSpec, - ReductionId, UnresolvedReductions); + VarList, Modifier, StartLoc, LParenLoc, ModifierLoc, ColonLoc, EndLoc, + ReductionIdScopeSpec, ReductionId, UnresolvedReductions); } /// Build a new OpenMP 'task_reduction' clause. @@ -2050,6 +2057,30 @@ class TreeTransform { EndLoc); } + /// Build a new OpenMP 'inclusive' clause. + /// + /// By default, performs semantic analysis to build the new OpenMP clause. + /// Subclasses may override this routine to provide different behavior. + OMPClause *RebuildOMPInclusiveClause(ArrayRef VarList, + SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc) { + return getSema().ActOnOpenMPInclusiveClause(VarList, StartLoc, LParenLoc, + EndLoc); + } + + /// Build a new OpenMP 'exclusive' clause. + /// + /// By default, performs semantic analysis to build the new OpenMP clause. + /// Subclasses may override this routine to provide different behavior. + OMPClause *RebuildOMPExclusiveClause(ArrayRef VarList, + SourceLocation StartLoc, + SourceLocation LParenLoc, + SourceLocation EndLoc) { + return getSema().ActOnOpenMPExclusiveClause(VarList, StartLoc, LParenLoc, + EndLoc); + } + /// Build a new OpenMP 'order' clause. /// /// By default, performs semantic analysis to build the new OpenMP clause. @@ -3477,6 +3508,11 @@ class TreeTransform { Sema::AtomicArgumentOrder::AST); } + ExprResult RebuildRecoveryExpr(SourceLocation BeginLoc, SourceLocation EndLoc, + ArrayRef SubExprs) { + return getSema().CreateRecoveryExpr(BeginLoc, EndLoc, SubExprs); + } + private: TypeLoc TransformTypeInObjectScope(TypeLoc TL, QualType ObjectType, @@ -4061,50 +4097,8 @@ template void TreeTransform::InventTemplateArgumentLoc( const TemplateArgument &Arg, TemplateArgumentLoc &Output) { - SourceLocation Loc = getDerived().getBaseLocation(); - switch (Arg.getKind()) { - case TemplateArgument::Null: - llvm_unreachable("null template argument in TreeTransform"); - break; - - case TemplateArgument::Type: - Output = TemplateArgumentLoc(Arg, - SemaRef.Context.getTrivialTypeSourceInfo(Arg.getAsType(), Loc)); - - break; - - case TemplateArgument::Template: - case TemplateArgument::TemplateExpansion: { - NestedNameSpecifierLocBuilder Builder; - TemplateName Template = Arg.getAsTemplateOrTemplatePattern(); - if (DependentTemplateName *DTN = Template.getAsDependentTemplateName()) - Builder.MakeTrivial(SemaRef.Context, DTN->getQualifier(), Loc); - else if (QualifiedTemplateName *QTN = Template.getAsQualifiedTemplateName()) - Builder.MakeTrivial(SemaRef.Context, QTN->getQualifier(), Loc); - - if (Arg.getKind() == TemplateArgument::Template) - Output = TemplateArgumentLoc(Arg, - Builder.getWithLocInContext(SemaRef.Context), - Loc); - else - Output = TemplateArgumentLoc(Arg, - Builder.getWithLocInContext(SemaRef.Context), - Loc, Loc); - - break; - } - - case TemplateArgument::Expression: - Output = TemplateArgumentLoc(Arg, Arg.getAsExpr()); - break; - - case TemplateArgument::Declaration: - case TemplateArgument::Integral: - case TemplateArgument::Pack: - case TemplateArgument::NullPtr: - Output = TemplateArgumentLoc(Arg, TemplateArgumentLocInfo()); - break; - } + Output = getSema().getTrivialTemplateArgumentLoc( + Arg, QualType(), getDerived().getBaseLocation()); } template @@ -4114,12 +4108,45 @@ bool TreeTransform::TransformTemplateArgument( const TemplateArgument &Arg = Input.getArgument(); switch (Arg.getKind()) { case TemplateArgument::Null: - case TemplateArgument::Integral: case TemplateArgument::Pack: - case TemplateArgument::Declaration: - case TemplateArgument::NullPtr: llvm_unreachable("Unexpected TemplateArgument"); + case TemplateArgument::Integral: + case TemplateArgument::NullPtr: + case TemplateArgument::Declaration: { + // Transform a resolved template argument straight to a resolved template + // argument. We get here when substituting into an already-substituted + // template type argument during concept satisfaction checking. + QualType T = Arg.getNonTypeTemplateArgumentType(); + QualType NewT = getDerived().TransformType(T); + if (NewT.isNull()) + return true; + + ValueDecl *D = Arg.getKind() == TemplateArgument::Declaration + ? Arg.getAsDecl() + : nullptr; + ValueDecl *NewD = D ? cast_or_null(getDerived().TransformDecl( + getDerived().getBaseLocation(), D)) + : nullptr; + if (D && !NewD) + return true; + + if (NewT == T && D == NewD) + Output = Input; + else if (Arg.getKind() == TemplateArgument::Integral) + Output = TemplateArgumentLoc( + TemplateArgument(getSema().Context, Arg.getAsIntegral(), NewT), + TemplateArgumentLocInfo()); + else if (Arg.getKind() == TemplateArgument::NullPtr) + Output = TemplateArgumentLoc(TemplateArgument(NewT, /*IsNullPtr=*/true), + TemplateArgumentLocInfo()); + else + Output = TemplateArgumentLoc(TemplateArgument(NewD, NewT), + TemplateArgumentLocInfo()); + + return false; + } + case TemplateArgument::Type: { TypeSourceInfo *DI = Input.getTypeSourceInfo(); if (!DI) @@ -8312,6 +8339,17 @@ TreeTransform::TransformOMPDepobjDirective(OMPDepobjDirective *D) { return Res; } +template +StmtResult +TreeTransform::TransformOMPScanDirective(OMPScanDirective *D) { + DeclarationNameInfo DirName; + getDerived().getSema().StartOpenMPDSABlock(OMPD_scan, DirName, nullptr, + D->getBeginLoc()); + StmtResult Res = getDerived().TransformOMPExecutableDirective(D); + getDerived().getSema().EndOpenMPDSABlock(Res.get()); + return Res; +} + template StmtResult TreeTransform::TransformOMPOrderedDirective(OMPOrderedDirective *D) { @@ -9057,8 +9095,9 @@ TreeTransform::TransformOMPReductionClause(OMPReductionClause *C) { UnresolvedReductions.push_back(nullptr); } return getDerived().RebuildOMPReductionClause( - Vars, C->getBeginLoc(), C->getLParenLoc(), C->getColonLoc(), - C->getEndLoc(), ReductionIdScopeSpec, NameInfo, UnresolvedReductions); + Vars, C->getModifier(), C->getBeginLoc(), C->getLParenLoc(), + C->getModifierLoc(), C->getColonLoc(), C->getEndLoc(), + ReductionIdScopeSpec, NameInfo, UnresolvedReductions); } template @@ -9519,6 +9558,36 @@ TreeTransform::TransformOMPNontemporalClause(OMPNontemporalClause *C) { Vars, C->getBeginLoc(), C->getLParenLoc(), C->getEndLoc()); } +template +OMPClause * +TreeTransform::TransformOMPInclusiveClause(OMPInclusiveClause *C) { + llvm::SmallVector Vars; + Vars.reserve(C->varlist_size()); + for (auto *VE : C->varlists()) { + ExprResult EVar = getDerived().TransformExpr(cast(VE)); + if (EVar.isInvalid()) + return nullptr; + Vars.push_back(EVar.get()); + } + return getDerived().RebuildOMPInclusiveClause( + Vars, C->getBeginLoc(), C->getLParenLoc(), C->getEndLoc()); +} + +template +OMPClause * +TreeTransform::TransformOMPExclusiveClause(OMPExclusiveClause *C) { + llvm::SmallVector Vars; + Vars.reserve(C->varlist_size()); + for (auto *VE : C->varlists()) { + ExprResult EVar = getDerived().TransformExpr(cast(VE)); + if (EVar.isInvalid()) + return nullptr; + Vars.push_back(EVar.get()); + } + return getDerived().RebuildOMPExclusiveClause( + Vars, C->getBeginLoc(), C->getLParenLoc(), C->getEndLoc()); +} + template OMPClause * TreeTransform::TransformOMPOrderClause(OMPOrderClause *C) { @@ -9810,6 +9879,24 @@ TreeTransform::TransformTypoExpr(TypoExpr *E) { return E; } +template +ExprResult TreeTransform::TransformRecoveryExpr(RecoveryExpr *E) { + llvm::SmallVector Children; + bool Changed = false; + for (Expr *C : E->subExpressions()) { + ExprResult NewC = getDerived().TransformExpr(C); + if (NewC.isInvalid()) + return ExprError(); + Children.push_back(NewC.get()); + + Changed |= NewC.get() != C; + } + if (!getDerived().AlwaysRebuild() && !Changed) + return E; + return getDerived().RebuildRecoveryExpr(E->getBeginLoc(), E->getEndLoc(), + Children); +} + template ExprResult TreeTransform::TransformPseudoObjectExpr(PseudoObjectExpr *E) { @@ -11674,10 +11761,11 @@ TreeTransform::TransformCXXConstructExpr(CXXConstructExpr *E) { // CXXConstructExprs other than for list-initialization and // CXXTemporaryObjectExpr are always implicit, so when we have // a 1-argument construction we just transform that argument. - if ((E->getNumArgs() == 1 || - (E->getNumArgs() > 1 && getDerived().DropCallArgument(E->getArg(1)))) && - (!getDerived().DropCallArgument(E->getArg(0))) && - !E->isListInitialization()) + if (getDerived().AllowSkippingCXXConstructExpr() && + ((E->getNumArgs() == 1 || + (E->getNumArgs() > 1 && getDerived().DropCallArgument(E->getArg(1)))) && + (!getDerived().DropCallArgument(E->getArg(0))) && + !E->isListInitialization())) return getDerived().TransformExpr(E->getArg(0)); TemporaryBase Rebase(*this, /*FIXME*/ E->getBeginLoc(), DeclarationName()); diff --git a/clang/lib/Sema/UsedDeclVisitor.h b/clang/lib/Sema/UsedDeclVisitor.h index be46f0d4affcc..d207e07f451ad 100644 --- a/clang/lib/Sema/UsedDeclVisitor.h +++ b/clang/lib/Sema/UsedDeclVisitor.h @@ -84,6 +84,18 @@ class UsedDeclVisitor : public EvaluatedExprVisitor { void VisitCXXDefaultArgExpr(CXXDefaultArgExpr *E) { asImpl().Visit(E->getExpr()); } + + void visitUsedDecl(SourceLocation Loc, Decl *D) { + if (auto *CD = dyn_cast(D)) { + if (auto *S = CD->getBody()) { + asImpl().Visit(S); + } + } else if (auto *CD = dyn_cast(D)) { + if (auto *S = CD->getBody()) { + asImpl().Visit(S); + } + } + } }; } // end namespace clang diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp index dd0fa9f70dafd..c2d28f4362418 100644 --- a/clang/lib/Serialization/ASTReader.cpp +++ b/clang/lib/Serialization/ASTReader.cpp @@ -3773,6 +3773,11 @@ ASTReader::ReadASTBlock(ModuleFile &F, unsigned ClientLoadCapabilities) { } break; } + + case DECLS_TO_CHECK_FOR_DEFERRED_DIAGS: + for (unsigned I = 0, N = Record.size(); I != N; ++I) + DeclsToCheckForDeferredDiags.push_back(getGlobalDeclID(F, Record[I])); + break; } } } @@ -8180,6 +8185,19 @@ void ASTReader::ReadUnusedLocalTypedefNameCandidates( UnusedLocalTypedefNameCandidates.clear(); } +void ASTReader::ReadDeclsToCheckForDeferredDiags( + llvm::SmallVector &Decls) { + for (unsigned I = 0, N = DeclsToCheckForDeferredDiags.size(); I != N; + ++I) { + auto *D = dyn_cast_or_null( + GetDecl(DeclsToCheckForDeferredDiags[I])); + if (D) + Decls.push_back(D); + } + DeclsToCheckForDeferredDiags.clear(); +} + + void ASTReader::ReadReferencedSelectors( SmallVectorImpl> &Sels) { if (ReferencedSelectorsData.empty()) @@ -11824,6 +11842,12 @@ OMPClause *OMPClauseReader::readClause() { case OMPC_nontemporal: C = OMPNontemporalClause::CreateEmpty(Context, Record.readInt()); break; + case OMPC_inclusive: + C = OMPInclusiveClause::CreateEmpty(Context, Record.readInt()); + break; + case OMPC_exclusive: + C = OMPExclusiveClause::CreateEmpty(Context, Record.readInt()); + break; case OMPC_order: C = new (Context) OMPOrderClause(); break; @@ -12069,7 +12093,9 @@ void OMPClauseReader::VisitOMPSharedClause(OMPSharedClause *C) { void OMPClauseReader::VisitOMPReductionClause(OMPReductionClause *C) { VisitOMPClauseWithPostUpdate(C); C->setLParenLoc(Record.readSourceLocation()); + C->setModifierLoc(Record.readSourceLocation()); C->setColonLoc(Record.readSourceLocation()); + C->setModifier(Record.readEnum()); NestedNameSpecifierLoc NNSL = Record.readNestedNameSpecifierLoc(); DeclarationNameInfo DNI = Record.readDeclarationNameInfo(); C->setQualifierLoc(NNSL); @@ -12634,14 +12660,34 @@ void OMPClauseReader::VisitOMPNontemporalClause(OMPNontemporalClause *C) { C->setPrivateRefs(Vars); } +void OMPClauseReader::VisitOMPInclusiveClause(OMPInclusiveClause *C) { + C->setLParenLoc(Record.readSourceLocation()); + unsigned NumVars = C->varlist_size(); + SmallVector Vars; + Vars.reserve(NumVars); + for (unsigned i = 0; i != NumVars; ++i) + Vars.push_back(Record.readSubExpr()); + C->setVarRefs(Vars); +} + +void OMPClauseReader::VisitOMPExclusiveClause(OMPExclusiveClause *C) { + C->setLParenLoc(Record.readSourceLocation()); + unsigned NumVars = C->varlist_size(); + SmallVector Vars; + Vars.reserve(NumVars); + for (unsigned i = 0; i != NumVars; ++i) + Vars.push_back(Record.readSubExpr()); + C->setVarRefs(Vars); +} + void OMPClauseReader::VisitOMPOrderClause(OMPOrderClause *C) { C->setKind(Record.readEnum()); C->setLParenLoc(Record.readSourceLocation()); C->setKindKwLoc(Record.readSourceLocation()); } -OMPTraitInfo ASTRecordReader::readOMPTraitInfo() { - OMPTraitInfo TI; +OMPTraitInfo *ASTRecordReader::readOMPTraitInfo() { + OMPTraitInfo &TI = getContext().getNewOMPTraitInfo(); TI.Sets.resize(readUInt32()); for (auto &Set : TI.Sets) { Set.Kind = readEnum(); @@ -12656,5 +12702,5 @@ OMPTraitInfo ASTRecordReader::readOMPTraitInfo() { Property.Kind = readEnum(); } } - return TI; + return &TI; } diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp index bb2ecb53c7cee..3bd7b825cdc82 100644 --- a/clang/lib/Serialization/ASTReaderDecl.cpp +++ b/clang/lib/Serialization/ASTReaderDecl.cpp @@ -2744,7 +2744,7 @@ class AttrReader { return Reader.readVersionTuple(); } - OMPTraitInfo readOMPTraitInfo() { return Reader.readOMPTraitInfo(); } + OMPTraitInfo *readOMPTraitInfo() { return Reader.readOMPTraitInfo(); } template T *GetLocalDeclAs(uint32_t LocalID) { return Reader.GetLocalDeclAs(LocalID); diff --git a/clang/lib/Serialization/ASTReaderStmt.cpp b/clang/lib/Serialization/ASTReaderStmt.cpp index d74b0d514eda0..5d9033e379777 100644 --- a/clang/lib/Serialization/ASTReaderStmt.cpp +++ b/clang/lib/Serialization/ASTReaderStmt.cpp @@ -106,7 +106,8 @@ namespace clang { /// The number of record fields required for the Expr class /// itself. - static const unsigned NumExprFields = NumStmtFields + 7; + static const unsigned NumExprFields = + NumStmtFields + ExprDependenceBits + 3; /// Read and initialize a ExplicitTemplateArgumentList structure. void ReadTemplateKWAndArgsInfo(ASTTemplateKWAndArgsInfo &Args, @@ -517,6 +518,7 @@ void ASTStmtReader::VisitExpr(Expr *E) { bool ValueDependent = Record.readInt(); bool InstantiationDependent = Record.readInt(); bool ContainsUnexpandedTemplateParameters = Record.readInt(); + bool ContainsErrors = Record.readInt(); auto Deps = ExprDependence::None; if (TypeDependent) Deps |= ExprDependence::Type; @@ -526,6 +528,8 @@ void ASTStmtReader::VisitExpr(Expr *E) { Deps |= ExprDependence::Instantiation; if (ContainsUnexpandedTemplateParameters) Deps |= ExprDependence::UnexpandedPack; + if (ContainsErrors) + Deps |= ExprDependence::Error; E->setDependence(Deps); E->setValueKind(static_cast(Record.readInt())); @@ -2076,6 +2080,19 @@ void ASTStmtReader::VisitTypoExpr(TypoExpr *E) { llvm_unreachable("Cannot read TypoExpr nodes"); } +void ASTStmtReader::VisitRecoveryExpr(RecoveryExpr *E) { + VisitExpr(E); + unsigned NumArgs = Record.readInt(); + E->BeginLoc = readSourceLocation(); + E->EndLoc = readSourceLocation(); + assert( + (NumArgs == std::distance(E->children().begin(), E->children().end())) && + "Wrong NumArgs!"); + (void)NumArgs; + for (Stmt *&Child : E->children()) + Child = Record.readSubStmt(); +} + //===----------------------------------------------------------------------===// // Microsoft Expressions and Statements //===----------------------------------------------------------------------===// @@ -2373,6 +2390,13 @@ void ASTStmtReader::VisitOMPDepobjDirective(OMPDepobjDirective *D) { VisitOMPExecutableDirective(D); } +void ASTStmtReader::VisitOMPScanDirective(OMPScanDirective *D) { + VisitStmt(D); + // The NumClauses field was read in ReadStmtFromStream. + Record.skipInts(1); + VisitOMPExecutableDirective(D); +} + void ASTStmtReader::VisitOMPOrderedDirective(OMPOrderedDirective *D) { VisitStmt(D); // The NumClauses field was read in ReadStmtFromStream. @@ -2846,6 +2870,11 @@ Stmt *ASTReader::ReadStmtFromStream(ModuleFile &F) { Context, /*NumArgs=*/Record[ASTStmtReader::NumExprFields], Empty); break; + case EXPR_RECOVERY: + S = RecoveryExpr::CreateEmpty( + Context, /*NumArgs=*/Record[ASTStmtReader::NumExprFields]); + break; + case EXPR_MEMBER: S = MemberExpr::CreateEmpty(Context, Record[ASTStmtReader::NumExprFields], Record[ASTStmtReader::NumExprFields + 1], @@ -3209,6 +3238,11 @@ Stmt *ASTReader::ReadStmtFromStream(ModuleFile &F) { Context, Record[ASTStmtReader::NumStmtFields], Empty); break; + case STMT_OMP_SCAN_DIRECTIVE: + S = OMPScanDirective::CreateEmpty( + Context, Record[ASTStmtReader::NumStmtFields], Empty); + break; + case STMT_OMP_ORDERED_DIRECTIVE: S = OMPOrderedDirective::CreateEmpty( Context, Record[ASTStmtReader::NumStmtFields], Empty); diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp index c96e46543dbad..f7c58ed11d9f4 100644 --- a/clang/lib/Serialization/ASTWriter.cpp +++ b/clang/lib/Serialization/ASTWriter.cpp @@ -500,6 +500,7 @@ void ASTWriter::WriteTypeAbbrevs() { Abv->Add(BitCodeAbbrevOp(0)); // ProducesResult Abv->Add(BitCodeAbbrevOp(0)); // NoCallerSavedRegs Abv->Add(BitCodeAbbrevOp(0)); // NoCfCheck + Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); // CmseNSCall // FunctionProtoType Abv->Add(BitCodeAbbrevOp(0)); // IsVariadic Abv->Add(BitCodeAbbrevOp(0)); // HasTrailingReturn @@ -756,6 +757,7 @@ void ASTWriter::WriteBlockInfoBlock() { RECORD(DELETE_EXPRS_TO_ANALYZE); RECORD(CUDA_PRAGMA_FORCE_HOST_DEVICE_DEPTH); RECORD(PP_CONDITIONAL_STACK); + RECORD(DECLS_TO_CHECK_FOR_DEFERRED_DIAGS); // SourceManager Block. BLOCK(SOURCE_MANAGER_BLOCK); @@ -4671,6 +4673,11 @@ ASTFileSignature ASTWriter::WriteASTCore(Sema &SemaRef, StringRef isysroot, Buffer.data(), Buffer.size()); } + // Build a record containing all of the DeclsToCheckForDeferredDiags. + RecordData DeclsToCheckForDeferredDiags; + for (auto *D : SemaRef.DeclsToCheckForDeferredDiags) + AddDeclRef(D, DeclsToCheckForDeferredDiags); + RecordData DeclUpdatesOffsetsRecord; // Keep writing types, declarations, and declaration update records @@ -4762,6 +4769,11 @@ ASTFileSignature ASTWriter::WriteASTCore(Sema &SemaRef, StringRef isysroot, if (!SemaDeclRefs.empty()) Stream.EmitRecord(SEMA_DECL_REFS, SemaDeclRefs); + // Write the record containing decls to be checked for deferred diags. + if (!DeclsToCheckForDeferredDiags.empty()) + Stream.EmitRecord(DECLS_TO_CHECK_FOR_DEFERRED_DIAGS, + DeclsToCheckForDeferredDiags); + // Write the record containing CUDA-specific declaration references. if (!CUDASpecialDeclRefs.empty()) Stream.EmitRecord(CUDA_SPECIAL_DECL_REFS, CUDASpecialDeclRefs); @@ -6231,7 +6243,9 @@ void OMPClauseWriter::VisitOMPReductionClause(OMPReductionClause *C) { Record.push_back(C->varlist_size()); VisitOMPClauseWithPostUpdate(C); Record.AddSourceLocation(C->getLParenLoc()); + Record.AddSourceLocation(C->getModifierLoc()); Record.AddSourceLocation(C->getColonLoc()); + Record.writeEnum(C->getModifier()); Record.AddNestedNameSpecifierLoc(C->getQualifierLoc()); Record.AddDeclarationNameInfo(C->getNameInfo()); for (auto *VE : C->varlists()) @@ -6593,15 +6607,29 @@ void OMPClauseWriter::VisitOMPNontemporalClause(OMPNontemporalClause *C) { Record.AddStmt(E); } +void OMPClauseWriter::VisitOMPInclusiveClause(OMPInclusiveClause *C) { + Record.push_back(C->varlist_size()); + Record.AddSourceLocation(C->getLParenLoc()); + for (auto *VE : C->varlists()) + Record.AddStmt(VE); +} + +void OMPClauseWriter::VisitOMPExclusiveClause(OMPExclusiveClause *C) { + Record.push_back(C->varlist_size()); + Record.AddSourceLocation(C->getLParenLoc()); + for (auto *VE : C->varlists()) + Record.AddStmt(VE); +} + void OMPClauseWriter::VisitOMPOrderClause(OMPOrderClause *C) { Record.writeEnum(C->getKind()); Record.AddSourceLocation(C->getLParenLoc()); Record.AddSourceLocation(C->getKindKwLoc()); } -void ASTRecordWriter::writeOMPTraitInfo(const OMPTraitInfo &TI) { - writeUInt32(TI.Sets.size()); - for (const auto &Set : TI.Sets) { +void ASTRecordWriter::writeOMPTraitInfo(const OMPTraitInfo *TI) { + writeUInt32(TI->Sets.size()); + for (const auto &Set : TI->Sets) { writeEnum(Set.Kind); writeUInt32(Set.Selectors.size()); for (const auto &Selector : Set.Selectors) { diff --git a/clang/lib/Serialization/ASTWriterDecl.cpp b/clang/lib/Serialization/ASTWriterDecl.cpp index f7d89299e140c..ddda3931d3199 100644 --- a/clang/lib/Serialization/ASTWriterDecl.cpp +++ b/clang/lib/Serialization/ASTWriterDecl.cpp @@ -2280,6 +2280,7 @@ void ASTWriter::WriteDeclAbbrevs() { Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); //ValueDependent Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); //InstantiationDependent Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); //UnexpandedParamPack + Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); //ContainsErrors Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 3)); //GetValueKind Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 3)); //GetObjectKind //DeclRefExpr @@ -2303,6 +2304,7 @@ void ASTWriter::WriteDeclAbbrevs() { Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); //ValueDependent Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); //InstantiationDependent Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); //UnexpandedParamPack + Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); //ContainsErrors Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 3)); //GetValueKind Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 3)); //GetObjectKind //Integer Literal @@ -2321,6 +2323,7 @@ void ASTWriter::WriteDeclAbbrevs() { Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); //ValueDependent Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); //InstantiationDependent Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); //UnexpandedParamPack + Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); //ContainsErrors Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 3)); //GetValueKind Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 3)); //GetObjectKind //Character Literal @@ -2339,6 +2342,7 @@ void ASTWriter::WriteDeclAbbrevs() { Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); //ValueDependent Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); //InstantiationDependent Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); //UnexpandedParamPack + Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 1)); //ContainsErrors Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 3)); //GetValueKind Abv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 3)); //GetObjectKind // CastExpr diff --git a/clang/lib/Serialization/ASTWriterStmt.cpp b/clang/lib/Serialization/ASTWriterStmt.cpp index 588977525b65a..d64e2330850a1 100644 --- a/clang/lib/Serialization/ASTWriterStmt.cpp +++ b/clang/lib/Serialization/ASTWriterStmt.cpp @@ -540,6 +540,7 @@ void ASTStmtWriter::VisitExpr(Expr *E) { Record.push_back(E->isValueDependent()); Record.push_back(E->isInstantiationDependent()); Record.push_back(E->containsUnexpandedParameterPack()); + Record.push_back(E->containsErrors()); Record.push_back(E->getValueKind()); Record.push_back(E->getObjectKind()); } @@ -785,6 +786,16 @@ void ASTStmtWriter::VisitCallExpr(CallExpr *E) { Code = serialization::EXPR_CALL; } +void ASTStmtWriter::VisitRecoveryExpr(RecoveryExpr *E) { + VisitExpr(E); + Record.push_back(std::distance(E->children().begin(), E->children().end())); + Record.AddSourceLocation(E->getBeginLoc()); + Record.AddSourceLocation(E->getEndLoc()); + for (Stmt *Child : E->children()) + Record.AddStmt(Child); + Code = serialization::EXPR_RECOVERY; +} + void ASTStmtWriter::VisitMemberExpr(MemberExpr *E) { VisitExpr(E); @@ -2329,6 +2340,13 @@ void ASTStmtWriter::VisitOMPDepobjDirective(OMPDepobjDirective *D) { Code = serialization::STMT_OMP_DEPOBJ_DIRECTIVE; } +void ASTStmtWriter::VisitOMPScanDirective(OMPScanDirective *D) { + VisitStmt(D); + Record.push_back(D->getNumClauses()); + VisitOMPExecutableDirective(D); + Code = serialization::STMT_OMP_SCAN_DIRECTIVE; +} + void ASTStmtWriter::VisitOMPOrderedDirective(OMPOrderedDirective *D) { VisitStmt(D); Record.push_back(D->getNumClauses()); diff --git a/clang/lib/StaticAnalyzer/Checkers/IteratorModeling.cpp b/clang/lib/StaticAnalyzer/Checkers/IteratorModeling.cpp index 955a40b0d4b75..9a813b0a1a456 100644 --- a/clang/lib/StaticAnalyzer/Checkers/IteratorModeling.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/IteratorModeling.cpp @@ -86,6 +86,15 @@ class IteratorModeling : public Checker, check::Bind, check::LiveSymbols, check::DeadSymbols> { + using AdvanceFn = void (IteratorModeling::*)(CheckerContext &, const Expr *, + SVal, SVal, SVal) const; + + void handleOverloadedOperator(CheckerContext &C, const CallEvent &Call, + OverloadedOperatorKind Op) const; + void handleAdvanceLikeFunction(CheckerContext &C, const CallEvent &Call, + const Expr *OrigExpr, + const AdvanceFn *Handler) const; + void handleComparison(CheckerContext &C, const Expr *CE, SVal RetVal, const SVal &LVal, const SVal &RVal, OverloadedOperatorKind Op) const; @@ -99,13 +108,39 @@ class IteratorModeling void handleRandomIncrOrDecr(CheckerContext &C, const Expr *CE, OverloadedOperatorKind Op, const SVal &RetVal, const SVal &LHS, const SVal &RHS) const; + void handleAdvance(CheckerContext &C, const Expr *CE, SVal RetVal, SVal Iter, + SVal Amount) const; + void handlePrev(CheckerContext &C, const Expr *CE, SVal RetVal, SVal Iter, + SVal Amount) const; + void handleNext(CheckerContext &C, const Expr *CE, SVal RetVal, SVal Iter, + SVal Amount) const; void assignToContainer(CheckerContext &C, const Expr *CE, const SVal &RetVal, const MemRegion *Cont) const; + bool noChangeInAdvance(CheckerContext &C, SVal Iter, const Expr *CE) const; void printState(raw_ostream &Out, ProgramStateRef State, const char *NL, const char *Sep) const override; + // std::advance, std::prev & std::next + CallDescriptionMap AdvanceLikeFunctions = { + // template + // void advance(InputIt& it, Distance n); + {{{"std", "advance"}, 2}, &IteratorModeling::handleAdvance}, + + // template + // BidirIt prev( + // BidirIt it, + // typename std::iterator_traits::difference_type n = 1); + {{{"std", "prev"}, 2}, &IteratorModeling::handlePrev}, + + // template + // ForwardIt next( + // ForwardIt it, + // typename std::iterator_traits::difference_type n = 1); + {{{"std", "next"}, 2}, &IteratorModeling::handleNext}, + }; + public: - IteratorModeling() {} + IteratorModeling() = default; void checkPostCall(const CallEvent &Call, CheckerContext &C) const; void checkBind(SVal Loc, SVal Val, const Stmt *S, CheckerContext &C) const; @@ -123,6 +158,7 @@ ProgramStateRef relateSymbols(ProgramStateRef State, SymbolRef Sym1, SymbolRef Sym2, bool Equal); bool isBoundThroughLazyCompoundVal(const Environment &Env, const MemRegion *Reg); +const ExplodedNode *findCallEnter(const ExplodedNode *Node, const Expr *Call); } // namespace @@ -135,101 +171,52 @@ void IteratorModeling::checkPostCall(const CallEvent &Call, if (Func->isOverloadedOperator()) { const auto Op = Func->getOverloadedOperator(); - if (isSimpleComparisonOperator(Op)) { - const auto *OrigExpr = Call.getOriginExpr(); - if (!OrigExpr) - return; - - if (const auto *InstCall = dyn_cast(&Call)) { - handleComparison(C, OrigExpr, Call.getReturnValue(), - InstCall->getCXXThisVal(), Call.getArgSVal(0), Op); - return; - } - - handleComparison(C, OrigExpr, Call.getReturnValue(), Call.getArgSVal(0), - Call.getArgSVal(1), Op); - return; - } else if (isRandomIncrOrDecrOperator(Func->getOverloadedOperator())) { - const auto *OrigExpr = Call.getOriginExpr(); - if (!OrigExpr) - return; - - if (const auto *InstCall = dyn_cast(&Call)) { - if (Call.getNumArgs() >= 1 && - Call.getArgExpr(0)->getType()->isIntegralOrEnumerationType()) { - handleRandomIncrOrDecr(C, OrigExpr, Func->getOverloadedOperator(), - Call.getReturnValue(), - InstCall->getCXXThisVal(), Call.getArgSVal(0)); - return; - } - } else { - if (Call.getNumArgs() >= 2 && - Call.getArgExpr(1)->getType()->isIntegralOrEnumerationType()) { - handleRandomIncrOrDecr(C, OrigExpr, Func->getOverloadedOperator(), - Call.getReturnValue(), Call.getArgSVal(0), - Call.getArgSVal(1)); - return; - } - } - } else if (isIncrementOperator(Func->getOverloadedOperator())) { - if (const auto *InstCall = dyn_cast(&Call)) { - handleIncrement(C, Call.getReturnValue(), InstCall->getCXXThisVal(), - Call.getNumArgs()); - return; - } + handleOverloadedOperator(C, Call, Op); + return; + } - handleIncrement(C, Call.getReturnValue(), Call.getArgSVal(0), - Call.getNumArgs()); - return; - } else if (isDecrementOperator(Func->getOverloadedOperator())) { - if (const auto *InstCall = dyn_cast(&Call)) { - handleDecrement(C, Call.getReturnValue(), InstCall->getCXXThisVal(), - Call.getNumArgs()); - return; - } + const auto *OrigExpr = Call.getOriginExpr(); + if (!OrigExpr) + return; - handleDecrement(C, Call.getReturnValue(), Call.getArgSVal(0), - Call.getNumArgs()); - return; - } - } else { - if (!isIteratorType(Call.getResultType())) - return; + const AdvanceFn *Handler = AdvanceLikeFunctions.lookup(Call); + if (Handler) { + handleAdvanceLikeFunction(C, Call, OrigExpr, Handler); + return; + } - const auto *OrigExpr = Call.getOriginExpr(); - if (!OrigExpr) - return; + if (!isIteratorType(Call.getResultType())) + return; - auto State = C.getState(); + auto State = C.getState(); - // Already bound to container? - if (getIteratorPosition(State, Call.getReturnValue())) - return; + // Already bound to container? + if (getIteratorPosition(State, Call.getReturnValue())) + return; - // Copy-like and move constructors - if (isa(&Call) && Call.getNumArgs() == 1) { - if (const auto *Pos = getIteratorPosition(State, Call.getArgSVal(0))) { - State = setIteratorPosition(State, Call.getReturnValue(), *Pos); - if (cast(Func)->isMoveConstructor()) { - State = removeIteratorPosition(State, Call.getArgSVal(0)); - } - C.addTransition(State); - return; + // Copy-like and move constructors + if (isa(&Call) && Call.getNumArgs() == 1) { + if (const auto *Pos = getIteratorPosition(State, Call.getArgSVal(0))) { + State = setIteratorPosition(State, Call.getReturnValue(), *Pos); + if (cast(Func)->isMoveConstructor()) { + State = removeIteratorPosition(State, Call.getArgSVal(0)); } + C.addTransition(State); + return; } + } - // Assumption: if return value is an iterator which is not yet bound to a - // container, then look for the first iterator argument, and - // bind the return value to the same container. This approach - // works for STL algorithms. - // FIXME: Add a more conservative mode - for (unsigned i = 0; i < Call.getNumArgs(); ++i) { - if (isIteratorType(Call.getArgExpr(i)->getType())) { - if (const auto *Pos = getIteratorPosition(State, Call.getArgSVal(i))) { - assignToContainer(C, OrigExpr, Call.getReturnValue(), - Pos->getContainer()); - return; - } + // Assumption: if return value is an iterator which is not yet bound to a + // container, then look for the first iterator argument, and + // bind the return value to the same container. This approach + // works for STL algorithms. + // FIXME: Add a more conservative mode + for (unsigned i = 0; i < Call.getNumArgs(); ++i) { + if (isIteratorType(Call.getArgExpr(i)->getType())) { + if (const auto *Pos = getIteratorPosition(State, Call.getArgSVal(i))) { + assignToContainer(C, OrigExpr, Call.getReturnValue(), + Pos->getContainer()); + return; } } } @@ -310,6 +297,91 @@ void IteratorModeling::checkDeadSymbols(SymbolReaper &SR, C.addTransition(State); } +void +IteratorModeling::handleOverloadedOperator(CheckerContext &C, + const CallEvent &Call, + OverloadedOperatorKind Op) const { + if (isSimpleComparisonOperator(Op)) { + const auto *OrigExpr = Call.getOriginExpr(); + if (!OrigExpr) + return; + + if (const auto *InstCall = dyn_cast(&Call)) { + handleComparison(C, OrigExpr, Call.getReturnValue(), + InstCall->getCXXThisVal(), Call.getArgSVal(0), Op); + return; + } + + handleComparison(C, OrigExpr, Call.getReturnValue(), Call.getArgSVal(0), + Call.getArgSVal(1), Op); + return; + } else if (isRandomIncrOrDecrOperator(Op)) { + const auto *OrigExpr = Call.getOriginExpr(); + if (!OrigExpr) + return; + + if (const auto *InstCall = dyn_cast(&Call)) { + if (Call.getNumArgs() >= 1 && + Call.getArgExpr(0)->getType()->isIntegralOrEnumerationType()) { + handleRandomIncrOrDecr(C, OrigExpr, Op, Call.getReturnValue(), + InstCall->getCXXThisVal(), Call.getArgSVal(0)); + return; + } + } else { + if (Call.getNumArgs() >= 2 && + Call.getArgExpr(1)->getType()->isIntegralOrEnumerationType()) { + handleRandomIncrOrDecr(C, OrigExpr, Op, Call.getReturnValue(), + Call.getArgSVal(0), Call.getArgSVal(1)); + return; + } + } + } else if (isIncrementOperator(Op)) { + if (const auto *InstCall = dyn_cast(&Call)) { + handleIncrement(C, Call.getReturnValue(), InstCall->getCXXThisVal(), + Call.getNumArgs()); + return; + } + + handleIncrement(C, Call.getReturnValue(), Call.getArgSVal(0), + Call.getNumArgs()); + return; + } else if (isDecrementOperator(Op)) { + if (const auto *InstCall = dyn_cast(&Call)) { + handleDecrement(C, Call.getReturnValue(), InstCall->getCXXThisVal(), + Call.getNumArgs()); + return; + } + + handleDecrement(C, Call.getReturnValue(), Call.getArgSVal(0), + Call.getNumArgs()); + return; + } +} + +void +IteratorModeling::handleAdvanceLikeFunction(CheckerContext &C, + const CallEvent &Call, + const Expr *OrigExpr, + const AdvanceFn *Handler) const { + if (!C.wasInlined) { + (this->**Handler)(C, OrigExpr, Call.getReturnValue(), + Call.getArgSVal(0), Call.getArgSVal(1)); + return; + } + + // If std::advance() was inlined, but a non-standard function it calls inside + // was not, then we have to model it explicitly + const auto *IdInfo = cast(Call.getDecl())->getIdentifier(); + if (IdInfo) { + if (IdInfo->getName() == "advance") { + if (noChangeInAdvance(C, Call.getArgSVal(0), OrigExpr)) { + (this->**Handler)(C, OrigExpr, Call.getReturnValue(), + Call.getArgSVal(0), Call.getArgSVal(1)); + } + } + } +} + void IteratorModeling::handleComparison(CheckerContext &C, const Expr *CE, SVal RetVal, const SVal &LVal, const SVal &RVal, @@ -481,6 +553,22 @@ void IteratorModeling::handleRandomIncrOrDecr(CheckerContext &C, } } +void IteratorModeling::handleAdvance(CheckerContext &C, const Expr *CE, + SVal RetVal, SVal Iter, + SVal Amount) const { + handleRandomIncrOrDecr(C, CE, OO_PlusEqual, RetVal, Iter, Amount); +} + +void IteratorModeling::handlePrev(CheckerContext &C, const Expr *CE, + SVal RetVal, SVal Iter, SVal Amount) const { + handleRandomIncrOrDecr(C, CE, OO_Minus, RetVal, Iter, Amount); +} + +void IteratorModeling::handleNext(CheckerContext &C, const Expr *CE, + SVal RetVal, SVal Iter, SVal Amount) const { + handleRandomIncrOrDecr(C, CE, OO_Plus, RetVal, Iter, Amount); +} + void IteratorModeling::assignToContainer(CheckerContext &C, const Expr *CE, const SVal &RetVal, const MemRegion *Cont) const { @@ -493,6 +581,31 @@ void IteratorModeling::assignToContainer(CheckerContext &C, const Expr *CE, C.addTransition(State); } +bool IteratorModeling::noChangeInAdvance(CheckerContext &C, SVal Iter, + const Expr *CE) const { + // Compare the iterator position before and after the call. (To be called + // from `checkPostCall()`.) + const auto StateAfter = C.getState(); + + const auto *PosAfter = getIteratorPosition(StateAfter, Iter); + // If we have no position after the call of `std::advance`, then we are not + // interested. (Modeling of an inlined `std::advance()` should not remove the + // position in any case.) + if (!PosAfter) + return false; + + const ExplodedNode *N = findCallEnter(C.getPredecessor(), CE); + assert(N && "Any call should have a `CallEnter` node."); + + const auto StateBefore = N->getState(); + const auto *PosBefore = getIteratorPosition(StateBefore, Iter); + + assert(PosBefore && "`std::advance() should not create new iterator " + "position but change existing ones"); + + return PosBefore->getOffset() == PosAfter->getOffset(); +} + void IteratorModeling::printState(raw_ostream &Out, ProgramStateRef State, const char *NL, const char *Sep) const { auto SymbolMap = State->get(); @@ -584,6 +697,20 @@ bool isBoundThroughLazyCompoundVal(const Environment &Env, return false; } +const ExplodedNode *findCallEnter(const ExplodedNode *Node, const Expr *Call) { + while (Node) { + ProgramPoint PP = Node->getLocation(); + if (auto Enter = PP.getAs()) { + if (Enter->getCallExpr() == Call) + break; + } + + Node = Node->getFirstPred(); + } + + return Node; +} + } // namespace void ento::registerIteratorModeling(CheckerManager &mgr) { diff --git a/clang/lib/StaticAnalyzer/Checkers/IteratorRangeChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/IteratorRangeChecker.cpp index bd8b84d464b66..f9b493bf9bb03 100644 --- a/clang/lib/StaticAnalyzer/Checkers/IteratorRangeChecker.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/IteratorRangeChecker.cpp @@ -31,18 +31,30 @@ class IteratorRangeChecker std::unique_ptr OutOfRangeBugType; - void verifyDereference(CheckerContext &C, const SVal &Val) const; - void verifyIncrement(CheckerContext &C, const SVal &Iter) const; - void verifyDecrement(CheckerContext &C, const SVal &Iter) const; + void verifyDereference(CheckerContext &C, SVal Val) const; + void verifyIncrement(CheckerContext &C, SVal Iter) const; + void verifyDecrement(CheckerContext &C, SVal Iter) const; void verifyRandomIncrOrDecr(CheckerContext &C, OverloadedOperatorKind Op, - const SVal &LHS, const SVal &RHS) const; - void reportBug(const StringRef &Message, const SVal &Val, - CheckerContext &C, ExplodedNode *ErrNode) const; + SVal LHS, SVal RHS) const; + void verifyAdvance(CheckerContext &C, SVal LHS, SVal RHS) const; + void verifyPrev(CheckerContext &C, SVal LHS, SVal RHS) const; + void verifyNext(CheckerContext &C, SVal LHS, SVal RHS) const; + void reportBug(const StringRef &Message, SVal Val, CheckerContext &C, + ExplodedNode *ErrNode) const; + public: IteratorRangeChecker(); void checkPreCall(const CallEvent &Call, CheckerContext &C) const; + using AdvanceFn = void (IteratorRangeChecker::*)(CheckerContext &, SVal, + SVal) const; + + CallDescriptionMap AdvanceFunctions = { + {{{"std", "advance"}, 2}, &IteratorRangeChecker::verifyAdvance}, + {{{"std", "prev"}, 2}, &IteratorRangeChecker::verifyPrev}, + {{{"std", "next"}, 2}, &IteratorRangeChecker::verifyNext}, + }; }; bool isPastTheEnd(ProgramStateRef State, const IteratorPosition &Pos); @@ -107,11 +119,23 @@ void IteratorRangeChecker::checkPreCall(const CallEvent &Call, verifyDereference(C, Call.getArgSVal(0)); } } + } else { + const AdvanceFn *Verifier = AdvanceFunctions.lookup(Call); + if (Verifier) { + if (Call.getNumArgs() > 1) { + (this->**Verifier)(C, Call.getArgSVal(0), Call.getArgSVal(1)); + } else { + auto &BVF = C.getSValBuilder().getBasicValueFactory(); + (this->**Verifier)( + C, Call.getArgSVal(0), + nonloc::ConcreteInt(BVF.getValue(llvm::APSInt::get(1)))); + } + } } } void IteratorRangeChecker::verifyDereference(CheckerContext &C, - const SVal &Val) const { + SVal Val) const { auto State = C.getState(); const auto *Pos = getIteratorPosition(State, Val); if (Pos && isPastTheEnd(State, *Pos)) { @@ -123,24 +147,21 @@ void IteratorRangeChecker::verifyDereference(CheckerContext &C, } } -void IteratorRangeChecker::verifyIncrement(CheckerContext &C, - const SVal &Iter) const { +void IteratorRangeChecker::verifyIncrement(CheckerContext &C, SVal Iter) const { auto &BVF = C.getSValBuilder().getBasicValueFactory(); verifyRandomIncrOrDecr(C, OO_Plus, Iter, nonloc::ConcreteInt(BVF.getValue(llvm::APSInt::get(1)))); } -void IteratorRangeChecker::verifyDecrement(CheckerContext &C, - const SVal &Iter) const { +void IteratorRangeChecker::verifyDecrement(CheckerContext &C, SVal Iter) const { auto &BVF = C.getSValBuilder().getBasicValueFactory(); verifyRandomIncrOrDecr(C, OO_Minus, Iter, nonloc::ConcreteInt(BVF.getValue(llvm::APSInt::get(1)))); } void IteratorRangeChecker::verifyRandomIncrOrDecr(CheckerContext &C, - OverloadedOperatorKind Op, - const SVal &LHS, - const SVal &RHS) const { + OverloadedOperatorKind Op, + SVal LHS, SVal RHS) const { auto State = C.getState(); auto Value = RHS; @@ -180,9 +201,24 @@ void IteratorRangeChecker::verifyRandomIncrOrDecr(CheckerContext &C, } } -void IteratorRangeChecker::reportBug(const StringRef &Message, - const SVal &Val, CheckerContext &C, - ExplodedNode *ErrNode) const { +void IteratorRangeChecker::verifyAdvance(CheckerContext &C, SVal LHS, + SVal RHS) const { + verifyRandomIncrOrDecr(C, OO_PlusEqual, LHS, RHS); +} + +void IteratorRangeChecker::verifyPrev(CheckerContext &C, SVal LHS, + SVal RHS) const { + verifyRandomIncrOrDecr(C, OO_Minus, LHS, RHS); +} + +void IteratorRangeChecker::verifyNext(CheckerContext &C, SVal LHS, + SVal RHS) const { + verifyRandomIncrOrDecr(C, OO_Plus, LHS, RHS); +} + +void IteratorRangeChecker::reportBug(const StringRef &Message, SVal Val, + CheckerContext &C, + ExplodedNode *ErrNode) const { auto R = std::make_unique(*OutOfRangeBugType, Message, ErrNode); R->markInteresting(Val); diff --git a/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp index 210a4ff199c63..ff296e7ea46b2 100644 --- a/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp +++ b/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp @@ -7,7 +7,6 @@ //===----------------------------------------------------------------------===// // // This checker improves modeling of a few simple library functions. -// It does not generate warnings. // // This checker provides a specification format - `Summary' - and // contains descriptions of some library functions in this format. Each @@ -51,6 +50,7 @@ //===----------------------------------------------------------------------===// #include "clang/StaticAnalyzer/Checkers/BuiltinCheckerRegistration.h" +#include "clang/StaticAnalyzer/Core/BugReporter/BugType.h" #include "clang/StaticAnalyzer/Core/Checker.h" #include "clang/StaticAnalyzer/Core/CheckerManager.h" #include "clang/StaticAnalyzer/Core/PathSensitive/CallEvent.h" @@ -61,7 +61,8 @@ using namespace clang; using namespace clang::ento; namespace { -class StdLibraryFunctionsChecker : public Checker { +class StdLibraryFunctionsChecker + : public Checker { /// Below is a series of typedefs necessary to define function specs. /// We avoid nesting types here because each additional qualifier /// would need to be repeated in every function spec. @@ -87,6 +88,15 @@ class StdLibraryFunctionsChecker : public Checker { typedef uint32_t ArgNo; static const ArgNo Ret; + class ValueConstraint; + + // Pointer to the ValueConstraint. We need a copyable, polymorphic and + // default initialize able type (vector needs that). A raw pointer was good, + // however, we cannot default initialize that. unique_ptr makes the Summary + // class non-copyable, therefore not an option. Releasing the copyability + // requirement would render the initialization of the Summary map infeasible. + using ValueConstraintPtr = std::shared_ptr; + /// Polymorphic base class that represents a constraint on a given argument /// (or return value) of a function. Derived classes implement different kind /// of constraints, e.g range constraints or correlation between two @@ -99,6 +109,9 @@ class StdLibraryFunctionsChecker : public Checker { /// is returned then the constraint is not feasible. virtual ProgramStateRef apply(ProgramStateRef State, const CallEvent &Call, const Summary &Summary) const = 0; + virtual ValueConstraintPtr negate() const { + llvm_unreachable("Not implemented"); + }; ArgNo getArgNo() const { return ArgN; } protected: @@ -139,6 +152,19 @@ class StdLibraryFunctionsChecker : public Checker { } llvm_unreachable("Unknown range kind!"); } + + ValueConstraintPtr negate() const override { + RangeConstraint Tmp(*this); + switch (Kind) { + case OutOfRange: + Tmp.Kind = WithinRange; + break; + case WithinRange: + Tmp.Kind = OutOfRange; + break; + } + return std::make_shared(Tmp); + } }; class ComparisonConstraint : public ValueConstraint { @@ -155,22 +181,54 @@ class StdLibraryFunctionsChecker : public Checker { const Summary &Summary) const override; }; - // Pointer to the ValueConstraint. We need a copyable, polymorphic and - // default initialize able type (vector needs that). A raw pointer was good, - // however, we cannot default initialize that. unique_ptr makes the Summary - // class non-copyable, therefore not an option. Releasing the copyability - // requirement would render the initialization of the Summary map infeasible. - using ValueConstraintPtr = std::shared_ptr; + class NotNullConstraint : public ValueConstraint { + using ValueConstraint::ValueConstraint; + // This variable has a role when we negate the constraint. + bool CannotBeNull = true; + + public: + ProgramStateRef apply(ProgramStateRef State, const CallEvent &Call, + const Summary &Summary) const override { + SVal V = getArgSVal(Call, getArgNo()); + DefinedOrUnknownSVal L = V.castAs(); + if (!L.getAs()) + return State; + + return State->assume(L, CannotBeNull); + } + + ValueConstraintPtr negate() const override { + NotNullConstraint Tmp(*this); + Tmp.CannotBeNull = !this->CannotBeNull; + return std::make_shared(Tmp); + } + }; + /// The complete list of constraints that defines a single branch. typedef std::vector ConstraintSet; using ArgTypes = std::vector; using Cases = std::vector; - /// Includes information about function prototype (which is necessary to - /// ensure we're modeling the right function and casting values properly), - /// approach to invalidation, and a list of branches - essentially, a list - /// of list of ranges - essentially, a list of lists of lists of segments. + /// Includes information about + /// * function prototype (which is necessary to + /// ensure we're modeling the right function and casting values properly), + /// * approach to invalidation, + /// * a list of branches - a list of list of ranges - + /// A branch represents a path in the exploded graph of a function (which + /// is a tree). So, a branch is a series of assumptions. In other words, + /// branches represent split states and additional assumptions on top of + /// the splitting assumption. + /// For example, consider the branches in `isalpha(x)` + /// Branch 1) + /// x is in range ['A', 'Z'] or in ['a', 'z'] + /// then the return value is not 0. (I.e. out-of-range [0, 0]) + /// Branch 2) + /// x is out-of-range ['A', 'Z'] and out-of-range ['a', 'z'] + /// then the return value is 0. + /// * a list of argument constraints, that must be true on every branch. + /// If these constraints are not satisfied that means a fatal error + /// usually resulting in undefined behaviour. struct Summary { const ArgTypes ArgTys; const QualType RetTy; @@ -185,6 +243,10 @@ class StdLibraryFunctionsChecker : public Checker { CaseConstraints.push_back(std::move(CS)); return *this; } + Summary &ArgConstraint(ValueConstraintPtr VC) { + ArgConstraints.push_back(VC); + return *this; + } private: static void assertTypeSuitableForSummary(QualType T) { @@ -192,9 +254,6 @@ class StdLibraryFunctionsChecker : public Checker { "We should have had no significant void types in the spec"); assert(T.isCanonical() && "We should only have canonical types in the spec"); - // FIXME: lift this assert (but not the ones above!) - assert(T->isIntegralOrEnumerationType() && - "We only support integral ranges in the spec"); } public: @@ -220,6 +279,8 @@ class StdLibraryFunctionsChecker : public Checker { // lazily, and it doesn't change after initialization. mutable llvm::StringMap FunctionSummaryMap; + mutable std::unique_ptr BT_InvalidArg; + // Auxiliary functions to support ArgNo within all structures // in a unified manner. static QualType getArgType(const Summary &Summary, ArgNo ArgN) { @@ -238,15 +299,37 @@ class StdLibraryFunctionsChecker : public Checker { } public: + void checkPreCall(const CallEvent &Call, CheckerContext &C) const; void checkPostCall(const CallEvent &Call, CheckerContext &C) const; bool evalCall(const CallEvent &Call, CheckerContext &C) const; + enum CheckKind { CK_StdCLibraryFunctionArgsChecker, CK_NumCheckKinds }; + DefaultBool ChecksEnabled[CK_NumCheckKinds]; + CheckerNameRef CheckNames[CK_NumCheckKinds]; + private: Optional findFunctionSummary(const FunctionDecl *FD, const CallExpr *CE, CheckerContext &C) const; + Optional findFunctionSummary(const CallEvent &Call, + CheckerContext &C) const; void initFunctionSummaries(CheckerContext &C) const; + + void reportBug(const CallEvent &Call, ExplodedNode *N, + CheckerContext &C) const { + if (!ChecksEnabled[CK_StdCLibraryFunctionArgsChecker]) + return; + // TODO Add detailed diagnostic. + StringRef Msg = "Function argument constraint is not satisfied"; + if (!BT_InvalidArg) + BT_InvalidArg = std::make_unique( + CheckNames[CK_StdCLibraryFunctionArgsChecker], + "Unsatisfied argument constraints", categories::LogicError); + auto R = std::make_unique(*BT_InvalidArg, Msg, N); + bugreporter::trackExpressionValue(N, Call.getArgExpr(0), *R); + C.emitReport(std::move(R)); + } }; const StdLibraryFunctionsChecker::ArgNo StdLibraryFunctionsChecker::Ret = @@ -360,17 +443,37 @@ ProgramStateRef StdLibraryFunctionsChecker::ComparisonConstraint::apply( return State; } -void StdLibraryFunctionsChecker::checkPostCall(const CallEvent &Call, - CheckerContext &C) const { - const FunctionDecl *FD = dyn_cast_or_null(Call.getDecl()); - if (!FD) +void StdLibraryFunctionsChecker::checkPreCall(const CallEvent &Call, + CheckerContext &C) const { + Optional FoundSummary = findFunctionSummary(Call, C); + if (!FoundSummary) return; - const CallExpr *CE = dyn_cast_or_null(Call.getOriginExpr()); - if (!CE) - return; + const Summary &Summary = *FoundSummary; + ProgramStateRef State = C.getState(); + + for (const ValueConstraintPtr& VC : Summary.ArgConstraints) { + ProgramStateRef SuccessSt = VC->apply(State, Call, Summary); + ProgramStateRef FailureSt = VC->negate()->apply(State, Call, Summary); + // The argument constraint is not satisfied. + if (FailureSt && !SuccessSt) { + if (ExplodedNode *N = C.generateErrorNode(State)) + reportBug(Call, N, C); + break; + } else { + // Apply the constraint even if we cannot reason about the argument. This + // means both SuccessSt and FailureSt can be true. If we weren't applying + // the constraint that would mean that symbolic execution continues on a + // code whose behaviour is undefined. + assert(SuccessSt); + C.addTransition(SuccessSt); + } + } +} - Optional FoundSummary = findFunctionSummary(FD, CE, C); +void StdLibraryFunctionsChecker::checkPostCall(const CallEvent &Call, + CheckerContext &C) const { + Optional FoundSummary = findFunctionSummary(Call, C); if (!FoundSummary) return; @@ -394,15 +497,7 @@ void StdLibraryFunctionsChecker::checkPostCall(const CallEvent &Call, bool StdLibraryFunctionsChecker::evalCall(const CallEvent &Call, CheckerContext &C) const { - const auto *FD = dyn_cast_or_null(Call.getDecl()); - if (!FD) - return false; - - const auto *CE = dyn_cast_or_null(Call.getOriginExpr()); - if (!CE) - return false; - - Optional FoundSummary = findFunctionSummary(FD, CE, C); + Optional FoundSummary = findFunctionSummary(Call, C); if (!FoundSummary) return false; @@ -411,6 +506,7 @@ bool StdLibraryFunctionsChecker::evalCall(const CallEvent &Call, case EvalCallAsPure: { ProgramStateRef State = C.getState(); const LocationContext *LC = C.getLocationContext(); + const auto *CE = cast_or_null(Call.getOriginExpr()); SVal V = C.getSValBuilder().conjureSymbolVal( CE, LC, CE->getType().getCanonicalType(), C.blockCount()); State = State->BindExpr(CE, LC, V); @@ -490,6 +586,18 @@ StdLibraryFunctionsChecker::findFunctionSummary(const FunctionDecl *FD, return None; } +Optional +StdLibraryFunctionsChecker::findFunctionSummary(const CallEvent &Call, + CheckerContext &C) const { + const FunctionDecl *FD = dyn_cast_or_null(Call.getDecl()); + if (!FD) + return None; + const CallExpr *CE = dyn_cast_or_null(Call.getOriginExpr()); + if (!CE) + return None; + return findFunctionSummary(FD, CE, C); +} + void StdLibraryFunctionsChecker::initFunctionSummaries( CheckerContext &C) const { if (!FunctionSummaryMap.empty()) @@ -512,6 +620,9 @@ void StdLibraryFunctionsChecker::initFunctionSummaries( const QualType LongTy = ACtx.LongTy; const QualType LongLongTy = ACtx.LongLongTy; const QualType SizeTy = ACtx.getSizeType(); + const QualType VoidPtrTy = ACtx.VoidPtrTy; // void *T + const QualType ConstVoidPtrTy = + ACtx.getPointerType(ACtx.VoidTy.withConst()); // const void *T const RangeInt IntMax = BVF.getMaxValue(IntTy).getLimitedValue(); const RangeInt LongMax = BVF.getMaxValue(LongTy).getLimitedValue(); @@ -582,9 +693,11 @@ void StdLibraryFunctionsChecker::initFunctionSummaries( return IntRangeVector{std::pair{v, v}}; }; auto LessThanOrEq = BO_LE; + auto NotNull = [&](ArgNo ArgN) { + return std::make_shared(ArgN); + }; using RetType = QualType; - // Templates for summaries that are reused by many functions. auto Getc = [&]() { return Summary(ArgTypes{Irrelevant}, RetType{IntTy}, NoEvalCall) @@ -598,11 +711,20 @@ void StdLibraryFunctionsChecker::initFunctionSummaries( ReturnValueCondition(WithinRange, Range(-1, Max))}); }; auto Fread = [&]() { - return Summary(ArgTypes{Irrelevant, Irrelevant, SizeTy, Irrelevant}, + return Summary(ArgTypes{VoidPtrTy, Irrelevant, SizeTy, Irrelevant}, + RetType{SizeTy}, NoEvalCall) + .Case({ + ReturnValueCondition(LessThanOrEq, ArgNo(2)), + }) + .ArgConstraint(NotNull(ArgNo(0))); + }; + auto Fwrite = [&]() { + return Summary(ArgTypes{ConstVoidPtrTy, Irrelevant, SizeTy, Irrelevant}, RetType{SizeTy}, NoEvalCall) .Case({ ReturnValueCondition(LessThanOrEq, ArgNo(2)), - }); + }) + .ArgConstraint(NotNull(ArgNo(0))); }; auto Getline = [&](RetType R, RangeInt Max) { return Summary(ArgTypes{Irrelevant, Irrelevant, Irrelevant}, RetType{R}, @@ -612,6 +734,9 @@ void StdLibraryFunctionsChecker::initFunctionSummaries( FunctionSummaryMap = { // The isascii() family of functions. + // The behavior is undefined if the value of the argument is not + // representable as unsigned char or is not equal to EOF. See e.g. C99 + // 7.4.1.2 The isalpha function (p: 181-182). { "isalnum", Summaries{ @@ -631,7 +756,9 @@ void StdLibraryFunctionsChecker::initFunctionSummaries( {'A', 'Z'}, {'a', 'z'}, {128, UCharRangeMax}}), - ReturnValueCondition(WithinRange, SingleValue(0))})}, + ReturnValueCondition(WithinRange, SingleValue(0))}) + .ArgConstraint(ArgumentCondition( + 0U, WithinRange, {{EOFv, EOFv}, {0, UCharRangeMax}}))}, }, { "isalpha", @@ -799,7 +926,7 @@ void StdLibraryFunctionsChecker::initFunctionSummaries( {"write", Summaries{Read(IntTy, IntMax), Read(LongTy, LongMax), Read(LongLongTy, LongLongMax)}}, {"fread", Summaries{Fread()}}, - {"fwrite", Summaries{Fread()}}, + {"fwrite", Summaries{Fwrite()}}, // getline()-like functions either fail or read at least the delimiter. {"getline", Summaries{Getline(IntTy, IntMax), Getline(LongTy, LongMax), Getline(LongLongTy, LongLongMax)}}, @@ -809,13 +936,22 @@ void StdLibraryFunctionsChecker::initFunctionSummaries( } void ento::registerStdCLibraryFunctionsChecker(CheckerManager &mgr) { - // If this checker grows large enough to support C++, Objective-C, or other - // standard libraries, we could use multiple register...Checker() functions, - // which would register various checkers with the help of the same Checker - // class, turning on different function summaries. mgr.registerChecker(); } bool ento::shouldRegisterStdCLibraryFunctionsChecker(const LangOptions &LO) { return true; } + +#define REGISTER_CHECKER(name) \ + void ento::register##name(CheckerManager &mgr) { \ + StdLibraryFunctionsChecker *checker = \ + mgr.getChecker(); \ + checker->ChecksEnabled[StdLibraryFunctionsChecker::CK_##name] = true; \ + checker->CheckNames[StdLibraryFunctionsChecker::CK_##name] = \ + mgr.getCurrentCheckerName(); \ + } \ + \ + bool ento::shouldRegister##name(const LangOptions &LO) { return true; } + +REGISTER_CHECKER(StdCLibraryFunctionArgsChecker) diff --git a/clang/lib/StaticAnalyzer/Core/AnalyzerOptions.cpp b/clang/lib/StaticAnalyzer/Core/AnalyzerOptions.cpp index 99e16752b51a4..01ac2bc83bb6b 100644 --- a/clang/lib/StaticAnalyzer/Core/AnalyzerOptions.cpp +++ b/clang/lib/StaticAnalyzer/Core/AnalyzerOptions.cpp @@ -134,9 +134,9 @@ StringRef AnalyzerOptions::getCheckerStringOption(StringRef CheckerName, CheckerName = CheckerName.substr(0, Pos); } while (!CheckerName.empty() && SearchInParents); - assert(false && "Unknown checker option! Did you call getChecker*Option " - "with incorrect parameters? User input must've been " - "verified by CheckerRegistry."); + llvm_unreachable("Unknown checker option! Did you call getChecker*Option " + "with incorrect parameters? User input must've been " + "verified by CheckerRegistry."); return ""; } diff --git a/clang/lib/StaticAnalyzer/Core/CMakeLists.txt b/clang/lib/StaticAnalyzer/Core/CMakeLists.txt index 081d922ede800..dc2a6279b737c 100644 --- a/clang/lib/StaticAnalyzer/Core/CMakeLists.txt +++ b/clang/lib/StaticAnalyzer/Core/CMakeLists.txt @@ -45,6 +45,7 @@ add_clang_library(clangStaticAnalyzerCore SValBuilder.cpp SVals.cpp SymbolManager.cpp + TextDiagnostics.cpp WorkList.cpp LINK_LIBS @@ -56,5 +57,6 @@ add_clang_library(clangStaticAnalyzerCore clangFrontend clangLex clangRewrite + clangToolingCore ) diff --git a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp index 1b13c49713ba1..b9adee87436a9 100644 --- a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp +++ b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp @@ -1225,6 +1225,7 @@ void ExprEngine::Visit(const Stmt *S, ExplodedNode *Pred, case Stmt::UnresolvedLookupExprClass: case Stmt::UnresolvedMemberExprClass: case Stmt::TypoExprClass: + case Stmt::RecoveryExprClass: case Stmt::CXXNoexceptExprClass: case Stmt::PackExpansionExprClass: case Stmt::SubstNonTypeTemplateParmPackExprClass: @@ -1258,6 +1259,7 @@ void ExprEngine::Visit(const Stmt *S, ExplodedNode *Pred, case Stmt::OMPTaskgroupDirectiveClass: case Stmt::OMPFlushDirectiveClass: case Stmt::OMPDepobjDirectiveClass: + case Stmt::OMPScanDirectiveClass: case Stmt::OMPOrderedDirectiveClass: case Stmt::OMPAtomicDirectiveClass: case Stmt::OMPTargetDirectiveClass: diff --git a/clang/lib/StaticAnalyzer/Core/HTMLDiagnostics.cpp b/clang/lib/StaticAnalyzer/Core/HTMLDiagnostics.cpp index 002b6070ddcd1..184fdcfb3d4b7 100644 --- a/clang/lib/StaticAnalyzer/Core/HTMLDiagnostics.cpp +++ b/clang/lib/StaticAnalyzer/Core/HTMLDiagnostics.cpp @@ -66,11 +66,9 @@ class HTMLDiagnostics : public PathDiagnosticConsumer { const bool SupportsCrossFileDiagnostics; public: - HTMLDiagnostics(AnalyzerOptions &AnalyzerOpts, - const std::string& prefix, - const Preprocessor &pp, - bool supportsMultipleFiles) - : Directory(prefix), PP(pp), AnalyzerOpts(AnalyzerOpts), + HTMLDiagnostics(AnalyzerOptions &AnalyzerOpts, const std::string &OutputDir, + const Preprocessor &pp, bool supportsMultipleFiles) + : Directory(OutputDir), PP(pp), AnalyzerOpts(AnalyzerOpts), SupportsCrossFileDiagnostics(supportsMultipleFiles) {} ~HTMLDiagnostics() override { FlushDiagnostics(nullptr); } @@ -136,16 +134,45 @@ class HTMLDiagnostics : public PathDiagnosticConsumer { void ento::createHTMLDiagnosticConsumer( AnalyzerOptions &AnalyzerOpts, PathDiagnosticConsumers &C, - const std::string &prefix, const Preprocessor &PP, - const cross_tu::CrossTranslationUnitContext &) { - C.push_back(new HTMLDiagnostics(AnalyzerOpts, prefix, PP, true)); + const std::string &OutputDir, const Preprocessor &PP, + const cross_tu::CrossTranslationUnitContext &CTU) { + + // FIXME: HTML is currently our default output type, but if the output + // directory isn't specified, it acts like if it was in the minimal text + // output mode. This doesn't make much sense, we should have the minimal text + // as our default. In the case of backward compatibility concerns, this could + // be preserved with -analyzer-config-compatibility-mode=true. + createTextMinimalPathDiagnosticConsumer(AnalyzerOpts, C, OutputDir, PP, CTU); + + // TODO: Emit an error here. + if (OutputDir.empty()) + return; + + C.push_back(new HTMLDiagnostics(AnalyzerOpts, OutputDir, PP, true)); } void ento::createHTMLSingleFileDiagnosticConsumer( + AnalyzerOptions &AnalyzerOpts, PathDiagnosticConsumers &C, + const std::string &OutputDir, const Preprocessor &PP, + const cross_tu::CrossTranslationUnitContext &CTU) { + + // TODO: Emit an error here. + if (OutputDir.empty()) + return; + + C.push_back(new HTMLDiagnostics(AnalyzerOpts, OutputDir, PP, false)); + createTextMinimalPathDiagnosticConsumer(AnalyzerOpts, C, OutputDir, PP, CTU); +} + +void ento::createPlistHTMLDiagnosticConsumer( AnalyzerOptions &AnalyzerOpts, PathDiagnosticConsumers &C, const std::string &prefix, const Preprocessor &PP, - const cross_tu::CrossTranslationUnitContext &) { - C.push_back(new HTMLDiagnostics(AnalyzerOpts, prefix, PP, false)); + const cross_tu::CrossTranslationUnitContext &CTU) { + createHTMLDiagnosticConsumer( + AnalyzerOpts, C, std::string(llvm::sys::path::parent_path(prefix)), PP, + CTU); + createPlistMultiFileDiagnosticConsumer(AnalyzerOpts, C, prefix, PP, CTU); + createTextMinimalPathDiagnosticConsumer(AnalyzerOpts, C, prefix, PP, CTU); } //===----------------------------------------------------------------------===// diff --git a/clang/lib/StaticAnalyzer/Core/PlistDiagnostics.cpp b/clang/lib/StaticAnalyzer/Core/PlistDiagnostics.cpp index babd1401a5a73..9b6369aee7a87 100644 --- a/clang/lib/StaticAnalyzer/Core/PlistDiagnostics.cpp +++ b/clang/lib/StaticAnalyzer/Core/PlistDiagnostics.cpp @@ -45,8 +45,8 @@ namespace { AnalyzerOptions &AnOpts; const bool SupportsCrossFileDiagnostics; public: - PlistDiagnostics(AnalyzerOptions &AnalyzerOpts, const std::string &prefix, - const Preprocessor &PP, + PlistDiagnostics(AnalyzerOptions &AnalyzerOpts, + const std::string &OutputFile, const Preprocessor &PP, const cross_tu::CrossTranslationUnitContext &CTU, bool supportsMultipleFiles); @@ -582,19 +582,32 @@ PlistDiagnostics::PlistDiagnostics( void ento::createPlistDiagnosticConsumer( AnalyzerOptions &AnalyzerOpts, PathDiagnosticConsumers &C, - const std::string &s, const Preprocessor &PP, + const std::string &OutputFile, const Preprocessor &PP, const cross_tu::CrossTranslationUnitContext &CTU) { - C.push_back(new PlistDiagnostics(AnalyzerOpts, s, PP, CTU, + + // TODO: Emit an error here. + if (OutputFile.empty()) + return; + + C.push_back(new PlistDiagnostics(AnalyzerOpts, OutputFile, PP, CTU, /*supportsMultipleFiles*/ false)); + createTextMinimalPathDiagnosticConsumer(AnalyzerOpts, C, OutputFile, PP, CTU); } void ento::createPlistMultiFileDiagnosticConsumer( AnalyzerOptions &AnalyzerOpts, PathDiagnosticConsumers &C, - const std::string &s, const Preprocessor &PP, + const std::string &OutputFile, const Preprocessor &PP, const cross_tu::CrossTranslationUnitContext &CTU) { - C.push_back(new PlistDiagnostics(AnalyzerOpts, s, PP, CTU, + + // TODO: Emit an error here. + if (OutputFile.empty()) + return; + + C.push_back(new PlistDiagnostics(AnalyzerOpts, OutputFile, PP, CTU, /*supportsMultipleFiles*/ true)); + createTextMinimalPathDiagnosticConsumer(AnalyzerOpts, C, OutputFile, PP, CTU); } + void PlistDiagnostics::FlushDiagnosticsImpl( std::vector &Diags, FilesMade *filesMade) { diff --git a/clang/lib/StaticAnalyzer/Core/SarifDiagnostics.cpp b/clang/lib/StaticAnalyzer/Core/SarifDiagnostics.cpp index eed45aed620f1..8c2e856015768 100644 --- a/clang/lib/StaticAnalyzer/Core/SarifDiagnostics.cpp +++ b/clang/lib/StaticAnalyzer/Core/SarifDiagnostics.cpp @@ -50,8 +50,14 @@ class SarifDiagnostics : public PathDiagnosticConsumer { void ento::createSarifDiagnosticConsumer( AnalyzerOptions &AnalyzerOpts, PathDiagnosticConsumers &C, const std::string &Output, const Preprocessor &PP, - const cross_tu::CrossTranslationUnitContext &) { + const cross_tu::CrossTranslationUnitContext &CTU) { + + // TODO: Emit an error here. + if (Output.empty()) + return; + C.push_back(new SarifDiagnostics(AnalyzerOpts, Output, PP.getLangOpts())); + createTextMinimalPathDiagnosticConsumer(AnalyzerOpts, C, Output, PP, CTU); } static StringRef getFileName(const FileEntry &FE) { diff --git a/clang/lib/StaticAnalyzer/Core/TextDiagnostics.cpp b/clang/lib/StaticAnalyzer/Core/TextDiagnostics.cpp new file mode 100644 index 0000000000000..07e5685e604cb --- /dev/null +++ b/clang/lib/StaticAnalyzer/Core/TextDiagnostics.cpp @@ -0,0 +1,148 @@ +//===--- TextDiagnostics.cpp - Text Diagnostics for Paths -------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines the TextDiagnostics object. +// +//===----------------------------------------------------------------------===// + +#include "clang/Analysis/PathDiagnostic.h" +#include "clang/Basic/SourceManager.h" +#include "clang/Basic/Version.h" +#include "clang/CrossTU/CrossTranslationUnit.h" +#include "clang/Frontend/ASTUnit.h" +#include "clang/Lex/Preprocessor.h" +#include "clang/Rewrite/Core/Rewriter.h" +#include "clang/StaticAnalyzer/Core/AnalyzerOptions.h" +#include "clang/StaticAnalyzer/Core/PathDiagnosticConsumers.h" +#include "clang/Tooling/Core/Replacement.h" +#include "clang/Tooling/Tooling.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/Support/Casting.h" + +using namespace clang; +using namespace ento; +using namespace tooling; + +namespace { +/// Emitsd minimal diagnostics (report message + notes) for the 'none' output +/// type to the standard error, or to to compliment many others. Emits detailed +/// diagnostics in textual format for the 'text' output type. +class TextDiagnostics : public PathDiagnosticConsumer { + DiagnosticsEngine &DiagEng; + LangOptions LO; + const bool IncludePath = false; + const bool ShouldEmitAsError = false; + const bool ApplyFixIts = false; + +public: + TextDiagnostics(DiagnosticsEngine &DiagEng, LangOptions LO, + bool ShouldIncludePath, const AnalyzerOptions &AnOpts) + : DiagEng(DiagEng), LO(LO), IncludePath(ShouldIncludePath), + ShouldEmitAsError(AnOpts.AnalyzerWerror), + ApplyFixIts(AnOpts.ShouldApplyFixIts) {} + ~TextDiagnostics() override {} + + StringRef getName() const override { return "TextDiagnostics"; } + + bool supportsLogicalOpControlFlow() const override { return true; } + bool supportsCrossFileDiagnostics() const override { return true; } + + PathGenerationScheme getGenerationScheme() const override { + return IncludePath ? Minimal : None; + } + + void FlushDiagnosticsImpl(std::vector &Diags, + FilesMade *filesMade) override { + unsigned WarnID = + ShouldEmitAsError + ? DiagEng.getCustomDiagID(DiagnosticsEngine::Error, "%0") + : DiagEng.getCustomDiagID(DiagnosticsEngine::Warning, "%0"); + unsigned NoteID = DiagEng.getCustomDiagID(DiagnosticsEngine::Note, "%0"); + SourceManager &SM = DiagEng.getSourceManager(); + + Replacements Repls; + auto reportPiece = [&](unsigned ID, FullSourceLoc Loc, StringRef String, + ArrayRef Ranges, + ArrayRef Fixits) { + if (!ApplyFixIts) { + DiagEng.Report(Loc, ID) << String << Ranges << Fixits; + return; + } + + DiagEng.Report(Loc, ID) << String << Ranges; + for (const FixItHint &Hint : Fixits) { + Replacement Repl(SM, Hint.RemoveRange, Hint.CodeToInsert); + + if (llvm::Error Err = Repls.add(Repl)) { + llvm::errs() << "Error applying replacement " << Repl.toString() + << ": " << Err << "\n"; + } + } + }; + + for (std::vector::iterator I = Diags.begin(), + E = Diags.end(); + I != E; ++I) { + const PathDiagnostic *PD = *I; + reportPiece(WarnID, PD->getLocation().asLocation(), + PD->getShortDescription(), PD->path.back()->getRanges(), + PD->path.back()->getFixits()); + + // First, add extra notes, even if paths should not be included. + for (const auto &Piece : PD->path) { + if (!isa(Piece.get())) + continue; + + reportPiece(NoteID, Piece->getLocation().asLocation(), + Piece->getString(), Piece->getRanges(), Piece->getFixits()); + } + + if (!IncludePath) + continue; + + // Then, add the path notes if necessary. + PathPieces FlatPath = PD->path.flatten(/*ShouldFlattenMacros=*/true); + for (const auto &Piece : FlatPath) { + if (isa(Piece.get())) + continue; + + reportPiece(NoteID, Piece->getLocation().asLocation(), + Piece->getString(), Piece->getRanges(), Piece->getFixits()); + } + } + + if (!ApplyFixIts || Repls.empty()) + return; + + Rewriter Rewrite(SM, LO); + if (!applyAllReplacements(Repls, Rewrite)) { + llvm::errs() << "An error occured during applying fix-it.\n"; + } + + Rewrite.overwriteChangedFiles(); + } +}; +} // end anonymous namespace + +void ento::createTextPathDiagnosticConsumer( + AnalyzerOptions &AnalyzerOpts, PathDiagnosticConsumers &C, + const std::string &Prefix, const clang::Preprocessor &PP, + const cross_tu::CrossTranslationUnitContext &CTU) { + C.emplace_back(new TextDiagnostics(PP.getDiagnostics(), PP.getLangOpts(), + /*ShouldIncludePath*/ true, AnalyzerOpts)); +} + +void ento::createTextMinimalPathDiagnosticConsumer( + AnalyzerOptions &AnalyzerOpts, PathDiagnosticConsumers &C, + const std::string &Prefix, const clang::Preprocessor &PP, + const cross_tu::CrossTranslationUnitContext &CTU) { + C.emplace_back(new TextDiagnostics(PP.getDiagnostics(), PP.getLangOpts(), + /*ShouldIncludePath*/ false, + AnalyzerOpts)); +} diff --git a/clang/lib/StaticAnalyzer/Frontend/AnalysisConsumer.cpp b/clang/lib/StaticAnalyzer/Frontend/AnalysisConsumer.cpp index a908aede68bb9..2e3aa0669061f 100644 --- a/clang/lib/StaticAnalyzer/Frontend/AnalysisConsumer.cpp +++ b/clang/lib/StaticAnalyzer/Frontend/AnalysisConsumer.cpp @@ -34,8 +34,6 @@ #include "clang/StaticAnalyzer/Core/PathSensitive/AnalysisManager.h" #include "clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h" #include "clang/StaticAnalyzer/Frontend/CheckerRegistration.h" -#include "clang/Tooling/Core/Replacement.h" -#include "clang/Tooling/Tooling.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/Statistic.h" #include "llvm/Support/FileSystem.h" @@ -49,7 +47,6 @@ using namespace clang; using namespace ento; -using namespace tooling; #define DEBUG_TYPE "AnalysisConsumer" @@ -64,126 +61,6 @@ STATISTIC(NumVisitedBlocksInAnalyzedFunctions, STATISTIC(PercentReachableBlocks, "The % of reachable basic blocks."); STATISTIC(MaxCFGSize, "The maximum number of basic blocks in a function."); -//===----------------------------------------------------------------------===// -// Special PathDiagnosticConsumers. -//===----------------------------------------------------------------------===// - -void ento::createPlistHTMLDiagnosticConsumer( - AnalyzerOptions &AnalyzerOpts, PathDiagnosticConsumers &C, - const std::string &prefix, const Preprocessor &PP, - const cross_tu::CrossTranslationUnitContext &CTU) { - createHTMLDiagnosticConsumer( - AnalyzerOpts, C, std::string(llvm::sys::path::parent_path(prefix)), PP, - CTU); - createPlistMultiFileDiagnosticConsumer(AnalyzerOpts, C, prefix, PP, CTU); -} - -void ento::createTextPathDiagnosticConsumer( - AnalyzerOptions &AnalyzerOpts, PathDiagnosticConsumers &C, - const std::string &Prefix, const clang::Preprocessor &PP, - const cross_tu::CrossTranslationUnitContext &CTU) { - llvm_unreachable("'text' consumer should be enabled on ClangDiags"); -} - -namespace { -class ClangDiagPathDiagConsumer : public PathDiagnosticConsumer { - DiagnosticsEngine &Diag; - LangOptions LO; - - bool IncludePath = false; - bool ShouldEmitAsError = false; - bool ApplyFixIts = false; - -public: - ClangDiagPathDiagConsumer(DiagnosticsEngine &Diag, LangOptions LO) - : Diag(Diag), LO(LO) {} - ~ClangDiagPathDiagConsumer() override {} - StringRef getName() const override { return "ClangDiags"; } - - bool supportsLogicalOpControlFlow() const override { return true; } - bool supportsCrossFileDiagnostics() const override { return true; } - - PathGenerationScheme getGenerationScheme() const override { - return IncludePath ? Minimal : None; - } - - void enablePaths() { IncludePath = true; } - void enableWerror() { ShouldEmitAsError = true; } - void enableApplyFixIts() { ApplyFixIts = true; } - - void FlushDiagnosticsImpl(std::vector &Diags, - FilesMade *filesMade) override { - unsigned WarnID = - ShouldEmitAsError - ? Diag.getCustomDiagID(DiagnosticsEngine::Error, "%0") - : Diag.getCustomDiagID(DiagnosticsEngine::Warning, "%0"); - unsigned NoteID = Diag.getCustomDiagID(DiagnosticsEngine::Note, "%0"); - SourceManager &SM = Diag.getSourceManager(); - - Replacements Repls; - auto reportPiece = [&](unsigned ID, FullSourceLoc Loc, StringRef String, - ArrayRef Ranges, - ArrayRef Fixits) { - if (!ApplyFixIts) { - Diag.Report(Loc, ID) << String << Ranges << Fixits; - return; - } - - Diag.Report(Loc, ID) << String << Ranges; - for (const FixItHint &Hint : Fixits) { - Replacement Repl(SM, Hint.RemoveRange, Hint.CodeToInsert); - - if (llvm::Error Err = Repls.add(Repl)) { - llvm::errs() << "Error applying replacement " << Repl.toString() - << ": " << Err << "\n"; - } - } - }; - - for (std::vector::iterator I = Diags.begin(), - E = Diags.end(); - I != E; ++I) { - const PathDiagnostic *PD = *I; - reportPiece(WarnID, PD->getLocation().asLocation(), - PD->getShortDescription(), PD->path.back()->getRanges(), - PD->path.back()->getFixits()); - - // First, add extra notes, even if paths should not be included. - for (const auto &Piece : PD->path) { - if (!isa(Piece.get())) - continue; - - reportPiece(NoteID, Piece->getLocation().asLocation(), - Piece->getString(), Piece->getRanges(), Piece->getFixits()); - } - - if (!IncludePath) - continue; - - // Then, add the path notes if necessary. - PathPieces FlatPath = PD->path.flatten(/*ShouldFlattenMacros=*/true); - for (const auto &Piece : FlatPath) { - if (isa(Piece.get())) - continue; - - reportPiece(NoteID, Piece->getLocation().asLocation(), - Piece->getString(), Piece->getRanges(), Piece->getFixits()); - } - } - - if (!ApplyFixIts || Repls.empty()) - return; - - Rewriter Rewrite(SM, LO); - if (!applyAllReplacements(Repls, Rewrite)) { - llvm::errs() << "An error occured during applying fix-it.\n"; - } - - Rewrite.overwriteChangedFiles(); - } -}; -} // end anonymous namespace - //===----------------------------------------------------------------------===// // AnalysisConsumer declaration. //===----------------------------------------------------------------------===// @@ -269,31 +146,16 @@ class AnalysisConsumer : public AnalysisASTConsumer, } void DigestAnalyzerOptions() { - if (Opts->AnalysisDiagOpt != PD_NONE) { - // Create the PathDiagnosticConsumer. - ClangDiagPathDiagConsumer *clangDiags = - new ClangDiagPathDiagConsumer(PP.getDiagnostics(), PP.getLangOpts()); - PathConsumers.push_back(clangDiags); - - if (Opts->AnalyzerWerror) - clangDiags->enableWerror(); - - if (Opts->ShouldApplyFixIts) - clangDiags->enableApplyFixIts(); - - if (Opts->AnalysisDiagOpt == PD_TEXT) { - clangDiags->enablePaths(); - - } else if (!OutDir.empty()) { - switch (Opts->AnalysisDiagOpt) { - default: + switch (Opts->AnalysisDiagOpt) { + case PD_NONE: + break; #define ANALYSIS_DIAGNOSTICS(NAME, CMDFLAG, DESC, CREATEFN) \ case PD_##NAME: \ CREATEFN(*Opts.get(), PathConsumers, OutDir, PP, CTU); \ break; #include "clang/StaticAnalyzer/Core/Analyses.def" - } - } + default: + llvm_unreachable("Unkown analyzer output type!"); } // Create the analyzer component creators. @@ -329,20 +191,19 @@ class AnalysisConsumer : public AnalysisASTConsumer, else if (Mode == AM_Path) { llvm::errs() << " (Path, "; switch (IMode) { - case ExprEngine::Inline_Minimal: - llvm::errs() << " Inline_Minimal"; - break; - case ExprEngine::Inline_Regular: - llvm::errs() << " Inline_Regular"; - break; + case ExprEngine::Inline_Minimal: + llvm::errs() << " Inline_Minimal"; + break; + case ExprEngine::Inline_Regular: + llvm::errs() << " Inline_Regular"; + break; } llvm::errs() << ")"; - } - else + } else assert(Mode == (AM_Syntax | AM_Path) && "Unexpected mode!"); - llvm::errs() << ": " << Loc.getFilename() << ' ' - << getFunctionName(D) << '\n'; + llvm::errs() << ": " << Loc.getFilename() << ' ' << getFunctionName(D) + << '\n'; } } @@ -485,7 +346,7 @@ class AnalysisConsumer : public AnalysisASTConsumer, /// Print \p S to stderr if \c Opts->AnalyzerDisplayProgress is set. void reportAnalyzerProgress(StringRef S); -}; +}; // namespace } // end anonymous namespace diff --git a/clang/lib/StaticAnalyzer/Frontend/CMakeLists.txt b/clang/lib/StaticAnalyzer/Frontend/CMakeLists.txt index 6f1151ab0c111..5e7dd8f18cd73 100644 --- a/clang/lib/StaticAnalyzer/Frontend/CMakeLists.txt +++ b/clang/lib/StaticAnalyzer/Frontend/CMakeLists.txt @@ -21,6 +21,4 @@ add_clang_library(clangStaticAnalyzerFrontend clangLex clangStaticAnalyzerCheckers clangStaticAnalyzerCore - clangRewrite - clangToolingCore ) diff --git a/clang/lib/Tooling/Syntax/BuildTree.cpp b/clang/lib/Tooling/Syntax/BuildTree.cpp index 9ebf7d29d8eda..82c87ba02b744 100644 --- a/clang/lib/Tooling/Syntax/BuildTree.cpp +++ b/clang/lib/Tooling/Syntax/BuildTree.cpp @@ -18,12 +18,15 @@ #include "clang/Basic/LLVM.h" #include "clang/Basic/SourceLocation.h" #include "clang/Basic/SourceManager.h" +#include "clang/Basic/Specifiers.h" #include "clang/Basic/TokenKinds.h" #include "clang/Lex/Lexer.h" #include "clang/Tooling/Syntax/Nodes.h" #include "clang/Tooling/Syntax/Tokens.h" #include "clang/Tooling/Syntax/Tree.h" #include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/PointerUnion.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/ScopeExit.h" #include "llvm/ADT/SmallVector.h" @@ -33,6 +36,7 @@ #include "llvm/Support/FormatVariadic.h" #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/raw_ostream.h" +#include #include using namespace clang; @@ -144,6 +148,30 @@ static SourceRange getDeclaratorRange(const SourceManager &SM, TypeLoc T, return SourceRange(Start, End); } +namespace { +/// All AST hierarchy roots that can be represented as pointers. +using ASTPtr = llvm::PointerUnion; +/// Maintains a mapping from AST to syntax tree nodes. This class will get more +/// complicated as we support more kinds of AST nodes, e.g. TypeLocs. +/// FIXME: expose this as public API. +class ASTToSyntaxMapping { +public: + void add(ASTPtr From, syntax::Tree *To) { + assert(To != nullptr); + assert(!From.isNull()); + + bool Added = Nodes.insert({From, To}).second; + (void)Added; + assert(Added && "mapping added twice"); + } + + syntax::Tree *find(ASTPtr P) const { return Nodes.lookup(P); } + +private: + llvm::DenseMap Nodes; +}; +} // namespace + /// A helper class for constructing the syntax tree while traversing a clang /// AST. /// @@ -171,7 +199,18 @@ class syntax::TreeBuilder { /// Populate children for \p New node, assuming it covers tokens from \p /// Range. - void foldNode(llvm::ArrayRef Range, syntax::Tree *New); + void foldNode(llvm::ArrayRef Range, syntax::Tree *New, + ASTPtr From) { + assert(New); + Pending.foldChildren(Arena, Range, New); + if (From) + Mapping.add(From, New); + } + void foldNode(llvm::ArrayRef Range, syntax::Tree *New, + TypeLoc L) { + // FIXME: add mapping for TypeLocs + foldNode(Range, New, nullptr); + } /// Must be called with the range of each `DeclaratorDecl`. Ensures the /// corresponding declarator nodes are covered by `SimpleDeclaration`. @@ -189,16 +228,20 @@ class syntax::TreeBuilder { /// Should be called for expressions in non-statement position to avoid /// wrapping into expression statement. void markExprChild(Expr *Child, NodeRole Role); - /// Set role for a token starting at \p Loc. void markChildToken(SourceLocation Loc, NodeRole R); /// Set role for \p T. void markChildToken(const syntax::Token *T, NodeRole R); - /// Set role for the node that spans exactly \p Range. - void markChild(llvm::ArrayRef Range, NodeRole R); + /// Set role for \p N. + void markChild(syntax::Node *N, NodeRole R); + /// Set role for the syntax node matching \p N. + void markChild(ASTPtr N, NodeRole R); /// Set role for the delayed node that spans exactly \p Range. void markDelayedChild(llvm::ArrayRef Range, NodeRole R); + /// Set role for the node that may or may not be delayed. Node must span + /// exactly \p Range. + void markMaybeDelayedChild(llvm::ArrayRef Range, NodeRole R); /// Finish building the tree and consume the root node. syntax::TranslationUnit *finalize() && { @@ -215,8 +258,16 @@ class syntax::TreeBuilder { return TU; } - /// getRange() finds the syntax tokens corresponding to the passed source - /// locations. + /// Finds a token starting at \p L. The token must exist if \p L is valid. + const syntax::Token *findToken(SourceLocation L) const; + + /// Finds the syntax tokens corresponding to the \p SourceRange. + llvm::ArrayRef getRange(SourceRange Range) const { + assert(Range.isValid()); + return getRange(Range.getBegin(), Range.getEnd()); + } + + /// Finds the syntax tokens corresponding to the passed source locations. /// \p First is the start position of the first token and \p Last is the start /// position of the last token. llvm::ArrayRef getRange(SourceLocation First, @@ -227,23 +278,29 @@ class syntax::TreeBuilder { Arena.sourceManager().isBeforeInTranslationUnit(First, Last)); return llvm::makeArrayRef(findToken(First), std::next(findToken(Last))); } - llvm::ArrayRef getRange(const Decl *D) const { - auto Tokens = getRange(D->getBeginLoc(), D->getEndLoc()); - if (llvm::isa(D)) - return Tokens; - if (DeclsWithoutSemicolons.count(D)) - return Tokens; - // FIXME: do not consume trailing semicolon on function definitions. - // Most declarations own a semicolon in syntax trees, but not in clang AST. - return withTrailingSemicolon(Tokens); + + llvm::ArrayRef + getTemplateRange(const ClassTemplateSpecializationDecl *D) const { + auto Tokens = getRange(D->getSourceRange()); + return maybeAppendSemicolon(Tokens, D); + } + + llvm::ArrayRef getDeclRange(const Decl *D) const { + llvm::ArrayRef Tokens; + // We want to drop the template parameters for specializations. + if (const auto *S = llvm::dyn_cast(D)) + Tokens = getRange(S->TypeDecl::getBeginLoc(), S->getEndLoc()); + else + Tokens = getRange(D->getSourceRange()); + return maybeAppendSemicolon(Tokens, D); } llvm::ArrayRef getExprRange(const Expr *E) const { - return getRange(E->getBeginLoc(), E->getEndLoc()); + return getRange(E->getSourceRange()); } /// Find the adjusted range for the statement, consuming the trailing /// semicolon when needed. llvm::ArrayRef getStmtRange(const Stmt *S) const { - auto Tokens = getRange(S->getBeginLoc(), S->getEndLoc()); + auto Tokens = getRange(S->getSourceRange()); if (isa(S)) return Tokens; @@ -255,6 +312,18 @@ class syntax::TreeBuilder { } private: + llvm::ArrayRef + maybeAppendSemicolon(llvm::ArrayRef Tokens, + const Decl *D) const { + if (llvm::isa(D)) + return Tokens; + if (DeclsWithoutSemicolons.count(D)) + return Tokens; + // FIXME: do not consume trailing semicolon on function definitions. + // Most declarations own a semicolon in syntax trees, but not in clang AST. + return withTrailingSemicolon(Tokens); + } + llvm::ArrayRef withTrailingSemicolon(llvm::ArrayRef Tokens) const { assert(!Tokens.empty()); @@ -265,8 +334,10 @@ class syntax::TreeBuilder { return Tokens; } - /// Finds a token starting at \p L. The token must exist. - const syntax::Token *findToken(SourceLocation L) const; + void setRole(syntax::Node *N, NodeRole R) { + assert(N->role() == NodeRole::Detached); + N->setRole(R); + } /// A collection of trees covering the input tokens. /// When created, each tree corresponds to a single token in the file. @@ -284,7 +355,7 @@ class syntax::TreeBuilder { auto *L = new (A.allocator()) syntax::Leaf(&T); L->Original = true; L->CanModify = A.tokenBuffer().spelledForExpanded(T).hasValue(); - Trees.insert(Trees.end(), {&T, NodeAndRole{L}}); + Trees.insert(Trees.end(), {&T, L}); } } @@ -298,6 +369,15 @@ class syntax::TreeBuilder { It->second.Role = Role; } + void assignRoleMaybeDelayed(llvm::ArrayRef Range, + syntax::NodeRole Role) { + auto It = DelayedFolds.find(Range.begin()); + if (It == DelayedFolds.end()) + return assignRole(Range, Role); + assert(It->second.End == Range.end()); + It->second.Role = Role; + } + void assignRole(llvm::ArrayRef Range, syntax::NodeRole Role) { assert(!Range.empty()); @@ -307,7 +387,9 @@ class syntax::TreeBuilder { assert((std::next(It) == Trees.end() || std::next(It)->first == Range.end()) && "no child with the specified range"); - It->second.Role = Role; + assert(It->second->role() == NodeRole::Detached && + "re-assigning role for a child"); + It->second->setRole(Role); } /// Add \p Node to the forest and attach child nodes based on \p Tokens. @@ -363,7 +445,7 @@ class syntax::TreeBuilder { // EXPECTS: all tokens were consumed and are owned by a single root node. syntax::Node *finalize() && { assert(Trees.size() == 1); - auto *Root = Trees.begin()->second.Node; + auto *Root = Trees.begin()->second; Trees = {}; return Root; } @@ -377,9 +459,9 @@ class syntax::TreeBuilder { : A.tokenBuffer().expandedTokens().end() - It->first; R += std::string(llvm::formatv( - "- '{0}' covers '{1}'+{2} tokens\n", It->second.Node->kind(), + "- '{0}' covers '{1}'+{2} tokens\n", It->second->kind(), It->first->text(A.sourceManager()), CoveredTokens)); - R += It->second.Node->dump(A); + R += It->second->dump(A); } return R; } @@ -403,32 +485,25 @@ class syntax::TreeBuilder { "fold crosses boundaries of existing subtrees"); // We need to go in reverse order, because we can only prepend. - for (auto It = EndChildren; It != BeginChildren; --It) - Node->prependChildLowLevel(std::prev(It)->second.Node, - std::prev(It)->second.Role); + for (auto It = EndChildren; It != BeginChildren; --It) { + auto *C = std::prev(It)->second; + if (C->role() == NodeRole::Detached) + C->setRole(NodeRole::Unknown); + Node->prependChildLowLevel(C); + } // Mark that this node came from the AST and is backed by the source code. Node->Original = true; Node->CanModify = A.tokenBuffer().spelledForExpanded(Tokens).hasValue(); Trees.erase(BeginChildren, EndChildren); - Trees.insert({FirstToken, NodeAndRole(Node)}); + Trees.insert({FirstToken, Node}); } - /// A with a role that should be assigned to it when adding to a parent. - struct NodeAndRole { - explicit NodeAndRole(syntax::Node *Node) - : Node(Node), Role(NodeRole::Unknown) {} - - syntax::Node *Node; - NodeRole Role; - }; /// Maps from the start token to a subtree starting at that token. /// Keys in the map are pointers into the array of expanded tokens, so /// pointer order corresponds to the order of preprocessor tokens. - /// FIXME: storing the end tokens is redundant. - /// FIXME: the key of a map is redundant, it is also stored in NodeForRange. - std::map Trees; + std::map Trees; /// See documentation of `foldChildrenDelayed` for details. struct DelayedFold { @@ -448,6 +523,7 @@ class syntax::TreeBuilder { LocationToToken; Forest Pending; llvm::DenseSet DeclsWithoutSemicolons; + ASTToSyntaxMapping Mapping; }; namespace { @@ -460,7 +536,7 @@ class BuildTreeVisitor : public RecursiveASTVisitor { bool WalkUpFromDeclaratorDecl(DeclaratorDecl *DD) { // Ensure declarators are covered by SimpleDeclaration. - Builder.noticeDeclRange(Builder.getRange(DD)); + Builder.noticeDeclRange(Builder.getDeclRange(DD)); // Build the declarator node. SourceRange Initializer; @@ -474,10 +550,9 @@ class BuildTreeVisitor : public RecursiveASTVisitor { Builder.sourceManager(), DD->getTypeSourceInfo()->getTypeLoc(), getQualifiedNameStart(DD), Initializer); if (Declarator.isValid()) { - auto Tokens = - Builder.getRange(Declarator.getBegin(), Declarator.getEnd()); - Builder.foldNode(Tokens, new (allocator()) syntax::SimpleDeclarator); - Builder.markChild(Tokens, syntax::NodeRole::SimpleDeclaration_declarator); + auto *N = new (allocator()) syntax::SimpleDeclarator; + Builder.foldNode(Builder.getRange(Declarator), N, DD); + Builder.markChild(N, syntax::NodeRole::SimpleDeclaration_declarator); } return true; @@ -485,37 +560,83 @@ class BuildTreeVisitor : public RecursiveASTVisitor { bool WalkUpFromTypedefNameDecl(TypedefNameDecl *D) { // Ensure declarators are covered by SimpleDeclaration. - Builder.noticeDeclRange(Builder.getRange(D)); + Builder.noticeDeclRange(Builder.getDeclRange(D)); auto R = getDeclaratorRange( Builder.sourceManager(), D->getTypeSourceInfo()->getTypeLoc(), /*Name=*/D->getLocation(), /*Initializer=*/SourceRange()); if (R.isValid()) { - auto Tokens = Builder.getRange(R.getBegin(), R.getEnd()); - Builder.foldNode(Tokens, new (allocator()) syntax::SimpleDeclarator); - Builder.markChild(Tokens, syntax::NodeRole::SimpleDeclaration_declarator); + auto *N = new (allocator()) syntax::SimpleDeclarator; + Builder.foldNode(Builder.getRange(R), N, D); + Builder.markChild(N, syntax::NodeRole::SimpleDeclaration_declarator); } return true; } bool VisitDecl(Decl *D) { assert(!D->isImplicit()); - Builder.foldNode(Builder.getRange(D), - new (allocator()) syntax::UnknownDeclaration()); + Builder.foldNode(Builder.getDeclRange(D), + new (allocator()) syntax::UnknownDeclaration(), D); + return true; + } + + // RAV does not call WalkUpFrom* on explicit instantiations, so we have to + // override Traverse. + // FIXME: make RAV call WalkUpFrom* instead. + bool + TraverseClassTemplateSpecializationDecl(ClassTemplateSpecializationDecl *C) { + if (!RecursiveASTVisitor::TraverseClassTemplateSpecializationDecl(C)) + return false; + if (C->isExplicitSpecialization()) + return true; // we are only interested in explicit instantiations. + auto *Declaration = + cast(handleFreeStandingTagDecl(C)); + foldExplicitTemplateInstantiation( + Builder.getTemplateRange(C), Builder.findToken(C->getExternLoc()), + Builder.findToken(C->getTemplateKeywordLoc()), Declaration, C); + return true; + } + + bool WalkUpFromTemplateDecl(TemplateDecl *S) { + foldTemplateDeclaration( + Builder.getDeclRange(S), + Builder.findToken(S->getTemplateParameters()->getTemplateLoc()), + Builder.getDeclRange(S->getTemplatedDecl()), S); return true; } bool WalkUpFromTagDecl(TagDecl *C) { // FIXME: build the ClassSpecifier node. - if (C->isFreeStanding()) { - // Class is a declaration specifier and needs a spanning declaration node. - Builder.foldNode(Builder.getRange(C), - new (allocator()) syntax::SimpleDeclaration); + if (!C->isFreeStanding()) { + assert(C->getNumTemplateParameterLists() == 0); return true; } + handleFreeStandingTagDecl(C); return true; } + syntax::Declaration *handleFreeStandingTagDecl(TagDecl *C) { + assert(C->isFreeStanding()); + // Class is a declaration specifier and needs a spanning declaration node. + auto DeclarationRange = Builder.getDeclRange(C); + syntax::Declaration *Result = new (allocator()) syntax::SimpleDeclaration; + Builder.foldNode(DeclarationRange, Result, nullptr); + + // Build TemplateDeclaration nodes if we had template parameters. + auto ConsumeTemplateParameters = [&](const TemplateParameterList &L) { + const auto *TemplateKW = Builder.findToken(L.getTemplateLoc()); + auto R = llvm::makeArrayRef(TemplateKW, DeclarationRange.end()); + Result = + foldTemplateDeclaration(R, TemplateKW, DeclarationRange, nullptr); + DeclarationRange = R; + }; + if (auto *S = llvm::dyn_cast(C)) + ConsumeTemplateParameters(*S->getTemplateParameters()); + for (unsigned I = C->getNumTemplateParameterLists(); 0 < I; --I) + ConsumeTemplateParameters(*C->getTemplateParameterList(I - 1)); + return Result; + } + bool WalkUpFromTranslationUnitDecl(TranslationUnitDecl *TU) { // We do not want to call VisitDecl(), the declaration for translation // unit is built by finalize(). @@ -531,14 +652,14 @@ class BuildTreeVisitor : public RecursiveASTVisitor { Builder.markChildToken(S->getRBracLoc(), NodeRole::CloseParen); Builder.foldNode(Builder.getStmtRange(S), - new (allocator()) syntax::CompoundStatement); + new (allocator()) syntax::CompoundStatement, S); return true; } // Some statements are not yet handled by syntax trees. bool WalkUpFromStmt(Stmt *S) { Builder.foldNode(Builder.getStmtRange(S), - new (allocator()) syntax::UnknownStatement); + new (allocator()) syntax::UnknownStatement, S); return true; } @@ -576,19 +697,19 @@ class BuildTreeVisitor : public RecursiveASTVisitor { bool WalkUpFromExpr(Expr *E) { assert(!isImplicitExpr(E) && "should be handled by TraverseStmt"); Builder.foldNode(Builder.getExprRange(E), - new (allocator()) syntax::UnknownExpression); + new (allocator()) syntax::UnknownExpression, E); return true; } bool WalkUpFromNamespaceDecl(NamespaceDecl *S) { - auto Tokens = Builder.getRange(S); + auto Tokens = Builder.getDeclRange(S); if (Tokens.front().kind() == tok::coloncolon) { // Handle nested namespace definitions. Those start at '::' token, e.g. // namespace a^::b {} // FIXME: build corresponding nodes for the name of this namespace. return true; } - Builder.foldNode(Tokens, new (allocator()) syntax::NamespaceDefinition); + Builder.foldNode(Tokens, new (allocator()) syntax::NamespaceDefinition, S); return true; } @@ -603,7 +724,7 @@ class BuildTreeVisitor : public RecursiveASTVisitor { Builder.markChildToken(L.getLParenLoc(), syntax::NodeRole::OpenParen); Builder.markChildToken(L.getRParenLoc(), syntax::NodeRole::CloseParen); Builder.foldNode(Builder.getRange(L.getLParenLoc(), L.getRParenLoc()), - new (allocator()) syntax::ParenDeclarator); + new (allocator()) syntax::ParenDeclarator, L); return true; } @@ -614,7 +735,7 @@ class BuildTreeVisitor : public RecursiveASTVisitor { syntax::NodeRole::ArraySubscript_sizeExpression); Builder.markChildToken(L.getRBracketLoc(), syntax::NodeRole::CloseParen); Builder.foldNode(Builder.getRange(L.getLBracketLoc(), L.getRBracketLoc()), - new (allocator()) syntax::ArraySubscript); + new (allocator()) syntax::ArraySubscript, L); return true; } @@ -622,11 +743,11 @@ class BuildTreeVisitor : public RecursiveASTVisitor { Builder.markChildToken(L.getLParenLoc(), syntax::NodeRole::OpenParen); for (auto *P : L.getParams()) Builder.markDelayedChild( - Builder.getRange(P), + Builder.getDeclRange(P), syntax::NodeRole::ParametersAndQualifiers_parameter); Builder.markChildToken(L.getRParenLoc(), syntax::NodeRole::CloseParen); Builder.foldNode(Builder.getRange(L.getLParenLoc(), L.getEndLoc()), - new (allocator()) syntax::ParametersAndQualifiers); + new (allocator()) syntax::ParametersAndQualifiers, L); return true; } @@ -643,8 +764,8 @@ class BuildTreeVisitor : public RecursiveASTVisitor { bool WalkUpFromMemberPointerTypeLoc(MemberPointerTypeLoc L) { auto SR = L.getLocalSourceRange(); - Builder.foldNode(Builder.getRange(SR.getBegin(), SR.getEnd()), - new (allocator()) syntax::MemberPointer); + Builder.foldNode(Builder.getRange(SR), + new (allocator()) syntax::MemberPointer, L); return true; } @@ -653,13 +774,13 @@ class BuildTreeVisitor : public RecursiveASTVisitor { // and fold resulting nodes. bool WalkUpFromDeclStmt(DeclStmt *S) { Builder.foldNode(Builder.getStmtRange(S), - new (allocator()) syntax::DeclarationStatement); + new (allocator()) syntax::DeclarationStatement, S); return true; } bool WalkUpFromNullStmt(NullStmt *S) { Builder.foldNode(Builder.getStmtRange(S), - new (allocator()) syntax::EmptyStatement); + new (allocator()) syntax::EmptyStatement, S); return true; } @@ -668,7 +789,7 @@ class BuildTreeVisitor : public RecursiveASTVisitor { syntax::NodeRole::IntroducerKeyword); Builder.markStmtChild(S->getBody(), syntax::NodeRole::BodyStatement); Builder.foldNode(Builder.getStmtRange(S), - new (allocator()) syntax::SwitchStatement); + new (allocator()) syntax::SwitchStatement, S); return true; } @@ -678,7 +799,7 @@ class BuildTreeVisitor : public RecursiveASTVisitor { Builder.markExprChild(S->getLHS(), syntax::NodeRole::CaseStatement_value); Builder.markStmtChild(S->getSubStmt(), syntax::NodeRole::BodyStatement); Builder.foldNode(Builder.getStmtRange(S), - new (allocator()) syntax::CaseStatement); + new (allocator()) syntax::CaseStatement, S); return true; } @@ -687,7 +808,7 @@ class BuildTreeVisitor : public RecursiveASTVisitor { syntax::NodeRole::IntroducerKeyword); Builder.markStmtChild(S->getSubStmt(), syntax::NodeRole::BodyStatement); Builder.foldNode(Builder.getStmtRange(S), - new (allocator()) syntax::DefaultStatement); + new (allocator()) syntax::DefaultStatement, S); return true; } @@ -700,7 +821,7 @@ class BuildTreeVisitor : public RecursiveASTVisitor { Builder.markStmtChild(S->getElse(), syntax::NodeRole::IfStatement_elseStatement); Builder.foldNode(Builder.getStmtRange(S), - new (allocator()) syntax::IfStatement); + new (allocator()) syntax::IfStatement, S); return true; } @@ -708,7 +829,7 @@ class BuildTreeVisitor : public RecursiveASTVisitor { Builder.markChildToken(S->getForLoc(), syntax::NodeRole::IntroducerKeyword); Builder.markStmtChild(S->getBody(), syntax::NodeRole::BodyStatement); Builder.foldNode(Builder.getStmtRange(S), - new (allocator()) syntax::ForStatement); + new (allocator()) syntax::ForStatement, S); return true; } @@ -717,7 +838,7 @@ class BuildTreeVisitor : public RecursiveASTVisitor { syntax::NodeRole::IntroducerKeyword); Builder.markStmtChild(S->getBody(), syntax::NodeRole::BodyStatement); Builder.foldNode(Builder.getStmtRange(S), - new (allocator()) syntax::WhileStatement); + new (allocator()) syntax::WhileStatement, S); return true; } @@ -725,7 +846,7 @@ class BuildTreeVisitor : public RecursiveASTVisitor { Builder.markChildToken(S->getContinueLoc(), syntax::NodeRole::IntroducerKeyword); Builder.foldNode(Builder.getStmtRange(S), - new (allocator()) syntax::ContinueStatement); + new (allocator()) syntax::ContinueStatement, S); return true; } @@ -733,7 +854,7 @@ class BuildTreeVisitor : public RecursiveASTVisitor { Builder.markChildToken(S->getBreakLoc(), syntax::NodeRole::IntroducerKeyword); Builder.foldNode(Builder.getStmtRange(S), - new (allocator()) syntax::BreakStatement); + new (allocator()) syntax::BreakStatement, S); return true; } @@ -743,7 +864,7 @@ class BuildTreeVisitor : public RecursiveASTVisitor { Builder.markExprChild(S->getRetValue(), syntax::NodeRole::ReturnStatement_value); Builder.foldNode(Builder.getStmtRange(S), - new (allocator()) syntax::ReturnStatement); + new (allocator()) syntax::ReturnStatement, S); return true; } @@ -751,13 +872,13 @@ class BuildTreeVisitor : public RecursiveASTVisitor { Builder.markChildToken(S->getForLoc(), syntax::NodeRole::IntroducerKeyword); Builder.markStmtChild(S->getBody(), syntax::NodeRole::BodyStatement); Builder.foldNode(Builder.getStmtRange(S), - new (allocator()) syntax::RangeBasedForStatement); + new (allocator()) syntax::RangeBasedForStatement, S); return true; } bool WalkUpFromEmptyDecl(EmptyDecl *S) { - Builder.foldNode(Builder.getRange(S), - new (allocator()) syntax::EmptyDeclaration); + Builder.foldNode(Builder.getDeclRange(S), + new (allocator()) syntax::EmptyDeclaration, S); return true; } @@ -766,56 +887,57 @@ class BuildTreeVisitor : public RecursiveASTVisitor { syntax::NodeRole::StaticAssertDeclaration_condition); Builder.markExprChild(S->getMessage(), syntax::NodeRole::StaticAssertDeclaration_message); - Builder.foldNode(Builder.getRange(S), - new (allocator()) syntax::StaticAssertDeclaration); + Builder.foldNode(Builder.getDeclRange(S), + new (allocator()) syntax::StaticAssertDeclaration, S); return true; } bool WalkUpFromLinkageSpecDecl(LinkageSpecDecl *S) { - Builder.foldNode(Builder.getRange(S), - new (allocator()) syntax::LinkageSpecificationDeclaration); + Builder.foldNode(Builder.getDeclRange(S), + new (allocator()) syntax::LinkageSpecificationDeclaration, + S); return true; } bool WalkUpFromNamespaceAliasDecl(NamespaceAliasDecl *S) { - Builder.foldNode(Builder.getRange(S), - new (allocator()) syntax::NamespaceAliasDefinition); + Builder.foldNode(Builder.getDeclRange(S), + new (allocator()) syntax::NamespaceAliasDefinition, S); return true; } bool WalkUpFromUsingDirectiveDecl(UsingDirectiveDecl *S) { - Builder.foldNode(Builder.getRange(S), - new (allocator()) syntax::UsingNamespaceDirective); + Builder.foldNode(Builder.getDeclRange(S), + new (allocator()) syntax::UsingNamespaceDirective, S); return true; } bool WalkUpFromUsingDecl(UsingDecl *S) { - Builder.foldNode(Builder.getRange(S), - new (allocator()) syntax::UsingDeclaration); + Builder.foldNode(Builder.getDeclRange(S), + new (allocator()) syntax::UsingDeclaration, S); return true; } bool WalkUpFromUnresolvedUsingValueDecl(UnresolvedUsingValueDecl *S) { - Builder.foldNode(Builder.getRange(S), - new (allocator()) syntax::UsingDeclaration); + Builder.foldNode(Builder.getDeclRange(S), + new (allocator()) syntax::UsingDeclaration, S); return true; } bool WalkUpFromUnresolvedUsingTypenameDecl(UnresolvedUsingTypenameDecl *S) { - Builder.foldNode(Builder.getRange(S), - new (allocator()) syntax::UsingDeclaration); + Builder.foldNode(Builder.getDeclRange(S), + new (allocator()) syntax::UsingDeclaration, S); return true; } bool WalkUpFromTypeAliasDecl(TypeAliasDecl *S) { - Builder.foldNode(Builder.getRange(S), - new (allocator()) syntax::TypeAliasDeclaration); + Builder.foldNode(Builder.getDeclRange(S), + new (allocator()) syntax::TypeAliasDeclaration, S); return true; } private: /// Returns the range of the built node. - llvm::ArrayRef BuildTrailingReturn(FunctionProtoTypeLoc L) { + syntax::TrailingReturnType *BuildTrailingReturn(FunctionProtoTypeLoc L) { assert(L.getTypePtr()->hasTrailingReturn()); auto ReturnedType = L.getReturnLoc(); @@ -824,27 +946,58 @@ class BuildTreeVisitor : public RecursiveASTVisitor { getDeclaratorRange(this->Builder.sourceManager(), ReturnedType, /*Name=*/SourceLocation(), /*Initializer=*/SourceLocation()); - llvm::ArrayRef ReturnDeclaratorTokens; + syntax::SimpleDeclarator *ReturnDeclarator = nullptr; if (ReturnDeclaratorRange.isValid()) { - ReturnDeclaratorTokens = Builder.getRange( - ReturnDeclaratorRange.getBegin(), ReturnDeclaratorRange.getEnd()); - Builder.foldNode(ReturnDeclaratorTokens, - new (allocator()) syntax::SimpleDeclarator); + ReturnDeclarator = new (allocator()) syntax::SimpleDeclarator; + Builder.foldNode(Builder.getRange(ReturnDeclaratorRange), + ReturnDeclarator, nullptr); } // Build node for trailing return type. - auto Return = - Builder.getRange(ReturnedType.getBeginLoc(), ReturnedType.getEndLoc()); + auto Return = Builder.getRange(ReturnedType.getSourceRange()); const auto *Arrow = Return.begin() - 1; assert(Arrow->kind() == tok::arrow); auto Tokens = llvm::makeArrayRef(Arrow, Return.end()); Builder.markChildToken(Arrow, syntax::NodeRole::TrailingReturnType_arrow); - if (!ReturnDeclaratorTokens.empty()) - Builder.markChild(ReturnDeclaratorTokens, + if (ReturnDeclarator) + Builder.markChild(ReturnDeclarator, syntax::NodeRole::TrailingReturnType_declarator); - Builder.foldNode(Tokens, new (allocator()) syntax::TrailingReturnType); - return Tokens; + auto *R = new (allocator()) syntax::TrailingReturnType; + Builder.foldNode(Tokens, R, nullptr); + return R; + } + + void foldExplicitTemplateInstantiation( + ArrayRef Range, const syntax::Token *ExternKW, + const syntax::Token *TemplateKW, + syntax::SimpleDeclaration *InnerDeclaration, Decl *From) { + assert(!ExternKW || ExternKW->kind() == tok::kw_extern); + assert(TemplateKW && TemplateKW->kind() == tok::kw_template); + Builder.markChildToken( + ExternKW, + syntax::NodeRole::ExplicitTemplateInstantiation_externKeyword); + Builder.markChildToken(TemplateKW, syntax::NodeRole::IntroducerKeyword); + Builder.markChild( + InnerDeclaration, + syntax::NodeRole::ExplicitTemplateInstantiation_declaration); + Builder.foldNode( + Range, new (allocator()) syntax::ExplicitTemplateInstantiation, From); + } + + syntax::TemplateDeclaration *foldTemplateDeclaration( + ArrayRef Range, const syntax::Token *TemplateKW, + ArrayRef TemplatedDeclaration, Decl *From) { + assert(TemplateKW && TemplateKW->kind() == tok::kw_template); + Builder.markChildToken(TemplateKW, syntax::NodeRole::IntroducerKeyword); + Builder.markMaybeDelayedChild( + TemplatedDeclaration, + syntax::NodeRole::TemplateDeclaration_declaration); + + auto *N = new (allocator()) syntax::TemplateDeclaration; + Builder.foldNode(Range, N, From); + return N; } + /// A small helper to save some typing. llvm::BumpPtrAllocator &allocator() { return Builder.allocator(); } @@ -853,11 +1006,6 @@ class BuildTreeVisitor : public RecursiveASTVisitor { }; } // namespace -void syntax::TreeBuilder::foldNode(llvm::ArrayRef Range, - syntax::Tree *New) { - Pending.foldChildren(Arena, Range, New); -} - void syntax::TreeBuilder::noticeDeclRange(llvm::ArrayRef Range) { if (Pending.extendDelayedFold(Range)) return; @@ -881,9 +1029,15 @@ void syntax::TreeBuilder::markChildToken(const syntax::Token *T, NodeRole R) { Pending.assignRole(*T, R); } -void syntax::TreeBuilder::markChild(llvm::ArrayRef Range, - NodeRole R) { - Pending.assignRole(Range, R); +void syntax::TreeBuilder::markChild(syntax::Node *N, NodeRole R) { + assert(N); + setRole(N, R); +} + +void syntax::TreeBuilder::markChild(ASTPtr N, NodeRole R) { + auto *SN = Mapping.find(N); + assert(SN != nullptr); + setRole(SN, R); } void syntax::TreeBuilder::markDelayedChild(llvm::ArrayRef Range, @@ -891,31 +1045,42 @@ void syntax::TreeBuilder::markDelayedChild(llvm::ArrayRef Range, Pending.assignRoleDelayed(Range, R); } +void syntax::TreeBuilder::markMaybeDelayedChild( + llvm::ArrayRef Range, NodeRole R) { + Pending.assignRoleMaybeDelayed(Range, R); +} + void syntax::TreeBuilder::markStmtChild(Stmt *Child, NodeRole Role) { if (!Child) return; - auto Range = getStmtRange(Child); + syntax::Tree *ChildNode = Mapping.find(Child); + assert(ChildNode != nullptr); + // This is an expression in a statement position, consume the trailing // semicolon and form an 'ExpressionStatement' node. - if (auto *E = dyn_cast(Child)) { - Pending.assignRole(getExprRange(E), - NodeRole::ExpressionStatement_expression); - // 'getRange(Stmt)' ensures this already covers a trailing semicolon. - Pending.foldChildren(Arena, Range, - new (allocator()) syntax::ExpressionStatement); - } - Pending.assignRole(Range, Role); + if (isa(Child)) { + setRole(ChildNode, NodeRole::ExpressionStatement_expression); + ChildNode = new (allocator()) syntax::ExpressionStatement; + // (!) 'getStmtRange()' ensures this covers a trailing semicolon. + Pending.foldChildren(Arena, getStmtRange(Child), ChildNode); + } + setRole(ChildNode, Role); } void syntax::TreeBuilder::markExprChild(Expr *Child, NodeRole Role) { if (!Child) return; + Child = Child->IgnoreImplicit(); - Pending.assignRole(getExprRange(Child), Role); + syntax::Tree *ChildNode = Mapping.find(Child); + assert(ChildNode != nullptr); + setRole(ChildNode, Role); } const syntax::Token *syntax::TreeBuilder::findToken(SourceLocation L) const { + if (L.isInvalid()) + return nullptr; auto It = LocationToToken.find(L.getRawEncoding()); assert(It != LocationToToken.end()); return It->second; diff --git a/clang/lib/Tooling/Syntax/Mutations.cpp b/clang/lib/Tooling/Syntax/Mutations.cpp index 72458528202e5..24048b297a112 100644 --- a/clang/lib/Tooling/Syntax/Mutations.cpp +++ b/clang/lib/Tooling/Syntax/Mutations.cpp @@ -35,7 +35,7 @@ class syntax::MutationsImpl { assert(!New->isDetached()); assert(Role != NodeRole::Detached); - New->Role = static_cast(Role); + New->setRole(Role); auto *P = Anchor->parent(); P->replaceChildRangeLowLevel(Anchor, Anchor, New); diff --git a/clang/lib/Tooling/Syntax/Nodes.cpp b/clang/lib/Tooling/Syntax/Nodes.cpp index 4f86007e39bb5..75f025e5f8536 100644 --- a/clang/lib/Tooling/Syntax/Nodes.cpp +++ b/clang/lib/Tooling/Syntax/Nodes.cpp @@ -58,6 +58,10 @@ llvm::raw_ostream &syntax::operator<<(llvm::raw_ostream &OS, NodeKind K) { return OS << "LinkageSpecificationDeclaration"; case NodeKind::SimpleDeclaration: return OS << "SimpleDeclaration"; + case NodeKind::TemplateDeclaration: + return OS << "TemplateDeclaration"; + case NodeKind::ExplicitTemplateInstantiation: + return OS << "ExplicitTemplateInstantiation"; case NodeKind::NamespaceDefinition: return OS << "NamespaceDefinition"; case NodeKind::NamespaceAliasDefinition: @@ -118,6 +122,12 @@ llvm::raw_ostream &syntax::operator<<(llvm::raw_ostream &OS, NodeRole R) { return OS << "StaticAssertDeclaration_message"; case syntax::NodeRole::SimpleDeclaration_declarator: return OS << "SimpleDeclaration_declarator"; + case syntax::NodeRole::TemplateDeclaration_declaration: + return OS << "TemplateDeclaration_declaration"; + case syntax::NodeRole::ExplicitTemplateInstantiation_externKeyword: + return OS << "ExplicitTemplateInstantiation_externKeyword"; + case syntax::NodeRole::ExplicitTemplateInstantiation_declaration: + return OS << "ExplicitTemplateInstantiation_declaration"; case syntax::NodeRole::ArraySubscript_sizeExpression: return OS << "ArraySubscript_sizeExpression"; case syntax::NodeRole::TrailingReturnType_arrow: @@ -281,6 +291,31 @@ syntax::SimpleDeclaration::declarators() { return Children; } +syntax::Leaf *syntax::TemplateDeclaration::templateKeyword() { + return llvm::cast_or_null( + findChild(syntax::NodeRole::IntroducerKeyword)); +} + +syntax::Declaration *syntax::TemplateDeclaration::declaration() { + return llvm::cast_or_null( + findChild(syntax::NodeRole::TemplateDeclaration_declaration)); +} + +syntax::Leaf *syntax::ExplicitTemplateInstantiation::templateKeyword() { + return llvm::cast_or_null( + findChild(syntax::NodeRole::IntroducerKeyword)); +} + +syntax::Leaf *syntax::ExplicitTemplateInstantiation::externKeyword() { + return llvm::cast_or_null( + findChild(syntax::NodeRole::ExplicitTemplateInstantiation_externKeyword)); +} + +syntax::Declaration *syntax::ExplicitTemplateInstantiation::declaration() { + return llvm::cast_or_null( + findChild(syntax::NodeRole::ExplicitTemplateInstantiation_declaration)); +} + syntax::Leaf *syntax::ParenDeclarator::lparen() { return llvm::cast_or_null( findChild(syntax::NodeRole::OpenParen)); diff --git a/clang/lib/Tooling/Syntax/Tree.cpp b/clang/lib/Tooling/Syntax/Tree.cpp index 9a6270ec4cce3..37579e6145b65 100644 --- a/clang/lib/Tooling/Syntax/Tree.cpp +++ b/clang/lib/Tooling/Syntax/Tree.cpp @@ -58,22 +58,33 @@ bool syntax::Leaf::classof(const Node *N) { syntax::Node::Node(NodeKind Kind) : Parent(nullptr), NextSibling(nullptr), Kind(static_cast(Kind)), - Role(static_cast(NodeRole::Detached)), Original(false), - CanModify(false) {} + Role(0), Original(false), CanModify(false) { + this->setRole(NodeRole::Detached); +} bool syntax::Node::isDetached() const { return role() == NodeRole::Detached; } +void syntax::Node::setRole(NodeRole NR) { + this->Role = static_cast(NR); +} + bool syntax::Tree::classof(const Node *N) { return N->kind() > NodeKind::Leaf; } void syntax::Tree::prependChildLowLevel(Node *Child, NodeRole Role) { - assert(Child->Parent == nullptr); - assert(Child->NextSibling == nullptr); assert(Child->role() == NodeRole::Detached); assert(Role != NodeRole::Detached); + Child->setRole(Role); + prependChildLowLevel(Child); +} + +void syntax::Tree::prependChildLowLevel(Node *Child) { + assert(Child->Parent == nullptr); + assert(Child->NextSibling == nullptr); + assert(Child->role() != NodeRole::Detached); + Child->Parent = this; Child->NextSibling = this->FirstChild; - Child->Role = static_cast(Role); this->FirstChild = Child; } @@ -94,7 +105,7 @@ void syntax::Tree::replaceChildRangeLowLevel(Node *BeforeBegin, Node *End, N != End;) { auto *Next = N->NextSibling; - N->Role = static_cast(NodeRole::Detached); + N->setRole(NodeRole::Detached); N->Parent = nullptr; N->NextSibling = nullptr; if (N->Original) diff --git a/clang/test/AST/ast-dump-arm-attr.c b/clang/test/AST/ast-dump-arm-attr.c index 41328165d210f..82a797615009a 100644 --- a/clang/test/AST/ast-dump-arm-attr.c +++ b/clang/test/AST/ast-dump-arm-attr.c @@ -1,5 +1,9 @@ // RUN: %clang_cc1 -triple arm-apple-darwin -ast-dump -ast-dump-filter Test %s | FileCheck --strict-whitespace %s +// RUN: %clang_cc1 -triple armv8m.base-none-eabi -mcmse -ast-dump -ast-dump-filter Test %s | FileCheck --strict-whitespace %s --check-prefix=CHECK-CMSE __attribute__((interrupt)) void Test(void); // CHECK: FunctionDecl{{.*}}Test // CHECK-NEXT: ARMInterruptAttr + +typedef int (*CmseTest)(int a) __attribute__((cmse_nonsecure_call)); +// CHECK-CMSE: TypedefDecl{{.*}}CmseTest{{.*}}__attribute__((cmse_nonsecure_call)) diff --git a/clang/test/AST/ast-dump-expr-errors.cpp b/clang/test/AST/ast-dump-expr-errors.cpp new file mode 100644 index 0000000000000..e623fad04f4c8 --- /dev/null +++ b/clang/test/AST/ast-dump-expr-errors.cpp @@ -0,0 +1,46 @@ +// RUN: not %clang_cc1 -triple x86_64-unknown-unknown -Wno-unused-value -fcxx-exceptions -std=gnu++17 -ast-dump -frecovery-ast %s | FileCheck -strict-whitespace %s + +// Check errors flag is set for RecoveryExpr. +// +// CHECK: VarDecl {{.*}} a +// CHECK-NEXT:`-RecoveryExpr {{.*}} contains-errors +// CHECK-NEXT: `-UnresolvedLookupExpr {{.*}} 'bar' +int a = bar(); + +// The flag propagates through more complicated calls. +// +// CHECK: VarDecl {{.*}} b +// CHECK-NEXT:`-CallExpr {{.*}} contains-errors +// CHECK-NEXT: |-UnresolvedLookupExpr {{.*}} 'bar' +// CHECK-NEXT: |-RecoveryExpr {{.*}} contains-errors +// CHECK-NEXT: | `-UnresolvedLookupExpr {{.*}} 'baz' +// CHECK-NEXT: `-RecoveryExpr {{.*}} contains-errors +// CHECK-NEXT: `-UnresolvedLookupExpr {{.*}} 'qux' +int b = bar(baz(), qux()); + +// Also propagates through more complicated expressions. +// +// CHECK: |-VarDecl {{.*}} c +// CHECK-NEXT:| `-BinaryOperator {{.*}} '' contains-errors '*' +// CHECK-NEXT:| |-UnaryOperator {{.*}} '' contains-errors prefix '&' +// CHECK-NEXT:| | `-ParenExpr {{.*}} '' contains-errors +// CHECK-NEXT:| | `-BinaryOperator {{.*}} '' contains-errors '+' +// CHECK-NEXT:| | |-RecoveryExpr {{.*}} '' contains-errors +// CHECK-NEXT:| | | `-UnresolvedLookupExpr {{.*}} 'bar' +// CHECK-NEXT:| | `-RecoveryExpr {{.*}} '' contains-errors +// CHECK-NEXT:| | `-UnresolvedLookupExpr {{.*}} 'baz' +int c = &(bar() + baz()) * 10; + +// Errors flag propagates even when type is not dependent anymore. +// CHECK: |-VarDecl {{.*}} d +// CHECK-NEXT:| `-CXXStaticCastExpr {{.*}} 'int' contains-errors +// CHECK-NEXT:| `-BinaryOperator {{.*}} '' contains-errors '+' +// CHECK-NEXT:| |-RecoveryExpr {{.*}} '' contains-errors +// CHECK-NEXT:| | `-UnresolvedLookupExpr {{.*}} 'bar' +// CHECK-NEXT:| `-IntegerLiteral {{.*}} 1 +int d = static_cast(bar() + 1); + +// FIXME: store initializer even when 'auto' could not be deduced. +// Expressions with errors currently do not keep initializers around. +// CHECK: `-VarDecl {{.*}} invalid e 'auto' +auto e = bar(); diff --git a/clang/test/AST/ast-dump-recovery.cpp b/clang/test/AST/ast-dump-recovery.cpp new file mode 100644 index 0000000000000..beb409edc4a6a --- /dev/null +++ b/clang/test/AST/ast-dump-recovery.cpp @@ -0,0 +1,85 @@ +// RUN: not %clang_cc1 -triple x86_64-unknown-unknown -Wno-unused-value -fcxx-exceptions -std=gnu++17 -frecovery-ast -ast-dump %s | FileCheck -strict-whitespace %s +// RUN: not %clang_cc1 -triple x86_64-unknown-unknown -Wno-unused-value -fcxx-exceptions -std=gnu++17 -fno-recovery-ast -ast-dump %s | FileCheck --check-prefix=DISABLED -strict-whitespace %s + +int some_func(int *); + +// CHECK: VarDecl {{.*}} invalid_call +// CHECK-NEXT:`-RecoveryExpr {{.*}} contains-errors +// CHECK-NEXT: |-UnresolvedLookupExpr {{.*}} 'some_func' +// CHECK-NEXT: `-IntegerLiteral {{.*}} 123 +// DISABLED-NOT: -RecoveryExpr {{.*}} contains-errors +int invalid_call = some_func(123); + +int ambig_func(double); +int ambig_func(float); + +// CHECK: VarDecl {{.*}} ambig_call +// CHECK-NEXT:`-RecoveryExpr {{.*}} contains-errors +// CHECK-NEXT: |-UnresolvedLookupExpr {{.*}} 'ambig_func' +// CHECK-NEXT: `-IntegerLiteral {{.*}} 123 +// DISABLED-NOT: -RecoveryExpr {{.*}} contains-errors +int ambig_call = ambig_func(123); + +// CHECK: VarDecl {{.*}} unresolved_call1 +// CHECK-NEXT:`-RecoveryExpr {{.*}} contains-errors +// CHECK-NEXT: `-UnresolvedLookupExpr {{.*}} 'bar' +// DISABLED-NOT: -RecoveryExpr {{.*}} contains-errors +int unresolved_call1 = bar(); + +// CHECK: VarDecl {{.*}} unresolved_call2 +// CHECK-NEXT:`-CallExpr {{.*}} contains-errors +// CHECK-NEXT: |-UnresolvedLookupExpr {{.*}} 'bar' +// CHECK-NEXT: |-RecoveryExpr {{.*}} contains-errors +// CHECK-NEXT: | `-UnresolvedLookupExpr {{.*}} 'baz' +// CHECK-NEXT: `-RecoveryExpr {{.*}} contains-errors +// CHECK-NEXT: `-UnresolvedLookupExpr {{.*}} 'qux' +// DISABLED-NOT: -RecoveryExpr {{.*}} contains-errors +int unresolved_call2 = bar(baz(), qux()); + +constexpr int a = 10; + +// CHECK: VarDecl {{.*}} postfix_inc +// CHECK-NEXT:`-RecoveryExpr {{.*}} contains-errors +// CHECK-NEXT: `-DeclRefExpr {{.*}} 'a' +// DISABLED-NOT: -RecoveryExpr {{.*}} contains-errors +int postfix_inc = a++; + +// CHECK: VarDecl {{.*}} prefix_inc +// CHECK-NEXT:`-RecoveryExpr {{.*}} contains-errors +// CHECK-NEXT: `-DeclRefExpr {{.*}} 'a' +// DISABLED-NOT: -RecoveryExpr {{.*}} contains-errors +int prefix_inc = ++a; + +// CHECK: VarDecl {{.*}} unary_address +// CHECK-NEXT:`-RecoveryExpr {{.*}} contains-errors +// CHECK-NEXT: `-ParenExpr {{.*}} +// CHECK-NEXT: `-BinaryOperator {{.*}} '+' +// CHECK-NEXT: |-ImplicitCastExpr +// CHECK-NEXT: | `-DeclRefExpr {{.*}} 'a' +// DISABLED-NOT: -RecoveryExpr {{.*}} contains-errors +int unary_address = &(a + 1); + +// CHECK: VarDecl {{.*}} unary_bitinverse +// CHECK-NEXT:`-RecoveryExpr {{.*}} contains-errors +// CHECK-NEXT: `-ParenExpr {{.*}} +// CHECK-NEXT: `-BinaryOperator {{.*}} '+' +// CHECK-NEXT: |-ImplicitCastExpr +// CHECK-NEXT: | `-ImplicitCastExpr +// CHECK-NEXT: | `-DeclRefExpr {{.*}} 'a' +// DISABLED-NOT: -RecoveryExpr {{.*}} contains-errors +int unary_bitinverse = ~(a + 0.0); + +// CHECK: VarDecl {{.*}} binary +// CHECK-NEXT:`-RecoveryExpr {{.*}} contains-errors +// CHECK-NEXT: |-DeclRefExpr {{.*}} 'a' +// CHECK-NEXT: `-CXXNullPtrLiteralExpr +// DISABLED-NOT: -RecoveryExpr {{.*}} contains-errors +int binary = a + nullptr; + +// CHECK: VarDecl {{.*}} ternary +// CHECK-NEXT:`-RecoveryExpr {{.*}} contains-errors +// CHECK-NEXT: |-DeclRefExpr {{.*}} 'a' +// CHECK-NEXT: |-CXXNullPtrLiteralExpr +// CHECK-NEXT: `-DeclRefExpr {{.*}} 'a' +// DISABLED-NOT: -RecoveryExpr {{.*}} contains-errors +int ternary = a ? nullptr : a; diff --git a/clang/test/Analysis/Inputs/system-header-simulator-cxx.h b/clang/test/Analysis/Inputs/system-header-simulator-cxx.h index 54cfdcef7b16d..25a705bb93584 100644 --- a/clang/test/Analysis/Inputs/system-header-simulator-cxx.h +++ b/clang/test/Analysis/Inputs/system-header-simulator-cxx.h @@ -757,31 +757,66 @@ namespace std { } template -void __advance (BidirectionalIterator& it, Distance n, - std::bidirectional_iterator_tag) { +void __advance(BidirectionalIterator& it, Distance n, + std::bidirectional_iterator_tag) +#if !defined(STD_ADVANCE_INLINE_LEVEL) || STD_ADVANCE_INLINE_LEVEL > 2 +{ if (n >= 0) while(n-- > 0) ++it; else while (n++<0) --it; } +#else + ; +#endif template -void __advance (RandomAccessIterator& it, Distance n, - std::random_access_iterator_tag) { +void __advance(RandomAccessIterator& it, Distance n, + std::random_access_iterator_tag) +#if !defined(STD_ADVANCE_INLINE_LEVEL) || STD_ADVANCE_INLINE_LEVEL > 2 +{ it += n; } +#else + ; +#endif namespace std { - template - void advance (InputIterator& it, Distance n) { - __advance(it, n, typename InputIterator::iterator_category()); - } - template - BidirectionalIterator - prev (BidirectionalIterator it, - typename iterator_traits::difference_type n = - 1) { - advance(it, -n); - return it; - } +template +void advance(InputIterator& it, Distance n) +#if !defined(STD_ADVANCE_INLINE_LEVEL) || STD_ADVANCE_INLINE_LEVEL > 1 +{ + __advance(it, n, typename InputIterator::iterator_category()); +} +#else + ; +#endif + +template +BidirectionalIterator +prev(BidirectionalIterator it, + typename iterator_traits::difference_type n = + 1) +#if !defined(STD_ADVANCE_INLINE_LEVEL) || STD_ADVANCE_INLINE_LEVEL > 0 +{ + advance(it, -n); + return it; +} +#else + ; +#endif + +template +ForwardIterator +next(ForwardIterator it, + typename iterator_traits::difference_type n = + 1) +#if !defined(STD_ADVANCE_INLINE_LEVEL) || STD_ADVANCE_INLINE_LEVEL > 0 +{ + advance(it, n); + return it; +} +#else + ; +#endif template InputIt find(InputIt first, InputIt last, const T& value); diff --git a/clang/test/Analysis/analyzer-enabled-checkers.c b/clang/test/Analysis/analyzer-enabled-checkers.c index ba850ec6da48f..55f15f6d37c8f 100644 --- a/clang/test/Analysis/analyzer-enabled-checkers.c +++ b/clang/test/Analysis/analyzer-enabled-checkers.c @@ -7,6 +7,7 @@ // CHECK: OVERVIEW: Clang Static Analyzer Enabled Checkers List // CHECK-EMPTY: // CHECK-NEXT: apiModeling.StdCLibraryFunctions +// CHECK-NEXT: apiModeling.StdCLibraryFunctionArgs // CHECK-NEXT: apiModeling.TrustNonnull // CHECK-NEXT: apiModeling.llvm.CastValue // CHECK-NEXT: apiModeling.llvm.ReturnValue diff --git a/clang/test/Analysis/debug-CallGraph.cpp b/clang/test/Analysis/debug-CallGraph.cpp index 0f5a83b268a01..120bb38d3bb33 100644 --- a/clang/test/Analysis/debug-CallGraph.cpp +++ b/clang/test/Analysis/debug-CallGraph.cpp @@ -81,8 +81,27 @@ namespace Lambdas { } } +namespace CallDecl { + void SomeDecl(); + void SomeOtherDecl(); + void SomeDef() {} + + void Caller() { + SomeDecl(); + SomeOtherDecl(); + } + + void SomeOtherDecl() { + SomeDef(); + } +} + // CHECK:--- Call graph Dump --- -// CHECK-NEXT: {{Function: < root > calls: get5 add test_add mmm foo aaa < > bbb ddd ccc eee fff do_nothing test_single_call SomeNS::templ SomeNS::templ SomeNS::templUser Lambdas::Callee Lambdas::f1 Lambdas::f1\(\)::\(anonymous class\)::operator\(\) Lambdas::f1\(\)::\(anonymous class\)::operator\(\) $}} +// CHECK-NEXT: {{Function: < root > calls: get5 add test_add mmm foo aaa < > bbb ddd ccc eee fff do_nothing test_single_call SomeNS::templ SomeNS::templ SomeNS::templUser Lambdas::Callee Lambdas::f1 Lambdas::f1\(\)::\(anonymous class\)::operator\(\) Lambdas::f1\(\)::\(anonymous class\)::operator\(\) CallDecl::SomeDef CallDecl::Caller CallDecl::SomeDecl CallDecl::SomeOtherDecl $}} +// CHECK-NEXT: {{Function: CallDecl::Caller calls: CallDecl::SomeDecl CallDecl::SomeOtherDecl $}} +// CHECK-NEXT: {{Function: CallDecl::SomeOtherDecl calls: CallDecl::SomeDef $}} +// CHECK-NEXT: {{Function: CallDecl::SomeDecl calls: $}} +// CHECK-NEXT: {{Function: CallDecl::SomeDef calls: $}} // CHECK-NEXT: {{Function: Lambdas::f1 calls: Lambdas::f1\(\)::\(anonymous class\)::operator\(\) Lambdas::f1\(\)::\(anonymous class\)::operator\(\) $}} // CHECK-NEXT: {{Function: Lambdas::f1\(\)::\(anonymous class\)::operator\(\) calls: Lambdas::Callee $}} // CHECK-NEXT: {{Function: Lambdas::f1\(\)::\(anonymous class\)::operator\(\) calls: Lambdas::Callee $}} diff --git a/clang/test/Analysis/iterator-modelling.cpp b/clang/test/Analysis/iterator-modelling.cpp index 4e40319cedc1f..bb37a7565ca16 100644 --- a/clang/test/Analysis/iterator-modelling.cpp +++ b/clang/test/Analysis/iterator-modelling.cpp @@ -2,6 +2,12 @@ // RUN: %clang_analyze_cc1 -std=c++11 -analyzer-checker=core,cplusplus,debug.DebugIteratorModeling,debug.ExprInspection -analyzer-config aggressive-binary-operation-simplification=true -analyzer-config c++-container-inlining=true -DINLINE=1 %s -verify +// RUN: %clang_analyze_cc1 -std=c++11 -analyzer-checker=core,cplusplus,debug.DebugIteratorModeling,debug.ExprInspection -analyzer-config aggressive-binary-operation-simplification=true -analyzer-config c++-container-inlining=true -DINLINE=1 -DSTD_ADVANCE_INLINE_LEVEL=0 %s -verify + +// RUN: %clang_analyze_cc1 -std=c++11 -analyzer-checker=core,cplusplus,debug.DebugIteratorModeling,debug.ExprInspection -analyzer-config aggressive-binary-operation-simplification=true -analyzer-config c++-container-inlining=true -DINLINE=1 -DSTD_ADVANCE_INLINE_LEVEL=1 %s -verify + +// RUN: %clang_analyze_cc1 -std=c++11 -analyzer-checker=core,cplusplus,debug.DebugIteratorModeling,debug.ExprInspection -analyzer-config aggressive-binary-operation-simplification=true -analyzer-config c++-container-inlining=true -DINLINE=1 -DSTD_ADVANCE_INLINE_LEVEL=2 %s -verify + // RUN: %clang_analyze_cc1 -std=c++11 -analyzer-checker=core,cplusplus,alpha.cplusplus.IteratorModeling,debug.ExprInspection -analyzer-config aggressive-binary-operation-simplification=true %s 2>&1 | FileCheck %s #include "Inputs/system-header-simulator-cxx.h" @@ -233,6 +239,68 @@ void copy_and_decrement2(const std::vector &v) { clang_analyzer_express(clang_analyzer_iterator_position(i2)); //expected-warning{{$v.end() - 1}} } +/// std::advance(), std::prev(), std::next() + +void std_advance_minus(const std::vector &v) { + auto i = v.end(); + + clang_analyzer_denote(clang_analyzer_container_end(v), "$v.end()"); + + std::advance(i, -1); + + clang_analyzer_express(clang_analyzer_iterator_position(i)); //expected-warning{{$v.end() - 1}} +} + +void std_advance_plus(const std::vector &v) { + auto i = v.begin(); + + clang_analyzer_denote(clang_analyzer_container_begin(v), "$v.begin()"); + + std::advance(i, 1); + + clang_analyzer_express(clang_analyzer_iterator_position(i)); //expected-warning{{$v.begin() + 1}} +} + +void std_prev(const std::vector &v) { + auto i = v.end(); + + clang_analyzer_denote(clang_analyzer_container_end(v), "$v.end()"); + + auto j = std::prev(i); + + clang_analyzer_express(clang_analyzer_iterator_position(j)); //expected-warning{{$v.end() - 1}} +} + +void std_prev2(const std::vector &v) { + auto i = v.end(); + + clang_analyzer_denote(clang_analyzer_container_end(v), "$v.end()"); + + auto j = std::prev(i, 2); + + clang_analyzer_express(clang_analyzer_iterator_position(j)); //expected-warning{{$v.end() - 2}} +} + +void std_next(const std::vector &v) { + auto i = v.begin(); + + clang_analyzer_denote(clang_analyzer_container_begin(v), "$v.begin()"); + + auto j = std::next(i); + + clang_analyzer_express(clang_analyzer_iterator_position(j)); //expected-warning{{$v.begin() + 1}} +} + +void std_next2(const std::vector &v) { + auto i = v.begin(); + + clang_analyzer_denote(clang_analyzer_container_begin(v), "$v.begin()"); + + auto j = std::next(i, 2); + + clang_analyzer_express(clang_analyzer_iterator_position(j)); //expected-warning{{$v.begin() + 2}} +} + //////////////////////////////////////////////////////////////////////////////// /// /// C O N T A I N E R A S S I G N M E N T S diff --git a/clang/test/Analysis/iterator-range.cpp b/clang/test/Analysis/iterator-range.cpp index 71af17a1f3d02..babcfdec99d66 100644 --- a/clang/test/Analysis/iterator-range.cpp +++ b/clang/test/Analysis/iterator-range.cpp @@ -359,6 +359,423 @@ void subscript_positive_end(const std::vector &V) { auto j = i[1]; // expected-warning{{Past-the-end iterator dereferenced}} FIXME: expect warning Iterator incremented behind the past-the-end iterator } +// +// std::advance() +// + +// std::advance() by +1 + +void advance_plus_1_begin(const std::vector &V) { + auto i = V.begin(); + std::advance(i, 1); // no-warning +} + +void advance_plus_1_behind_begin(const std::vector &V) { + auto i = ++V.begin(); + std::advance(i, 1); // no-warning +} + +void advance_plus_1_unknown(const std::vector &V) { + auto i = return_any_iterator(V.begin()); + std::advance(i, 1); // no-warning +} + +void advance_plus_1_ahead_of_end(const std::vector &V) { + auto i = --V.end(); + std::advance(i, 1); // no-warning +} + +void advance_plus_1_end(const std::vector &V) { + auto i = V.end(); + std::advance(i, 1); // expected-warning{{Iterator incremented behind the past-the-end iterator}} +} + +// std::advance() by -1 + +void advance_minus_1_begin(const std::vector &V) { + auto i = V.begin(); + std::advance(i, -1); // expected-warning{{Iterator decremented ahead of its valid range}} +} + +void advance_minus_1_behind_begin(const std::vector &V) { + auto i = ++V.begin(); + std::advance(i, -1); // no-warning +} + +void advance_minus_1_unknown(const std::vector &V) { + auto i = return_any_iterator(V.begin()); + std::advance(i, -1); // no-warning +} + +void advance_minus_1_ahead_of_end(const std::vector &V) { + auto i = --V.end(); + std::advance(i, -1); // no-warning +} + +void advance_minus_1_end(const std::vector &V) { + auto i = V.end(); + std::advance(i, -1); // no-warning +} + +// std::advance() by +2 + +void advance_plus_2_begin(const std::vector &V) { + auto i = V.begin(); + std::advance(i, 2); // no-warning +} + +void advance_plus_2_behind_begin(const std::vector &V) { + auto i = ++V.begin(); + std::advance(i, 2); // no-warning +} + +void advance_plus_2_unknown(const std::vector &V) { + auto i = return_any_iterator(V.begin()); + std::advance(i, 2); // no-warning +} + +void advance_plus_2_ahead_of_end(const std::vector &V) { + auto i = --V.end(); + std::advance(i, 2); // expected-warning{{Iterator incremented behind the past-the-end iterator}} +} + +void advance_plus_2_end(const std::vector &V) { + auto i = V.end(); + std::advance(i, 2); // expected-warning{{Iterator incremented behind the past-the-end iterator}} +} + +// std::advance() by -2 + +void advance_minus_2_begin(const std::vector &V) { + auto i = V.begin(); + std::advance(i, -2); // expected-warning{{Iterator decremented ahead of its valid range}} +} + +void advance_minus_2_behind_begin(const std::vector &V) { + auto i = ++V.begin(); + std::advance(i, -2); // expected-warning{{Iterator decremented ahead of its valid range}} +} + +void advance_minus_2_unknown(const std::vector &V) { + auto i = return_any_iterator(V.begin()); + std::advance(i, -2); // no-warning +} + +void advance_minus_2_ahead_of_end(const std::vector &V) { + auto i = --V.end(); + std::advance(i, -2); // no-warning +} + +void advance_minus_2_end(const std::vector &V) { + auto i = V.end(); + std::advance(i, -2); // no-warning +} + +// std::advance() by 0 + +void advance_0_begin(const std::vector &V) { + auto i = V.begin(); + std::advance(i, 0); // no-warning +} + +void advance_0_behind_begin(const std::vector &V) { + auto i = ++V.begin(); + std::advance(i, 0); // no-warning +} + +void advance_0_unknown(const std::vector &V) { + auto i = return_any_iterator(V.begin()); + std::advance(i, 0); // no-warning +} + +void advance_0_ahead_of_end(const std::vector &V) { + auto i = --V.end(); + std::advance(i, 0); // no-warning +} + +void advance_0_end(const std::vector &V) { + auto i = V.end(); + std::advance(i, 0); // no-warning +} + +// +// std::next() +// + +// std::next() by +1 (default) + +void next_plus_1_begin(const std::vector &V) { + auto i = V.begin(); + auto j = std::next(i); // no-warning +} + +void next_plus_1_behind_begin(const std::vector &V) { + auto i = ++V.begin(); + auto j = std::next(i); // no-warning +} + +void next_plus_1_unknown(const std::vector &V) { + auto i = return_any_iterator(V.begin()); + auto j = std::next(i); // no-warning +} + +void next_plus_1_ahead_of_end(const std::vector &V) { + auto i = --V.end(); + auto j = std::next(i); // no-warning +} + +void next_plus_1_end(const std::vector &V) { + auto i = V.end(); + auto j = std::next(i); // expected-warning{{Iterator incremented behind the past-the-end iterator}} +} + +// std::next() by -1 + +void next_minus_1_begin(const std::vector &V) { + auto i = V.begin(); + auto j = std::next(i, -1); // expected-warning{{Iterator decremented ahead of its valid range}} +} + +void next_minus_1_behind_begin(const std::vector &V) { + auto i = ++V.begin(); + auto j = std::next(i, -1); // no-warning +} + +void next_minus_1_unknown(const std::vector &V) { + auto i = return_any_iterator(V.begin()); + auto j = std::next(i, -1); // no-warning +} + +void next_minus_1_ahead_of_end(const std::vector &V) { + auto i = --V.end(); + auto j = std::next(i, -1); // no-warning +} + +void next_minus_1_end(const std::vector &V) { + auto i = V.end(); + auto j = std::next(i, -1); // no-warning +} + +// std::next() by +2 + +void next_plus_2_begin(const std::vector &V) { + auto i = V.begin(); + auto j = std::next(i, 2); // no-warning +} + +void next_plus_2_behind_begin(const std::vector &V) { + auto i = ++V.begin(); + auto j = std::next(i, 2); // no-warning +} + +void next_plus_2_unknown(const std::vector &V) { + auto i = return_any_iterator(V.begin()); + auto j = std::next(i, 2); // no-warning +} + +void next_plus_2_ahead_of_end(const std::vector &V) { + auto i = --V.end(); + auto j = std::next(i, 2); // expected-warning{{Iterator incremented behind the past-the-end iterator}} +} + +void next_plus_2_end(const std::vector &V) { + auto i = V.end(); + auto j = std::next(i, 2); // expected-warning{{Iterator incremented behind the past-the-end iterator}} +} + +// std::next() by -2 + +void next_minus_2_begin(const std::vector &V) { + auto i = V.begin(); + auto j = std::next(i, -2); // expected-warning{{Iterator decremented ahead of its valid range}} +} + +void next_minus_2_behind_begin(const std::vector &V) { + auto i = ++V.begin(); + auto j = std::next(i, -2); // expected-warning{{Iterator decremented ahead of its valid range}} +} + +void next_minus_2_unknown(const std::vector &V) { + auto i = return_any_iterator(V.begin()); + auto j = std::next(i, -2); // no-warning +} + +void next_minus_2_ahead_of_end(const std::vector &V) { + auto i = --V.end(); + auto j = std::next(i, -2); // no-warning +} + +void next_minus_2_end(const std::vector &V) { + auto i = V.end(); + auto j = std::next(i, -2); // no-warning +} + +// std::next() by 0 + +void next_0_begin(const std::vector &V) { + auto i = V.begin(); + auto j = std::next(i, 0); // no-warning +} + +void next_0_behind_begin(const std::vector &V) { + auto i = ++V.begin(); + auto j = std::next(i, 0); // no-warning +} + +void next_0_unknown(const std::vector &V) { + auto i = return_any_iterator(V.begin()); + auto j = std::next(i, 0); // no-warning +} + +void next_0_ahead_of_end(const std::vector &V) { + auto i = --V.end(); + auto j = std::next(i, 0); // no-warning +} + +void next_0_end(const std::vector &V) { + auto i = V.end(); + auto j = std::next(i, 0); // no-warning +} + +// +// std::prev() +// + +// std::prev() by +1 (default) + +void prev_plus_1_begin(const std::vector &V) { + auto i = V.begin(); + auto j = std::prev(i); // expected-warning{{Iterator decremented ahead of its valid range}} +} + +void prev_plus_1_behind_begin(const std::vector &V) { + auto i = ++V.begin(); + auto j = std::prev(i); // no-warning +} + +void prev_plus_1_unknown(const std::vector &V) { + auto i = return_any_iterator(V.begin()); + auto j = std::prev(i); // no-warning +} + +void prev_plus_1_ahead_of_end(const std::vector &V) { + auto i = --V.end(); + auto j = std::prev(i); // no-warning +} + +void prev_plus_1_end(const std::vector &V) { + auto i = V.end(); + auto j = std::prev(i); // no-warning +} + +// std::prev() by -1 + +void prev_minus_1_begin(const std::vector &V) { + auto i = V.begin(); + auto j = std::prev(i, -1); // no-warning +} + +void prev_minus_1_behind_begin(const std::vector &V) { + auto i = ++V.begin(); + auto j = std::prev(i, -1); // no-warning +} + +void prev_minus_1_unknown(const std::vector &V) { + auto i = return_any_iterator(V.begin()); + auto j = std::prev(i, -1); // no-warning +} + +void prev_minus_1_ahead_of_end(const std::vector &V) { + auto i = --V.end(); + auto j = std::prev(i, -1); // no-warning +} + +void prev_minus_1_end(const std::vector &V) { + auto i = V.end(); + auto j = std::prev(i, -1); // expected-warning{{Iterator incremented behind the past-the-end iterator}} +} + +// std::prev() by +2 + +void prev_plus_2_begin(const std::vector &V) { + auto i = V.begin(); + auto j = std::prev(i, 2); // expected-warning{{Iterator decremented ahead of its valid range}} +} + +void prev_plus_2_behind_begin(const std::vector &V) { + auto i = ++V.begin(); + auto j = std::prev(i, 2); // expected-warning{{Iterator decremented ahead of its valid range}} +} + +void prev_plus_2_unknown(const std::vector &V) { + auto i = return_any_iterator(V.begin()); + auto j = std::prev(i, 2); // no-warning +} + +void prev_plus_2_ahead_of_end(const std::vector &V) { + auto i = --V.end(); + auto j = std::prev(i, 2); // no-warning +} + +void prev_plus_2_end(const std::vector &V) { + auto i = V.end(); + auto j = std::prev(i, 2); // no-warning +} + +// std::prev() by -2 + +void prev_minus_2_begin(const std::vector &V) { + auto i = V.begin(); + auto j = std::prev(i, -2); // no-warning +} + +void prev_minus_2_behind_begin(const std::vector &V) { + auto i = ++V.begin(); + auto j = std::prev(i, -2); // no-warning +} + +void prev_minus_2_unknown(const std::vector &V) { + auto i = return_any_iterator(V.begin()); + auto j = std::prev(i, -2); // no-warning +} + +void prev_minus_2_ahead_of_end(const std::vector &V) { + auto i = --V.end(); + auto j = std::prev(i, -2); // expected-warning{{Iterator incremented behind the past-the-end iterator}} +} + +void prev_minus_2_end(const std::vector &V) { + auto i = V.end(); + auto j = std::prev(i, -2); // expected-warning{{Iterator incremented behind the past-the-end iterator}} +} + +// std::prev() by 0 + +void prev_0_begin(const std::vector &V) { + auto i = V.begin(); + auto j = std::prev(i, 0); // no-warning +} + +void prev_0_behind_begin(const std::vector &V) { + auto i = ++V.begin(); + auto j = std::prev(i, 0); // no-warning +} + +void prev_0_unknown(const std::vector &V) { + auto i = return_any_iterator(V.begin()); + auto j = std::prev(i, 0); // no-warning +} + +void prev_0_ahead_of_end(const std::vector &V) { + auto i = --V.end(); + auto j = std::prev(i, 0); // no-warning +} + +void prev_0_end(const std::vector &V) { + auto i = V.end(); + auto j = std::prev(i, 0); // no-warning +} + // // Structure member dereference operators // diff --git a/clang/test/Analysis/std-c-library-functions-arg-constraints.c b/clang/test/Analysis/std-c-library-functions-arg-constraints.c new file mode 100644 index 0000000000000..a20b90ad1ccb1 --- /dev/null +++ b/clang/test/Analysis/std-c-library-functions-arg-constraints.c @@ -0,0 +1,87 @@ +// Check the basic reporting/warning and the application of constraints. +// RUN: %clang_analyze_cc1 %s \ +// RUN: -analyzer-checker=core \ +// RUN: -analyzer-checker=apiModeling.StdCLibraryFunctions \ +// RUN: -analyzer-checker=apiModeling.StdCLibraryFunctionArgs \ +// RUN: -analyzer-checker=debug.ExprInspection \ +// RUN: -triple x86_64-unknown-linux-gnu \ +// RUN: -verify=report + +// Check the bugpath related to the reports. +// RUN: %clang_analyze_cc1 %s \ +// RUN: -analyzer-checker=core \ +// RUN: -analyzer-checker=apiModeling.StdCLibraryFunctions \ +// RUN: -analyzer-checker=apiModeling.StdCLibraryFunctionArgs \ +// RUN: -analyzer-checker=debug.ExprInspection \ +// RUN: -triple x86_64-unknown-linux-gnu \ +// RUN: -analyzer-output=text \ +// RUN: -verify=bugpath + +void clang_analyzer_eval(int); + +int glob; + +#define EOF -1 + +int isalnum(int); + +void test_alnum_concrete(int v) { + int ret = isalnum(256); // \ + // report-warning{{Function argument constraint is not satisfied}} \ + // bugpath-warning{{Function argument constraint is not satisfied}} \ + // bugpath-note{{Function argument constraint is not satisfied}} + (void)ret; +} + +void test_alnum_symbolic(int x) { + int ret = isalnum(x); + (void)ret; + + clang_analyzer_eval(EOF <= x && x <= 255); // \ + // report-warning{{TRUE}} \ + // bugpath-warning{{TRUE}} \ + // bugpath-note{{TRUE}} \ + // bugpath-note{{Left side of '&&' is true}} \ + // bugpath-note{{'x' is <= 255}} + +} + +void test_alnum_symbolic2(int x) { + if (x > 255) { // \ + // bugpath-note{{Assuming 'x' is > 255}} \ + // bugpath-note{{Taking true branch}} + + int ret = isalnum(x); // \ + // report-warning{{Function argument constraint is not satisfied}} \ + // bugpath-warning{{Function argument constraint is not satisfied}} \ + // bugpath-note{{Function argument constraint is not satisfied}} + + (void)ret; + } +} + +typedef struct FILE FILE; +typedef typeof(sizeof(int)) size_t; +size_t fread(void *, size_t, size_t, FILE *); +void test_notnull_concrete(FILE *fp) { + fread(0, sizeof(int), 10, fp); // \ + // report-warning{{Function argument constraint is not satisfied}} \ + // bugpath-warning{{Function argument constraint is not satisfied}} \ + // bugpath-note{{Function argument constraint is not satisfied}} +} +void test_notnull_symbolic(FILE *fp, int *buf) { + fread(buf, sizeof(int), 10, fp); + clang_analyzer_eval(buf != 0); // \ + // report-warning{{TRUE}} \ + // bugpath-warning{{TRUE}} \ + // bugpath-note{{TRUE}} \ + // bugpath-note{{'buf' is not equal to null}} +} +void test_notnull_symbolic2(FILE *fp, int *buf) { + if (!buf) // bugpath-note{{Assuming 'buf' is null}} \ + // bugpath-note{{Taking true branch}} + fread(buf, sizeof(int), 10, fp); // \ + // report-warning{{Function argument constraint is not satisfied}} \ + // bugpath-warning{{Function argument constraint is not satisfied}} \ + // bugpath-note{{Function argument constraint is not satisfied}} +} diff --git a/clang/test/Analysis/std-c-library-functions.c b/clang/test/Analysis/std-c-library-functions.c index 9fb8833175bef..3f700a7c39a45 100644 --- a/clang/test/Analysis/std-c-library-functions.c +++ b/clang/test/Analysis/std-c-library-functions.c @@ -1,8 +1,34 @@ -// RUN: %clang_analyze_cc1 -analyzer-checker=apiModeling.StdCLibraryFunctions,debug.ExprInspection -verify -analyzer-config eagerly-assume=false %s -// RUN: %clang_analyze_cc1 -triple i686-unknown-linux -analyzer-checker=apiModeling.StdCLibraryFunctions,debug.ExprInspection -verify -analyzer-config eagerly-assume=false %s -// RUN: %clang_analyze_cc1 -triple x86_64-unknown-linux -analyzer-checker=apiModeling.StdCLibraryFunctions,debug.ExprInspection -verify -analyzer-config eagerly-assume=false %s -// RUN: %clang_analyze_cc1 -triple armv7-a15-linux -analyzer-checker=apiModeling.StdCLibraryFunctions,debug.ExprInspection -verify -analyzer-config eagerly-assume=false %s -// RUN: %clang_analyze_cc1 -triple thumbv7-a15-linux -analyzer-checker=apiModeling.StdCLibraryFunctions,debug.ExprInspection -verify -analyzer-config eagerly-assume=false %s +// RUN: %clang_analyze_cc1 %s \ +// RUN: -analyzer-checker=core \ +// RUN: -analyzer-checker=apiModeling.StdCLibraryFunctions \ +// RUN: -analyzer-checker=debug.ExprInspection \ +// RUN: -analyzer-config eagerly-assume=false \ +// RUN: -triple i686-unknown-linux \ +// RUN: -verify + +// RUN: %clang_analyze_cc1 %s \ +// RUN: -analyzer-checker=core \ +// RUN: -analyzer-checker=apiModeling.StdCLibraryFunctions \ +// RUN: -analyzer-checker=debug.ExprInspection \ +// RUN: -analyzer-config eagerly-assume=false \ +// RUN: -triple x86_64-unknown-linux \ +// RUN: -verify + +// RUN: %clang_analyze_cc1 %s \ +// RUN: -analyzer-checker=core \ +// RUN: -analyzer-checker=apiModeling.StdCLibraryFunctions \ +// RUN: -analyzer-checker=debug.ExprInspection \ +// RUN: -analyzer-config eagerly-assume=false \ +// RUN: -triple armv7-a15-linux \ +// RUN: -verify + +// RUN: %clang_analyze_cc1 %s \ +// RUN: -analyzer-checker=core \ +// RUN: -analyzer-checker=apiModeling.StdCLibraryFunctions \ +// RUN: -analyzer-checker=debug.ExprInspection \ +// RUN: -analyzer-config eagerly-assume=false \ +// RUN: -triple thumbv7-a15-linux \ +// RUN: -verify void clang_analyzer_eval(int); @@ -52,10 +78,13 @@ void test_read_write(int fd, char *buf) { size_t fread(void *, size_t, size_t, FILE *); size_t fwrite(const void *restrict, size_t, size_t, FILE *restrict); void test_fread_fwrite(FILE *fp, int *buf) { + size_t x = fwrite(buf, sizeof(int), 10, fp); clang_analyzer_eval(x <= 10); // expected-warning{{TRUE}} + size_t y = fread(buf, sizeof(int), 10, fp); clang_analyzer_eval(y <= 10); // expected-warning{{TRUE}} + size_t z = fwrite(buf, sizeof(int), y, fp); clang_analyzer_eval(z <= y); // expected-warning{{TRUE}} } diff --git a/clang/test/CXX/drs/dr4xx.cpp b/clang/test/CXX/drs/dr4xx.cpp index 35fd15f0cc642..2c762237037d2 100644 --- a/clang/test/CXX/drs/dr4xx.cpp +++ b/clang/test/CXX/drs/dr4xx.cpp @@ -297,13 +297,11 @@ namespace dr420 { // dr420: yes void test2(T p) { p->template Y::~Y(); p->~Y(); - // FIXME: This is ill-formed, but this diagnostic is terrible. We should - // reject this in the parser. - p->template ~Y(); // expected-error 2{{no member named '~typename Y'}} + p->template ~Y(); // expected-error {{'template' keyword not permitted in destructor name}} } template struct Y {}; - template void test2(Y*); // expected-note {{instantiation}} - template void test2(ptr >); // expected-note {{instantiation}} + template void test2(Y*); + template void test2(ptr >); void test3(int *p, ptr q) { typedef int Int; diff --git a/clang/test/CXX/over/over.match/over.match.funcs/over.match.oper/p3-2a.cpp b/clang/test/CXX/over/over.match/over.match.funcs/over.match.oper/p3-2a.cpp index 023e076d50a73..f572a79327723 100644 --- a/clang/test/CXX/over/over.match/over.match.funcs/over.match.oper/p3-2a.cpp +++ b/clang/test/CXX/over/over.match/over.match.funcs/over.match.oper/p3-2a.cpp @@ -144,24 +144,26 @@ namespace problem_cases { bool cmp_base_derived = D() == D(); // expected-warning {{ambiguous}} template struct CRTPBase { - bool operator==(const T&) const; // expected-note {{operator}} + bool operator==(const T&) const; // expected-note {{operator}} expected-note {{reversed}} + bool operator!=(const T&) const; // expected-note {{non-reversed}} }; struct CRTP : CRTPBase {}; - bool cmp_crtp = CRTP() == CRTP(); // expected-warning {{ambiguous}} + bool cmp_crtp = CRTP() == CRTP(); // expected-warning-re {{ambiguous despite there being a unique best viable function{{$}}}}}} + bool cmp_crtp2 = CRTP() != CRTP(); // expected-warning {{ambiguous despite there being a unique best viable function with non-reversed arguments}} - // We can select a non-rewriteable operator== for a != comparison, when there - // was a viable operator!= candidate we could have used instead. - // - // Rejecting this seems OK on balance. + // Given a choice between a rewritten and non-rewritten function with the + // same parameter types, where the rewritten function is reversed and each + // has a better conversion for one of the two arguments, prefer the + // non-rewritten one. using UBool = signed char; // ICU uses this. struct ICUBase { virtual UBool operator==(const ICUBase&) const; UBool operator!=(const ICUBase &arg) const { return !operator==(arg); } }; struct ICUDerived : ICUBase { - UBool operator==(const ICUBase&) const override; // expected-note {{declared here}} + UBool operator==(const ICUBase&) const override; // expected-note {{declared here}} expected-note {{ambiguity is between}} }; - bool cmp_icu = ICUDerived() != ICUDerived(); // expected-error {{not 'bool'}} + bool cmp_icu = ICUDerived() != ICUDerived(); // expected-warning {{ambiguous}} expected-warning {{'bool', not 'problem_cases::UBool'}} } #else // NO_ERRORS diff --git a/clang/test/CXX/over/over.match/over.match.funcs/over.match.oper/p9-2a.cpp b/clang/test/CXX/over/over.match/over.match.funcs/over.match.oper/p9-2a.cpp index fce46816cedc6..3826a4127910e 100644 --- a/clang/test/CXX/over/over.match/over.match.funcs/over.match.oper/p9-2a.cpp +++ b/clang/test/CXX/over/over.match/over.match.funcs/over.match.oper/p9-2a.cpp @@ -4,18 +4,28 @@ namespace not_bool { struct X {} x; struct Y {} y; - int operator==(X, Y); // expected-note 4{{here}} + double operator==(X, Y); // expected-note 4{{here}} bool a = x == y; // ok - bool b = y == x; // expected-error {{return type 'int' of selected 'operator==' function for rewritten '==' comparison is not 'bool'}} - bool c = x != y; // expected-error {{return type 'int' of selected 'operator==' function for rewritten '!=' comparison is not 'bool'}} - bool d = y != x; // expected-error {{return type 'int' of selected 'operator==' function for rewritten '!=' comparison is not 'bool'}} + bool b = y == x; // expected-error {{return type 'double' of selected 'operator==' function for rewritten '==' comparison is not 'bool'}} + bool c = x != y; // expected-error {{return type 'double' of selected 'operator==' function for rewritten '!=' comparison is not 'bool'}} + bool d = y != x; // expected-error {{return type 'double' of selected 'operator==' function for rewritten '!=' comparison is not 'bool'}} // cv-qualifiers are OK const bool operator==(Y, X); bool e = y != x; // ok // We don't prefer a function with bool return type over one witn non-bool return type. - bool f = x != y; // expected-error {{return type 'int' of selected 'operator==' function for rewritten '!=' comparison is not 'bool'}} + bool f = x != y; // expected-error {{return type 'double' of selected 'operator==' function for rewritten '!=' comparison is not 'bool'}} + + // As an extension, we permit integral and unscoped enumeration types too. + // These are used by popular C++ libraries such as ICU. + struct Z {} z; + int operator==(X, Z); // expected-note {{here}} + bool g = z == x; // expected-warning {{ISO C++20 requires return type of selected 'operator==' function for rewritten '==' comparison to be 'bool', not 'int'}} + + enum E {}; + E operator==(Y, Z); // expected-note {{here}} + bool h = z == y; // expected-warning {{ISO C++20 requires return type of selected 'operator==' function for rewritten '==' comparison to be 'bool', not 'not_bool::E'}} } struct X { bool equal; }; diff --git a/clang/test/CodeGen/2006-05-19-SingleEltReturn.c b/clang/test/CodeGen/2006-05-19-SingleEltReturn.c index dfc23f84ab424..d3f9e4e00acde 100644 --- a/clang/test/CodeGen/2006-05-19-SingleEltReturn.c +++ b/clang/test/CodeGen/2006-05-19-SingleEltReturn.c @@ -24,7 +24,7 @@ struct Y bar() { // X86_32: define void @foo(%struct.Y* %P) -// X86_32: call void @bar(%struct.Y* sret %{{[^),]*}}) +// X86_32: call void @bar(%struct.Y* sret align 4 %{{[^),]*}}) -// X86_32: define void @bar(%struct.Y* noalias sret %{{[^,)]*}}) +// X86_32: define void @bar(%struct.Y* noalias sret align 4 %{{[^,)]*}}) // X86_32: ret void diff --git a/clang/test/CodeGen/aarch64-neon-2velem.c b/clang/test/CodeGen/aarch64-neon-2velem.c index 5ad06cfff8086..25c0ae4988b74 100644 --- a/clang/test/CodeGen/aarch64-neon-2velem.c +++ b/clang/test/CodeGen/aarch64-neon-2velem.c @@ -7,8 +7,10 @@ // CHECK-LABEL: @test_vmla_lane_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i16> [[ADD]] // @@ -18,8 +20,10 @@ int16x4_t test_vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v) { // CHECK-LABEL: @test_vmlaq_lane_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <8 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <8 x i16> [[ADD]] // @@ -29,8 +33,10 @@ int16x8_t test_vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v) { // CHECK-LABEL: @test_vmla_lane_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <2 x i32> [[ADD]] // @@ -40,8 +46,10 @@ int32x2_t test_vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v) { // CHECK-LABEL: @test_vmlaq_lane_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <4 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -51,8 +59,10 @@ int32x4_t test_vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v) { // CHECK-LABEL: @test_vmla_laneq_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i16> [[ADD]] // @@ -62,8 +72,10 @@ int16x4_t test_vmla_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v) { // CHECK-LABEL: @test_vmlaq_laneq_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <8 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <8 x i16> [[ADD]] // @@ -73,8 +85,10 @@ int16x8_t test_vmlaq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v) { // CHECK-LABEL: @test_vmla_laneq_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <2 x i32> [[ADD]] // @@ -84,8 +98,10 @@ int32x2_t test_vmla_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v) { // CHECK-LABEL: @test_vmlaq_laneq_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <4 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -95,8 +111,10 @@ int32x4_t test_vmlaq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t v) { // CHECK-LABEL: @test_vmls_lane_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i16> [[SUB]] // @@ -106,8 +124,10 @@ int16x4_t test_vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v) { // CHECK-LABEL: @test_vmlsq_lane_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <8 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <8 x i16> [[SUB]] // @@ -117,8 +137,10 @@ int16x8_t test_vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v) { // CHECK-LABEL: @test_vmls_lane_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <2 x i32> [[SUB]] // @@ -128,8 +150,10 @@ int32x2_t test_vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v) { // CHECK-LABEL: @test_vmlsq_lane_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <4 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -139,8 +163,10 @@ int32x4_t test_vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v) { // CHECK-LABEL: @test_vmls_laneq_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i16> [[SUB]] // @@ -150,8 +176,10 @@ int16x4_t test_vmls_laneq_s16(int16x4_t a, int16x4_t b, int16x8_t v) { // CHECK-LABEL: @test_vmlsq_laneq_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <8 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <8 x i16> [[SUB]] // @@ -161,8 +189,10 @@ int16x8_t test_vmlsq_laneq_s16(int16x8_t a, int16x8_t b, int16x8_t v) { // CHECK-LABEL: @test_vmls_laneq_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <2 x i32> [[SUB]] // @@ -172,8 +202,10 @@ int32x2_t test_vmls_laneq_s32(int32x2_t a, int32x2_t b, int32x4_t v) { // CHECK-LABEL: @test_vmlsq_laneq_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <4 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -183,8 +215,10 @@ int32x4_t test_vmlsq_laneq_s32(int32x4_t a, int32x4_t b, int32x4_t v) { // CHECK-LABEL: @test_vmul_lane_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <4 x i16> [[MUL]] // int16x4_t test_vmul_lane_s16(int16x4_t a, int16x4_t v) { @@ -193,8 +227,10 @@ int16x4_t test_vmul_lane_s16(int16x4_t a, int16x4_t v) { // CHECK-LABEL: @test_vmulq_lane_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <8 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <8 x i16> [[MUL]] // int16x8_t test_vmulq_lane_s16(int16x8_t a, int16x4_t v) { @@ -203,8 +239,10 @@ int16x8_t test_vmulq_lane_s16(int16x8_t a, int16x4_t v) { // CHECK-LABEL: @test_vmul_lane_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <2 x i32> [[MUL]] // int32x2_t test_vmul_lane_s32(int32x2_t a, int32x2_t v) { @@ -213,8 +251,10 @@ int32x2_t test_vmul_lane_s32(int32x2_t a, int32x2_t v) { // CHECK-LABEL: @test_vmulq_lane_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <4 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <4 x i32> [[MUL]] // int32x4_t test_vmulq_lane_s32(int32x4_t a, int32x2_t v) { @@ -223,8 +263,10 @@ int32x4_t test_vmulq_lane_s32(int32x4_t a, int32x2_t v) { // CHECK-LABEL: @test_vmul_lane_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <4 x i16> [[MUL]] // uint16x4_t test_vmul_lane_u16(uint16x4_t a, uint16x4_t v) { @@ -233,8 +275,10 @@ uint16x4_t test_vmul_lane_u16(uint16x4_t a, uint16x4_t v) { // CHECK-LABEL: @test_vmulq_lane_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <8 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <8 x i16> [[MUL]] // uint16x8_t test_vmulq_lane_u16(uint16x8_t a, uint16x4_t v) { @@ -243,8 +287,10 @@ uint16x8_t test_vmulq_lane_u16(uint16x8_t a, uint16x4_t v) { // CHECK-LABEL: @test_vmul_lane_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <2 x i32> [[MUL]] // uint32x2_t test_vmul_lane_u32(uint32x2_t a, uint32x2_t v) { @@ -253,8 +299,10 @@ uint32x2_t test_vmul_lane_u32(uint32x2_t a, uint32x2_t v) { // CHECK-LABEL: @test_vmulq_lane_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <4 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <4 x i32> [[MUL]] // uint32x4_t test_vmulq_lane_u32(uint32x4_t a, uint32x2_t v) { @@ -263,8 +311,10 @@ uint32x4_t test_vmulq_lane_u32(uint32x4_t a, uint32x2_t v) { // CHECK-LABEL: @test_vmul_laneq_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <4 x i16> [[MUL]] // int16x4_t test_vmul_laneq_s16(int16x4_t a, int16x8_t v) { @@ -273,8 +323,10 @@ int16x4_t test_vmul_laneq_s16(int16x4_t a, int16x8_t v) { // CHECK-LABEL: @test_vmulq_laneq_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <8 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <8 x i16> [[MUL]] // int16x8_t test_vmulq_laneq_s16(int16x8_t a, int16x8_t v) { @@ -283,8 +335,10 @@ int16x8_t test_vmulq_laneq_s16(int16x8_t a, int16x8_t v) { // CHECK-LABEL: @test_vmul_laneq_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <2 x i32> [[MUL]] // int32x2_t test_vmul_laneq_s32(int32x2_t a, int32x4_t v) { @@ -293,8 +347,10 @@ int32x2_t test_vmul_laneq_s32(int32x2_t a, int32x4_t v) { // CHECK-LABEL: @test_vmulq_laneq_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <4 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <4 x i32> [[MUL]] // int32x4_t test_vmulq_laneq_s32(int32x4_t a, int32x4_t v) { @@ -303,8 +359,10 @@ int32x4_t test_vmulq_laneq_s32(int32x4_t a, int32x4_t v) { // CHECK-LABEL: @test_vmul_laneq_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <4 x i16> [[MUL]] // uint16x4_t test_vmul_laneq_u16(uint16x4_t a, uint16x8_t v) { @@ -313,8 +371,10 @@ uint16x4_t test_vmul_laneq_u16(uint16x4_t a, uint16x8_t v) { // CHECK-LABEL: @test_vmulq_laneq_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <8 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <8 x i16> [[MUL]] // uint16x8_t test_vmulq_laneq_u16(uint16x8_t a, uint16x8_t v) { @@ -323,8 +383,10 @@ uint16x8_t test_vmulq_laneq_u16(uint16x8_t a, uint16x8_t v) { // CHECK-LABEL: @test_vmul_laneq_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <2 x i32> [[MUL]] // uint32x2_t test_vmul_laneq_u32(uint32x2_t a, uint32x4_t v) { @@ -333,8 +395,10 @@ uint32x2_t test_vmul_laneq_u32(uint32x2_t a, uint32x4_t v) { // CHECK-LABEL: @test_vmulq_laneq_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <4 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <4 x i32> [[MUL]] // uint32x4_t test_vmulq_laneq_u32(uint32x4_t a, uint32x4_t v) { @@ -584,10 +648,12 @@ float64_t test_vfmsd_laneq_f64(float64_t a, float64_t b, float64x2_t v) { // CHECK-LABEL: @test_vmlal_lane_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -597,10 +663,12 @@ int32x4_t test_vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) { // CHECK-LABEL: @test_vmlal_lane_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] // @@ -610,10 +678,12 @@ int64x2_t test_vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) { // CHECK-LABEL: @test_vmlal_laneq_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -623,10 +693,12 @@ int32x4_t test_vmlal_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) { // CHECK-LABEL: @test_vmlal_laneq_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] // @@ -637,10 +709,12 @@ int64x2_t test_vmlal_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) { // CHECK-LABEL: @test_vmlal_high_lane_s16( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -651,10 +725,12 @@ int32x4_t test_vmlal_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) { // CHECK-LABEL: @test_vmlal_high_lane_s32( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] // @@ -665,10 +741,12 @@ int64x2_t test_vmlal_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) { // CHECK-LABEL: @test_vmlal_high_laneq_s16( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -679,10 +757,12 @@ int32x4_t test_vmlal_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) { // CHECK-LABEL: @test_vmlal_high_laneq_s32( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] // @@ -692,10 +772,12 @@ int64x2_t test_vmlal_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) { // CHECK-LABEL: @test_vmlsl_lane_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -705,10 +787,12 @@ int32x4_t test_vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) { // CHECK-LABEL: @test_vmlsl_lane_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] // @@ -718,10 +802,12 @@ int64x2_t test_vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) { // CHECK-LABEL: @test_vmlsl_laneq_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -731,10 +817,12 @@ int32x4_t test_vmlsl_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) { // CHECK-LABEL: @test_vmlsl_laneq_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] // @@ -745,10 +833,12 @@ int64x2_t test_vmlsl_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) { // CHECK-LABEL: @test_vmlsl_high_lane_s16( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -759,10 +849,12 @@ int32x4_t test_vmlsl_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) { // CHECK-LABEL: @test_vmlsl_high_lane_s32( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] // @@ -773,10 +865,12 @@ int64x2_t test_vmlsl_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) { // CHECK-LABEL: @test_vmlsl_high_laneq_s16( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -787,10 +881,12 @@ int32x4_t test_vmlsl_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) { // CHECK-LABEL: @test_vmlsl_high_laneq_s32( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] // @@ -800,10 +896,12 @@ int64x2_t test_vmlsl_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) { // CHECK-LABEL: @test_vmlal_lane_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -813,10 +911,12 @@ int32x4_t test_vmlal_lane_u16(int32x4_t a, int16x4_t b, int16x4_t v) { // CHECK-LABEL: @test_vmlal_lane_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] // @@ -826,10 +926,12 @@ int64x2_t test_vmlal_lane_u32(int64x2_t a, int32x2_t b, int32x2_t v) { // CHECK-LABEL: @test_vmlal_laneq_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -839,10 +941,12 @@ int32x4_t test_vmlal_laneq_u16(int32x4_t a, int16x4_t b, int16x8_t v) { // CHECK-LABEL: @test_vmlal_laneq_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] // @@ -853,10 +957,12 @@ int64x2_t test_vmlal_laneq_u32(int64x2_t a, int32x2_t b, int32x4_t v) { // CHECK-LABEL: @test_vmlal_high_lane_u16( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -867,10 +973,12 @@ int32x4_t test_vmlal_high_lane_u16(int32x4_t a, int16x8_t b, int16x4_t v) { // CHECK-LABEL: @test_vmlal_high_lane_u32( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] // @@ -881,10 +989,12 @@ int64x2_t test_vmlal_high_lane_u32(int64x2_t a, int32x4_t b, int32x2_t v) { // CHECK-LABEL: @test_vmlal_high_laneq_u16( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -895,10 +1005,12 @@ int32x4_t test_vmlal_high_laneq_u16(int32x4_t a, int16x8_t b, int16x8_t v) { // CHECK-LABEL: @test_vmlal_high_laneq_u32( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] // @@ -908,10 +1020,12 @@ int64x2_t test_vmlal_high_laneq_u32(int64x2_t a, int32x4_t b, int32x4_t v) { // CHECK-LABEL: @test_vmlsl_lane_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -921,10 +1035,12 @@ int32x4_t test_vmlsl_lane_u16(int32x4_t a, int16x4_t b, int16x4_t v) { // CHECK-LABEL: @test_vmlsl_lane_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] // @@ -934,10 +1050,12 @@ int64x2_t test_vmlsl_lane_u32(int64x2_t a, int32x2_t b, int32x2_t v) { // CHECK-LABEL: @test_vmlsl_laneq_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -947,10 +1065,12 @@ int32x4_t test_vmlsl_laneq_u16(int32x4_t a, int16x4_t b, int16x8_t v) { // CHECK-LABEL: @test_vmlsl_laneq_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] // @@ -961,10 +1081,12 @@ int64x2_t test_vmlsl_laneq_u32(int64x2_t a, int32x2_t b, int32x4_t v) { // CHECK-LABEL: @test_vmlsl_high_lane_u16( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -975,10 +1097,12 @@ int32x4_t test_vmlsl_high_lane_u16(int32x4_t a, int16x8_t b, int16x4_t v) { // CHECK-LABEL: @test_vmlsl_high_lane_u32( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] // @@ -989,10 +1113,12 @@ int64x2_t test_vmlsl_high_lane_u32(int64x2_t a, int32x4_t b, int32x2_t v) { // CHECK-LABEL: @test_vmlsl_high_laneq_u16( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -1003,10 +1129,12 @@ int32x4_t test_vmlsl_high_laneq_u16(int32x4_t a, int16x8_t b, int16x8_t v) { // CHECK-LABEL: @test_vmlsl_high_laneq_u32( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] // @@ -1016,10 +1144,12 @@ int64x2_t test_vmlsl_high_laneq_u32(int64x2_t a, int32x4_t b, int32x4_t v) { // CHECK-LABEL: @test_vmull_lane_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // int32x4_t test_vmull_lane_s16(int16x4_t a, int16x4_t v) { @@ -1028,10 +1158,12 @@ int32x4_t test_vmull_lane_s16(int16x4_t a, int16x4_t v) { // CHECK-LABEL: @test_vmull_lane_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // int64x2_t test_vmull_lane_s32(int32x2_t a, int32x2_t v) { @@ -1040,10 +1172,12 @@ int64x2_t test_vmull_lane_s32(int32x2_t a, int32x2_t v) { // CHECK-LABEL: @test_vmull_lane_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // uint32x4_t test_vmull_lane_u16(uint16x4_t a, uint16x4_t v) { @@ -1052,10 +1186,12 @@ uint32x4_t test_vmull_lane_u16(uint16x4_t a, uint16x4_t v) { // CHECK-LABEL: @test_vmull_lane_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // uint64x2_t test_vmull_lane_u32(uint32x2_t a, uint32x2_t v) { @@ -1065,10 +1201,12 @@ uint64x2_t test_vmull_lane_u32(uint32x2_t a, uint32x2_t v) { // CHECK-LABEL: @test_vmull_high_lane_s16( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // int32x4_t test_vmull_high_lane_s16(int16x8_t a, int16x4_t v) { @@ -1078,10 +1216,12 @@ int32x4_t test_vmull_high_lane_s16(int16x8_t a, int16x4_t v) { // CHECK-LABEL: @test_vmull_high_lane_s32( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // int64x2_t test_vmull_high_lane_s32(int32x4_t a, int32x2_t v) { @@ -1091,10 +1231,12 @@ int64x2_t test_vmull_high_lane_s32(int32x4_t a, int32x2_t v) { // CHECK-LABEL: @test_vmull_high_lane_u16( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // uint32x4_t test_vmull_high_lane_u16(uint16x8_t a, uint16x4_t v) { @@ -1104,10 +1246,12 @@ uint32x4_t test_vmull_high_lane_u16(uint16x8_t a, uint16x4_t v) { // CHECK-LABEL: @test_vmull_high_lane_u32( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // uint64x2_t test_vmull_high_lane_u32(uint32x4_t a, uint32x2_t v) { @@ -1116,10 +1260,12 @@ uint64x2_t test_vmull_high_lane_u32(uint32x4_t a, uint32x2_t v) { // CHECK-LABEL: @test_vmull_laneq_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // int32x4_t test_vmull_laneq_s16(int16x4_t a, int16x8_t v) { @@ -1128,10 +1274,12 @@ int32x4_t test_vmull_laneq_s16(int16x4_t a, int16x8_t v) { // CHECK-LABEL: @test_vmull_laneq_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // int64x2_t test_vmull_laneq_s32(int32x2_t a, int32x4_t v) { @@ -1140,10 +1288,12 @@ int64x2_t test_vmull_laneq_s32(int32x2_t a, int32x4_t v) { // CHECK-LABEL: @test_vmull_laneq_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // uint32x4_t test_vmull_laneq_u16(uint16x4_t a, uint16x8_t v) { @@ -1152,10 +1302,12 @@ uint32x4_t test_vmull_laneq_u16(uint16x4_t a, uint16x8_t v) { // CHECK-LABEL: @test_vmull_laneq_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // uint64x2_t test_vmull_laneq_u32(uint32x2_t a, uint32x4_t v) { @@ -1165,10 +1317,12 @@ uint64x2_t test_vmull_laneq_u32(uint32x2_t a, uint32x4_t v) { // CHECK-LABEL: @test_vmull_high_laneq_s16( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // int32x4_t test_vmull_high_laneq_s16(int16x8_t a, int16x8_t v) { @@ -1178,10 +1332,12 @@ int32x4_t test_vmull_high_laneq_s16(int16x8_t a, int16x8_t v) { // CHECK-LABEL: @test_vmull_high_laneq_s32( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // int64x2_t test_vmull_high_laneq_s32(int32x4_t a, int32x4_t v) { @@ -1191,10 +1347,12 @@ int64x2_t test_vmull_high_laneq_s32(int32x4_t a, int32x4_t v) { // CHECK-LABEL: @test_vmull_high_laneq_u16( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // uint32x4_t test_vmull_high_laneq_u16(uint16x8_t a, uint16x8_t v) { @@ -1204,10 +1362,12 @@ uint32x4_t test_vmull_high_laneq_u16(uint16x8_t a, uint16x8_t v) { // CHECK-LABEL: @test_vmull_high_laneq_u32( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // uint64x2_t test_vmull_high_laneq_u32(uint32x4_t a, uint32x4_t v) { @@ -1216,11 +1376,13 @@ uint64x2_t test_vmull_high_laneq_u32(uint32x4_t a, uint32x4_t v) { // CHECK-LABEL: @test_vqdmlal_lane_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I]] // @@ -1230,11 +1392,13 @@ int32x4_t test_vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) { // CHECK-LABEL: @test_vqdmlal_lane_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I]] // @@ -1245,11 +1409,13 @@ int64x2_t test_vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) { // CHECK-LABEL: @test_vqdmlal_high_lane_s16( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I]] // @@ -1260,11 +1426,13 @@ int32x4_t test_vqdmlal_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) { // CHECK-LABEL: @test_vqdmlal_high_lane_s32( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I]] // @@ -1274,11 +1442,13 @@ int64x2_t test_vqdmlal_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) { // CHECK-LABEL: @test_vqdmlsl_lane_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I]] // @@ -1288,11 +1458,13 @@ int32x4_t test_vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v) { // CHECK-LABEL: @test_vqdmlsl_lane_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I]] // @@ -1303,11 +1475,13 @@ int64x2_t test_vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v) { // CHECK-LABEL: @test_vqdmlsl_high_lane_s16( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I]] // @@ -1318,11 +1492,13 @@ int32x4_t test_vqdmlsl_high_lane_s16(int32x4_t a, int16x8_t b, int16x4_t v) { // CHECK-LABEL: @test_vqdmlsl_high_lane_s32( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I]] // @@ -1332,10 +1508,12 @@ int64x2_t test_vqdmlsl_high_lane_s32(int64x2_t a, int32x4_t b, int32x2_t v) { // CHECK-LABEL: @test_vqdmull_lane_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8> // CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I]] // @@ -1345,10 +1523,12 @@ int32x4_t test_vqdmull_lane_s16(int16x4_t a, int16x4_t v) { // CHECK-LABEL: @test_vqdmull_lane_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8> // CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I]] // @@ -1358,10 +1538,12 @@ int64x2_t test_vqdmull_lane_s32(int32x2_t a, int32x2_t v) { // CHECK-LABEL: @test_vqdmull_laneq_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8> // CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I]] // @@ -1371,10 +1553,12 @@ int32x4_t test_vqdmull_laneq_s16(int16x4_t a, int16x8_t v) { // CHECK-LABEL: @test_vqdmull_laneq_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8> // CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I]] // @@ -1385,10 +1569,12 @@ int64x2_t test_vqdmull_laneq_s32(int32x2_t a, int32x4_t v) { // CHECK-LABEL: @test_vqdmull_high_lane_s16( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8> // CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I]] // @@ -1399,10 +1585,12 @@ int32x4_t test_vqdmull_high_lane_s16(int16x8_t a, int16x4_t v) { // CHECK-LABEL: @test_vqdmull_high_lane_s32( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8> // CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I]] // @@ -1413,10 +1601,12 @@ int64x2_t test_vqdmull_high_lane_s32(int32x4_t a, int32x2_t v) { // CHECK-LABEL: @test_vqdmull_high_laneq_s16( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8> // CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I]] // @@ -1427,10 +1617,12 @@ int32x4_t test_vqdmull_high_laneq_s16(int16x8_t a, int16x8_t v) { // CHECK-LABEL: @test_vqdmull_high_laneq_s32( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8> // CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I]] // @@ -1544,8 +1736,10 @@ int32x4_t test_vqrdmulhq_lane_s32(int32x4_t a, int32x2_t v) { // CHECK-LABEL: @test_vmul_lane_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[V:%.*]], <2 x float> [[V]], <2 x i32> -// CHECK-NEXT: [[MUL:%.*]] = fmul <2 x float> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[MUL:%.*]] = fmul <2 x float> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <2 x float> [[MUL]] // float32x2_t test_vmul_lane_f32(float32x2_t a, float32x2_t v) { @@ -1568,11 +1762,12 @@ float64x1_t test_vmul_lane_f64(float64x1_t a, float64x1_t v) { return vmul_lane_f64(a, v, 0); } - // CHECK-LABEL: @test_vmulq_lane_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[V:%.*]], <2 x float> [[V]], <4 x i32> -// CHECK-NEXT: [[MUL:%.*]] = fmul <4 x float> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = fmul <4 x float> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <4 x float> [[MUL]] // float32x4_t test_vmulq_lane_f32(float32x4_t a, float32x2_t v) { @@ -1581,8 +1776,10 @@ float32x4_t test_vmulq_lane_f32(float32x4_t a, float32x2_t v) { // CHECK-LABEL: @test_vmulq_lane_f64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <1 x double> [[V:%.*]], <1 x double> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = fmul <2 x double> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <1 x double> [[TMP1]], <1 x double> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = fmul <2 x double> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <2 x double> [[MUL]] // float64x2_t test_vmulq_lane_f64(float64x2_t a, float64x1_t v) { @@ -1591,8 +1788,10 @@ float64x2_t test_vmulq_lane_f64(float64x2_t a, float64x1_t v) { // CHECK-LABEL: @test_vmul_laneq_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x float> [[V:%.*]], <4 x float> [[V]], <2 x i32> -// CHECK-NEXT: [[MUL:%.*]] = fmul <2 x float> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[MUL:%.*]] = fmul <2 x float> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <2 x float> [[MUL]] // float32x2_t test_vmul_laneq_f32(float32x2_t a, float32x4_t v) { @@ -1614,11 +1813,12 @@ float64x1_t test_vmul_laneq_f64(float64x1_t a, float64x2_t v) { return vmul_laneq_f64(a, v, 1); } - // CHECK-LABEL: @test_vmulq_laneq_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x float> [[V:%.*]], <4 x float> [[V]], <4 x i32> -// CHECK-NEXT: [[MUL:%.*]] = fmul <4 x float> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = fmul <4 x float> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <4 x float> [[MUL]] // float32x4_t test_vmulq_laneq_f32(float32x4_t a, float32x4_t v) { @@ -1627,8 +1827,10 @@ float32x4_t test_vmulq_laneq_f32(float32x4_t a, float32x4_t v) { // CHECK-LABEL: @test_vmulq_laneq_f64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[V:%.*]], <2 x double> [[V]], <2 x i32> -// CHECK-NEXT: [[MUL:%.*]] = fmul <2 x double> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[MUL:%.*]] = fmul <2 x double> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <2 x double> [[MUL]] // float64x2_t test_vmulq_laneq_f64(float64x2_t a, float64x2_t v) { @@ -1637,10 +1839,12 @@ float64x2_t test_vmulq_laneq_f64(float64x2_t a, float64x2_t v) { // CHECK-LABEL: @test_vmulx_lane_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[V:%.*]], <2 x float> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x float> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[LANE]]) #4 // CHECK-NEXT: ret <2 x float> [[VMULX2_I]] // float32x2_t test_vmulx_lane_f32(float32x2_t a, float32x2_t v) { @@ -1649,10 +1853,12 @@ float32x2_t test_vmulx_lane_f32(float32x2_t a, float32x2_t v) { // CHECK-LABEL: @test_vmulxq_lane_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[V:%.*]], <2 x float> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[SHUFFLE]] to <16 x i8> -// CHECK-NEXT: [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x float> [[LANE]] to <16 x i8> +// CHECK-NEXT: [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[LANE]]) #4 // CHECK-NEXT: ret <4 x float> [[VMULX2_I]] // float32x4_t test_vmulxq_lane_f32(float32x4_t a, float32x2_t v) { @@ -1661,10 +1867,12 @@ float32x4_t test_vmulxq_lane_f32(float32x4_t a, float32x2_t v) { // CHECK-LABEL: @test_vmulxq_lane_f64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <1 x double> [[V:%.*]], <1 x double> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[SHUFFLE]] to <16 x i8> -// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <1 x double> [[TMP1]], <1 x double> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x double> [[LANE]] to <16 x i8> +// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[LANE]]) #4 // CHECK-NEXT: ret <2 x double> [[VMULX2_I]] // float64x2_t test_vmulxq_lane_f64(float64x2_t a, float64x1_t v) { @@ -1673,10 +1881,12 @@ float64x2_t test_vmulxq_lane_f64(float64x2_t a, float64x1_t v) { // CHECK-LABEL: @test_vmulx_laneq_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x float> [[V:%.*]], <4 x float> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x float> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[LANE]]) #4 // CHECK-NEXT: ret <2 x float> [[VMULX2_I]] // float32x2_t test_vmulx_laneq_f32(float32x2_t a, float32x4_t v) { @@ -1685,10 +1895,12 @@ float32x2_t test_vmulx_laneq_f32(float32x2_t a, float32x4_t v) { // CHECK-LABEL: @test_vmulxq_laneq_f32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x float> [[V:%.*]], <4 x float> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[SHUFFLE]] to <16 x i8> -// CHECK-NEXT: [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x float> [[LANE]] to <16 x i8> +// CHECK-NEXT: [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[LANE]]) #4 // CHECK-NEXT: ret <4 x float> [[VMULX2_I]] // float32x4_t test_vmulxq_laneq_f32(float32x4_t a, float32x4_t v) { @@ -1697,10 +1909,12 @@ float32x4_t test_vmulxq_laneq_f32(float32x4_t a, float32x4_t v) { // CHECK-LABEL: @test_vmulxq_laneq_f64( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[V:%.*]], <2 x double> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[SHUFFLE]] to <16 x i8> -// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x double> [[LANE]] to <16 x i8> +// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[LANE]]) #4 // CHECK-NEXT: ret <2 x double> [[VMULX2_I]] // float64x2_t test_vmulxq_laneq_f64(float64x2_t a, float64x2_t v) { @@ -1709,8 +1923,10 @@ float64x2_t test_vmulxq_laneq_f64(float64x2_t a, float64x2_t v) { // CHECK-LABEL: @test_vmla_lane_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i16> [[ADD]] // @@ -1720,8 +1936,10 @@ int16x4_t test_vmla_lane_s16_0(int16x4_t a, int16x4_t b, int16x4_t v) { // CHECK-LABEL: @test_vmlaq_lane_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <8 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <8 x i16> [[ADD]] // @@ -1731,8 +1949,10 @@ int16x8_t test_vmlaq_lane_s16_0(int16x8_t a, int16x8_t b, int16x4_t v) { // CHECK-LABEL: @test_vmla_lane_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <2 x i32> [[ADD]] // @@ -1742,8 +1962,10 @@ int32x2_t test_vmla_lane_s32_0(int32x2_t a, int32x2_t b, int32x2_t v) { // CHECK-LABEL: @test_vmlaq_lane_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -1753,8 +1975,10 @@ int32x4_t test_vmlaq_lane_s32_0(int32x4_t a, int32x4_t b, int32x2_t v) { // CHECK-LABEL: @test_vmla_laneq_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i16> [[ADD]] // @@ -1764,8 +1988,10 @@ int16x4_t test_vmla_laneq_s16_0(int16x4_t a, int16x4_t b, int16x8_t v) { // CHECK-LABEL: @test_vmlaq_laneq_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <8 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <8 x i16> [[ADD]] // @@ -1775,8 +2001,10 @@ int16x8_t test_vmlaq_laneq_s16_0(int16x8_t a, int16x8_t b, int16x8_t v) { // CHECK-LABEL: @test_vmla_laneq_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <2 x i32> [[ADD]] // @@ -1786,8 +2014,10 @@ int32x2_t test_vmla_laneq_s32_0(int32x2_t a, int32x2_t b, int32x4_t v) { // CHECK-LABEL: @test_vmlaq_laneq_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -1797,8 +2027,10 @@ int32x4_t test_vmlaq_laneq_s32_0(int32x4_t a, int32x4_t b, int32x4_t v) { // CHECK-LABEL: @test_vmls_lane_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i16> [[SUB]] // @@ -1808,8 +2040,10 @@ int16x4_t test_vmls_lane_s16_0(int16x4_t a, int16x4_t b, int16x4_t v) { // CHECK-LABEL: @test_vmlsq_lane_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <8 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <8 x i16> [[SUB]] // @@ -1819,8 +2053,10 @@ int16x8_t test_vmlsq_lane_s16_0(int16x8_t a, int16x8_t b, int16x4_t v) { // CHECK-LABEL: @test_vmls_lane_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <2 x i32> [[SUB]] // @@ -1830,8 +2066,10 @@ int32x2_t test_vmls_lane_s32_0(int32x2_t a, int32x2_t b, int32x2_t v) { // CHECK-LABEL: @test_vmlsq_lane_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -1841,8 +2079,10 @@ int32x4_t test_vmlsq_lane_s32_0(int32x4_t a, int32x4_t b, int32x2_t v) { // CHECK-LABEL: @test_vmls_laneq_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i16> [[SUB]] // @@ -1852,8 +2092,10 @@ int16x4_t test_vmls_laneq_s16_0(int16x4_t a, int16x4_t b, int16x8_t v) { // CHECK-LABEL: @test_vmlsq_laneq_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <8 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <8 x i16> [[SUB]] // @@ -1863,8 +2105,10 @@ int16x8_t test_vmlsq_laneq_s16_0(int16x8_t a, int16x8_t b, int16x8_t v) { // CHECK-LABEL: @test_vmls_laneq_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <2 x i32> [[SUB]] // @@ -1874,8 +2118,10 @@ int32x2_t test_vmls_laneq_s32_0(int32x2_t a, int32x2_t b, int32x4_t v) { // CHECK-LABEL: @test_vmlsq_laneq_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -1885,8 +2131,10 @@ int32x4_t test_vmlsq_laneq_s32_0(int32x4_t a, int32x4_t b, int32x4_t v) { // CHECK-LABEL: @test_vmul_lane_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <4 x i16> [[MUL]] // int16x4_t test_vmul_lane_s16_0(int16x4_t a, int16x4_t v) { @@ -1895,8 +2143,10 @@ int16x4_t test_vmul_lane_s16_0(int16x4_t a, int16x4_t v) { // CHECK-LABEL: @test_vmulq_lane_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <8 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <8 x i16> [[MUL]] // int16x8_t test_vmulq_lane_s16_0(int16x8_t a, int16x4_t v) { @@ -1905,8 +2155,10 @@ int16x8_t test_vmulq_lane_s16_0(int16x8_t a, int16x4_t v) { // CHECK-LABEL: @test_vmul_lane_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <2 x i32> [[MUL]] // int32x2_t test_vmul_lane_s32_0(int32x2_t a, int32x2_t v) { @@ -1915,8 +2167,10 @@ int32x2_t test_vmul_lane_s32_0(int32x2_t a, int32x2_t v) { // CHECK-LABEL: @test_vmulq_lane_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <4 x i32> [[MUL]] // int32x4_t test_vmulq_lane_s32_0(int32x4_t a, int32x2_t v) { @@ -1925,8 +2179,10 @@ int32x4_t test_vmulq_lane_s32_0(int32x4_t a, int32x2_t v) { // CHECK-LABEL: @test_vmul_lane_u16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <4 x i16> [[MUL]] // uint16x4_t test_vmul_lane_u16_0(uint16x4_t a, uint16x4_t v) { @@ -1935,8 +2191,10 @@ uint16x4_t test_vmul_lane_u16_0(uint16x4_t a, uint16x4_t v) { // CHECK-LABEL: @test_vmulq_lane_u16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <8 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <8 x i16> [[MUL]] // uint16x8_t test_vmulq_lane_u16_0(uint16x8_t a, uint16x4_t v) { @@ -1945,8 +2203,10 @@ uint16x8_t test_vmulq_lane_u16_0(uint16x8_t a, uint16x4_t v) { // CHECK-LABEL: @test_vmul_lane_u32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <2 x i32> [[MUL]] // uint32x2_t test_vmul_lane_u32_0(uint32x2_t a, uint32x2_t v) { @@ -1955,8 +2215,10 @@ uint32x2_t test_vmul_lane_u32_0(uint32x2_t a, uint32x2_t v) { // CHECK-LABEL: @test_vmulq_lane_u32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <4 x i32> [[MUL]] // uint32x4_t test_vmulq_lane_u32_0(uint32x4_t a, uint32x2_t v) { @@ -1965,8 +2227,10 @@ uint32x4_t test_vmulq_lane_u32_0(uint32x4_t a, uint32x2_t v) { // CHECK-LABEL: @test_vmul_laneq_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <4 x i16> [[MUL]] // int16x4_t test_vmul_laneq_s16_0(int16x4_t a, int16x8_t v) { @@ -1975,8 +2239,10 @@ int16x4_t test_vmul_laneq_s16_0(int16x4_t a, int16x8_t v) { // CHECK-LABEL: @test_vmulq_laneq_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <8 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <8 x i16> [[MUL]] // int16x8_t test_vmulq_laneq_s16_0(int16x8_t a, int16x8_t v) { @@ -1985,8 +2251,10 @@ int16x8_t test_vmulq_laneq_s16_0(int16x8_t a, int16x8_t v) { // CHECK-LABEL: @test_vmul_laneq_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <2 x i32> [[MUL]] // int32x2_t test_vmul_laneq_s32_0(int32x2_t a, int32x4_t v) { @@ -1995,8 +2263,10 @@ int32x2_t test_vmul_laneq_s32_0(int32x2_t a, int32x4_t v) { // CHECK-LABEL: @test_vmulq_laneq_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <4 x i32> [[MUL]] // int32x4_t test_vmulq_laneq_s32_0(int32x4_t a, int32x4_t v) { @@ -2005,8 +2275,10 @@ int32x4_t test_vmulq_laneq_s32_0(int32x4_t a, int32x4_t v) { // CHECK-LABEL: @test_vmul_laneq_u16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <4 x i16> [[MUL]] // uint16x4_t test_vmul_laneq_u16_0(uint16x4_t a, uint16x8_t v) { @@ -2015,8 +2287,10 @@ uint16x4_t test_vmul_laneq_u16_0(uint16x4_t a, uint16x8_t v) { // CHECK-LABEL: @test_vmulq_laneq_u16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <8 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <8 x i16> [[MUL]] // uint16x8_t test_vmulq_laneq_u16_0(uint16x8_t a, uint16x8_t v) { @@ -2025,8 +2299,10 @@ uint16x8_t test_vmulq_laneq_u16_0(uint16x8_t a, uint16x8_t v) { // CHECK-LABEL: @test_vmul_laneq_u32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <2 x i32> [[MUL]] // uint32x2_t test_vmul_laneq_u32_0(uint32x2_t a, uint32x4_t v) { @@ -2035,8 +2311,10 @@ uint32x2_t test_vmul_laneq_u32_0(uint32x2_t a, uint32x4_t v) { // CHECK-LABEL: @test_vmulq_laneq_u32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <4 x i32> [[MUL]] // uint32x4_t test_vmulq_laneq_u32_0(uint32x4_t a, uint32x4_t v) { @@ -2210,10 +2488,12 @@ float64x2_t test_vfmsq_laneq_f64_0(float64x2_t a, float64x2_t b, float64x2_t v) // CHECK-LABEL: @test_vmlal_lane_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -2223,10 +2503,12 @@ int32x4_t test_vmlal_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) { // CHECK-LABEL: @test_vmlal_lane_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] // @@ -2236,10 +2518,12 @@ int64x2_t test_vmlal_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) { // CHECK-LABEL: @test_vmlal_laneq_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -2249,10 +2533,12 @@ int32x4_t test_vmlal_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) { // CHECK-LABEL: @test_vmlal_laneq_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] // @@ -2263,10 +2549,12 @@ int64x2_t test_vmlal_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) { // CHECK-LABEL: @test_vmlal_high_lane_s16_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -2277,10 +2565,12 @@ int32x4_t test_vmlal_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) { // CHECK-LABEL: @test_vmlal_high_lane_s32_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] // @@ -2291,10 +2581,12 @@ int64x2_t test_vmlal_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) { // CHECK-LABEL: @test_vmlal_high_laneq_s16_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -2305,10 +2597,12 @@ int32x4_t test_vmlal_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) { // CHECK-LABEL: @test_vmlal_high_laneq_s32_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] // @@ -2318,10 +2612,12 @@ int64x2_t test_vmlal_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) { // CHECK-LABEL: @test_vmlsl_lane_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -2331,10 +2627,12 @@ int32x4_t test_vmlsl_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) { // CHECK-LABEL: @test_vmlsl_lane_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] // @@ -2344,10 +2642,12 @@ int64x2_t test_vmlsl_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) { // CHECK-LABEL: @test_vmlsl_laneq_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -2357,10 +2657,12 @@ int32x4_t test_vmlsl_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) { // CHECK-LABEL: @test_vmlsl_laneq_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] // @@ -2371,10 +2673,12 @@ int64x2_t test_vmlsl_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) { // CHECK-LABEL: @test_vmlsl_high_lane_s16_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -2385,10 +2689,12 @@ int32x4_t test_vmlsl_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) { // CHECK-LABEL: @test_vmlsl_high_lane_s32_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] // @@ -2399,10 +2705,12 @@ int64x2_t test_vmlsl_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) { // CHECK-LABEL: @test_vmlsl_high_laneq_s16_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -2413,10 +2721,12 @@ int32x4_t test_vmlsl_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) { // CHECK-LABEL: @test_vmlsl_high_laneq_s32_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] // @@ -2426,10 +2736,12 @@ int64x2_t test_vmlsl_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) { // CHECK-LABEL: @test_vmlal_lane_u16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -2439,10 +2751,12 @@ int32x4_t test_vmlal_lane_u16_0(int32x4_t a, int16x4_t b, int16x4_t v) { // CHECK-LABEL: @test_vmlal_lane_u32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] // @@ -2452,10 +2766,12 @@ int64x2_t test_vmlal_lane_u32_0(int64x2_t a, int32x2_t b, int32x2_t v) { // CHECK-LABEL: @test_vmlal_laneq_u16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -2465,10 +2781,12 @@ int32x4_t test_vmlal_laneq_u16_0(int32x4_t a, int16x4_t b, int16x8_t v) { // CHECK-LABEL: @test_vmlal_laneq_u32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] // @@ -2479,10 +2797,12 @@ int64x2_t test_vmlal_laneq_u32_0(int64x2_t a, int32x2_t b, int32x4_t v) { // CHECK-LABEL: @test_vmlal_high_lane_u16_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -2493,10 +2813,12 @@ int32x4_t test_vmlal_high_lane_u16_0(int32x4_t a, int16x8_t b, int16x4_t v) { // CHECK-LABEL: @test_vmlal_high_lane_u32_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] // @@ -2507,10 +2829,12 @@ int64x2_t test_vmlal_high_lane_u32_0(int64x2_t a, int32x4_t b, int32x2_t v) { // CHECK-LABEL: @test_vmlal_high_laneq_u16_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -2521,10 +2845,12 @@ int32x4_t test_vmlal_high_laneq_u16_0(int32x4_t a, int16x8_t b, int16x8_t v) { // CHECK-LABEL: @test_vmlal_high_laneq_u32_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[ADD]] // @@ -2534,10 +2860,12 @@ int64x2_t test_vmlal_high_laneq_u32_0(int64x2_t a, int32x4_t b, int32x4_t v) { // CHECK-LABEL: @test_vmlsl_lane_u16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -2547,10 +2875,12 @@ int32x4_t test_vmlsl_lane_u16_0(int32x4_t a, int16x4_t b, int16x4_t v) { // CHECK-LABEL: @test_vmlsl_lane_u32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] // @@ -2560,10 +2890,12 @@ int64x2_t test_vmlsl_lane_u32_0(int64x2_t a, int32x2_t b, int32x2_t v) { // CHECK-LABEL: @test_vmlsl_laneq_u16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -2573,10 +2905,12 @@ int32x4_t test_vmlsl_laneq_u16_0(int32x4_t a, int16x4_t b, int16x8_t v) { // CHECK-LABEL: @test_vmlsl_laneq_u32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] // @@ -2587,10 +2921,12 @@ int64x2_t test_vmlsl_laneq_u32_0(int64x2_t a, int32x2_t b, int32x4_t v) { // CHECK-LABEL: @test_vmlsl_high_lane_u16_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -2601,10 +2937,12 @@ int32x4_t test_vmlsl_high_lane_u16_0(int32x4_t a, int16x8_t b, int16x4_t v) { // CHECK-LABEL: @test_vmlsl_high_lane_u32_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] // @@ -2615,10 +2953,12 @@ int64x2_t test_vmlsl_high_lane_u32_0(int64x2_t a, int32x4_t b, int32x2_t v) { // CHECK-LABEL: @test_vmlsl_high_laneq_u16_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -2629,10 +2969,12 @@ int32x4_t test_vmlsl_high_laneq_u16_0(int32x4_t a, int16x8_t b, int16x8_t v) { // CHECK-LABEL: @test_vmlsl_high_laneq_u32_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK-NEXT: ret <2 x i64> [[SUB]] // @@ -2642,10 +2984,12 @@ int64x2_t test_vmlsl_high_laneq_u32_0(int64x2_t a, int32x4_t b, int32x4_t v) { // CHECK-LABEL: @test_vmull_lane_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // int32x4_t test_vmull_lane_s16_0(int16x4_t a, int16x4_t v) { @@ -2654,10 +2998,12 @@ int32x4_t test_vmull_lane_s16_0(int16x4_t a, int16x4_t v) { // CHECK-LABEL: @test_vmull_lane_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // int64x2_t test_vmull_lane_s32_0(int32x2_t a, int32x2_t v) { @@ -2666,10 +3012,12 @@ int64x2_t test_vmull_lane_s32_0(int32x2_t a, int32x2_t v) { // CHECK-LABEL: @test_vmull_lane_u16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // uint32x4_t test_vmull_lane_u16_0(uint16x4_t a, uint16x4_t v) { @@ -2678,10 +3026,12 @@ uint32x4_t test_vmull_lane_u16_0(uint16x4_t a, uint16x4_t v) { // CHECK-LABEL: @test_vmull_lane_u32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // uint64x2_t test_vmull_lane_u32_0(uint32x2_t a, uint32x2_t v) { @@ -2691,10 +3041,12 @@ uint64x2_t test_vmull_lane_u32_0(uint32x2_t a, uint32x2_t v) { // CHECK-LABEL: @test_vmull_high_lane_s16_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // int32x4_t test_vmull_high_lane_s16_0(int16x8_t a, int16x4_t v) { @@ -2704,10 +3056,12 @@ int32x4_t test_vmull_high_lane_s16_0(int16x8_t a, int16x4_t v) { // CHECK-LABEL: @test_vmull_high_lane_s32_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // int64x2_t test_vmull_high_lane_s32_0(int32x4_t a, int32x2_t v) { @@ -2717,10 +3071,12 @@ int64x2_t test_vmull_high_lane_s32_0(int32x4_t a, int32x2_t v) { // CHECK-LABEL: @test_vmull_high_lane_u16_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // uint32x4_t test_vmull_high_lane_u16_0(uint16x8_t a, uint16x4_t v) { @@ -2730,10 +3086,12 @@ uint32x4_t test_vmull_high_lane_u16_0(uint16x8_t a, uint16x4_t v) { // CHECK-LABEL: @test_vmull_high_lane_u32_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // uint64x2_t test_vmull_high_lane_u32_0(uint32x4_t a, uint32x2_t v) { @@ -2742,10 +3100,12 @@ uint64x2_t test_vmull_high_lane_u32_0(uint32x4_t a, uint32x2_t v) { // CHECK-LABEL: @test_vmull_laneq_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // int32x4_t test_vmull_laneq_s16_0(int16x4_t a, int16x8_t v) { @@ -2754,10 +3114,12 @@ int32x4_t test_vmull_laneq_s16_0(int16x4_t a, int16x8_t v) { // CHECK-LABEL: @test_vmull_laneq_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // int64x2_t test_vmull_laneq_s32_0(int32x2_t a, int32x4_t v) { @@ -2766,10 +3128,12 @@ int64x2_t test_vmull_laneq_s32_0(int32x2_t a, int32x4_t v) { // CHECK-LABEL: @test_vmull_laneq_u16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // uint32x4_t test_vmull_laneq_u16_0(uint16x4_t a, uint16x8_t v) { @@ -2778,10 +3142,12 @@ uint32x4_t test_vmull_laneq_u16_0(uint16x4_t a, uint16x8_t v) { // CHECK-LABEL: @test_vmull_laneq_u32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // uint64x2_t test_vmull_laneq_u32_0(uint32x2_t a, uint32x4_t v) { @@ -2791,10 +3157,12 @@ uint64x2_t test_vmull_laneq_u32_0(uint32x2_t a, uint32x4_t v) { // CHECK-LABEL: @test_vmull_high_laneq_s16_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // int32x4_t test_vmull_high_laneq_s16_0(int16x8_t a, int16x8_t v) { @@ -2804,10 +3172,12 @@ int32x4_t test_vmull_high_laneq_s16_0(int16x8_t a, int16x8_t v) { // CHECK-LABEL: @test_vmull_high_laneq_s32_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // int64x2_t test_vmull_high_laneq_s32_0(int32x4_t a, int32x4_t v) { @@ -2817,10 +3187,12 @@ int64x2_t test_vmull_high_laneq_s32_0(int32x4_t a, int32x4_t v) { // CHECK-LABEL: @test_vmull_high_laneq_u16_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: ret <4 x i32> [[VMULL2_I]] // uint32x4_t test_vmull_high_laneq_u16_0(uint16x8_t a, uint16x8_t v) { @@ -2830,10 +3202,12 @@ uint32x4_t test_vmull_high_laneq_u16_0(uint16x8_t a, uint16x8_t v) { // CHECK-LABEL: @test_vmull_high_laneq_u32_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: ret <2 x i64> [[VMULL2_I]] // uint64x2_t test_vmull_high_laneq_u32_0(uint32x4_t a, uint32x4_t v) { @@ -2842,11 +3216,13 @@ uint64x2_t test_vmull_high_laneq_u32_0(uint32x4_t a, uint32x4_t v) { // CHECK-LABEL: @test_vqdmlal_lane_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I]] // @@ -2856,11 +3232,13 @@ int32x4_t test_vqdmlal_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) { // CHECK-LABEL: @test_vqdmlal_lane_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I]] // @@ -2871,11 +3249,13 @@ int64x2_t test_vqdmlal_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) { // CHECK-LABEL: @test_vqdmlal_high_lane_s16_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I]] // @@ -2886,11 +3266,13 @@ int32x4_t test_vqdmlal_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) { // CHECK-LABEL: @test_vqdmlal_high_lane_s32_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I]] // @@ -2900,11 +3282,13 @@ int64x2_t test_vqdmlal_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) { // CHECK-LABEL: @test_vqdmlsl_lane_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I]] // @@ -2914,11 +3298,13 @@ int32x4_t test_vqdmlsl_lane_s16_0(int32x4_t a, int16x4_t b, int16x4_t v) { // CHECK-LABEL: @test_vqdmlsl_lane_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I]] // @@ -2929,11 +3315,13 @@ int64x2_t test_vqdmlsl_lane_s32_0(int64x2_t a, int32x2_t b, int32x2_t v) { // CHECK-LABEL: @test_vqdmlsl_high_lane_s16_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I]] // @@ -2944,11 +3332,13 @@ int32x4_t test_vqdmlsl_high_lane_s16_0(int32x4_t a, int16x8_t b, int16x4_t v) { // CHECK-LABEL: @test_vqdmlsl_high_lane_s32_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I]] // @@ -2958,10 +3348,12 @@ int64x2_t test_vqdmlsl_high_lane_s32_0(int64x2_t a, int32x4_t b, int32x2_t v) { // CHECK-LABEL: @test_vqdmull_lane_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8> // CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I]] // @@ -2971,10 +3363,12 @@ int32x4_t test_vqdmull_lane_s16_0(int16x4_t a, int16x4_t v) { // CHECK-LABEL: @test_vqdmull_lane_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8> // CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I]] // @@ -2984,10 +3378,12 @@ int64x2_t test_vqdmull_lane_s32_0(int32x2_t a, int32x2_t v) { // CHECK-LABEL: @test_vqdmull_laneq_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8> // CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I]] // @@ -2997,10 +3393,12 @@ int32x4_t test_vqdmull_laneq_s16_0(int16x4_t a, int16x8_t v) { // CHECK-LABEL: @test_vqdmull_laneq_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8> // CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I]] // @@ -3011,10 +3409,12 @@ int64x2_t test_vqdmull_laneq_s32_0(int32x2_t a, int32x4_t v) { // CHECK-LABEL: @test_vqdmull_high_lane_s16_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8> // CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I]] // @@ -3025,10 +3425,12 @@ int32x4_t test_vqdmull_high_lane_s16_0(int16x8_t a, int16x4_t v) { // CHECK-LABEL: @test_vqdmull_high_lane_s32_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8> // CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I]] // @@ -3039,10 +3441,12 @@ int64x2_t test_vqdmull_high_lane_s32_0(int32x4_t a, int32x2_t v) { // CHECK-LABEL: @test_vqdmull_high_laneq_s16_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[A]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8> // CHECK-NEXT: ret <4 x i32> [[VQDMULL_V2_I]] // @@ -3053,10 +3457,12 @@ int32x4_t test_vqdmull_high_laneq_s16_0(int16x8_t a, int16x8_t v) { // CHECK-LABEL: @test_vqdmull_high_laneq_s32_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[A]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8> // CHECK-NEXT: ret <2 x i64> [[VQDMULL_V2_I]] // @@ -3170,8 +3576,10 @@ int32x4_t test_vqrdmulhq_lane_s32_0(int32x4_t a, int32x2_t v) { // CHECK-LABEL: @test_vmul_lane_f32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[V:%.*]], <2 x float> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = fmul <2 x float> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = fmul <2 x float> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <2 x float> [[MUL]] // float32x2_t test_vmul_lane_f32_0(float32x2_t a, float32x2_t v) { @@ -3180,8 +3588,10 @@ float32x2_t test_vmul_lane_f32_0(float32x2_t a, float32x2_t v) { // CHECK-LABEL: @test_vmulq_lane_f32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[V:%.*]], <2 x float> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = fmul <4 x float> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = fmul <4 x float> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <4 x float> [[MUL]] // float32x4_t test_vmulq_lane_f32_0(float32x4_t a, float32x2_t v) { @@ -3190,8 +3600,10 @@ float32x4_t test_vmulq_lane_f32_0(float32x4_t a, float32x2_t v) { // CHECK-LABEL: @test_vmul_laneq_f32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x float> [[V:%.*]], <4 x float> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = fmul <2 x float> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = fmul <2 x float> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <2 x float> [[MUL]] // float32x2_t test_vmul_laneq_f32_0(float32x2_t a, float32x4_t v) { @@ -3215,8 +3627,10 @@ float64x1_t test_vmul_laneq_f64_0(float64x1_t a, float64x2_t v) { // CHECK-LABEL: @test_vmulq_laneq_f32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x float> [[V:%.*]], <4 x float> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = fmul <4 x float> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = fmul <4 x float> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <4 x float> [[MUL]] // float32x4_t test_vmulq_laneq_f32_0(float32x4_t a, float32x4_t v) { @@ -3225,8 +3639,10 @@ float32x4_t test_vmulq_laneq_f32_0(float32x4_t a, float32x4_t v) { // CHECK-LABEL: @test_vmulq_laneq_f64_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[V:%.*]], <2 x double> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = fmul <2 x double> [[A:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = fmul <2 x double> [[A:%.*]], [[LANE]] // CHECK-NEXT: ret <2 x double> [[MUL]] // float64x2_t test_vmulq_laneq_f64_0(float64x2_t a, float64x2_t v) { @@ -3235,10 +3651,12 @@ float64x2_t test_vmulq_laneq_f64_0(float64x2_t a, float64x2_t v) { // CHECK-LABEL: @test_vmulx_lane_f32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[V:%.*]], <2 x float> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x float> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[LANE]]) #4 // CHECK-NEXT: ret <2 x float> [[VMULX2_I]] // float32x2_t test_vmulx_lane_f32_0(float32x2_t a, float32x2_t v) { @@ -3247,10 +3665,12 @@ float32x2_t test_vmulx_lane_f32_0(float32x2_t a, float32x2_t v) { // CHECK-LABEL: @test_vmulxq_lane_f32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[V:%.*]], <2 x float> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[SHUFFLE]] to <16 x i8> -// CHECK-NEXT: [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x float> [[LANE]] to <16 x i8> +// CHECK-NEXT: [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[LANE]]) #4 // CHECK-NEXT: ret <4 x float> [[VMULX2_I]] // float32x4_t test_vmulxq_lane_f32_0(float32x4_t a, float32x2_t v) { @@ -3259,10 +3679,12 @@ float32x4_t test_vmulxq_lane_f32_0(float32x4_t a, float32x2_t v) { // CHECK-LABEL: @test_vmulxq_lane_f64_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <1 x double> [[V:%.*]], <1 x double> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[SHUFFLE]] to <16 x i8> -// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x double> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <1 x double> [[TMP1]], <1 x double> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x double> [[LANE]] to <16 x i8> +// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[LANE]]) #4 // CHECK-NEXT: ret <2 x double> [[VMULX2_I]] // float64x2_t test_vmulxq_lane_f64_0(float64x2_t a, float64x1_t v) { @@ -3271,10 +3693,12 @@ float64x2_t test_vmulxq_lane_f64_0(float64x2_t a, float64x1_t v) { // CHECK-LABEL: @test_vmulx_laneq_f32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x float> [[V:%.*]], <4 x float> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x float> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x float> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fmulx.v2f32(<2 x float> [[A]], <2 x float> [[LANE]]) #4 // CHECK-NEXT: ret <2 x float> [[VMULX2_I]] // float32x2_t test_vmulx_laneq_f32_0(float32x2_t a, float32x4_t v) { @@ -3283,10 +3707,12 @@ float32x2_t test_vmulx_laneq_f32_0(float32x2_t a, float32x4_t v) { // CHECK-LABEL: @test_vmulxq_laneq_f32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x float> [[V:%.*]], <4 x float> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[SHUFFLE]] to <16 x i8> -// CHECK-NEXT: [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x float> [[LANE]] to <16 x i8> +// CHECK-NEXT: [[VMULX2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmulx.v4f32(<4 x float> [[A]], <4 x float> [[LANE]]) #4 // CHECK-NEXT: ret <4 x float> [[VMULX2_I]] // float32x4_t test_vmulxq_laneq_f32_0(float32x4_t a, float32x4_t v) { @@ -3295,10 +3721,12 @@ float32x4_t test_vmulxq_laneq_f32_0(float32x4_t a, float32x4_t v) { // CHECK-LABEL: @test_vmulxq_laneq_f64_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[V:%.*]], <2 x double> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x double> [[SHUFFLE]] to <16 x i8> -// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x double> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x double> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x double> [[LANE]] to <16 x i8> +// CHECK-NEXT: [[VMULX2_I:%.*]] = call <2 x double> @llvm.aarch64.neon.fmulx.v2f64(<2 x double> [[A]], <2 x double> [[LANE]]) #4 // CHECK-NEXT: ret <2 x double> [[VMULX2_I]] // float64x2_t test_vmulxq_laneq_f64_0(float64x2_t a, float64x2_t v) { @@ -4461,8 +4889,10 @@ int64x2_t test_vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) { // CHECK-LABEL: @test_vmla_lane_u16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i16> [[ADD]] // @@ -4472,8 +4902,10 @@ uint16x4_t test_vmla_lane_u16_0(uint16x4_t a, uint16x4_t b, uint16x4_t v) { // CHECK-LABEL: @test_vmlaq_lane_u16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <8 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <8 x i16> [[ADD]] // @@ -4483,8 +4915,10 @@ uint16x8_t test_vmlaq_lane_u16_0(uint16x8_t a, uint16x8_t b, uint16x4_t v) { // CHECK-LABEL: @test_vmla_lane_u32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <2 x i32> [[ADD]] // @@ -4494,8 +4928,10 @@ uint32x2_t test_vmla_lane_u32_0(uint32x2_t a, uint32x2_t b, uint32x2_t v) { // CHECK-LABEL: @test_vmlaq_lane_u32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -4505,8 +4941,10 @@ uint32x4_t test_vmlaq_lane_u32_0(uint32x4_t a, uint32x4_t b, uint32x2_t v) { // CHECK-LABEL: @test_vmla_laneq_u16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i16> [[ADD]] // @@ -4516,8 +4954,10 @@ uint16x4_t test_vmla_laneq_u16_0(uint16x4_t a, uint16x4_t b, uint16x8_t v) { // CHECK-LABEL: @test_vmlaq_laneq_u16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <8 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <8 x i16> [[ADD]] // @@ -4527,8 +4967,10 @@ uint16x8_t test_vmlaq_laneq_u16_0(uint16x8_t a, uint16x8_t b, uint16x8_t v) { // CHECK-LABEL: @test_vmla_laneq_u32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <2 x i32> [[ADD]] // @@ -4538,8 +4980,10 @@ uint32x2_t test_vmla_laneq_u32_0(uint32x2_t a, uint32x2_t b, uint32x4_t v) { // CHECK-LABEL: @test_vmlaq_laneq_u32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -4549,11 +4993,13 @@ uint32x4_t test_vmlaq_laneq_u32_0(uint32x4_t a, uint32x4_t b, uint32x4_t v) { // CHECK-LABEL: @test_vqdmlal_laneq_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I]] // @@ -4563,11 +5009,13 @@ int32x4_t test_vqdmlal_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) { // CHECK-LABEL: @test_vqdmlal_laneq_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I]] // @@ -4578,11 +5026,13 @@ int64x2_t test_vqdmlal_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) { // CHECK-LABEL: @test_vqdmlal_high_laneq_s16_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I]] // @@ -4593,11 +5043,13 @@ int32x4_t test_vqdmlal_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) { // CHECK-LABEL: @test_vqdmlal_high_laneq_s32_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I]] // @@ -4607,8 +5059,10 @@ int64x2_t test_vqdmlal_high_laneq_s32_0(int64x2_t a, int32x4_t b, int32x4_t v) { // CHECK-LABEL: @test_vmls_lane_u16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i16> [[SUB]] // @@ -4618,8 +5072,10 @@ uint16x4_t test_vmls_lane_u16_0(uint16x4_t a, uint16x4_t b, uint16x4_t v) { // CHECK-LABEL: @test_vmlsq_lane_u16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <8 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <8 x i16> [[SUB]] // @@ -4629,8 +5085,10 @@ uint16x8_t test_vmlsq_lane_u16_0(uint16x8_t a, uint16x8_t b, uint16x4_t v) { // CHECK-LABEL: @test_vmls_lane_u32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <2 x i32> [[SUB]] // @@ -4640,8 +5098,10 @@ uint32x2_t test_vmls_lane_u32_0(uint32x2_t a, uint32x2_t b, uint32x2_t v) { // CHECK-LABEL: @test_vmlsq_lane_u32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -4651,8 +5111,10 @@ uint32x4_t test_vmlsq_lane_u32_0(uint32x4_t a, uint32x4_t b, uint32x2_t v) { // CHECK-LABEL: @test_vmls_laneq_u16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i16> [[SUB]] // @@ -4662,8 +5124,10 @@ uint16x4_t test_vmls_laneq_u16_0(uint16x4_t a, uint16x4_t b, uint16x8_t v) { // CHECK-LABEL: @test_vmlsq_laneq_u16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <8 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <8 x i16> [[SUB]] // @@ -4673,8 +5137,10 @@ uint16x8_t test_vmlsq_laneq_u16_0(uint16x8_t a, uint16x8_t b, uint16x8_t v) { // CHECK-LABEL: @test_vmls_laneq_u32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <2 x i32> [[SUB]] // @@ -4684,8 +5150,10 @@ uint32x2_t test_vmls_laneq_u32_0(uint32x2_t a, uint32x2_t b, uint32x4_t v) { // CHECK-LABEL: @test_vmlsq_laneq_u32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -4695,11 +5163,13 @@ uint32x4_t test_vmlsq_laneq_u32_0(uint32x4_t a, uint32x4_t b, uint32x4_t v) { // CHECK-LABEL: @test_vqdmlsl_laneq_s16_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I]] // @@ -4709,11 +5179,13 @@ int32x4_t test_vqdmlsl_laneq_s16_0(int32x4_t a, int16x4_t b, int16x8_t v) { // CHECK-LABEL: @test_vqdmlsl_laneq_s32_0( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I]] // @@ -4724,11 +5196,13 @@ int64x2_t test_vqdmlsl_laneq_s32_0(int64x2_t a, int32x2_t b, int32x4_t v) { // CHECK-LABEL: @test_vqdmlsl_high_laneq_s16_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I]] // @@ -4739,11 +5213,13 @@ int32x4_t test_vqdmlsl_high_laneq_s16_0(int32x4_t a, int16x8_t b, int16x8_t v) { // CHECK-LABEL: @test_vqdmlsl_high_laneq_s32_0( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> zeroinitializer -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> zeroinitializer +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I]] // @@ -4857,8 +5333,10 @@ int32x4_t test_vqrdmulhq_laneq_s32_0(int32x4_t a, int32x4_t v) { // CHECK-LABEL: @test_vmla_lane_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i16> [[ADD]] // @@ -4868,8 +5346,10 @@ uint16x4_t test_vmla_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v) { // CHECK-LABEL: @test_vmlaq_lane_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <8 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <8 x i16> [[ADD]] // @@ -4879,8 +5359,10 @@ uint16x8_t test_vmlaq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v) { // CHECK-LABEL: @test_vmla_lane_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <2 x i32> [[ADD]] // @@ -4890,8 +5372,10 @@ uint32x2_t test_vmla_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v) { // CHECK-LABEL: @test_vmlaq_lane_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <4 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -4901,8 +5385,10 @@ uint32x4_t test_vmlaq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v) { // CHECK-LABEL: @test_vmla_laneq_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i16> [[ADD]] // @@ -4912,8 +5398,10 @@ uint16x4_t test_vmla_laneq_u16(uint16x4_t a, uint16x4_t b, uint16x8_t v) { // CHECK-LABEL: @test_vmlaq_laneq_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <8 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <8 x i16> [[ADD]] // @@ -4923,8 +5411,10 @@ uint16x8_t test_vmlaq_laneq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t v) { // CHECK-LABEL: @test_vmla_laneq_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <2 x i32> [[ADD]] // @@ -4934,8 +5424,10 @@ uint32x2_t test_vmla_laneq_u32(uint32x2_t a, uint32x2_t b, uint32x4_t v) { // CHECK-LABEL: @test_vmlaq_laneq_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <4 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i32> [[ADD]] // @@ -4945,11 +5437,13 @@ uint32x4_t test_vmlaq_laneq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t v) { // CHECK-LABEL: @test_vqdmlal_laneq_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I]] // @@ -4959,11 +5453,13 @@ int32x4_t test_vqdmlal_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) { // CHECK-LABEL: @test_vqdmlal_laneq_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I]] // @@ -4974,11 +5470,13 @@ int64x2_t test_vqdmlal_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) { // CHECK-LABEL: @test_vqdmlal_high_laneq_s16( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <4 x i32> [[VQDMLAL_V3_I]] // @@ -4989,11 +5487,13 @@ int32x4_t test_vqdmlal_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) { // CHECK-LABEL: @test_vqdmlal_high_laneq_s32( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqadd.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <2 x i64> [[VQDMLAL_V3_I]] // @@ -5003,8 +5503,10 @@ int64x2_t test_vqdmlal_high_laneq_s32(int64x2_t a, int32x4_t b, int32x4_t v) { // CHECK-LABEL: @test_vmls_lane_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i16> [[SUB]] // @@ -5014,8 +5516,10 @@ uint16x4_t test_vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v) { // CHECK-LABEL: @test_vmlsq_lane_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[V:%.*]], <4 x i16> [[V]], <8 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i16> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <8 x i16> [[SUB]] // @@ -5025,8 +5529,10 @@ uint16x8_t test_vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v) { // CHECK-LABEL: @test_vmls_lane_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <2 x i32> [[SUB]] // @@ -5036,8 +5542,10 @@ uint32x2_t test_vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v) { // CHECK-LABEL: @test_vmlsq_lane_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[V:%.*]], <2 x i32> [[V]], <4 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i32> [[V:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -5047,8 +5555,10 @@ uint32x4_t test_vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v) { // CHECK-LABEL: @test_vmls_laneq_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i16> [[SUB]] // @@ -5058,8 +5568,10 @@ uint16x4_t test_vmls_laneq_u16(uint16x4_t a, uint16x4_t b, uint16x8_t v) { // CHECK-LABEL: @test_vmlsq_laneq_u16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <8 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <8 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <8 x i16> [[SUB]] // @@ -5069,8 +5581,10 @@ uint16x8_t test_vmlsq_laneq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t v) { // CHECK-LABEL: @test_vmls_laneq_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <2 x i32> [[SUB]] // @@ -5080,8 +5594,10 @@ uint32x2_t test_vmls_laneq_u32(uint32x2_t a, uint32x2_t b, uint32x4_t v) { // CHECK-LABEL: @test_vmlsq_laneq_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <4 x i32> -// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[SHUFFLE]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] // CHECK-NEXT: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]] // CHECK-NEXT: ret <4 x i32> [[SUB]] // @@ -5091,11 +5607,13 @@ uint32x4_t test_vmlsq_laneq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t v) { // CHECK-LABEL: @test_vqdmlsl_laneq_s16( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I]] // @@ -5105,11 +5623,13 @@ int32x4_t test_vqdmlsl_laneq_s16(int32x4_t a, int16x4_t b, int16x8_t v) { // CHECK-LABEL: @test_vqdmlsl_laneq_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I]] // @@ -5120,11 +5640,13 @@ int64x2_t test_vqdmlsl_laneq_s32(int64x2_t a, int32x2_t b, int32x4_t v) { // CHECK-LABEL: @test_vqdmlsl_high_laneq_s16( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> [[B]], <4 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> [[V]], <4 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x i16> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP1]], <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i16> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqdmull.v4i32(<4 x i16> [[SHUFFLE_I]], <4 x i16> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <4 x i32> [[VQDMLSL_V3_I]] // @@ -5135,11 +5657,13 @@ int32x4_t test_vqdmlsl_high_laneq_s16(int32x4_t a, int16x8_t b, int16x8_t v) { // CHECK-LABEL: @test_vqdmlsl_high_laneq_s32( // CHECK-NEXT: entry: // CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> [[B]], <2 x i32> -// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> [[V]], <2 x i32> -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> -// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[SHUFFLE]]) #4 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[V:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32> +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP1]], <2 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i32> [[SHUFFLE_I]] to <8 x i8> +// CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK-NEXT: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqdmull.v2i64(<2 x i32> [[SHUFFLE_I]], <2 x i32> [[LANE]]) #4 // CHECK-NEXT: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.sqsub.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #4 // CHECK-NEXT: ret <2 x i64> [[VQDMLSL_V3_I]] // diff --git a/clang/test/CodeGen/aarch64-neon-fma.c b/clang/test/CodeGen/aarch64-neon-fma.c index ae02bfbffb985..c2dd315ed9fc4 100644 --- a/clang/test/CodeGen/aarch64-neon-fma.c +++ b/clang/test/CodeGen/aarch64-neon-fma.c @@ -69,144 +69,177 @@ float64x2_t test_vmlsq_n_f64(float64x2_t a, float64x2_t b, float64_t c) { } // CHECK-LABEL: define <2 x float> @test_vmla_lane_f32_0(<2 x float> %a, <2 x float> %b, <2 x float> %v) #0 { -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> zeroinitializer -// CHECK: [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]] -// CHECK: [[ADD:%.*]] = fadd <2 x float> %a, [[MUL]] -// CHECK: ret <2 x float> [[ADD]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> zeroinitializer +// CHECK: [[MUL:%.*]] = fmul <2 x float> [[B:%.*]], [[LANE]] +// CHECK: [[ADD:%.*]] = fadd <2 x float> [[A:%.*]], [[MUL]] +// CHECK: ret <2 x float> [[ADD]] float32x2_t test_vmla_lane_f32_0(float32x2_t a, float32x2_t b, float32x2_t v) { return vmla_lane_f32(a, b, v, 0); } // CHECK-LABEL: define <4 x float> @test_vmlaq_lane_f32_0(<4 x float> %a, <4 x float> %b, <2 x float> %v) #1 { -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> zeroinitializer -// CHECK: [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]] -// CHECK: [[ADD:%.*]] = fadd <4 x float> %a, [[MUL]] -// CHECK: ret <4 x float> [[ADD]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> zeroinitializer +// CHECK: [[MUL:%.*]] = fmul <4 x float> [[B:%.*]], [[LANE]] +// CHECK: [[ADD:%.*]] = fadd <4 x float> [[A:%.*]], [[MUL]] +// CHECK: ret <4 x float> [[ADD]] float32x4_t test_vmlaq_lane_f32_0(float32x4_t a, float32x4_t b, float32x2_t v) { return vmlaq_lane_f32(a, b, v, 0); } // CHECK-LABEL: define <2 x float> @test_vmla_laneq_f32_0(<2 x float> %a, <2 x float> %b, <4 x float> %v) #1 { -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> zeroinitializer -// CHECK: [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]] -// CHECK: [[ADD:%.*]] = fadd <2 x float> %a, [[MUL]] -// CHECK: ret <2 x float> [[ADD]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> zeroinitializer +// CHECK: [[MUL:%.*]] = fmul <2 x float> [[B:%.*]], [[LANE]] +// CHECK: [[ADD:%.*]] = fadd <2 x float> [[A:%.*]], [[MUL]] +// CHECK: ret <2 x float> [[ADD]] float32x2_t test_vmla_laneq_f32_0(float32x2_t a, float32x2_t b, float32x4_t v) { return vmla_laneq_f32(a, b, v, 0); } // CHECK-LABEL: define <4 x float> @test_vmlaq_laneq_f32_0(<4 x float> %a, <4 x float> %b, <4 x float> %v) #1 { -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> zeroinitializer -// CHECK: [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]] -// CHECK: [[ADD:%.*]] = fadd <4 x float> %a, [[MUL]] -// CHECK: ret <4 x float> [[ADD]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> zeroinitializer +// CHECK: [[MUL:%.*]] = fmul <4 x float> [[B:%.*]], [[LANE]] +// CHECK: [[ADD:%.*]] = fadd <4 x float> [[A:%.*]], [[MUL]] +// CHECK: ret <4 x float> [[ADD]] float32x4_t test_vmlaq_laneq_f32_0(float32x4_t a, float32x4_t b, float32x4_t v) { return vmlaq_laneq_f32(a, b, v, 0); } // CHECK-LABEL: define <2 x float> @test_vmls_lane_f32_0(<2 x float> %a, <2 x float> %b, <2 x float> %v) #0 { -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> zeroinitializer -// CHECK: [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]] -// CHECK: [[SUB:%.*]] = fsub <2 x float> %a, [[MUL]] -// CHECK: ret <2 x float> [[SUB]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> zeroinitializer +// CHECK: [[MUL:%.*]] = fmul <2 x float> [[B:%.*]], [[LANE]] +// CHECK: [[SUB:%.*]] = fsub <2 x float> [[A:%.*]], [[MUL]] +// CHECK: ret <2 x float> [[SUB]] float32x2_t test_vmls_lane_f32_0(float32x2_t a, float32x2_t b, float32x2_t v) { return vmls_lane_f32(a, b, v, 0); } // CHECK-LABEL: define <4 x float> @test_vmlsq_lane_f32_0(<4 x float> %a, <4 x float> %b, <2 x float> %v) #1 { -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> zeroinitializer -// CHECK: [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]] -// CHECK: [[SUB:%.*]] = fsub <4 x float> %a, [[MUL]] -// CHECK: ret <4 x float> [[SUB]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> zeroinitializer +// CHECK: [[MUL:%.*]] = fmul <4 x float> [[B:%.*]], [[LANE]] +// CHECK: [[SUB:%.*]] = fsub <4 x float> [[A:%.*]], [[MUL]] +// CHECK: ret <4 x float> [[SUB]] float32x4_t test_vmlsq_lane_f32_0(float32x4_t a, float32x4_t b, float32x2_t v) { return vmlsq_lane_f32(a, b, v, 0); } // CHECK-LABEL: define <2 x float> @test_vmls_laneq_f32_0(<2 x float> %a, <2 x float> %b, <4 x float> %v) #1 { -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> zeroinitializer -// CHECK: [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]] -// CHECK: [[SUB:%.*]] = fsub <2 x float> %a, [[MUL]] -// CHECK: ret <2 x float> [[SUB]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> zeroinitializer +// CHECK: [[MUL:%.*]] = fmul <2 x float> [[B:%.*]], [[LANE]] +// CHECK: [[SUB:%.*]] = fsub <2 x float> [[A:%.*]], [[MUL]] +// CHECK: ret <2 x float> [[SUB]] float32x2_t test_vmls_laneq_f32_0(float32x2_t a, float32x2_t b, float32x4_t v) { return vmls_laneq_f32(a, b, v, 0); } // CHECK-LABEL: define <4 x float> @test_vmlsq_laneq_f32_0(<4 x float> %a, <4 x float> %b, <4 x float> %v) #1 { -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> zeroinitializer -// CHECK: [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]] -// CHECK: [[SUB:%.*]] = fsub <4 x float> %a, [[MUL]] -// CHECK: ret <4 x float> [[SUB]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> zeroinitializer +// CHECK: [[MUL:%.*]] = fmul <4 x float> [[B:%.*]], [[LANE]] +// CHECK: [[SUB:%.*]] = fsub <4 x float> [[A:%.*]], [[MUL]] +// CHECK: ret <4 x float> [[SUB]] float32x4_t test_vmlsq_laneq_f32_0(float32x4_t a, float32x4_t b, float32x4_t v) { return vmlsq_laneq_f32(a, b, v, 0); } // CHECK-LABEL: define <2 x float> @test_vmla_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) #0 { -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> -// CHECK: [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]] -// CHECK: [[ADD:%.*]] = fadd <2 x float> %a, [[MUL]] -// CHECK: ret <2 x float> [[ADD]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> +// CHECK: [[MUL:%.*]] = fmul <2 x float> [[B:%.*]], [[LANE]] +// CHECK: [[ADD:%.*]] = fadd <2 x float> [[A:%.*]], [[MUL]] +// CHECK: ret <2 x float> [[ADD]] float32x2_t test_vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) { return vmla_lane_f32(a, b, v, 1); } // CHECK-LABEL: define <4 x float> @test_vmlaq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %v) #1 { -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> -// CHECK: [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]] -// CHECK: [[ADD:%.*]] = fadd <4 x float> %a, [[MUL]] -// CHECK: ret <4 x float> [[ADD]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> +// CHECK: [[MUL:%.*]] = fmul <4 x float> [[B:%.*]], [[LANE]] +// CHECK: [[ADD:%.*]] = fadd <4 x float> [[A:%.*]], [[MUL]] +// CHECK: ret <4 x float> [[ADD]] float32x4_t test_vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) { return vmlaq_lane_f32(a, b, v, 1); } // CHECK-LABEL: define <2 x float> @test_vmla_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) #1 { -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> -// CHECK: [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]] -// CHECK: [[ADD:%.*]] = fadd <2 x float> %a, [[MUL]] -// CHECK: ret <2 x float> [[ADD]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> +// CHECK: [[MUL:%.*]] = fmul <2 x float> [[B:%.*]], [[LANE]] +// CHECK: [[ADD:%.*]] = fadd <2 x float> [[A:%.*]], [[MUL]] +// CHECK: ret <2 x float> [[ADD]] float32x2_t test_vmla_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) { return vmla_laneq_f32(a, b, v, 3); } // CHECK-LABEL: define <4 x float> @test_vmlaq_laneq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %v) #1 { -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> -// CHECK: [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]] -// CHECK: [[ADD:%.*]] = fadd <4 x float> %a, [[MUL]] -// CHECK: ret <4 x float> [[ADD]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> +// CHECK: [[MUL:%.*]] = fmul <4 x float> [[B:%.*]], [[LANE]] +// CHECK: [[ADD:%.*]] = fadd <4 x float> [[A:%.*]], [[MUL]] +// CHECK: ret <4 x float> [[ADD]] float32x4_t test_vmlaq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) { return vmlaq_laneq_f32(a, b, v, 3); } // CHECK-LABEL: define <2 x float> @test_vmls_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) #0 { -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <2 x i32> -// CHECK: [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]] -// CHECK: [[SUB:%.*]] = fsub <2 x float> %a, [[MUL]] -// CHECK: ret <2 x float> [[SUB]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> +// CHECK: [[MUL:%.*]] = fmul <2 x float> [[B:%.*]], [[LANE]] +// CHECK: [[SUB:%.*]] = fsub <2 x float> [[A:%.*]], [[MUL]] +// CHECK: ret <2 x float> [[SUB]] float32x2_t test_vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v) { return vmls_lane_f32(a, b, v, 1); } // CHECK-LABEL: define <4 x float> @test_vmlsq_lane_f32(<4 x float> %a, <4 x float> %b, <2 x float> %v) #1 { -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %v, <2 x float> %v, <4 x i32> -// CHECK: [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]] -// CHECK: [[SUB:%.*]] = fsub <4 x float> %a, [[MUL]] -// CHECK: ret <4 x float> [[SUB]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> [[V:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> +// CHECK: [[MUL:%.*]] = fmul <4 x float> [[B:%.*]], [[LANE]] +// CHECK: [[SUB:%.*]] = fsub <4 x float> [[A:%.*]], [[MUL]] +// CHECK: ret <4 x float> [[SUB]] +// float32x4_t test_vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v) { return vmlsq_lane_f32(a, b, v, 1); } // CHECK-LABEL: define <2 x float> @test_vmls_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) #1 { -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <2 x i32> -// CHECK: [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]] -// CHECK: [[SUB:%.*]] = fsub <2 x float> %a, [[MUL]] -// CHECK: ret <2 x float> [[SUB]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <2 x i32> +// CHECK: [[MUL:%.*]] = fmul <2 x float> [[B:%.*]], [[LANE]] +// CHECK: [[SUB:%.*]] = fsub <2 x float> [[A:%.*]], [[MUL]] +// CHECK: ret <2 x float> [[SUB]] float32x2_t test_vmls_laneq_f32(float32x2_t a, float32x2_t b, float32x4_t v) { return vmls_laneq_f32(a, b, v, 3); } // CHECK-LABEL: define <4 x float> @test_vmlsq_laneq_f32(<4 x float> %a, <4 x float> %b, <4 x float> %v) #1 { -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x float> %v, <4 x float> %v, <4 x i32> -// CHECK: [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]] -// CHECK: [[SUB:%.*]] = fsub <4 x float> %a, [[MUL]] -// CHECK: ret <4 x float> [[SUB]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x float> [[V:%.*]] to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x float> +// CHECK: [[LANE:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP1]], <4 x i32> +// CHECK: [[MUL:%.*]] = fmul <4 x float> [[B:%.*]], [[LANE]] +// CHECK: [[SUB:%.*]] = fsub <4 x float> [[A:%.*]], [[MUL]] +// CHECK: ret <4 x float> [[SUB]] float32x4_t test_vmlsq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t v) { return vmlsq_laneq_f32(a, b, v, 3); } diff --git a/clang/test/CodeGen/aarch64-poly64.c b/clang/test/CodeGen/aarch64-poly64.c index 8c4ef23bb7e14..b7fb1db9b0ff3 100644 --- a/clang/test/CodeGen/aarch64-poly64.c +++ b/clang/test/CodeGen/aarch64-poly64.c @@ -150,22 +150,28 @@ poly64x2_t test_vmovq_n_p64(poly64_t a) { } // CHECK-LABEL: define <1 x i64> @test_vdup_lane_p64(<1 x i64> %vec) #0 { -// CHECK: [[SHUFFLE:%.*]] = shufflevector <1 x i64> %vec, <1 x i64> %vec, <1 x i32> zeroinitializer -// CHECK: ret <1 x i64> [[SHUFFLE]] +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> [[VEC:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[LANE:%.*]] = shufflevector <1 x i64> [[TMP1]], <1 x i64> [[TMP1]], <1 x i32> zeroinitializer +// CHECK: ret <1 x i64> [[LANE]] poly64x1_t test_vdup_lane_p64(poly64x1_t vec) { return vdup_lane_p64(vec, 0); } // CHECK-LABEL: define <2 x i64> @test_vdupq_lane_p64(<1 x i64> %vec) #1 { -// CHECK: [[SHUFFLE:%.*]] = shufflevector <1 x i64> %vec, <1 x i64> %vec, <2 x i32> zeroinitializer -// CHECK: ret <2 x i64> [[SHUFFLE]] +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> [[VEC:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[LANE:%.*]] = shufflevector <1 x i64> [[TMP1]], <1 x i64> [[TMP1]], <2 x i32> zeroinitializer +// CHECK: ret <2 x i64> [[LANE]] poly64x2_t test_vdupq_lane_p64(poly64x1_t vec) { return vdupq_lane_p64(vec, 0); } // CHECK-LABEL: define <2 x i64> @test_vdupq_laneq_p64(<2 x i64> %vec) #1 { -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i64> %vec, <2 x i64> %vec, <2 x i32> -// CHECK: ret <2 x i64> [[SHUFFLE]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> [[VEC:%.*]] to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <2 x i64> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP1]], <2 x i32> +// CHECK: ret <2 x i64> [[LANE]] poly64x2_t test_vdupq_laneq_p64(poly64x2_t vec) { return vdupq_laneq_p64(vec, 1); } diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1_shortform.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1_shortform.c new file mode 100644 index 0000000000000..90258f00de43d --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1_shortform.c @@ -0,0 +1,83 @@ +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -fallow-half-arguments-and-returns -S -O1 -Werror -emit-llvm -o - %s -D__ARM_FEATURE_SVE | FileCheck %s + +#include +// +// ld1 +// + +svint8_t test_svld1_s8(svbool_t pg, const int8_t *base) +{ + // CHECK-LABEL: test_svld1_s8 + // CHECK: @llvm.masked.load.nxv16i8.p0nxv16i8(* %{{.*}}, i32 1, %{{.*}}, zeroinitializer) + return svld1(pg, base); +} + +svint16_t test_svld1_s16(svbool_t pg, const int16_t *base) +{ + // CHECK-LABEL: test_svld1_s16 + // CHECK: @llvm.masked.load.nxv8i16.p0nxv8i16(* %{{.*}}, i32 1, %{{.*}}, zeroinitializer) + return svld1(pg, base); +} + +svint32_t test_svld1_s32(svbool_t pg, const int32_t *base) +{ + // CHECK-LABEL: test_svld1_s32 + // CHECK: @llvm.masked.load.nxv4i32.p0nxv4i32(* %{{.*}}, i32 1, %{{.*}}, zeroinitializer) + return svld1(pg, base); +} + +svint64_t test_svld1_s64(svbool_t pg, const int64_t *base) +{ + // CHECK-LABEL: test_svld1_s64 + // CHECK: @llvm.masked.load.nxv2i64.p0nxv2i64(* %{{.*}}, i32 1, %{{.*}}, zeroinitializer) + return svld1(pg, base); +} + +svuint8_t test_svld1_u8(svbool_t pg, const uint8_t *base) +{ + // CHECK-LABEL: test_svld1_u8 + // CHECK: @llvm.masked.load.nxv16i8.p0nxv16i8(* %{{.*}}, i32 1, %{{.*}}, zeroinitializer) + return svld1(pg, base); +} + +svuint16_t test_svld1_u16(svbool_t pg, const uint16_t *base) +{ + // CHECK-LABEL: test_svld1_u16 + // CHECK: @llvm.masked.load.nxv8i16.p0nxv8i16(* %{{.*}}, i32 1, %{{.*}}, zeroinitializer) + return svld1(pg, base); +} + +svuint32_t test_svld1_u32(svbool_t pg, const uint32_t *base) +{ + // CHECK-LABEL: test_svld1_u32 + // CHECK: @llvm.masked.load.nxv4i32.p0nxv4i32(* %{{.*}}, i32 1, %{{.*}}, zeroinitializer) + return svld1(pg, base); +} + +svuint64_t test_svld1_u64(svbool_t pg, const uint64_t *base) +{ + // CHECK-LABEL: test_svld1_u64 + // CHECK: @llvm.masked.load.nxv2i64.p0nxv2i64(* %{{.*}}, i32 1, %{{.*}}, zeroinitializer) + return svld1(pg, base); +} + +svfloat16_t test_svld1_f16(svbool_t pg, const float16_t *base) +{ + // CHECK-LABEL: test_svld1_f16 + // CHECK: @llvm.masked.load.nxv8f16.p0nxv8f16(* %{{.*}}, i32 1, %{{.*}}, zeroinitializer) + return svld1(pg, base); +} + +svfloat32_t test_svld1_f32(svbool_t pg, const float32_t *base) +{ + // CHECK-LABEL: test_svld1_f32 + // CHECK: @llvm.masked.load.nxv4f32.p0nxv4f32(* %{{.*}}, i32 1, %{{.*}}, zeroinitializer) + return svld1(pg, base); +} + +svfloat64_t test_svld1_f64(svbool_t pg, const float64_t *base) +{ + // CHECK-LABEL: test_svld1_f64 + // CHECK: @llvm.masked.load.nxv2f64.p0nxv2f64(* %{{.*}}, i32 1, %{{.*}}, zeroinitializer) + return svld1(pg, base); +} diff --git a/clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c b/clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c index fc339faa6cdbc..59b3dfec80cb9 100644 --- a/clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c +++ b/clang/test/CodeGen/aarch64-v8.2a-neon-intrinsics.c @@ -1086,32 +1086,40 @@ float16_t test_vfmsh_laneq_f16(float16_t a, float16_t b, float16x8_t c) { } // CHECK-LABEL: test_vmul_lane_f16 -// CHECK: [[TMP0:%.*]] = shufflevector <4 x half> %b, <4 x half> %b, <4 x i32> -// CHECK: [[MUL:%.*]] = fmul <4 x half> %a, [[TMP0]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> +// CHECK: [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <4 x i32> +// CHECK: [[MUL:%.*]] = fmul <4 x half> [[A:%.*]], [[LANE]] // CHECK: ret <4 x half> [[MUL]] float16x4_t test_vmul_lane_f16(float16x4_t a, float16x4_t b) { return vmul_lane_f16(a, b, 3); } // CHECK-LABEL: test_vmulq_lane_f16 -// CHECK: [[TMP0:%.*]] = shufflevector <4 x half> %b, <4 x half> %b, <8 x i32> -// CHECK: [[MUL:%.*]] = fmul <8 x half> %a, [[TMP0]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> +// CHECK: [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <8 x i32> +// CHECK: [[MUL:%.*]] = fmul <8 x half> [[A:%.*]], [[LANE]] // CHECK: ret <8 x half> [[MUL]] float16x8_t test_vmulq_lane_f16(float16x8_t a, float16x4_t b) { - return vmulq_lane_f16(a, b, 7); + return vmulq_lane_f16(a, b, 3); } // CHECK-LABEL: test_vmul_laneq_f16 -// CHECK: [[TMP0:%.*]] = shufflevector <8 x half> %b, <8 x half> %b, <4 x i32> -// CHECK: [[MUL:%.*]] = fmul <4 x half> %a, [[TMP0]] +// CHECK: [[TMP0:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half> +// CHECK: [[LANE:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> [[TMP1]], <4 x i32> +// CHECK: [[MUL:%.*]] = fmul <4 x half> [[A:%.*]], [[LANE]] // CHECK: ret <4 x half> [[MUL]] float16x4_t test_vmul_laneq_f16(float16x4_t a, float16x8_t b) { return vmul_laneq_f16(a, b, 7); } // CHECK-LABEL: test_vmulq_laneq_f16 -// CHECK: [[TMP0:%.*]] = shufflevector <8 x half> %b, <8 x half> %b, <8 x i32> -// CHECK: [[MUL:%.*]] = fmul <8 x half> %a, [[TMP0]] +// CHECK: [[TMP0:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half> +// CHECK: [[LANE:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> [[TMP1]], <8 x i32> +// CHECK: [[MUL:%.*]] = fmul <8 x half> [[A:%.*]], [[LANE]] // CHECK: ret <8 x half> [[MUL]] float16x8_t test_vmulq_laneq_f16(float16x8_t a, float16x8_t b) { return vmulq_laneq_f16(a, b, 7); @@ -1165,33 +1173,49 @@ float16_t test_vmulh_laneq_f16(float16_t a, float16x8_t b) { } // CHECK-LABEL: test_vmulx_lane_f16 -// CHECK: [[TMP0:%.*]] = shufflevector <4 x half> %b, <4 x half> %b, <4 x i32> -// CHECK: [[MUL:%.*]] = call <4 x half> @llvm.aarch64.neon.fmulx.v4f16(<4 x half> %a, <4 x half> [[TMP0]]) -// CHECK: ret <4 x half> [[MUL]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> +// CHECK: [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <4 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <4 x half> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <4 x half> [[LANE]] to <8 x i8> +// CHECK: [[VMULX2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmulx.v4f16(<4 x half> [[A]], <4 x half> [[LANE]]) #4 +// CHECK: ret <4 x half> [[VMULX2_I]] float16x4_t test_vmulx_lane_f16(float16x4_t a, float16x4_t b) { return vmulx_lane_f16(a, b, 3); } // CHECK-LABEL: test_vmulxq_lane_f16 -// CHECK: [[TMP0:%.*]] = shufflevector <4 x half> %b, <4 x half> %b, <8 x i32> -// CHECK: [[MUL:%.*]] = call <8 x half> @llvm.aarch64.neon.fmulx.v8f16(<8 x half> %a, <8 x half> [[TMP0]]) -// CHECK: ret <8 x half> [[MUL]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> +// CHECK: [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <8 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <8 x half> [[A:%.*]] to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <8 x half> [[LANE]] to <16 x i8> +// CHECK: [[VMULX2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmulx.v8f16(<8 x half> [[A]], <8 x half> [[LANE]]) #4 +// CHECK: ret <8 x half> [[VMULX2_I]] float16x8_t test_vmulxq_lane_f16(float16x8_t a, float16x4_t b) { - return vmulxq_lane_f16(a, b, 7); + return vmulxq_lane_f16(a, b, 3); } // CHECK-LABEL: test_vmulx_laneq_f16 -// CHECK: [[TMP0:%.*]] = shufflevector <8 x half> %b, <8 x half> %b, <4 x i32> -// CHECK: [[MUL:%.*]] = call <4 x half> @llvm.aarch64.neon.fmulx.v4f16(<4 x half> %a, <4 x half> [[TMP0]]) -// CHECK: ret <4 x half> [[MUL]] +// CHECK: [[TMP0:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half> +// CHECK: [[LANE:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> [[TMP1]], <4 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <4 x half> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <4 x half> [[LANE]] to <8 x i8> +// CHECK: [[VMULX2_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fmulx.v4f16(<4 x half> [[A]], <4 x half> [[LANE]]) #4 +// CHECK: ret <4 x half> [[VMULX2_I]] float16x4_t test_vmulx_laneq_f16(float16x4_t a, float16x8_t b) { return vmulx_laneq_f16(a, b, 7); } // CHECK-LABEL: test_vmulxq_laneq_f16 -// CHECK: [[TMP0:%.*]] = shufflevector <8 x half> %b, <8 x half> %b, <8 x i32> -// CHECK: [[MUL:%.*]] = call <8 x half> @llvm.aarch64.neon.fmulx.v8f16(<8 x half> %a, <8 x half> [[TMP0]]) -// CHECK: ret <8 x half> [[MUL]] +// CHECK: [[TMP0:%.*]] = bitcast <8 x half> [[B:%.*]] to <16 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half> +// CHECK: [[LANE:%.*]] = shufflevector <8 x half> [[TMP1]], <8 x half> [[TMP1]], <8 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <8 x half> [[A:%.*]] to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <8 x half> [[LANE]] to <16 x i8> +// CHECK: [[VMULX2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmulx.v8f16(<8 x half> [[A]], <8 x half> [[LANE]]) #4 +// CHECK: ret <8 x half> [[VMULX2_I]] float16x8_t test_vmulxq_laneq_f16(float16x8_t a, float16x8_t b) { return vmulxq_laneq_f16(a, b, 7); } @@ -1473,17 +1497,21 @@ float16x8_t test_vdupq_n_f16(float16_t a) { } // CHECK-LABEL: test_vdup_lane_f16 -// CHECK: [[SHFL:%.*]] = shufflevector <4 x half> %a, <4 x half> %a, <4 x i32> -// CHECK: ret <4 x half> [[SHFL]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x half> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> +// CHECK: [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <4 x i32> +// CHECK: ret <4 x half> [[LANE]] float16x4_t test_vdup_lane_f16(float16x4_t a) { return vdup_lane_f16(a, 3); } // CHECK-LABEL: test_vdupq_lane_f16 -// CHECK: [[SHFL:%.*]] = shufflevector <4 x half> %a, <4 x half> %a, <8 x i32> -// CHECK: ret <8 x half> [[SHFL]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x half> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> +// CHECK: [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <8 x i32> +// CHECK: ret <8 x half> [[LANE]] float16x8_t test_vdupq_lane_f16(float16x4_t a) { - return vdupq_lane_f16(a, 7); + return vdupq_lane_f16(a, 3); } // CHECK-LABEL: @test_vext_f16( diff --git a/clang/test/CodeGen/aarch64-varargs.c b/clang/test/CodeGen/aarch64-varargs.c index c213f5b9375ba..27bb602e75de1 100644 --- a/clang/test/CodeGen/aarch64-varargs.c +++ b/clang/test/CodeGen/aarch64-varargs.c @@ -639,7 +639,7 @@ typedef struct __attribute__((aligned(32))) { __int128 val; } overaligned_int128_struct; overaligned_int128_struct overaligned_int128_struct_test() { -// CHECK-LABEL: define void @overaligned_int128_struct_test(%struct.overaligned_int128_struct* noalias sret %agg.result) +// CHECK-LABEL: define void @overaligned_int128_struct_test(%struct.overaligned_int128_struct* noalias sret align 32 %agg.result) return va_arg(the_list, overaligned_int128_struct); // CHECK: [[GR_OFFS:%[a-z_0-9]+]] = load i32, i32* getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 3) // CHECK: [[EARLY_ONSTACK:%[a-z_0-9]+]] = icmp sge i32 [[GR_OFFS]], 0 @@ -853,7 +853,7 @@ typedef struct { __int128 val __attribute__((aligned(32))); } overaligned_int128_struct_member; overaligned_int128_struct_member overaligned_int128_struct_member_test() { -// CHECK-LABEL: define void @overaligned_int128_struct_member_test(%struct.overaligned_int128_struct_member* noalias sret %agg.result) +// CHECK-LABEL: define void @overaligned_int128_struct_member_test(%struct.overaligned_int128_struct_member* noalias sret align 32 %agg.result) return va_arg(the_list, overaligned_int128_struct_member); // CHECK: [[GR_OFFS:%[a-z_0-9]+]] = load i32, i32* getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 3) // CHECK: [[EARLY_ONSTACK:%[a-z_0-9]+]] = icmp sge i32 [[GR_OFFS]], 0 diff --git a/clang/test/CodeGen/aggregate-assign-call.c b/clang/test/CodeGen/aggregate-assign-call.c index d00cb90c09407..9616e6d22562f 100644 --- a/clang/test/CodeGen/aggregate-assign-call.c +++ b/clang/test/CodeGen/aggregate-assign-call.c @@ -62,8 +62,8 @@ struct S baz(int i, volatile int *j) { // O1-NEWPM: %[[TMP3:.*]] = bitcast %struct.S* %[[TMP2_ALLOCA]] to i8* // O1-NEWPM: call void @llvm.lifetime.end.p0i8({{[^,]*}}, i8* nonnull %[[P]]) // - // O1-LEGACY: call void @foo_int(%struct.S* sret %[[TMP1_ALLOCA]], - // O1-NEWPM: call void @foo_int(%struct.S* nonnull sret %[[TMP1_ALLOCA]], + // O1-LEGACY: call void @foo_int(%struct.S* sret align 4 %[[TMP1_ALLOCA]], + // O1-NEWPM: call void @foo_int(%struct.S* nonnull sret align 4 %[[TMP1_ALLOCA]], // O1: call void @llvm.memcpy // O1-LEGACY: %[[P:[^ ]+]] = bitcast %struct.S* %[[TMP1_ALLOCA]] to i8* // O1-LEGACY: call void @llvm.lifetime.end.p0i8({{[^,]*}}, i8* %[[P]]) @@ -71,8 +71,8 @@ struct S baz(int i, volatile int *j) { // O1-LEGACY: %[[P:[^ ]+]] = bitcast %struct.S* %[[TMP2_ALLOCA]] to i8* // O1-LEGACY: call void @llvm.lifetime.start.p0i8({{[^,]*}}, i8* %[[P]]) // O1-NEWPM: call void @llvm.lifetime.start.p0i8({{[^,]*}}, i8* nonnull %[[TMP3]]) - // O1-LEGACY: call void @foo_int(%struct.S* sret %[[TMP2_ALLOCA]], - // O1-NEWPM: call void @foo_int(%struct.S* nonnull sret %[[TMP2_ALLOCA]], + // O1-LEGACY: call void @foo_int(%struct.S* sret align 4 %[[TMP2_ALLOCA]], + // O1-NEWPM: call void @foo_int(%struct.S* nonnull sret align 4 %[[TMP2_ALLOCA]], // O1: call void @llvm.memcpy // O1-LEGACY: %[[P:[^ ]+]] = bitcast %struct.S* %[[TMP2_ALLOCA]] to i8* // O1-LEGACY: call void @llvm.lifetime.end.p0i8({{[^,]*}}, i8* %[[P]]) diff --git a/clang/test/CodeGen/aligned-sret.c b/clang/test/CodeGen/aligned-sret.c new file mode 100644 index 0000000000000..c459fe730163c --- /dev/null +++ b/clang/test/CodeGen/aligned-sret.c @@ -0,0 +1,10 @@ +// RUN: %clang_cc1 -triple x86_64-apple-macos %s -S -emit-llvm -o- | FileCheck %s + +typedef __attribute__((__ext_vector_type__(4),__aligned__(16))) double simd_double4; +typedef struct { simd_double4 columns[4]; } simd_double4x4; +typedef simd_double4x4 matrix_double4x4; + +// CHECK: define void @ident(%struct.simd_double4x4* noalias sret align 16 %agg.result +matrix_double4x4 ident(matrix_double4x4 x) { + return x; +} diff --git a/clang/test/CodeGen/arc/arguments.c b/clang/test/CodeGen/arc/arguments.c index fdc6037da730a..9c9b553b1be45 100644 --- a/clang/test/CodeGen/arc/arguments.c +++ b/clang/test/CodeGen/arc/arguments.c @@ -22,7 +22,7 @@ void cf1(cs1 i) {} typedef struct { int cc; } s2; -// CHECK: define void @f2(%struct.s2* noalias sret %agg.result) +// CHECK: define void @f2(%struct.s2* noalias sret align 4 %agg.result) s2 f2() { s2 foo; return foo; @@ -32,7 +32,7 @@ typedef struct { int cc; int dd; } s3; -// CHECK: define void @f3(%struct.s3* noalias sret %agg.result) +// CHECK: define void @f3(%struct.s3* noalias sret align 4 %agg.result) s3 f3() { s3 foo; return foo; @@ -128,8 +128,8 @@ void st3(s16 a, s16 b, s16 c) {} // 1 sret + 1 i32 + 2*(i32 coerce) + 4*(i32 coerce) + 1 byval s16 st4(int x, s8 a, s16 b, s16 c) { return b; } -// CHECK: define void @st4(%struct.s16* noalias sret %agg.result, i32 inreg %x, i32 inreg %a.coerce0, i32 inreg %a.coerce1, i32 inreg %b.coerce0, i32 inreg %b.coerce1, i32 inreg %b.coerce2, i32 inreg %b.coerce3, { i32, i32, i32, i32 } %c.coerce) +// CHECK: define void @st4(%struct.s16* noalias sret align 4 %agg.result, i32 inreg %x, i32 inreg %a.coerce0, i32 inreg %a.coerce1, i32 inreg %b.coerce0, i32 inreg %b.coerce1, i32 inreg %b.coerce2, i32 inreg %b.coerce3, { i32, i32, i32, i32 } %c.coerce) // 1 sret + 2*(i32 coerce) + 4*(i32 coerce) + 4*(i32 coerce) s16 st5(s8 a, s16 b, s16 c) { return b; } -// CHECK: define void @st5(%struct.s16* noalias sret %agg.result, i32 inreg %a.coerce0, i32 inreg %a.coerce1, i32 inreg %b.coerce0, i32 inreg %b.coerce1, i32 inreg %b.coerce2, i32 inreg %b.coerce3, { i32, i32, i32, i32 } %c.coerce) +// CHECK: define void @st5(%struct.s16* noalias sret align 4 %agg.result, i32 inreg %a.coerce0, i32 inreg %a.coerce1, i32 inreg %b.coerce0, i32 inreg %b.coerce1, i32 inreg %b.coerce2, i32 inreg %b.coerce3, { i32, i32, i32, i32 } %c.coerce) diff --git a/clang/test/CodeGen/arm-aapcs-vfp.c b/clang/test/CodeGen/arm-aapcs-vfp.c index 69581fcab2479..486ed6ab94fd4 100644 --- a/clang/test/CodeGen/arm-aapcs-vfp.c +++ b/clang/test/CodeGen/arm-aapcs-vfp.c @@ -125,7 +125,7 @@ void test_vfp_stack_gpr_split_1(double a, double b, double c, double d, double e // CHECK: define arm_aapcs_vfpcc void @test_vfp_stack_gpr_split_2(double %a, double %b, double %c, double %d, double %e, double %f, double %g, double %h, double %i, i32 %j, [2 x i64] %k.coerce) void test_vfp_stack_gpr_split_2(double a, double b, double c, double d, double e, double f, double g, double h, double i, int j, struct_long_long_int k) {} -// CHECK: define arm_aapcs_vfpcc void @test_vfp_stack_gpr_split_3(%struct.struct_long_long_int* noalias sret %agg.result, double %a, double %b, double %c, double %d, double %e, double %f, double %g, double %h, double %i, [2 x i64] %k.coerce) +// CHECK: define arm_aapcs_vfpcc void @test_vfp_stack_gpr_split_3(%struct.struct_long_long_int* noalias sret align 8 %agg.result, double %a, double %b, double %c, double %d, double %e, double %f, double %g, double %h, double %i, [2 x i64] %k.coerce) struct_long_long_int test_vfp_stack_gpr_split_3(double a, double b, double c, double d, double e, double f, double g, double h, double i, struct_long_long_int k) {} typedef struct { int a; int b:4; int c; } struct_int_bitfield_int; diff --git a/clang/test/CodeGen/arm-cde-gpr.c b/clang/test/CodeGen/arm-cde-gpr.c index 9a24b1540b67b..1e6893d7d2f83 100644 --- a/clang/test/CodeGen/arm-cde-gpr.c +++ b/clang/test/CodeGen/arm-cde-gpr.c @@ -11,6 +11,150 @@ // CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.cde.cx1(i32 0, i32 123) // CHECK-NEXT: ret i32 [[TMP0]] // -uint32_t test_cx1() { +uint32_t test_cx1(void) { return __arm_cx1(0, 123); } + +// CHECK-LABEL: @test_cx1a( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.cde.cx1a(i32 0, i32 [[ACC:%.*]], i32 345) +// CHECK-NEXT: ret i32 [[TMP0]] +// +uint32_t test_cx1a(uint32_t acc) { + return __arm_cx1a(0, acc, 345); +} + +// CHECK-LABEL: @test_cx1d( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call { i32, i32 } @llvm.arm.cde.cx1d(i32 1, i32 567) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { i32, i32 } [[TMP0]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64 +// CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 32 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP0]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64 +// CHECK-NEXT: [[TMP6:%.*]] = or i64 [[TMP3]], [[TMP5]] +// CHECK-NEXT: ret i64 [[TMP6]] +// +uint64_t test_cx1d(void) { + return __arm_cx1d(1, 567); +} + +// CHECK-LABEL: @test_cx1da( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[ACC:%.*]], 32 +// CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[ACC]] to i32 +// CHECK-NEXT: [[TMP3:%.*]] = call { i32, i32 } @llvm.arm.cde.cx1da(i32 0, i32 [[TMP2]], i32 [[TMP1]], i32 789) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP3]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64 +// CHECK-NEXT: [[TMP6:%.*]] = shl i64 [[TMP5]], 32 +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { i32, i32 } [[TMP3]], 0 +// CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 +// CHECK-NEXT: [[TMP9:%.*]] = or i64 [[TMP6]], [[TMP8]] +// CHECK-NEXT: ret i64 [[TMP9]] +// +uint64_t test_cx1da(uint64_t acc) { + return __arm_cx1da(0, acc, 789); +} + +// CHECK-LABEL: @test_cx2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.cde.cx2(i32 0, i32 [[N:%.*]], i32 11) +// CHECK-NEXT: ret i32 [[TMP0]] +// +uint32_t test_cx2(uint32_t n) { + return __arm_cx2(0, n, 11); +} + +// CHECK-LABEL: @test_cx2a( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.cde.cx2a(i32 1, i32 [[ACC:%.*]], i32 [[N:%.*]], i32 22) +// CHECK-NEXT: ret i32 [[TMP0]] +// +uint32_t test_cx2a(uint32_t acc, uint32_t n) { + return __arm_cx2a(1, acc, n, 22); +} + +// CHECK-LABEL: @test_cx2d( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call { i32, i32 } @llvm.arm.cde.cx2d(i32 1, i32 [[N:%.*]], i32 33) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { i32, i32 } [[TMP0]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64 +// CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 32 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP0]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64 +// CHECK-NEXT: [[TMP6:%.*]] = or i64 [[TMP3]], [[TMP5]] +// CHECK-NEXT: ret i64 [[TMP6]] +// +uint64_t test_cx2d(uint32_t n) { + return __arm_cx2d(1, n, 33); +} + +// CHECK-LABEL: @test_cx2da( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[ACC:%.*]], 32 +// CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[ACC]] to i32 +// CHECK-NEXT: [[TMP3:%.*]] = call { i32, i32 } @llvm.arm.cde.cx2da(i32 0, i32 [[TMP2]], i32 [[TMP1]], i32 [[N:%.*]], i32 44) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP3]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64 +// CHECK-NEXT: [[TMP6:%.*]] = shl i64 [[TMP5]], 32 +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { i32, i32 } [[TMP3]], 0 +// CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 +// CHECK-NEXT: [[TMP9:%.*]] = or i64 [[TMP6]], [[TMP8]] +// CHECK-NEXT: ret i64 [[TMP9]] +// +uint64_t test_cx2da(uint64_t acc, uint32_t n) { + return __arm_cx2da(0, acc, n, 44); +} + +// CHECK-LABEL: @test_cx3( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.cde.cx3(i32 0, i32 [[N:%.*]], i32 [[M:%.*]], i32 1) +// CHECK-NEXT: ret i32 [[TMP0]] +// +uint32_t test_cx3(uint32_t n, uint32_t m) { + return __arm_cx3(0, n, m, 1); +} + +// CHECK-LABEL: @test_cx3a( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.cde.cx3a(i32 1, i32 [[ACC:%.*]], i32 [[N:%.*]], i32 [[M:%.*]], i32 2) +// CHECK-NEXT: ret i32 [[TMP0]] +// +uint32_t test_cx3a(uint32_t acc, uint32_t n, uint32_t m) { + return __arm_cx3a(1, acc, n, m, 2); +} + +// CHECK-LABEL: @test_cx3d( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call { i32, i32 } @llvm.arm.cde.cx3d(i32 1, i32 [[N:%.*]], i32 [[M:%.*]], i32 3) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { i32, i32 } [[TMP0]], 1 +// CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64 +// CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 32 +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP0]], 0 +// CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64 +// CHECK-NEXT: [[TMP6:%.*]] = or i64 [[TMP3]], [[TMP5]] +// CHECK-NEXT: ret i64 [[TMP6]] +// +uint64_t test_cx3d(uint32_t n, uint32_t m) { + return __arm_cx3d(1, n, m, 3); +} + +// CHECK-LABEL: @test_cx3da( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = lshr i64 [[ACC:%.*]], 32 +// CHECK-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[ACC]] to i32 +// CHECK-NEXT: [[TMP3:%.*]] = call { i32, i32 } @llvm.arm.cde.cx3da(i32 0, i32 [[TMP2]], i32 [[TMP1]], i32 [[N:%.*]], i32 [[M:%.*]], i32 4) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { i32, i32 } [[TMP3]], 1 +// CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64 +// CHECK-NEXT: [[TMP6:%.*]] = shl i64 [[TMP5]], 32 +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { i32, i32 } [[TMP3]], 0 +// CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 +// CHECK-NEXT: [[TMP9:%.*]] = or i64 [[TMP6]], [[TMP8]] +// CHECK-NEXT: ret i64 [[TMP9]] +// +uint64_t test_cx3da(uint64_t acc, uint32_t n, uint32_t m) { + return __arm_cx3da(0, acc, n, m, 4); +} diff --git a/clang/test/CodeGen/arm-cde-reinterpret.c b/clang/test/CodeGen/arm-cde-reinterpret.c new file mode 100644 index 0000000000000..569b51bdfdbe8 --- /dev/null +++ b/clang/test/CodeGen/arm-cde-reinterpret.c @@ -0,0 +1,78 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi \ +// RUN: -target-feature +cdecp0 -target-feature +mve.fp \ +// RUN: -mfloat-abi hard -O0 -disable-O0-optnone \ +// RUN: -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s + +#include + +// CHECK-LABEL: @test_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: ret <16 x i8> [[X:%.*]] +// +int8x16_t test_s8(uint8x16_t x) { + return __arm_vreinterpretq_s8_u8(x); +} + +// CHECK-LABEL: @test_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[X:%.*]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// +uint16x8_t test_u16(uint8x16_t x) { + return __arm_vreinterpretq_u16_u8(x); +} + +// CHECK-LABEL: @test_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[X:%.*]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// +int32x4_t test_s32(uint8x16_t x) { + return __arm_vreinterpretq_s32_u8(x); +} + +// CHECK-LABEL: @test_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[X:%.*]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// +uint32x4_t test_u32(uint8x16_t x) { + return __arm_vreinterpretq_u32_u8(x); +} + +// CHECK-LABEL: @test_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[X:%.*]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// +int64x2_t test_s64(uint8x16_t x) { + return __arm_vreinterpretq_s64_u8(x); +} + +// CHECK-LABEL: @test_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[X:%.*]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// +uint64x2_t test_u64(uint8x16_t x) { + return __arm_vreinterpretq_u64_u8(x); +} + +// CHECK-LABEL: @test_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[X:%.*]] to <8 x half> +// CHECK-NEXT: ret <8 x half> [[TMP0]] +// +float16x8_t test_f16(uint8x16_t x) { + return __arm_vreinterpretq_f16_u8(x); +} + +// CHECK-LABEL: @test_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[X:%.*]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP0]] +// +float32x4_t test_f32(uint8x16_t x) { + return __arm_vreinterpretq_f32_u8(x); +} diff --git a/clang/test/CodeGen/arm-cde-vec.c b/clang/test/CodeGen/arm-cde-vec.c new file mode 100644 index 0000000000000..fcf9270d87f06 --- /dev/null +++ b/clang/test/CodeGen/arm-cde-vec.c @@ -0,0 +1,104 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi \ +// RUN: -target-feature +cdecp0 -target-feature +cdecp1 \ +// RUN: -target-feature +mve.fp \ +// RUN: -mfloat-abi hard -O0 -disable-O0-optnone \ +// RUN: -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s + +#include + +// CHECK-LABEL: @test_vcx1q_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <16 x i8> @llvm.arm.cde.vcx1q(i32 0, i32 1111) +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// +uint8x16_t test_vcx1q_u8(void) { + return __arm_vcx1q_u8(0, 1111); +} + +// CHECK-LABEL: @test_vcx1qa_1( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <16 x i8> @llvm.arm.cde.vcx1qa(i32 1, <16 x i8> [[ACC:%.*]], i32 1112) +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// +uint8x16_t test_vcx1qa_1(uint8x16_t acc) { + return __arm_vcx1qa(1, acc, 1112); +} + +// CHECK-LABEL: @test_vcx1qa_2( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[ACC:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.arm.cde.vcx1qa(i32 0, <16 x i8> [[TMP0]], i32 1113) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// +int32x4_t test_vcx1qa_2(int32x4_t acc) { + return __arm_vcx1qa(0, acc, 1113); +} + +// CHECK-LABEL: @test_vcx2q_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[N:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.arm.cde.vcx2q(i32 1, <16 x i8> [[TMP0]], i32 111) +// CHECK-NEXT: ret <16 x i8> [[TMP1]] +// +uint8x16_t test_vcx2q_u8(float16x8_t n) { + return __arm_vcx2q_u8(1, n, 111); +} + +// CHECK-LABEL: @test_vcx2q( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[N:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.arm.cde.vcx2q(i32 1, <16 x i8> [[TMP0]], i32 112) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP2]] +// +float32x4_t test_vcx2q(float32x4_t n) { + return __arm_vcx2q(1, n, 112); +} + +// CHECK-LABEL: @test_vcx2qa( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[ACC:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i64> [[N:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.cde.vcx2qa(i32 0, <16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 113) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP3]] +// +float32x4_t test_vcx2qa(float32x4_t acc, int64x2_t n) { + return __arm_vcx2qa(0, acc, n, 113); +} + +// CHECK-LABEL: @test_vcx3q_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[N:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[M:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.cde.vcx3q(i32 0, <16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 11) +// CHECK-NEXT: ret <16 x i8> [[TMP2]] +// +uint8x16_t test_vcx3q_u8(uint16x8_t n, int32x4_t m) { + return __arm_vcx3q_u8(0, n, m, 11); +} + +// CHECK-LABEL: @test_vcx3q( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[N:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[M:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.cde.vcx3q(i32 1, <16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 12) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64> +// CHECK-NEXT: ret <2 x i64> [[TMP3]] +// +uint64x2_t test_vcx3q(uint64x2_t n, float32x4_t m) { + return __arm_vcx3q(1, n, m, 12); +} + +// CHECK-LABEL: @test_vcx3qa( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i16> [[N:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[M:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.cde.vcx3qa(i32 1, <16 x i8> [[ACC:%.*]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 13) +// CHECK-NEXT: ret <16 x i8> [[TMP2]] +// +int8x16_t test_vcx3qa(int8x16_t acc, uint16x8_t n, float32x4_t m) { + return __arm_vcx3qa(1, acc, n, m, 13); +} diff --git a/clang/test/CodeGen/arm-cde-vfp.c b/clang/test/CodeGen/arm-cde-vfp.c new file mode 100644 index 0000000000000..fffcb716359d1 --- /dev/null +++ b/clang/test/CodeGen/arm-cde-vfp.c @@ -0,0 +1,145 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi \ +// RUN: -target-feature +cdecp0 -target-feature +cdecp1 \ +// RUN: -mfloat-abi hard -O0 -disable-O0-optnone \ +// RUN: -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s + +#include + +// CHECK-LABEL: @test_vcx1_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call float @llvm.arm.cde.vcx1.f32(i32 0, i32 11) +// CHECK-NEXT: [[TMP1:%.*]] = bitcast float [[TMP0]] to i32 +// CHECK-NEXT: ret i32 [[TMP1]] +// +uint32_t test_vcx1_u32(void) { + return __arm_vcx1_u32(0, 11); +} + +// CHECK-LABEL: @test_vcx1a_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i32 [[ACC:%.*]] to float +// CHECK-NEXT: [[TMP1:%.*]] = call float @llvm.arm.cde.vcx1a.f32(i32 1, float [[TMP0]], i32 12) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast float [[TMP1]] to i32 +// CHECK-NEXT: ret i32 [[TMP2]] +// +uint32_t test_vcx1a_u32(uint32_t acc) { + return __arm_vcx1a_u32(1, acc, 12); +} + +// CHECK-LABEL: @test_vcx2_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i32 [[N:%.*]] to float +// CHECK-NEXT: [[TMP1:%.*]] = call float @llvm.arm.cde.vcx2.f32(i32 0, float [[TMP0]], i32 21) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast float [[TMP1]] to i32 +// CHECK-NEXT: ret i32 [[TMP2]] +// +uint32_t test_vcx2_u32(uint32_t n) { + return __arm_vcx2_u32(0, n, 21); +} + +// CHECK-LABEL: @test_vcx2a_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i32 [[ACC:%.*]] to float +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i32 [[N:%.*]] to float +// CHECK-NEXT: [[TMP2:%.*]] = call float @llvm.arm.cde.vcx2a.f32(i32 0, float [[TMP0]], float [[TMP1]], i32 22) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +// CHECK-NEXT: ret i32 [[TMP3]] +// +uint32_t test_vcx2a_u32(uint32_t acc, uint32_t n) { + return __arm_vcx2a_u32(0, acc, n, 22); +} + +// CHECK-LABEL: @test_vcx3_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i32 [[N:%.*]] to float +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i32 [[M:%.*]] to float +// CHECK-NEXT: [[TMP2:%.*]] = call float @llvm.arm.cde.vcx3.f32(i32 1, float [[TMP0]], float [[TMP1]], i32 3) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast float [[TMP2]] to i32 +// CHECK-NEXT: ret i32 [[TMP3]] +// +uint32_t test_vcx3_u32(uint32_t n, uint32_t m) { + return __arm_vcx3_u32(1, n, m, 3); +} + +// CHECK-LABEL: @test_vcx3a_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i32 [[ACC:%.*]] to float +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i32 [[N:%.*]] to float +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i32 [[M:%.*]] to float +// CHECK-NEXT: [[TMP3:%.*]] = call float @llvm.arm.cde.vcx3a.f32(i32 0, float [[TMP0]], float [[TMP1]], float [[TMP2]], i32 5) +// CHECK-NEXT: [[TMP4:%.*]] = bitcast float [[TMP3]] to i32 +// CHECK-NEXT: ret i32 [[TMP4]] +// +uint32_t test_vcx3a_u32(uint32_t acc, uint32_t n, uint32_t m) { + return __arm_vcx3a_u32(0, acc, n, m, 5); +} + +// CHECK-LABEL: @test_vcx1d_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call double @llvm.arm.cde.vcx1.f64(i32 0, i32 11) +// CHECK-NEXT: [[TMP1:%.*]] = bitcast double [[TMP0]] to i64 +// CHECK-NEXT: ret i64 [[TMP1]] +// +uint64_t test_vcx1d_u64(void) { + return __arm_vcx1d_u64(0, 11); +} + +// CHECK-LABEL: @test_vcx1da_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[ACC:%.*]] to double +// CHECK-NEXT: [[TMP1:%.*]] = call double @llvm.arm.cde.vcx1a.f64(i32 1, double [[TMP0]], i32 12) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast double [[TMP1]] to i64 +// CHECK-NEXT: ret i64 [[TMP2]] +// +uint64_t test_vcx1da_u64(uint64_t acc) { + return __arm_vcx1da_u64(1, acc, 12); +} + +// CHECK-LABEL: @test_vcx2d_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[N:%.*]] to double +// CHECK-NEXT: [[TMP1:%.*]] = call double @llvm.arm.cde.vcx2.f64(i32 0, double [[TMP0]], i32 21) +// CHECK-NEXT: [[TMP2:%.*]] = bitcast double [[TMP1]] to i64 +// CHECK-NEXT: ret i64 [[TMP2]] +// +uint64_t test_vcx2d_u64(uint64_t n) { + return __arm_vcx2d_u64(0, n, 21); +} + +// CHECK-LABEL: @test_vcx2da_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[ACC:%.*]] to double +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[N:%.*]] to double +// CHECK-NEXT: [[TMP2:%.*]] = call double @llvm.arm.cde.vcx2a.f64(i32 0, double [[TMP0]], double [[TMP1]], i32 22) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +// CHECK-NEXT: ret i64 [[TMP3]] +// +uint64_t test_vcx2da_u64(uint64_t acc, uint64_t n) { + return __arm_vcx2da_u64(0, acc, n, 22); +} + +// CHECK-LABEL: @test_vcx3d_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[N:%.*]] to double +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[M:%.*]] to double +// CHECK-NEXT: [[TMP2:%.*]] = call double @llvm.arm.cde.vcx3.f64(i32 1, double [[TMP0]], double [[TMP1]], i32 3) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast double [[TMP2]] to i64 +// CHECK-NEXT: ret i64 [[TMP3]] +// +uint64_t test_vcx3d_u64(uint64_t n, uint64_t m) { + return __arm_vcx3d_u64(1, n, m, 3); +} + +// CHECK-LABEL: @test_vcx3da_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[ACC:%.*]] to double +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[N:%.*]] to double +// CHECK-NEXT: [[TMP2:%.*]] = bitcast i64 [[M:%.*]] to double +// CHECK-NEXT: [[TMP3:%.*]] = call double @llvm.arm.cde.vcx3a.f64(i32 0, double [[TMP0]], double [[TMP1]], double [[TMP2]], i32 5) +// CHECK-NEXT: [[TMP4:%.*]] = bitcast double [[TMP3]] to i64 +// CHECK-NEXT: ret i64 [[TMP4]] +// +uint64_t test_vcx3da_u64(uint64_t acc, uint64_t n, uint64_t m) { + return __arm_vcx3da_u64(0, acc, n, m, 5); +} diff --git a/clang/test/CodeGen/arm-cmse-attr.c b/clang/test/CodeGen/arm-cmse-attr.c new file mode 100644 index 0000000000000..041ed3f64a7e0 --- /dev/null +++ b/clang/test/CodeGen/arm-cmse-attr.c @@ -0,0 +1,43 @@ +// RUN: %clang_cc1 -triple thumbv8m.base-none-eabi -O1 -emit-llvm %s -o - 2>&1 | \ +// RUN: FileCheck %s --check-prefix=CHECK-NOSE --check-prefix=CHECK +// RUN: %clang_cc1 -triple thumbebv8m.base-none-eabi -O1 -emit-llvm %s -o - 2>&1 | \ +// RUN: FileCheck %s --check-prefix=CHECK-NOSE --check-prefix=CHECK +// RUN: %clang_cc1 -triple thumbv8m.base-none-eabi -mcmse -O1 -emit-llvm %s -o - 2>&1 | \ +// RUN: FileCheck %s --check-prefix=CHECK-SE --check-prefix=CHECK +// RUN: %clang_cc1 -triple thumbebv8m.base-none-eabi -mcmse -O1 -emit-llvm %s -o - 2>&1 | \ +// RUN: FileCheck %s --check-prefix=CHECK-SE --check-prefix=CHECK + +typedef void (*callback_t)(void) __attribute__((cmse_nonsecure_call)); +typedef void callback2_t(void) __attribute__((cmse_nonsecure_call)); + +void f1(callback_t fptr) +{ + fptr(); +} + +void f2(callback2_t *fptr) +{ + fptr(); +} + +void f3() __attribute__((cmse_nonsecure_entry)); +void f3() +{ +} + +void f4() __attribute__((cmse_nonsecure_entry)) +{ +} + +// CHECK: define void @f1(void ()* nocapture %fptr) {{[^#]*}}#0 { +// CHECK: call void %fptr() #2 +// CHECK: define void @f2(void ()* nocapture %fptr) {{[^#]*}}#0 { +// CHECK: call void %fptr() #2 +// CHECK: define void @f3() {{[^#]*}}#1 { +// CHECK: define void @f4() {{[^#]*}}#1 { + +// CHECK-NOSE-NOT: cmse_nonsecure_entry +// CHECK-NOSE-NOT: cmse_nonsecure_call +// CHECK-SE: attributes #0 = { nounwind +// CHECK-SE: attributes #1 = { {{.*}} "cmse_nonsecure_entry" +// CHECK-SE: attributes #2 = { {{.*}} "cmse_nonsecure_call" diff --git a/clang/test/CodeGen/arm-cmse-call.c b/clang/test/CodeGen/arm-cmse-call.c new file mode 100644 index 0000000000000..8bbab23d9ea60 --- /dev/null +++ b/clang/test/CodeGen/arm-cmse-call.c @@ -0,0 +1,77 @@ +// RUN: %clang_cc1 -triple thumbv8m.base-none-eabi -mcmse -O1 -emit-llvm %s -o - 2>&1 | \ +// RUN: FileCheck %s --check-prefix=CHECK +// RUN: %clang_cc1 -triple thumbebv8m.base-none-eabi -mcmse -O1 -emit-llvm %s -o - 2>&1 | \ +// RUN: FileCheck %s --check-prefix=CHECK + +typedef void fn_t(void); +fn_t s; +fn_t *p0 __attribute__((cmse_nonsecure_call)); + +typedef fn_t *pfn_t __attribute__((cmse_nonsecure_call)); +pfn_t p1; +pfn_t a0[4]; +extern pfn_t a1[]; + +typedef void (*pfn1_t)(int) __attribute__((cmse_nonsecure_call)); +pfn1_t p2; + +typedef fn_t *apfn_t[4] __attribute__((cmse_nonsecure_call)); +apfn_t a2; + +typedef pfn_t apfn1_t[4] __attribute__((cmse_nonsecure_call)); +apfn1_t a3; + +typedef void (*apfn2_t[4])(void) __attribute__((cmse_nonsecure_call)); +apfn2_t a4; + +void (*b[4])(int) __attribute__((cmse_nonsecure_call)); + +void f(int i) { + s(); +// CHECK: call void @s() #[[#A1:]] + + p0(); +// CHECK: %[[#P0:]] = load {{.*}} @p0 +// CHECK: call void %[[#P0]]() #[[#A2:]] + + p1(); +// CHECK: %[[#P1:]] = load {{.*}} @p1 +// CHECK: call void %[[#P1]]() #[[#A2]] + + p2(i); +// CHECK: %[[#P2:]] = load {{.*}} @p2 +// CHECK: call void %[[#P2]](i32 %i) #[[#A2]] + + a0[i](); +// CHECK: %[[EP0:.*]] = getelementptr {{.*}} @a0 +// CHECK: %[[#E0:]] = load {{.*}} %[[EP0]] +// CHECK: call void %[[#E0]]() #[[#A2]] + + a1[i](); +// CHECK: %[[EP1:.*]] = getelementptr {{.*}} @a1 +// CHECK: %[[#E1:]] = load {{.*}} %[[EP1]] +// CHECK: call void %[[#E1]]() #[[#A2]] + + a2[i](); +// CHECK: %[[EP2:.*]] = getelementptr {{.*}} @a2 +// CHECK: %[[#E2:]] = load {{.*}} %[[EP2]] +// CHECK: call void %[[#E2]]() #[[#A2]] + + a3[i](); +// CHECK: %[[EP3:.*]] = getelementptr {{.*}} @a3 +// CHECK: %[[#E3:]] = load {{.*}} %[[EP3]] +// CHECK: call void %[[#E3]]() #[[#A2]] + + a4[i](); +// CHECK: %[[EP4:.*]] = getelementptr {{.*}} @a4 +// CHECK: %[[#E4:]] = load {{.*}} %[[EP4]] +// CHECK: call void %[[#E4]]() #[[#A2]] + + b[i](i); +// CHECK: %[[EP5:.*]] = getelementptr {{.*}} @b +// CHECK: %[[#E5:]] = load {{.*}} %[[EP5]] +// CHECK: call void %[[#E5]](i32 %i) #[[#A2]] +} + +// CHECK: attributes #[[#A1]] = { nounwind } +// CHECK: attributes #[[#A2]] = { nounwind "cmse_nonsecure_call" diff --git a/clang/test/CodeGen/arm-homogenous.c b/clang/test/CodeGen/arm-homogenous.c index 42a9bc1c16435..d321fc974c52e 100644 --- a/clang/test/CodeGen/arm-homogenous.c +++ b/clang/test/CodeGen/arm-homogenous.c @@ -27,7 +27,7 @@ void test_union_with_first_floats(void) { void test_return_union_with_first_floats(void) { g_u_f = returns_union_with_first_floats(); } -// CHECK: declare arm_aapcs_vfpcc void @returns_union_with_first_floats(%union.union_with_first_floats* sret) +// CHECK: declare arm_aapcs_vfpcc void @returns_union_with_first_floats(%union.union_with_first_floats* sret align 4) /* This is not a homogenous aggregate - fundamental types are different */ typedef union { @@ -47,7 +47,7 @@ void test_union_with_non_first_floats(void) { void test_return_union_with_non_first_floats(void) { g_u_nf_f = returns_union_with_non_first_floats(); } -// CHECK: declare arm_aapcs_vfpcc void @returns_union_with_non_first_floats(%union.union_with_non_first_floats* sret) +// CHECK: declare arm_aapcs_vfpcc void @returns_union_with_non_first_floats(%union.union_with_non_first_floats* sret align 4) /* This is not a homogenous aggregate - fundamental types are different */ typedef struct { @@ -67,7 +67,7 @@ void test_struct_with_union_with_first_floats(void) { void test_return_struct_with_union_with_first_floats(void) { g_s_f = returns_struct_with_union_with_first_floats(); } -// CHECK: declare arm_aapcs_vfpcc void @returns_struct_with_union_with_first_floats(%struct.struct_with_union_with_first_floats* sret) +// CHECK: declare arm_aapcs_vfpcc void @returns_struct_with_union_with_first_floats(%struct.struct_with_union_with_first_floats* sret align 4) /* This is not a homogenous aggregate - fundamental types are different */ typedef struct { @@ -87,7 +87,7 @@ void test_struct_with_union_with_non_first_floats(void) { void test_return_struct_with_union_with_non_first_floats(void) { g_s_nf_f = returns_struct_with_union_with_non_first_floats(); } -// CHECK: declare arm_aapcs_vfpcc void @returns_struct_with_union_with_non_first_floats(%struct.struct_with_union_with_non_first_floats* sret) +// CHECK: declare arm_aapcs_vfpcc void @returns_struct_with_union_with_non_first_floats(%struct.struct_with_union_with_non_first_floats* sret align 4) /* Plain array is not a homogenous aggregate */ extern void takes_array_of_floats(float a[4]); diff --git a/clang/test/CodeGen/arm-mve-intrinsics/ternary.c b/clang/test/CodeGen/arm-mve-intrinsics/ternary.c index 90e258715d261..77eb8d41fe580 100644 --- a/clang/test/CodeGen/arm-mve-intrinsics/ternary.c +++ b/clang/test/CodeGen/arm-mve-intrinsics/ternary.c @@ -357,6 +357,47 @@ int32x4_t test_vqdmlahq_n_s32(int32x4_t a, int32x4_t b, int32_t c) { #endif /* POLYMORPHIC */ } +// CHECK-LABEL: @test_vqdmlashq_n_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i8 [[ADD:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.arm.mve.vqdmlash.v16i8(<16 x i8> [[M1:%.*]], <16 x i8> [[M2:%.*]], i32 [[TMP0]]) +// CHECK-NEXT: ret <16 x i8> [[TMP1]] +// +int8x16_t test_vqdmlashq_n_s8(int8x16_t m1, int8x16_t m2, int8_t add) { +#ifdef POLYMORPHIC + return vqdmlashq(m1, m2, add); +#else /* POLYMORPHIC */ + return vqdmlashq_n_s8(m1, m2, add); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vqdmlashq_n_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[ADD:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i16> @llvm.arm.mve.vqdmlash.v8i16(<8 x i16> [[M1:%.*]], <8 x i16> [[M2:%.*]], i32 [[TMP0]]) +// CHECK-NEXT: ret <8 x i16> [[TMP1]] +// +int16x8_t test_vqdmlashq_n_s16(int16x8_t m1, int16x8_t m2, int16_t add) { +#ifdef POLYMORPHIC + return vqdmlashq(m1, m2, add); +#else /* POLYMORPHIC */ + return vqdmlashq_n_s16(m1, m2, add); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vqdmlashq_n_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vqdmlash.v4i32(<4 x i32> [[M1:%.*]], <4 x i32> [[M2:%.*]], i32 [[ADD:%.*]]) +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// +int32x4_t test_vqdmlashq_n_s32(int32x4_t m1, int32x4_t m2, int32_t add) { +#ifdef POLYMORPHIC + return vqdmlashq(m1, m2, add); +#else /* POLYMORPHIC */ + return vqdmlashq_n_s32(m1, m2, add); +#endif /* POLYMORPHIC */ +} + // CHECK-LABEL: @test_vqrdmlahq_n_s8( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i8 [[C:%.*]] to i32 @@ -810,6 +851,53 @@ int32x4_t test_vqdmlahq_m_n_s32(int32x4_t a, int32x4_t b, int32_t c, mve_pred16_ #endif /* POLYMORPHIC */ } +// CHECK-LABEL: @test_vqdmlashq_m_n_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i8 [[ADD:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call <16 x i8> @llvm.arm.mve.vqdmlash.predicated.v16i8.v16i1(<16 x i8> [[M1:%.*]], <16 x i8> [[M2:%.*]], i32 [[TMP0]], <16 x i1> [[TMP2]]) +// CHECK-NEXT: ret <16 x i8> [[TMP3]] +// +int8x16_t test_vqdmlashq_m_n_s8(int8x16_t m1, int8x16_t m2, int8_t add, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vqdmlashq_m(m1, m2, add, p); +#else /* POLYMORPHIC */ + return vqdmlashq_m_n_s8(m1, m2, add, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vqdmlashq_m_n_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[ADD:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call <8 x i16> @llvm.arm.mve.vqdmlash.predicated.v8i16.v8i1(<8 x i16> [[M1:%.*]], <8 x i16> [[M2:%.*]], i32 [[TMP0]], <8 x i1> [[TMP2]]) +// CHECK-NEXT: ret <8 x i16> [[TMP3]] +// +int16x8_t test_vqdmlashq_m_n_s16(int16x8_t m1, int16x8_t m2, int16_t add, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vqdmlashq_m(m1, m2, add, p); +#else /* POLYMORPHIC */ + return vqdmlashq_m_n_s16(m1, m2, add, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vqdmlashq_m_n_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.vqdmlash.predicated.v4i32.v4i1(<4 x i32> [[M1:%.*]], <4 x i32> [[M2:%.*]], i32 [[ADD:%.*]], <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// +int32x4_t test_vqdmlashq_m_n_s32(int32x4_t m1, int32x4_t m2, int32_t add, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vqdmlashq_m(m1, m2, add, p); +#else /* POLYMORPHIC */ + return vqdmlashq_m_n_s32(m1, m2, add, p); +#endif /* POLYMORPHIC */ +} + // CHECK-LABEL: @test_vqrdmlahq_m_n_s8( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i8 [[C:%.*]] to i32 @@ -903,4 +991,3 @@ int32x4_t test_vqrdmlashq_m_n_s32(int32x4_t a, int32x4_t b, int32_t c, mve_pred1 return vqrdmlashq_m_n_s32(a, b, c, p); #endif /* POLYMORPHIC */ } - diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vaddv.c b/clang/test/CodeGen/arm-mve-intrinsics/vaddv.c new file mode 100644 index 0000000000000..6bacc2775881d --- /dev/null +++ b/clang/test/CodeGen/arm-mve-intrinsics/vaddv.c @@ -0,0 +1,470 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py + // RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve -mfloat-abi hard -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s + // RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s + +#include + +// CHECK-LABEL: @test_vaddvq_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.addv.v16i8(<16 x i8> [[A:%.*]], i32 0) +// CHECK-NEXT: ret i32 [[TMP0]] +// +int32_t test_vaddvq_s8(int8x16_t a) { +#ifdef POLYMORPHIC + return vaddvq(a); +#else /* POLYMORPHIC */ + return vaddvq_s8(a); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vaddvq_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.addv.v8i16(<8 x i16> [[A:%.*]], i32 0) +// CHECK-NEXT: ret i32 [[TMP0]] +// +int32_t test_vaddvq_s16(int16x8_t a) { +#ifdef POLYMORPHIC + return vaddvq(a); +#else /* POLYMORPHIC */ + return vaddvq_s16(a); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vaddvq_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.addv.v4i32(<4 x i32> [[A:%.*]], i32 0) +// CHECK-NEXT: ret i32 [[TMP0]] +// +int32_t test_vaddvq_s32(int32x4_t a) { +#ifdef POLYMORPHIC + return vaddvq(a); +#else /* POLYMORPHIC */ + return vaddvq_s32(a); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vaddvq_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.addv.v16i8(<16 x i8> [[A:%.*]], i32 1) +// CHECK-NEXT: ret i32 [[TMP0]] +// +uint32_t test_vaddvq_u8(uint8x16_t a) { +#ifdef POLYMORPHIC + return vaddvq(a); +#else /* POLYMORPHIC */ + return vaddvq_u8(a); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vaddvq_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.addv.v8i16(<8 x i16> [[A:%.*]], i32 1) +// CHECK-NEXT: ret i32 [[TMP0]] +// +uint32_t test_vaddvq_u16(uint16x8_t a) { +#ifdef POLYMORPHIC + return vaddvq(a); +#else /* POLYMORPHIC */ + return vaddvq_u16(a); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vaddvq_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.addv.v4i32(<4 x i32> [[A:%.*]], i32 1) +// CHECK-NEXT: ret i32 [[TMP0]] +// +uint32_t test_vaddvq_u32(uint32x4_t a) { +#ifdef POLYMORPHIC + return vaddvq(a); +#else /* POLYMORPHIC */ + return vaddvq_u32(a); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vaddvaq_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.addv.v16i8(<16 x i8> [[B:%.*]], i32 0) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[TMP0]], [[A:%.*]] +// CHECK-NEXT: ret i32 [[TMP1]] +// +int32_t test_vaddvaq_s8(int32_t a, int8x16_t b) { +#ifdef POLYMORPHIC + return vaddvaq(a, b); +#else /* POLYMORPHIC */ + return vaddvaq_s8(a, b); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vaddvaq_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.addv.v8i16(<8 x i16> [[B:%.*]], i32 0) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[TMP0]], [[A:%.*]] +// CHECK-NEXT: ret i32 [[TMP1]] +// +int32_t test_vaddvaq_s16(int32_t a, int16x8_t b) { +#ifdef POLYMORPHIC + return vaddvaq(a, b); +#else /* POLYMORPHIC */ + return vaddvaq_s16(a, b); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vaddvaq_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.addv.v4i32(<4 x i32> [[B:%.*]], i32 0) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[TMP0]], [[A:%.*]] +// CHECK-NEXT: ret i32 [[TMP1]] +// +int32_t test_vaddvaq_s32(int32_t a, int32x4_t b) { +#ifdef POLYMORPHIC + return vaddvaq(a, b); +#else /* POLYMORPHIC */ + return vaddvaq_s32(a, b); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vaddvaq_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.addv.v16i8(<16 x i8> [[B:%.*]], i32 1) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[TMP0]], [[A:%.*]] +// CHECK-NEXT: ret i32 [[TMP1]] +// +uint32_t test_vaddvaq_u8(uint32_t a, uint8x16_t b) { +#ifdef POLYMORPHIC + return vaddvaq(a, b); +#else /* POLYMORPHIC */ + return vaddvaq_u8(a, b); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vaddvaq_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.addv.v8i16(<8 x i16> [[B:%.*]], i32 1) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[TMP0]], [[A:%.*]] +// CHECK-NEXT: ret i32 [[TMP1]] +// +uint32_t test_vaddvaq_u16(uint32_t a, uint16x8_t b) { +#ifdef POLYMORPHIC + return vaddvaq(a, b); +#else /* POLYMORPHIC */ + return vaddvaq_u16(a, b); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vaddvaq_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.addv.v4i32(<4 x i32> [[B:%.*]], i32 1) +// CHECK-NEXT: [[TMP1:%.*]] = add i32 [[TMP0]], [[A:%.*]] +// CHECK-NEXT: ret i32 [[TMP1]] +// +uint32_t test_vaddvaq_u32(uint32_t a, uint32x4_t b) { +#ifdef POLYMORPHIC + return vaddvaq(a, b); +#else /* POLYMORPHIC */ + return vaddvaq_u32(a, b); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vaddvq_p_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.addv.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], i32 0, <16 x i1> [[TMP1]]) +// CHECK-NEXT: ret i32 [[TMP2]] +// +int32_t test_vaddvq_p_s8(int8x16_t a, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vaddvq_p(a, p); +#else /* POLYMORPHIC */ + return vaddvq_p_s8(a, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vaddvq_p_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.addv.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], i32 0, <8 x i1> [[TMP1]]) +// CHECK-NEXT: ret i32 [[TMP2]] +// +int32_t test_vaddvq_p_s16(int16x8_t a, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vaddvq_p(a, p); +#else /* POLYMORPHIC */ + return vaddvq_p_s16(a, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vaddvq_p_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.addv.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], i32 0, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret i32 [[TMP2]] +// +int32_t test_vaddvq_p_s32(int32x4_t a, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vaddvq_p(a, p); +#else /* POLYMORPHIC */ + return vaddvq_p_s32(a, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vaddvq_p_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.addv.predicated.v16i8.v16i1(<16 x i8> [[A:%.*]], i32 1, <16 x i1> [[TMP1]]) +// CHECK-NEXT: ret i32 [[TMP2]] +// +uint32_t test_vaddvq_p_u8(uint8x16_t a, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vaddvq_p(a, p); +#else /* POLYMORPHIC */ + return vaddvq_p_u8(a, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vaddvq_p_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.addv.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], i32 1, <8 x i1> [[TMP1]]) +// CHECK-NEXT: ret i32 [[TMP2]] +// +uint32_t test_vaddvq_p_u16(uint16x8_t a, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vaddvq_p(a, p); +#else /* POLYMORPHIC */ + return vaddvq_p_u16(a, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vaddvq_p_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.addv.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], i32 1, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret i32 [[TMP2]] +// +uint32_t test_vaddvq_p_u32(uint32x4_t a, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vaddvq_p(a, p); +#else /* POLYMORPHIC */ + return vaddvq_p_u32(a, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vaddvaq_p_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.addv.predicated.v16i8.v16i1(<16 x i8> [[B:%.*]], i32 0, <16 x i1> [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP2]], [[A:%.*]] +// CHECK-NEXT: ret i32 [[TMP3]] +// +int32_t test_vaddvaq_p_s8(int32_t a, int8x16_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vaddvaq_p(a, b, p); +#else /* POLYMORPHIC */ + return vaddvaq_p_s8(a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vaddvaq_p_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.addv.predicated.v8i16.v8i1(<8 x i16> [[B:%.*]], i32 0, <8 x i1> [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP2]], [[A:%.*]] +// CHECK-NEXT: ret i32 [[TMP3]] +// +int32_t test_vaddvaq_p_s16(int32_t a, int16x8_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vaddvaq_p(a, b, p); +#else /* POLYMORPHIC */ + return vaddvaq_p_s16(a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vaddvaq_p_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.addv.predicated.v4i32.v4i1(<4 x i32> [[B:%.*]], i32 0, <4 x i1> [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP2]], [[A:%.*]] +// CHECK-NEXT: ret i32 [[TMP3]] +// +int32_t test_vaddvaq_p_s32(int32_t a, int32x4_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vaddvaq_p(a, b, p); +#else /* POLYMORPHIC */ + return vaddvaq_p_s32(a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vaddvaq_p_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.addv.predicated.v16i8.v16i1(<16 x i8> [[B:%.*]], i32 1, <16 x i1> [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP2]], [[A:%.*]] +// CHECK-NEXT: ret i32 [[TMP3]] +// +uint32_t test_vaddvaq_p_u8(uint32_t a, uint8x16_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vaddvaq_p(a, b, p); +#else /* POLYMORPHIC */ + return vaddvaq_p_u8(a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vaddvaq_p_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.addv.predicated.v8i16.v8i1(<8 x i16> [[B:%.*]], i32 1, <8 x i1> [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP2]], [[A:%.*]] +// CHECK-NEXT: ret i32 [[TMP3]] +// +uint32_t test_vaddvaq_p_u16(uint32_t a, uint16x8_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vaddvaq_p(a, b, p); +#else /* POLYMORPHIC */ + return vaddvaq_p_u16(a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vaddvaq_p_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.addv.predicated.v4i32.v4i1(<4 x i32> [[B:%.*]], i32 1, <4 x i1> [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP2]], [[A:%.*]] +// CHECK-NEXT: ret i32 [[TMP3]] +// +uint32_t test_vaddvaq_p_u32(uint32_t a, uint32x4_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vaddvaq_p(a, b, p); +#else /* POLYMORPHIC */ + return vaddvaq_p_u32(a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vaddlvq_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.arm.mve.addlv.v4i32(<4 x i32> [[A:%.*]], i32 0) +// CHECK-NEXT: ret i64 [[TMP0]] +// +int64_t test_vaddlvq_s32(int32x4_t a) { +#ifdef POLYMORPHIC + return vaddlvq(a); +#else /* POLYMORPHIC */ + return vaddlvq_s32(a); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vaddlvq_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.arm.mve.addlv.v4i32(<4 x i32> [[A:%.*]], i32 1) +// CHECK-NEXT: ret i64 [[TMP0]] +// +uint64_t test_vaddlvq_u32(uint32x4_t a) { +#ifdef POLYMORPHIC + return vaddlvq(a); +#else /* POLYMORPHIC */ + return vaddlvq_u32(a); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vaddlvaq_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.arm.mve.addlv.v4i32(<4 x i32> [[B:%.*]], i32 0) +// CHECK-NEXT: [[TMP1:%.*]] = add i64 [[TMP0]], [[A:%.*]] +// CHECK-NEXT: ret i64 [[TMP1]] +// +int64_t test_vaddlvaq_s32(int64_t a, int32x4_t b) { +#ifdef POLYMORPHIC + return vaddlvaq(a, b); +#else /* POLYMORPHIC */ + return vaddlvaq_s32(a, b); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vaddlvaq_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.arm.mve.addlv.v4i32(<4 x i32> [[B:%.*]], i32 1) +// CHECK-NEXT: [[TMP1:%.*]] = add i64 [[TMP0]], [[A:%.*]] +// CHECK-NEXT: ret i64 [[TMP1]] +// +uint64_t test_vaddlvaq_u32(uint64_t a, uint32x4_t b) { +#ifdef POLYMORPHIC + return vaddlvaq(a, b); +#else /* POLYMORPHIC */ + return vaddlvaq_u32(a, b); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vaddlvq_p_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.arm.mve.addlv.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], i32 0, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret i64 [[TMP2]] +// +int64_t test_vaddlvq_p_s32(int32x4_t a, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vaddlvq_p(a, p); +#else /* POLYMORPHIC */ + return vaddlvq_p_s32(a, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vaddlvq_p_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.arm.mve.addlv.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], i32 1, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret i64 [[TMP2]] +// +uint64_t test_vaddlvq_p_u32(uint32x4_t a, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vaddlvq_p(a, p); +#else /* POLYMORPHIC */ + return vaddlvq_p_u32(a, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vaddlvaq_p_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.arm.mve.addlv.predicated.v4i32.v4i1(<4 x i32> [[B:%.*]], i32 0, <4 x i1> [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = add i64 [[TMP2]], [[A:%.*]] +// CHECK-NEXT: ret i64 [[TMP3]] +// +int64_t test_vaddlvaq_p_s32(int64_t a, int32x4_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vaddlvaq_p(a, b, p); +#else /* POLYMORPHIC */ + return vaddlvaq_p_s32(a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vaddlvaq_p_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.arm.mve.addlv.predicated.v4i32.v4i1(<4 x i32> [[B:%.*]], i32 1, <4 x i1> [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = add i64 [[TMP2]], [[A:%.*]] +// CHECK-NEXT: ret i64 [[TMP3]] +// +uint64_t test_vaddlvaq_p_u32(uint64_t a, uint32x4_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vaddlvaq_p(a, b, p); +#else /* POLYMORPHIC */ + return vaddlvaq_p_u32(a, b, p); +#endif /* POLYMORPHIC */ +} + diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vminvq.c b/clang/test/CodeGen/arm-mve-intrinsics/vminvq.c index 1cf4d0ee198e0..0d484bf98f7ad 100644 --- a/clang/test/CodeGen/arm-mve-intrinsics/vminvq.c +++ b/clang/test/CodeGen/arm-mve-intrinsics/vminvq.c @@ -1,97 +1,853 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py -// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s -// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s +// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -mem2reg -sroa | FileCheck %s +// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -mem2reg -sroa | FileCheck %s #include // CHECK-LABEL: @test_vminvq_s8( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i8 [[A:%.*]] to i32 -// CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.arm.mve.minv.s.v16i8(i32 [[TMP0]], <16 x i8> [[B:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.arm.mve.minv.v16i8(i32 [[TMP0]], <16 x i8> [[B:%.*]], i32 0) // CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8 // CHECK-NEXT: ret i8 [[TMP2]] // -int8_t test_vminvq_s8(int8_t a, int8x16_t b) -{ +int8_t test_vminvq_s8(int8_t a, int8x16_t b) { #ifdef POLYMORPHIC - return vminvq(a, b); -#else /* POLYMORPHIC */ - return vminvq_s8(a, b); + return vminvq(a, b); +#else /* POLYMORPHIC */ + return vminvq_s8(a, b); #endif /* POLYMORPHIC */ } // CHECK-LABEL: @test_vminvq_s16( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[A:%.*]] to i32 -// CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.arm.mve.minv.s.v8i16(i32 [[TMP0]], <8 x i16> [[B:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.arm.mve.minv.v8i16(i32 [[TMP0]], <8 x i16> [[B:%.*]], i32 0) // CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 // CHECK-NEXT: ret i16 [[TMP2]] // -int16_t test_vminvq_s16(int16_t a, int16x8_t b) -{ +int16_t test_vminvq_s16(int16_t a, int16x8_t b) { #ifdef POLYMORPHIC - return vminvq(a, b); -#else /* POLYMORPHIC */ - return vminvq_s16(a, b); + return vminvq(a, b); +#else /* POLYMORPHIC */ + return vminvq_s16(a, b); #endif /* POLYMORPHIC */ } // CHECK-LABEL: @test_vminvq_s32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.minv.s.v4i32(i32 [[A:%.*]], <4 x i32> [[B:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.minv.v4i32(i32 [[A:%.*]], <4 x i32> [[B:%.*]], i32 0) // CHECK-NEXT: ret i32 [[TMP0]] // -int32_t test_vminvq_s32(int32_t a, int32x4_t b) -{ +int32_t test_vminvq_s32(int32_t a, int32x4_t b) { #ifdef POLYMORPHIC - return vminvq(a, b); -#else /* POLYMORPHIC */ - return vminvq_s32(a, b); + return vminvq(a, b); +#else /* POLYMORPHIC */ + return vminvq_s32(a, b); #endif /* POLYMORPHIC */ } // CHECK-LABEL: @test_vminvq_u8( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i8 [[A:%.*]] to i32 -// CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.arm.mve.minv.u.v16i8(i32 [[TMP0]], <16 x i8> [[B:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.arm.mve.minv.v16i8(i32 [[TMP0]], <16 x i8> [[B:%.*]], i32 1) // CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8 // CHECK-NEXT: ret i8 [[TMP2]] // -uint8_t test_vminvq_u8(uint8_t a, uint8x16_t b) -{ +uint8_t test_vminvq_u8(uint8_t a, uint8x16_t b) { #ifdef POLYMORPHIC - return vminvq(a, b); -#else /* POLYMORPHIC */ - return vminvq_u8(a, b); + return vminvq(a, b); +#else /* POLYMORPHIC */ + return vminvq_u8(a, b); #endif /* POLYMORPHIC */ } // CHECK-LABEL: @test_vminvq_u16( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[A:%.*]] to i32 -// CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.arm.mve.minv.u.v8i16(i32 [[TMP0]], <8 x i16> [[B:%.*]]) +// CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.arm.mve.minv.v8i16(i32 [[TMP0]], <8 x i16> [[B:%.*]], i32 1) // CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 // CHECK-NEXT: ret i16 [[TMP2]] // -uint16_t test_vminvq_u16(uint16_t a, uint16x8_t b) -{ +uint16_t test_vminvq_u16(uint16_t a, uint16x8_t b) { #ifdef POLYMORPHIC - return vminvq(a, b); -#else /* POLYMORPHIC */ - return vminvq_u16(a, b); + return vminvq(a, b); +#else /* POLYMORPHIC */ + return vminvq_u16(a, b); #endif /* POLYMORPHIC */ } // CHECK-LABEL: @test_vminvq_u32( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.minv.u.v4i32(i32 [[A:%.*]], <4 x i32> [[B:%.*]]) +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.minv.v4i32(i32 [[A:%.*]], <4 x i32> [[B:%.*]], i32 1) // CHECK-NEXT: ret i32 [[TMP0]] // -uint32_t test_vminvq_u32(uint32_t a, uint32x4_t b) -{ +uint32_t test_vminvq_u32(uint32_t a, uint32x4_t b) { #ifdef POLYMORPHIC - return vminvq(a, b); -#else /* POLYMORPHIC */ - return vminvq_u32(a, b); + return vminvq(a, b); +#else /* POLYMORPHIC */ + return vminvq_u32(a, b); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vmaxvq_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i8 [[A:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.arm.mve.maxv.v16i8(i32 [[TMP0]], <16 x i8> [[B:%.*]], i32 0) +// CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8 +// CHECK-NEXT: ret i8 [[TMP2]] +// +int8_t test_vmaxvq_s8(int8_t a, int8x16_t b) { +#ifdef POLYMORPHIC + return vmaxvq(a, b); +#else /* POLYMORPHIC */ + return vmaxvq_s8(a, b); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vmaxvq_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[A:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.arm.mve.maxv.v8i16(i32 [[TMP0]], <8 x i16> [[B:%.*]], i32 0) +// CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 +// CHECK-NEXT: ret i16 [[TMP2]] +// +int16_t test_vmaxvq_s16(int16_t a, int16x8_t b) { +#ifdef POLYMORPHIC + return vmaxvq(a, b); +#else /* POLYMORPHIC */ + return vmaxvq_s16(a, b); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vmaxvq_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.maxv.v4i32(i32 [[A:%.*]], <4 x i32> [[B:%.*]], i32 0) +// CHECK-NEXT: ret i32 [[TMP0]] +// +int32_t test_vmaxvq_s32(int32_t a, int32x4_t b) { +#ifdef POLYMORPHIC + return vmaxvq(a, b); +#else /* POLYMORPHIC */ + return vmaxvq_s32(a, b); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vmaxvq_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i8 [[A:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.arm.mve.maxv.v16i8(i32 [[TMP0]], <16 x i8> [[B:%.*]], i32 1) +// CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8 +// CHECK-NEXT: ret i8 [[TMP2]] +// +uint8_t test_vmaxvq_u8(uint8_t a, uint8x16_t b) { +#ifdef POLYMORPHIC + return vmaxvq(a, b); +#else /* POLYMORPHIC */ + return vmaxvq_u8(a, b); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vmaxvq_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[A:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.arm.mve.maxv.v8i16(i32 [[TMP0]], <8 x i16> [[B:%.*]], i32 1) +// CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 +// CHECK-NEXT: ret i16 [[TMP2]] +// +uint16_t test_vmaxvq_u16(uint16_t a, uint16x8_t b) { +#ifdef POLYMORPHIC + return vmaxvq(a, b); +#else /* POLYMORPHIC */ + return vmaxvq_u16(a, b); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vmaxvq_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.maxv.v4i32(i32 [[A:%.*]], <4 x i32> [[B:%.*]], i32 1) +// CHECK-NEXT: ret i32 [[TMP0]] +// +uint32_t test_vmaxvq_u32(uint32_t a, uint32x4_t b) { +#ifdef POLYMORPHIC + return vmaxvq(a, b); +#else /* POLYMORPHIC */ + return vmaxvq_u32(a, b); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vminavq_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i8 [[A:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.arm.mve.minav.v16i8(i32 [[TMP0]], <16 x i8> [[B:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8 +// CHECK-NEXT: ret i8 [[TMP2]] +// +uint8_t test_vminavq_s8(uint8_t a, int8x16_t b) { +#ifdef POLYMORPHIC + return vminavq(a, b); +#else /* POLYMORPHIC */ + return vminavq_s8(a, b); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vminavq_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[A:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.arm.mve.minav.v8i16(i32 [[TMP0]], <8 x i16> [[B:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 +// CHECK-NEXT: ret i16 [[TMP2]] +// +uint16_t test_vminavq_s16(uint16_t a, int16x8_t b) { +#ifdef POLYMORPHIC + return vminavq(a, b); +#else /* POLYMORPHIC */ + return vminavq_s16(a, b); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vminavq_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.minav.v4i32(i32 [[A:%.*]], <4 x i32> [[B:%.*]]) +// CHECK-NEXT: ret i32 [[TMP0]] +// +uint32_t test_vminavq_s32(uint32_t a, int32x4_t b) { +#ifdef POLYMORPHIC + return vminavq(a, b); +#else /* POLYMORPHIC */ + return vminavq_s32(a, b); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vmaxavq_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i8 [[A:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.arm.mve.maxav.v16i8(i32 [[TMP0]], <16 x i8> [[B:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8 +// CHECK-NEXT: ret i8 [[TMP2]] +// +uint8_t test_vmaxavq_s8(uint8_t a, int8x16_t b) { +#ifdef POLYMORPHIC + return vmaxavq(a, b); +#else /* POLYMORPHIC */ + return vmaxavq_s8(a, b); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vmaxavq_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[A:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.arm.mve.maxav.v8i16(i32 [[TMP0]], <8 x i16> [[B:%.*]]) +// CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 +// CHECK-NEXT: ret i16 [[TMP2]] +// +uint16_t test_vmaxavq_s16(uint16_t a, int16x8_t b) { +#ifdef POLYMORPHIC + return vmaxavq(a, b); +#else /* POLYMORPHIC */ + return vmaxavq_s16(a, b); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vmaxavq_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.arm.mve.maxav.v4i32(i32 [[A:%.*]], <4 x i32> [[B:%.*]]) +// CHECK-NEXT: ret i32 [[TMP0]] +// +uint32_t test_vmaxavq_s32(uint32_t a, int32x4_t b) { +#ifdef POLYMORPHIC + return vmaxavq(a, b); +#else /* POLYMORPHIC */ + return vmaxavq_s32(a, b); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vminnmvq_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast float [[A_COERCE:%.*]] to i32 +// CHECK-NEXT: [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half +// CHECK-NEXT: [[TMP2:%.*]] = call half @llvm.arm.mve.minnmv.f16.v8f16(half [[TMP1]], <8 x half> [[B:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast half [[TMP2]] to i16 +// CHECK-NEXT: [[TMP4:%.*]] = bitcast float undef to i32 +// CHECK-NEXT: [[TMP2_0_INSERT_EXT:%.*]] = zext i16 [[TMP3]] to i32 +// CHECK-NEXT: [[TMP2_0_INSERT_MASK:%.*]] = and i32 [[TMP4]], -65536 +// CHECK-NEXT: [[TMP2_0_INSERT_INSERT:%.*]] = or i32 [[TMP2_0_INSERT_MASK]], [[TMP2_0_INSERT_EXT]] +// CHECK-NEXT: [[TMP5:%.*]] = bitcast i32 [[TMP2_0_INSERT_INSERT]] to float +// CHECK-NEXT: ret float [[TMP5]] +// +float16_t test_vminnmvq_f16(float16_t a, float16x8_t b) { +#ifdef POLYMORPHIC + return vminnmvq(a, b); +#else /* POLYMORPHIC */ + return vminnmvq_f16(a, b); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vminnmvq_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call float @llvm.arm.mve.minnmv.f32.v4f32(float [[A:%.*]], <4 x float> [[B:%.*]]) +// CHECK-NEXT: ret float [[TMP0]] +// +float32_t test_vminnmvq_f32(float32_t a, float32x4_t b) { +#ifdef POLYMORPHIC + return vminnmvq(a, b); +#else /* POLYMORPHIC */ + return vminnmvq_f32(a, b); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vminnmavq_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast float [[A_COERCE:%.*]] to i32 +// CHECK-NEXT: [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half +// CHECK-NEXT: [[TMP2:%.*]] = call half @llvm.arm.mve.minnmav.f16.v8f16(half [[TMP1]], <8 x half> [[B:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast half [[TMP2]] to i16 +// CHECK-NEXT: [[TMP4:%.*]] = bitcast float undef to i32 +// CHECK-NEXT: [[TMP2_0_INSERT_EXT:%.*]] = zext i16 [[TMP3]] to i32 +// CHECK-NEXT: [[TMP2_0_INSERT_MASK:%.*]] = and i32 [[TMP4]], -65536 +// CHECK-NEXT: [[TMP2_0_INSERT_INSERT:%.*]] = or i32 [[TMP2_0_INSERT_MASK]], [[TMP2_0_INSERT_EXT]] +// CHECK-NEXT: [[TMP5:%.*]] = bitcast i32 [[TMP2_0_INSERT_INSERT]] to float +// CHECK-NEXT: ret float [[TMP5]] +// +float16_t test_vminnmavq_f16(float16_t a, float16x8_t b) { +#ifdef POLYMORPHIC + return vminnmavq(a, b); +#else /* POLYMORPHIC */ + return vminnmavq_f16(a, b); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vminnmavq_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call float @llvm.arm.mve.minnmav.f32.v4f32(float [[A:%.*]], <4 x float> [[B:%.*]]) +// CHECK-NEXT: ret float [[TMP0]] +// +float32_t test_vminnmavq_f32(float32_t a, float32x4_t b) { +#ifdef POLYMORPHIC + return vminnmavq(a, b); +#else /* POLYMORPHIC */ + return vminnmavq_f32(a, b); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vmaxnmvq_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast float [[A_COERCE:%.*]] to i32 +// CHECK-NEXT: [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half +// CHECK-NEXT: [[TMP2:%.*]] = call half @llvm.arm.mve.maxnmv.f16.v8f16(half [[TMP1]], <8 x half> [[B:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast half [[TMP2]] to i16 +// CHECK-NEXT: [[TMP4:%.*]] = bitcast float undef to i32 +// CHECK-NEXT: [[TMP2_0_INSERT_EXT:%.*]] = zext i16 [[TMP3]] to i32 +// CHECK-NEXT: [[TMP2_0_INSERT_MASK:%.*]] = and i32 [[TMP4]], -65536 +// CHECK-NEXT: [[TMP2_0_INSERT_INSERT:%.*]] = or i32 [[TMP2_0_INSERT_MASK]], [[TMP2_0_INSERT_EXT]] +// CHECK-NEXT: [[TMP5:%.*]] = bitcast i32 [[TMP2_0_INSERT_INSERT]] to float +// CHECK-NEXT: ret float [[TMP5]] +// +float16_t test_vmaxnmvq_f16(float16_t a, float16x8_t b) { +#ifdef POLYMORPHIC + return vmaxnmvq(a, b); +#else /* POLYMORPHIC */ + return vmaxnmvq_f16(a, b); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vmaxnmvq_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call float @llvm.arm.mve.maxnmv.f32.v4f32(float [[A:%.*]], <4 x float> [[B:%.*]]) +// CHECK-NEXT: ret float [[TMP0]] +// +float32_t test_vmaxnmvq_f32(float32_t a, float32x4_t b) { +#ifdef POLYMORPHIC + return vmaxnmvq(a, b); +#else /* POLYMORPHIC */ + return vmaxnmvq_f32(a, b); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vmaxnmavq_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast float [[A_COERCE:%.*]] to i32 +// CHECK-NEXT: [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half +// CHECK-NEXT: [[TMP2:%.*]] = call half @llvm.arm.mve.maxnmav.f16.v8f16(half [[TMP1]], <8 x half> [[B:%.*]]) +// CHECK-NEXT: [[TMP3:%.*]] = bitcast half [[TMP2]] to i16 +// CHECK-NEXT: [[TMP4:%.*]] = bitcast float undef to i32 +// CHECK-NEXT: [[TMP2_0_INSERT_EXT:%.*]] = zext i16 [[TMP3]] to i32 +// CHECK-NEXT: [[TMP2_0_INSERT_MASK:%.*]] = and i32 [[TMP4]], -65536 +// CHECK-NEXT: [[TMP2_0_INSERT_INSERT:%.*]] = or i32 [[TMP2_0_INSERT_MASK]], [[TMP2_0_INSERT_EXT]] +// CHECK-NEXT: [[TMP5:%.*]] = bitcast i32 [[TMP2_0_INSERT_INSERT]] to float +// CHECK-NEXT: ret float [[TMP5]] +// +float16_t test_vmaxnmavq_f16(float16_t a, float16x8_t b) { +#ifdef POLYMORPHIC + return vmaxnmavq(a, b); +#else /* POLYMORPHIC */ + return vmaxnmavq_f16(a, b); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vmaxnmavq_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call float @llvm.arm.mve.maxnmav.f32.v4f32(float [[A:%.*]], <4 x float> [[B:%.*]]) +// CHECK-NEXT: ret float [[TMP0]] +// +float32_t test_vmaxnmavq_f32(float32_t a, float32x4_t b) { +#ifdef POLYMORPHIC + return vmaxnmavq(a, b); +#else /* POLYMORPHIC */ + return vmaxnmavq_f32(a, b); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vminvq_p_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i8 [[A:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.arm.mve.minv.predicated.v16i8.v16i1(i32 [[TMP0]], <16 x i8> [[B:%.*]], i32 0, <16 x i1> [[TMP2]]) +// CHECK-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8 +// CHECK-NEXT: ret i8 [[TMP4]] +// +int8_t test_vminvq_p_s8(int8_t a, int8x16_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vminvq_p(a, b, p); +#else /* POLYMORPHIC */ + return vminvq_p_s8(a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vminvq_p_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[A:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.arm.mve.minv.predicated.v8i16.v8i1(i32 [[TMP0]], <8 x i16> [[B:%.*]], i32 0, <8 x i1> [[TMP2]]) +// CHECK-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16 +// CHECK-NEXT: ret i16 [[TMP4]] +// +int16_t test_vminvq_p_s16(int16_t a, int16x8_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vminvq_p(a, b, p); +#else /* POLYMORPHIC */ + return vminvq_p_s16(a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vminvq_p_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.minv.predicated.v4i32.v4i1(i32 [[A:%.*]], <4 x i32> [[B:%.*]], i32 0, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret i32 [[TMP2]] +// +int32_t test_vminvq_p_s32(int32_t a, int32x4_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vminvq_p(a, b, p); +#else /* POLYMORPHIC */ + return vminvq_p_s32(a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vminvq_p_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i8 [[A:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.arm.mve.minv.predicated.v16i8.v16i1(i32 [[TMP0]], <16 x i8> [[B:%.*]], i32 1, <16 x i1> [[TMP2]]) +// CHECK-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8 +// CHECK-NEXT: ret i8 [[TMP4]] +// +uint8_t test_vminvq_p_u8(uint8_t a, uint8x16_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vminvq_p(a, b, p); +#else /* POLYMORPHIC */ + return vminvq_p_u8(a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vminvq_p_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[A:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.arm.mve.minv.predicated.v8i16.v8i1(i32 [[TMP0]], <8 x i16> [[B:%.*]], i32 1, <8 x i1> [[TMP2]]) +// CHECK-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16 +// CHECK-NEXT: ret i16 [[TMP4]] +// +uint16_t test_vminvq_p_u16(uint16_t a, uint16x8_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vminvq_p(a, b, p); +#else /* POLYMORPHIC */ + return vminvq_p_u16(a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vminvq_p_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.minv.predicated.v4i32.v4i1(i32 [[A:%.*]], <4 x i32> [[B:%.*]], i32 1, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret i32 [[TMP2]] +// +uint32_t test_vminvq_p_u32(uint32_t a, uint32x4_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vminvq_p(a, b, p); +#else /* POLYMORPHIC */ + return vminvq_p_u32(a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vmaxvq_p_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i8 [[A:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.arm.mve.maxv.predicated.v16i8.v16i1(i32 [[TMP0]], <16 x i8> [[B:%.*]], i32 0, <16 x i1> [[TMP2]]) +// CHECK-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8 +// CHECK-NEXT: ret i8 [[TMP4]] +// +int8_t test_vmaxvq_p_s8(int8_t a, int8x16_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vmaxvq_p(a, b, p); +#else /* POLYMORPHIC */ + return vmaxvq_p_s8(a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vmaxvq_p_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[A:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.arm.mve.maxv.predicated.v8i16.v8i1(i32 [[TMP0]], <8 x i16> [[B:%.*]], i32 0, <8 x i1> [[TMP2]]) +// CHECK-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16 +// CHECK-NEXT: ret i16 [[TMP4]] +// +int16_t test_vmaxvq_p_s16(int16_t a, int16x8_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vmaxvq_p(a, b, p); +#else /* POLYMORPHIC */ + return vmaxvq_p_s16(a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vmaxvq_p_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.maxv.predicated.v4i32.v4i1(i32 [[A:%.*]], <4 x i32> [[B:%.*]], i32 0, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret i32 [[TMP2]] +// +int32_t test_vmaxvq_p_s32(int32_t a, int32x4_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vmaxvq_p(a, b, p); +#else /* POLYMORPHIC */ + return vmaxvq_p_s32(a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vmaxvq_p_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i8 [[A:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.arm.mve.maxv.predicated.v16i8.v16i1(i32 [[TMP0]], <16 x i8> [[B:%.*]], i32 1, <16 x i1> [[TMP2]]) +// CHECK-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8 +// CHECK-NEXT: ret i8 [[TMP4]] +// +uint8_t test_vmaxvq_p_u8(uint8_t a, uint8x16_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vmaxvq_p(a, b, p); +#else /* POLYMORPHIC */ + return vmaxvq_p_u8(a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vmaxvq_p_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[A:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.arm.mve.maxv.predicated.v8i16.v8i1(i32 [[TMP0]], <8 x i16> [[B:%.*]], i32 1, <8 x i1> [[TMP2]]) +// CHECK-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16 +// CHECK-NEXT: ret i16 [[TMP4]] +// +uint16_t test_vmaxvq_p_u16(uint16_t a, uint16x8_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vmaxvq_p(a, b, p); +#else /* POLYMORPHIC */ + return vmaxvq_p_u16(a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vmaxvq_p_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.maxv.predicated.v4i32.v4i1(i32 [[A:%.*]], <4 x i32> [[B:%.*]], i32 1, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret i32 [[TMP2]] +// +uint32_t test_vmaxvq_p_u32(uint32_t a, uint32x4_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vmaxvq_p(a, b, p); +#else /* POLYMORPHIC */ + return vmaxvq_p_u32(a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vminavq_p_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i8 [[A:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.arm.mve.minav.predicated.v16i8.v16i1(i32 [[TMP0]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP2]]) +// CHECK-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8 +// CHECK-NEXT: ret i8 [[TMP4]] +// +uint8_t test_vminavq_p_s8(uint8_t a, int8x16_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vminavq_p(a, b, p); +#else /* POLYMORPHIC */ + return vminavq_p_s8(a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vminavq_p_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[A:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.arm.mve.minav.predicated.v8i16.v8i1(i32 [[TMP0]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP2]]) +// CHECK-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16 +// CHECK-NEXT: ret i16 [[TMP4]] +// +uint16_t test_vminavq_p_s16(uint16_t a, int16x8_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vminavq_p(a, b, p); +#else /* POLYMORPHIC */ + return vminavq_p_s16(a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vminavq_p_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.minav.predicated.v4i32.v4i1(i32 [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret i32 [[TMP2]] +// +uint32_t test_vminavq_p_s32(uint32_t a, int32x4_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vminavq_p(a, b, p); +#else /* POLYMORPHIC */ + return vminavq_p_s32(a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vmaxavq_p_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i8 [[A:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.arm.mve.maxav.predicated.v16i8.v16i1(i32 [[TMP0]], <16 x i8> [[B:%.*]], <16 x i1> [[TMP2]]) +// CHECK-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i8 +// CHECK-NEXT: ret i8 [[TMP4]] +// +uint8_t test_vmaxavq_p_s8(uint8_t a, int8x16_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vmaxavq_p(a, b, p); +#else /* POLYMORPHIC */ + return vmaxavq_p_s8(a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vmaxavq_p_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[A:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.arm.mve.maxav.predicated.v8i16.v8i1(i32 [[TMP0]], <8 x i16> [[B:%.*]], <8 x i1> [[TMP2]]) +// CHECK-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16 +// CHECK-NEXT: ret i16 [[TMP4]] +// +uint16_t test_vmaxavq_p_s16(uint16_t a, int16x8_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vmaxavq_p(a, b, p); +#else /* POLYMORPHIC */ + return vmaxavq_p_s16(a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vmaxavq_p_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.arm.mve.maxav.predicated.v4i32.v4i1(i32 [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret i32 [[TMP2]] +// +uint32_t test_vmaxavq_p_s32(uint32_t a, int32x4_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vmaxavq_p(a, b, p); +#else /* POLYMORPHIC */ + return vmaxavq_p_s32(a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vminnmvq_p_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast float [[A_COERCE:%.*]] to i32 +// CHECK-NEXT: [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half +// CHECK-NEXT: [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP3:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP2]]) +// CHECK-NEXT: [[TMP4:%.*]] = call half @llvm.arm.mve.minnmv.predicated.f16.v8f16.v8i1(half [[TMP1]], <8 x half> [[B:%.*]], <8 x i1> [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = bitcast half [[TMP4]] to i16 +// CHECK-NEXT: [[TMP6:%.*]] = bitcast float undef to i32 +// CHECK-NEXT: [[TMP2_0_INSERT_EXT:%.*]] = zext i16 [[TMP5]] to i32 +// CHECK-NEXT: [[TMP2_0_INSERT_MASK:%.*]] = and i32 [[TMP6]], -65536 +// CHECK-NEXT: [[TMP2_0_INSERT_INSERT:%.*]] = or i32 [[TMP2_0_INSERT_MASK]], [[TMP2_0_INSERT_EXT]] +// CHECK-NEXT: [[TMP7:%.*]] = bitcast i32 [[TMP2_0_INSERT_INSERT]] to float +// CHECK-NEXT: ret float [[TMP7]] +// +float16_t test_vminnmvq_p_f16(float16_t a, float16x8_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vminnmvq_p(a, b, p); +#else /* POLYMORPHIC */ + return vminnmvq_p_f16(a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vminnmvq_p_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call float @llvm.arm.mve.minnmv.predicated.f32.v4f32.v4i1(float [[A:%.*]], <4 x float> [[B:%.*]], <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret float [[TMP2]] +// +float32_t test_vminnmvq_p_f32(float32_t a, float32x4_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vminnmvq_p(a, b, p); +#else /* POLYMORPHIC */ + return vminnmvq_p_f32(a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vminnmavq_p_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast float [[A_COERCE:%.*]] to i32 +// CHECK-NEXT: [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half +// CHECK-NEXT: [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP3:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP2]]) +// CHECK-NEXT: [[TMP4:%.*]] = call half @llvm.arm.mve.minnmav.predicated.f16.v8f16.v8i1(half [[TMP1]], <8 x half> [[B:%.*]], <8 x i1> [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = bitcast half [[TMP4]] to i16 +// CHECK-NEXT: [[TMP6:%.*]] = bitcast float undef to i32 +// CHECK-NEXT: [[TMP2_0_INSERT_EXT:%.*]] = zext i16 [[TMP5]] to i32 +// CHECK-NEXT: [[TMP2_0_INSERT_MASK:%.*]] = and i32 [[TMP6]], -65536 +// CHECK-NEXT: [[TMP2_0_INSERT_INSERT:%.*]] = or i32 [[TMP2_0_INSERT_MASK]], [[TMP2_0_INSERT_EXT]] +// CHECK-NEXT: [[TMP7:%.*]] = bitcast i32 [[TMP2_0_INSERT_INSERT]] to float +// CHECK-NEXT: ret float [[TMP7]] +// +float16_t test_vminnmavq_p_f16(float16_t a, float16x8_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vminnmavq_p(a, b, p); +#else /* POLYMORPHIC */ + return vminnmavq_p_f16(a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vminnmavq_p_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call float @llvm.arm.mve.minnmav.predicated.f32.v4f32.v4i1(float [[A:%.*]], <4 x float> [[B:%.*]], <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret float [[TMP2]] +// +float32_t test_vminnmavq_p_f32(float32_t a, float32x4_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vminnmavq_p(a, b, p); +#else /* POLYMORPHIC */ + return vminnmavq_p_f32(a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vmaxnmvq_p_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast float [[A_COERCE:%.*]] to i32 +// CHECK-NEXT: [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half +// CHECK-NEXT: [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP3:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP2]]) +// CHECK-NEXT: [[TMP4:%.*]] = call half @llvm.arm.mve.maxnmv.predicated.f16.v8f16.v8i1(half [[TMP1]], <8 x half> [[B:%.*]], <8 x i1> [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = bitcast half [[TMP4]] to i16 +// CHECK-NEXT: [[TMP6:%.*]] = bitcast float undef to i32 +// CHECK-NEXT: [[TMP2_0_INSERT_EXT:%.*]] = zext i16 [[TMP5]] to i32 +// CHECK-NEXT: [[TMP2_0_INSERT_MASK:%.*]] = and i32 [[TMP6]], -65536 +// CHECK-NEXT: [[TMP2_0_INSERT_INSERT:%.*]] = or i32 [[TMP2_0_INSERT_MASK]], [[TMP2_0_INSERT_EXT]] +// CHECK-NEXT: [[TMP7:%.*]] = bitcast i32 [[TMP2_0_INSERT_INSERT]] to float +// CHECK-NEXT: ret float [[TMP7]] +// +float16_t test_vmaxnmvq_p_f16(float16_t a, float16x8_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vmaxnmvq_p(a, b, p); +#else /* POLYMORPHIC */ + return vmaxnmvq_p_f16(a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vmaxnmvq_p_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call float @llvm.arm.mve.maxnmv.predicated.f32.v4f32.v4i1(float [[A:%.*]], <4 x float> [[B:%.*]], <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret float [[TMP2]] +// +float32_t test_vmaxnmvq_p_f32(float32_t a, float32x4_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vmaxnmvq_p(a, b, p); +#else /* POLYMORPHIC */ + return vmaxnmvq_p_f32(a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vmaxnmavq_p_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast float [[A_COERCE:%.*]] to i32 +// CHECK-NEXT: [[TMP_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[TMP0]] to i16 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[TMP_0_EXTRACT_TRUNC]] to half +// CHECK-NEXT: [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP3:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP2]]) +// CHECK-NEXT: [[TMP4:%.*]] = call half @llvm.arm.mve.maxnmav.predicated.f16.v8f16.v8i1(half [[TMP1]], <8 x half> [[B:%.*]], <8 x i1> [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = bitcast half [[TMP4]] to i16 +// CHECK-NEXT: [[TMP6:%.*]] = bitcast float undef to i32 +// CHECK-NEXT: [[TMP2_0_INSERT_EXT:%.*]] = zext i16 [[TMP5]] to i32 +// CHECK-NEXT: [[TMP2_0_INSERT_MASK:%.*]] = and i32 [[TMP6]], -65536 +// CHECK-NEXT: [[TMP2_0_INSERT_INSERT:%.*]] = or i32 [[TMP2_0_INSERT_MASK]], [[TMP2_0_INSERT_EXT]] +// CHECK-NEXT: [[TMP7:%.*]] = bitcast i32 [[TMP2_0_INSERT_INSERT]] to float +// CHECK-NEXT: ret float [[TMP7]] +// +float16_t test_vmaxnmavq_p_f16(float16_t a, float16x8_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vmaxnmavq_p(a, b, p); +#else /* POLYMORPHIC */ + return vmaxnmavq_p_f16(a, b, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vmaxnmavq_p_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call float @llvm.arm.mve.maxnmav.predicated.f32.v4f32.v4i1(float [[A:%.*]], <4 x float> [[B:%.*]], <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret float [[TMP2]] +// +float32_t test_vmaxnmavq_p_f32(float32_t a, float32x4_t b, mve_pred16_t p) { +#ifdef POLYMORPHIC + return vmaxnmavq_p(a, b, p); +#else /* POLYMORPHIC */ + return vmaxnmavq_p_f32(a, b, p); #endif /* POLYMORPHIC */ } diff --git a/clang/test/CodeGen/arm-neon-range-checks.c b/clang/test/CodeGen/arm-neon-range-checks.c new file mode 100644 index 0000000000000..488dad6d59acd --- /dev/null +++ b/clang/test/CodeGen/arm-neon-range-checks.c @@ -0,0 +1,424 @@ +// RUN: %clang_cc1 -triple arm64-none-eabi -target-feature +neon -target-feature +dotprod -target-feature +v8.1a -verify %s +// RUN: %clang_cc1 -triple armv8.1a-none-eabi -target-feature +neon -target-feature +dotprod -target-feature +v8.1a -verify %s + +#include + +void test_vdot_lane(int32x2_t r, int8x8_t a, int8x8_t b) { + vdot_lane_s32(r, a, b, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}} + vdot_lane_s32(r, a, b, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + vdot_lane_s32(r, a, b, 0); + vdot_lane_s32(r, a, b, 1); +} + +void test_vdotq_lane(int32x4_t r, int8x16_t a, int8x8_t b) { + vdotq_lane_s32(r, a, b, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}} + vdotq_lane_s32(r, a, b, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + vdotq_lane_s32(r, a, b, 0); + vdotq_lane_s32(r, a, b, 1); +} + +#if defined(__aarch64__) +void test_vdot_laneq(int32x2_t r, int8x8_t a, int8x16_t b) { + vdot_laneq_s32(r, a, b, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}} + vdot_laneq_s32(r, a, b, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + vdot_laneq_s32(r, a, b, 0); + vdot_laneq_s32(r, a, b, 3); +} + +void test_vdotq_laneq(int32x4_t r, int8x16_t a, int8x16_t b) { + vdotq_laneq_s32(r, a, b, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}} + vdotq_laneq_s32(r, a, b, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + vdotq_laneq_s32(r, a, b, 0); + vdotq_laneq_s32(r, a, b, 3); +} +#endif + +void test_vdup_lane(int32x2_t v) { + vdup_lane_s32(v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}} + vdup_lane_s32(v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + vdup_lane_s32(v, 0); + vdup_lane_s32(v, 1); +} + +void test_vdupq_lane(int32x2_t v) { + vdupq_lane_s32(v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}} + vdupq_lane_s32(v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + vdupq_lane_s32(v, 0); + vdupq_lane_s32(v, 1); +} + +#if defined(__aarch64__) +void test_vdup_laneq(int32x4_t v) { + vdup_laneq_s32(v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}} + vdup_laneq_s32(v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + vdup_laneq_s32(v, 0); + vdup_laneq_s32(v, 3); +} + +void test_vdupq_laneq(int32x4_t v) { + vdupq_laneq_s32(v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}} + vdupq_laneq_s32(v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + vdupq_laneq_s32(v, 0); + vdupq_laneq_s32(v, 3); +} +#endif + +void test_vmla_lane(int32x2_t a, int32x2_t b, int32x2_t v) { + vmla_lane_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}} + vmla_lane_s32(a, b, v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + vmla_lane_s32(a, b, v, 0); + vmla_lane_s32(a, b, v, 1); +} + +void test_vmlaq_lane(int32x4_t a, int32x4_t b, int32x2_t v) { + vmlaq_lane_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}} + vmlaq_lane_s32(a, b, v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + vmlaq_lane_s32(a, b, v, 0); + vmlaq_lane_s32(a, b, v, 1); +} + +#if defined(__aarch64__) +void test_vmla_laneq(int32x2_t a, int32x2_t b, int32x4_t v) { + vmla_laneq_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}} + vmla_laneq_s32(a, b, v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + vmla_laneq_s32(a, b, v, 0); + vmla_laneq_s32(a, b, v, 3); +} + +void test_vmlaq_laneq(int32x4_t a, int32x4_t b, int32x4_t v) { + vmlaq_laneq_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}} + vmlaq_laneq_s32(a, b, v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + vmlaq_laneq_s32(a, b, v, 0); + vmlaq_laneq_s32(a, b, v, 3); +} + +void test_vmlal_high_lane(int64x2_t a, int32x4_t b, int32x2_t v) { + vmlal_high_lane_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}} + vmlal_high_lane_s32(a, b, v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + vmlal_high_lane_s32(a, b, v, 0); + vmlal_high_lane_s32(a, b, v, 1); +} + +void test_vmlal_high_laneq(int64x2_t a, int32x4_t b, int32x4_t v) { + vmlal_high_laneq_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}} + vmlal_high_laneq_s32(a, b, v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + vmlal_high_laneq_s32(a, b, v, 0); + vmlal_high_laneq_s32(a, b, v, 3); +} +#endif + +void test_vmlal_lane(int64x2_t a, int32x2_t b, int32x2_t v) { + vmlal_lane_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}} + vmlal_lane_s32(a, b, v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + vmlal_lane_s32(a, b, v, 0); + vmlal_lane_s32(a, b, v, 1); +} + +#if defined(__aarch64__) +void test_vmlal_laneq(int64x2_t a, int32x2_t b, int32x4_t v) { + vmlal_laneq_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}} + vmlal_laneq_s32(a, b, v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + vmlal_laneq_s32(a, b, v, 0); + vmlal_laneq_s32(a, b, v, 3); +} +#endif + +void test_vmls_lane(int32x2_t a, int32x2_t b, int32x2_t v) { + vmls_lane_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}} + vmls_lane_s32(a, b, v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + vmls_lane_s32(a, b, v, 0); + vmls_lane_s32(a, b, v, 1); +} + +void test_vmlsq_lane(int32x4_t a, int32x4_t b, int32x2_t v) { + vmlsq_lane_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}} + vmlsq_lane_s32(a, b, v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + vmlsq_lane_s32(a, b, v, 0); + vmlsq_lane_s32(a, b, v, 1); +} + +#if defined(__aarch64__) +void test_vmls_laneq(int32x2_t a, int32x2_t b, int32x4_t v) { + vmls_laneq_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}} + vmls_laneq_s32(a, b, v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + vmls_laneq_s32(a, b, v, 0); + vmls_laneq_s32(a, b, v, 3); +} + +void test_vmlsq_laneq(int32x4_t a, int32x4_t b, int32x4_t v) { + vmlsq_laneq_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}} + vmlsq_laneq_s32(a, b, v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + vmlsq_laneq_s32(a, b, v, 0); + vmlsq_laneq_s32(a, b, v, 3); +} + +void test_vmlsl_high_lane(int64x2_t a, int32x4_t b, int32x2_t v) { + vmlsl_high_lane_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}} + vmlsl_high_lane_s32(a, b, v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + vmlsl_high_lane_s32(a, b, v, 0); + vmlsl_high_lane_s32(a, b, v, 1); +} + +void test_vmlsl_high_laneq(int64x2_t a, int32x4_t b, int32x4_t v) { + vmlsl_high_laneq_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}} + vmlsl_high_laneq_s32(a, b, v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + vmlsl_high_laneq_s32(a, b, v, 0); + vmlsl_high_laneq_s32(a, b, v, 3); +} +#endif + +void test_vmlsl_lane(int64x2_t a, int32x2_t b, int32x2_t v) { + vmlsl_lane_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}} + vmlsl_lane_s32(a, b, v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + vmlsl_lane_s32(a, b, v, 0); + vmlsl_lane_s32(a, b, v, 1); +} + +#if defined(__aarch64__) +void test_vmlsl_laneq(int64x2_t a, int32x2_t b, int32x4_t v) { + vmlsl_laneq_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}} + vmlsl_laneq_s32(a, b, v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + vmlsl_laneq_s32(a, b, v, 0); + vmlsl_laneq_s32(a, b, v, 3); +} +#endif + +void test_vmull_lane(int32x2_t a, int32x2_t b) { + vmull_lane_s32(a, b, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}} + vmull_lane_s32(a, b, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + vmull_lane_s32(a, b, 0); + vmull_lane_s32(a, b, 1); +} + +#if defined(__aarch64__) +void test_vmull_laneq(int32x2_t a, int32x4_t b) { + vmull_laneq_s32(a, b, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}} + vmull_laneq_s32(a, b, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + vmull_laneq_s32(a, b, 0); + vmull_laneq_s32(a, b, 3); +} + +void test_vmull_high_lane(int32x4_t a, int32x2_t b) { + vmull_high_lane_s32(a, b, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}} + vmull_high_lane_s32(a, b, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + vmull_high_lane_s32(a, b, 0); + vmull_high_lane_s32(a, b, 1); +} + +void test_vmull_high_laneq(int32x4_t a, int32x4_t b) { + vmull_high_laneq_s32(a, b, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}} + vmull_high_laneq_s32(a, b, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + vmull_high_laneq_s32(a, b, 0); + vmull_high_laneq_s32(a, b, 3); +} + +void test_vqdmlal_high_lane(int64x2_t a, int32x4_t b, int32x2_t v) { + vqdmlal_high_lane_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}} + vqdmlal_high_lane_s32(a, b, v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + vqdmlal_high_lane_s32(a, b, v, 0); + vqdmlal_high_lane_s32(a, b, v, 1); +} + +void test_vqdmlal_high_laneq(int64x2_t a, int32x4_t b, int32x4_t v) { + vqdmlal_high_laneq_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}} + vqdmlal_high_laneq_s32(a, b, v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + vqdmlal_high_laneq_s32(a, b, v, 0); + vqdmlal_high_laneq_s32(a, b, v, 3); +} +#endif + +void test_vqdmlal_lane(int64x2_t a, int32x2_t b, int32x2_t v) { + vqdmlal_lane_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}} + vqdmlal_lane_s32(a, b, v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + vqdmlal_lane_s32(a, b, v, 0); + vqdmlal_lane_s32(a, b, v, 1); +} + +#if defined(__aarch64__) +void test_vqdmlal_laneq(int64x2_t a, int32x2_t b, int32x4_t v) { + vqdmlal_laneq_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}} + vqdmlal_laneq_s32(a, b, v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + vqdmlal_laneq_s32(a, b, v, 0); + vqdmlal_laneq_s32(a, b, v, 3); +} + +void test_vqdmlsl_high_lane(int64x2_t a, int32x4_t b, int32x2_t v) { + vqdmlsl_high_lane_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}} + vqdmlsl_high_lane_s32(a, b, v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + vqdmlsl_high_lane_s32(a, b, v, 0); + vqdmlsl_high_lane_s32(a, b, v, 1); +} + +void test_vqdmlsl_high_laneq(int64x2_t a, int32x4_t b, int32x4_t v) { + vqdmlsl_high_laneq_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}} + vqdmlsl_high_laneq_s32(a, b, v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + vqdmlsl_high_laneq_s32(a, b, v, 0); + vqdmlsl_high_laneq_s32(a, b, v, 3); +} +#endif + +void test_vqdmlsl_lane(int64x2_t a, int32x2_t b, int32x2_t v) { + vqdmlsl_lane_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}} + vqdmlsl_lane_s32(a, b, v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + vqdmlsl_lane_s32(a, b, v, 0); + vqdmlsl_lane_s32(a, b, v, 1); +} + +#if defined(__aarch64__) +void test_vqdmlsl_laneq(int64x2_t a, int32x2_t b, int32x4_t v) { + vqdmlsl_laneq_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}} + vqdmlsl_laneq_s32(a, b, v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + vqdmlsl_laneq_s32(a, b, v, 0); + vqdmlsl_laneq_s32(a, b, v, 3); +} +#endif + +void test_vqdmulh_lane(int32x2_t a, int32x2_t b) { + vqdmulh_lane_s32(a, b, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}} + vqdmulh_lane_s32(a, b, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + vqdmulh_lane_s32(a, b, 0); + vqdmulh_lane_s32(a, b, 1); +} + +void test_vqdmulhq_lane(int32x4_t a, int32x2_t b) { + vqdmulhq_lane_s32(a, b, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}} + vqdmulhq_lane_s32(a, b, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + vqdmulhq_lane_s32(a, b, 0); + vqdmulhq_lane_s32(a, b, 1); +} + +#if defined(__aarch64__) +void test_vqdmulh_laneq(int32x2_t a, int32x4_t b) { + vqdmulh_laneq_s32(a, b, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}} + vqdmulh_laneq_s32(a, b, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + vqdmulh_laneq_s32(a, b, 0); + vqdmulh_laneq_s32(a, b, 3); +} + +void test_vqdmulhq_laneq(int32x4_t a, int32x4_t b) { + vqdmulhq_laneq_s32(a, b, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}} + vqdmulhq_laneq_s32(a, b, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + vqdmulhq_laneq_s32(a, b, 0); + vqdmulhq_laneq_s32(a, b, 3); +} + +void test_vqdmull_high_lane(int32x4_t a, int32x2_t b) { + vqdmull_high_lane_s32(a, b, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}} + vqdmull_high_lane_s32(a, b, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + vqdmull_high_lane_s32(a, b, 0); + vqdmull_high_lane_s32(a, b, 1); +} + +void test_vqdmull_high_laneq(int32x4_t a, int32x4_t b) { + vqdmull_high_laneq_s32(a, b, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}} + vqdmull_high_laneq_s32(a, b, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + vqdmull_high_laneq_s32(a, b, 0); + vqdmull_high_laneq_s32(a, b, 3); +} +#endif + +void test_vqdmull_lane(int32x2_t a, int32x2_t v) { + vqdmull_lane_s32(a, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}} + vqdmull_lane_s32(a, v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + vqdmull_lane_s32(a, v, 0); + vqdmull_lane_s32(a, v, 1); +} + +#if defined(__aarch64__) +void test_vqdmull_laneq(int32x2_t a, int32x4_t v) { + vqdmull_laneq_s32(a, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}} + vqdmull_laneq_s32(a, v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + vqdmull_laneq_s32(a, v, 0); + vqdmull_laneq_s32(a, v, 3); +} +#endif + +void test_vqrdmlah_lane(int32x2_t a, int32x2_t b, int32x2_t v) { + vqrdmlah_lane_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}} + vqrdmlah_lane_s32(a, b, v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + vqrdmlah_lane_s32(a, b, v, 0); + vqrdmlah_lane_s32(a, b, v, 1); +} + +void test_vqrdmlahq_lane(int32x4_t a, int32x4_t b, int32x2_t v) { + vqrdmlahq_lane_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}} + vqrdmlahq_lane_s32(a, b, v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + vqrdmlahq_lane_s32(a, b, v, 0); + vqrdmlahq_lane_s32(a, b, v, 1); +} + +#if defined(__aarch64__) +void test_vqrdmlah_laneq(int32x2_t a, int32x2_t b, int32x4_t v) { + vqrdmlah_laneq_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}} + vqrdmlah_laneq_s32(a, b, v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + vqrdmlah_laneq_s32(a, b, v, 0); + vqrdmlah_laneq_s32(a, b, v, 3); +} + +void test_vqrdmlahq_laneq(int32x4_t a, int32x4_t b, int32x4_t v) { + vqrdmlahq_laneq_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}} + vqrdmlahq_laneq_s32(a, b, v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + vqrdmlahq_laneq_s32(a, b, v, 0); + vqrdmlahq_laneq_s32(a, b, v, 3); +} +#endif + +void test_vqrdmlsh_lane(int32x2_t a, int32x2_t b, int32x2_t v) { + vqrdmlsh_lane_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}} + vqrdmlsh_lane_s32(a, b, v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + vqrdmlsh_lane_s32(a, b, v, 0); + vqrdmlsh_lane_s32(a, b, v, 1); +} + +void test_vqrdmlshq_lane(int32x4_t a, int32x4_t b, int32x2_t v) { + vqrdmlshq_lane_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}} + vqrdmlshq_lane_s32(a, b, v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + vqrdmlshq_lane_s32(a, b, v, 0); + vqrdmlshq_lane_s32(a, b, v, 1); +} + +#if defined(__aarch64__) +void test_vqrdmlsh_laneq(int32x2_t a, int32x2_t b, int32x4_t v) { + vqrdmlsh_laneq_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}} + vqrdmlsh_laneq_s32(a, b, v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + vqrdmlsh_laneq_s32(a, b, v, 0); + vqrdmlsh_laneq_s32(a, b, v, 3); +} + +void test_vqrdmlshq_laneq(int32x4_t a, int32x4_t b, int32x4_t v) { + vqrdmlshq_laneq_s32(a, b, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}} + vqrdmlshq_laneq_s32(a, b, v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + vqrdmlshq_laneq_s32(a, b, v, 0); + vqrdmlshq_laneq_s32(a, b, v, 3); +} +#endif + +void test_vqrdmulh_lane(int32x2_t a, int32x2_t v) { + vqrdmulh_lane_s32(a, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}} + vqrdmulh_lane_s32(a, v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + vqrdmulh_lane_s32(a, v, 0); + vqrdmulh_lane_s32(a, v, 1); +} + +void test_vqrdmulhq_lane(int32x4_t a, int32x2_t v) { + vqrdmulhq_lane_s32(a, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 1]}} + vqrdmulhq_lane_s32(a, v, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}} + vqrdmulhq_lane_s32(a, v, 0); + vqrdmulhq_lane_s32(a, v, 1); +} + +#if defined(__aarch64__) +void test_vqrdmulh_laneq(int32x2_t a, int32x4_t v) { + vqrdmulh_laneq_s32(a, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}} + vqrdmulh_laneq_s32(a, v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + vqrdmulh_laneq_s32(a, v, 0); + vqrdmulh_laneq_s32(a, v, 3); +} + +void test_vqrdmulhq_laneq(int32x4_t a, int32x4_t v) { + vqrdmulhq_laneq_s32(a, v, -1); // expected-error {{argument value -1 is outside the valid range [0, 3]}} + vqrdmulhq_laneq_s32(a, v, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} + vqrdmulhq_laneq_s32(a, v, 0); + vqrdmulhq_laneq_s32(a, v, 3); +} +#endif diff --git a/clang/test/CodeGen/arm-neon-vld.c b/clang/test/CodeGen/arm-neon-vld.c index 2c7af92f4796a..8d3d61c250a92 100644 --- a/clang/test/CodeGen/arm-neon-vld.c +++ b/clang/test/CodeGen/arm-neon-vld.c @@ -9,7 +9,7 @@ // CHECK-LABEL: @test_vld1_f16_x2( // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.float16x4x2_t, align 8 -// CHECK-A32: %struct.float16x4x2_t* noalias sret [[RETVAL:%.*]], +// CHECK-A32: %struct.float16x4x2_t* noalias sret align 8 [[RETVAL:%.*]], // CHECK: [[__RET:%.*]] = alloca %struct.float16x4x2_t, align 8 // CHECK: [[TMP0:%.*]] = bitcast %struct.float16x4x2_t* [[__RET]] to i8* // CHECK: [[TMP1:%.*]] = bitcast half* %a to i8* @@ -29,7 +29,7 @@ float16x4x2_t test_vld1_f16_x2(float16_t const *a) { // CHECK-LABEL: @test_vld1_f16_x3( // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.float16x4x3_t, align 8 -// CHECK-A32: %struct.float16x4x3_t* noalias sret [[RETVAL:%.*]], +// CHECK-A32: %struct.float16x4x3_t* noalias sret align 8 [[RETVAL:%.*]], // CHECK: [[__RET:%.*]] = alloca %struct.float16x4x3_t, align 8 // CHECK: [[TMP0:%.*]] = bitcast %struct.float16x4x3_t* [[__RET]] to i8* // CHECK: [[TMP1:%.*]] = bitcast half* %a to i8* @@ -49,7 +49,7 @@ float16x4x3_t test_vld1_f16_x3(float16_t const *a) { // CHECK-LABEL: @test_vld1_f16_x4( // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.float16x4x4_t, align 8 -// CHECK-A32: %struct.float16x4x4_t* noalias sret [[RETVAL:%.*]], +// CHECK-A32: %struct.float16x4x4_t* noalias sret align 8 [[RETVAL:%.*]], // CHECK: [[__RET:%.*]] = alloca %struct.float16x4x4_t, align 8 // CHECK: [[TMP0:%.*]] = bitcast %struct.float16x4x4_t* [[__RET]] to i8* // CHECK: [[TMP1:%.*]] = bitcast half* %a to i8* @@ -69,7 +69,7 @@ float16x4x4_t test_vld1_f16_x4(float16_t const *a) { // CHECK-LABEL: @test_vld1_f32_x2( // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.float32x2x2_t, align 8 -// CHECK-A32: %struct.float32x2x2_t* noalias sret [[RETVAL:%.*]], +// CHECK-A32: %struct.float32x2x2_t* noalias sret align 8 [[RETVAL:%.*]], // CHECK: [[__RET:%.*]] = alloca %struct.float32x2x2_t, align 8 // CHECK: [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[__RET]] to i8* // CHECK: [[TMP1:%.*]] = bitcast float* %a to i8* @@ -89,7 +89,7 @@ float32x2x2_t test_vld1_f32_x2(float32_t const *a) { // CHECK-LABEL: @test_vld1_f32_x3( // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.float32x2x3_t, align 8 -// CHECK-A32: %struct.float32x2x3_t* noalias sret [[RETVAL:%.*]], +// CHECK-A32: %struct.float32x2x3_t* noalias sret align 8 [[RETVAL:%.*]], // CHECK: [[__RET:%.*]] = alloca %struct.float32x2x3_t, align 8 // CHECK: [[TMP0:%.*]] = bitcast %struct.float32x2x3_t* [[__RET]] to i8* // CHECK: [[TMP1:%.*]] = bitcast float* %a to i8* @@ -108,7 +108,7 @@ float32x2x3_t test_vld1_f32_x3(float32_t const *a) { // CHECK-LABEL: @test_vld1_f32_x4( // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.float32x2x4_t, align 8 -// CHECK-A32: %struct.float32x2x4_t* noalias sret [[RETVAL:%.*]], +// CHECK-A32: %struct.float32x2x4_t* noalias sret align 8 [[RETVAL:%.*]], // CHECK: [[__RET:%.*]] = alloca %struct.float32x2x4_t, align 8 // CHECK: [[TMP0:%.*]] = bitcast %struct.float32x2x4_t* [[__RET]] to i8* // CHECK: [[TMP1:%.*]] = bitcast float* %a to i8* @@ -128,7 +128,7 @@ float32x2x4_t test_vld1_f32_x4(float32_t const *a) { // CHECK-LABEL: @test_vld1_p16_x2( // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.poly16x4x2_t, align 8 -// CHECK-A32: %struct.poly16x4x2_t* noalias sret [[RETVAL:%.*]], +// CHECK-A32: %struct.poly16x4x2_t* noalias sret align 8 [[RETVAL:%.*]], // CHECK: [[__RET:%.*]] = alloca %struct.poly16x4x2_t, align 8 // CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[__RET]] to i8* // CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* @@ -148,7 +148,7 @@ poly16x4x2_t test_vld1_p16_x2(poly16_t const *a) { // CHECK-LABEL: @test_vld1_p16_x3( // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.poly16x4x3_t, align 8 -// CHECK-A32: %struct.poly16x4x3_t* noalias sret [[RETVAL:%.*]], +// CHECK-A32: %struct.poly16x4x3_t* noalias sret align 8 [[RETVAL:%.*]], // CHECK: [[__RET:%.*]] = alloca %struct.poly16x4x3_t, align 8 // CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x4x3_t* [[__RET]] to i8* // CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* @@ -168,7 +168,7 @@ poly16x4x3_t test_vld1_p16_x3(poly16_t const *a) { // CHECK-LABEL: @test_vld1_p16_x4( // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.poly16x4x4_t, align 8 -// CHECK-A32: %struct.poly16x4x4_t* noalias sret [[RETVAL:%.*]], +// CHECK-A32: %struct.poly16x4x4_t* noalias sret align 8 [[RETVAL:%.*]], // CHECK: [[__RET:%.*]] = alloca %struct.poly16x4x4_t, align 8 // CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x4x4_t* [[__RET]] to i8* // CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* @@ -188,7 +188,7 @@ poly16x4x4_t test_vld1_p16_x4(poly16_t const *a) { // CHECK-LABEL: @test_vld1_p8_x2( // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.poly8x8x2_t, align 8 -// CHECK-A32: %struct.poly8x8x2_t* noalias sret [[RETVAL:%.*]], +// CHECK-A32: %struct.poly8x8x2_t* noalias sret align 8 [[RETVAL:%.*]], // CHECK: [[__RET:%.*]] = alloca %struct.poly8x8x2_t, align 8 // CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[__RET]] to i8* // CHECK: [[VLD1XN:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.{{aarch64.neon.ld1x2|arm.neon.vld1x2}}.v8i8.p0i8(i8* %a) @@ -206,7 +206,7 @@ poly8x8x2_t test_vld1_p8_x2(poly8_t const *a) { // CHECK-LABEL: @test_vld1_p8_x3( // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.poly8x8x3_t, align 8 -// CHECK-A32: %struct.poly8x8x3_t* noalias sret [[RETVAL:%.*]], +// CHECK-A32: %struct.poly8x8x3_t* noalias sret align 8 [[RETVAL:%.*]], // CHECK: [[__RET:%.*]] = alloca %struct.poly8x8x3_t, align 8 // CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x8x3_t* [[__RET]] to i8* // CHECK: [[VLD1XN:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.{{aarch64.neon.ld1x3|arm.neon.vld1x3}}.v8i8.p0i8(i8* %a) @@ -224,7 +224,7 @@ poly8x8x3_t test_vld1_p8_x3(poly8_t const *a) { // CHECK-LABEL: @test_vld1_p8_x4( // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.poly8x8x4_t, align 8 -// CHECK-A32: %struct.poly8x8x4_t* noalias sret [[RETVAL:%.*]], +// CHECK-A32: %struct.poly8x8x4_t* noalias sret align 8 [[RETVAL:%.*]], // CHECK: [[__RET:%.*]] = alloca %struct.poly8x8x4_t, align 8 // CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x8x4_t* [[__RET]] to i8* // CHECK: [[VLD1XN:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.{{aarch64.neon.ld1x4|arm.neon.vld1x4}}.v8i8.p0i8(i8* %a) @@ -242,7 +242,7 @@ poly8x8x4_t test_vld1_p8_x4(poly8_t const *a) { // CHECK-LABEL: @test_vld1_s16_x2( // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int16x4x2_t, align 8 -// CHECK-A32: %struct.int16x4x2_t* noalias sret [[RETVAL:%.*]], +// CHECK-A32: %struct.int16x4x2_t* noalias sret align 8 [[RETVAL:%.*]], // CHECK: [[__RET:%.*]] = alloca %struct.int16x4x2_t, align 8 // CHECK: [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[__RET]] to i8* // CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* @@ -262,7 +262,7 @@ int16x4x2_t test_vld1_s16_x2(int16_t const *a) { // CHECK-LABEL: @test_vld1_s16_x3( // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int16x4x3_t, align 8 -// CHECK-A32: %struct.int16x4x3_t* noalias sret [[RETVAL:%.*]], +// CHECK-A32: %struct.int16x4x3_t* noalias sret align 8 [[RETVAL:%.*]], // CHECK: [[__RET:%.*]] = alloca %struct.int16x4x3_t, align 8 // CHECK: [[TMP0:%.*]] = bitcast %struct.int16x4x3_t* [[__RET]] to i8* // CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* @@ -282,7 +282,7 @@ int16x4x3_t test_vld1_s16_x3(int16_t const *a) { // CHECK-LABEL: @test_vld1_s16_x4( // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int16x4x4_t, align 8 -// CHECK-A32: %struct.int16x4x4_t* noalias sret [[RETVAL:%.*]], +// CHECK-A32: %struct.int16x4x4_t* noalias sret align 8 [[RETVAL:%.*]], // CHECK: [[__RET:%.*]] = alloca %struct.int16x4x4_t, align 8 // CHECK: [[TMP0:%.*]] = bitcast %struct.int16x4x4_t* [[__RET]] to i8* // CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* @@ -302,7 +302,7 @@ int16x4x4_t test_vld1_s16_x4(int16_t const *a) { // CHECK-LABEL: @test_vld1_s32_x2( // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int32x2x2_t, align 8 -// CHECK-A32: %struct.int32x2x2_t* noalias sret [[RETVAL:%.*]], +// CHECK-A32: %struct.int32x2x2_t* noalias sret align 8 [[RETVAL:%.*]], // CHECK: [[__RET:%.*]] = alloca %struct.int32x2x2_t, align 8 // CHECK: [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[__RET]] to i8* // CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8* @@ -322,7 +322,7 @@ int32x2x2_t test_vld1_s32_x2(int32_t const *a) { // CHECK-LABEL: @test_vld1_s32_x3( // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int32x2x3_t, align 8 -// CHECK-A32: %struct.int32x2x3_t* noalias sret [[RETVAL:%.*]], +// CHECK-A32: %struct.int32x2x3_t* noalias sret align 8 [[RETVAL:%.*]], // CHECK: [[__RET:%.*]] = alloca %struct.int32x2x3_t, align 8 // CHECK: [[TMP0:%.*]] = bitcast %struct.int32x2x3_t* [[__RET]] to i8* // CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8* @@ -342,7 +342,7 @@ int32x2x3_t test_vld1_s32_x3(int32_t const *a) { // CHECK-LABEL: @test_vld1_s32_x4( // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int32x2x4_t, align 8 -// CHECK-A32: %struct.int32x2x4_t* noalias sret [[RETVAL:%.*]], +// CHECK-A32: %struct.int32x2x4_t* noalias sret align 8 [[RETVAL:%.*]], // CHECK: [[__RET:%.*]] = alloca %struct.int32x2x4_t, align 8 // CHECK: [[TMP0:%.*]] = bitcast %struct.int32x2x4_t* [[__RET]] to i8* // CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8* @@ -362,7 +362,7 @@ int32x2x4_t test_vld1_s32_x4(int32_t const *a) { // CHECK-LABEL: @test_vld1_s64_x2( // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int64x1x2_t, align 8 -// CHECK-A32: %struct.int64x1x2_t* noalias sret [[RETVAL:%.*]], +// CHECK-A32: %struct.int64x1x2_t* noalias sret align 8 [[RETVAL:%.*]], // CHECK: [[__RET:%.*]] = alloca %struct.int64x1x2_t, align 8 // CHECK: [[TMP0:%.*]] = bitcast %struct.int64x1x2_t* [[__RET]] to i8* // CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8* @@ -382,7 +382,7 @@ int64x1x2_t test_vld1_s64_x2(int64_t const *a) { // CHECK-LABEL: @test_vld1_s64_x3( // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int64x1x3_t, align 8 -// CHECK-A32: %struct.int64x1x3_t* noalias sret [[RETVAL:%.*]], +// CHECK-A32: %struct.int64x1x3_t* noalias sret align 8 [[RETVAL:%.*]], // CHECK: [[__RET:%.*]] = alloca %struct.int64x1x3_t, align 8 // CHECK: [[TMP0:%.*]] = bitcast %struct.int64x1x3_t* [[__RET]] to i8* // CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8* @@ -402,7 +402,7 @@ int64x1x3_t test_vld1_s64_x3(int64_t const *a) { // CHECK-LABEL: @test_vld1_s64_x4( // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int64x1x4_t, align 8 -// CHECK-A32: %struct.int64x1x4_t* noalias sret [[RETVAL:%.*]], +// CHECK-A32: %struct.int64x1x4_t* noalias sret align 8 [[RETVAL:%.*]], // CHECK: [[__RET:%.*]] = alloca %struct.int64x1x4_t, align 8 // CHECK: [[TMP0:%.*]] = bitcast %struct.int64x1x4_t* [[__RET]] to i8* // CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8* @@ -422,7 +422,7 @@ int64x1x4_t test_vld1_s64_x4(int64_t const *a) { // CHECK-LABEL: @test_vld1_s8_x2( // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int8x8x2_t, align 8 -// CHECK-A32: %struct.int8x8x2_t* noalias sret [[RETVAL:%.*]], +// CHECK-A32: %struct.int8x8x2_t* noalias sret align 8 [[RETVAL:%.*]], // CHECK: [[__RET:%.*]] = alloca %struct.int8x8x2_t, align 8 // CHECK: [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[__RET]] to i8* // CHECK: [[VLD1XN:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.{{aarch64.neon.ld1x2|arm.neon.vld1x2}}.v8i8.p0i8(i8* %a) @@ -440,7 +440,7 @@ int8x8x2_t test_vld1_s8_x2(int8_t const *a) { // CHECK-LABEL: @test_vld1_s8_x3( // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int8x8x3_t, align 8 -// CHECK-A32: %struct.int8x8x3_t* noalias sret [[RETVAL:%.*]], +// CHECK-A32: %struct.int8x8x3_t* noalias sret align 8 [[RETVAL:%.*]], // CHECK: [[__RET:%.*]] = alloca %struct.int8x8x3_t, align 8 // CHECK: [[TMP0:%.*]] = bitcast %struct.int8x8x3_t* [[__RET]] to i8* // CHECK: [[VLD1XN:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.{{aarch64.neon.ld1x3|arm.neon.vld1x3}}.v8i8.p0i8(i8* %a) @@ -458,7 +458,7 @@ int8x8x3_t test_vld1_s8_x3(int8_t const *a) { // CHECK-LABEL: @test_vld1_s8_x4( // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int8x8x4_t, align 8 -// CHECK-A32: %struct.int8x8x4_t* noalias sret [[RETVAL:%.*]], +// CHECK-A32: %struct.int8x8x4_t* noalias sret align 8 [[RETVAL:%.*]], // CHECK: [[__RET:%.*]] = alloca %struct.int8x8x4_t, align 8 // CHECK: [[TMP0:%.*]] = bitcast %struct.int8x8x4_t* [[__RET]] to i8* // CHECK: [[VLD1XN:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.{{aarch64.neon.ld1x4|arm.neon.vld1x4}}.v8i8.p0i8(i8* %a) @@ -476,7 +476,7 @@ int8x8x4_t test_vld1_s8_x4(int8_t const *a) { // CHECK-LABEL: @test_vld1_u16_x2( // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint16x4x2_t, align 8 -// CHECK-A32: %struct.uint16x4x2_t* noalias sret [[RETVAL:%.*]], +// CHECK-A32: %struct.uint16x4x2_t* noalias sret align 8 [[RETVAL:%.*]], // CHECK: [[__RET:%.*]] = alloca %struct.uint16x4x2_t, align 8 // CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[__RET]] to i8* // CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* @@ -496,7 +496,7 @@ uint16x4x2_t test_vld1_u16_x2(uint16_t const *a) { // CHECK-LABEL: @test_vld1_u16_x3( // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint16x4x3_t, align 8 -// CHECK-A32: %struct.uint16x4x3_t* noalias sret [[RETVAL:%.*]], +// CHECK-A32: %struct.uint16x4x3_t* noalias sret align 8 [[RETVAL:%.*]], // CHECK: [[__RET:%.*]] = alloca %struct.uint16x4x3_t, align 8 // CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x4x3_t* [[__RET]] to i8* // CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* @@ -516,7 +516,7 @@ uint16x4x3_t test_vld1_u16_x3(uint16_t const *a) { // CHECK-LABEL: @test_vld1_u16_x4( // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint16x4x4_t, align 8 -// CHECK-A32: %struct.uint16x4x4_t* noalias sret [[RETVAL:%.*]], +// CHECK-A32: %struct.uint16x4x4_t* noalias sret align 8 [[RETVAL:%.*]], // CHECK: [[__RET:%.*]] = alloca %struct.uint16x4x4_t, align 8 // CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x4x4_t* [[__RET]] to i8* // CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* @@ -536,7 +536,7 @@ uint16x4x4_t test_vld1_u16_x4(uint16_t const *a) { // CHECK-LABEL: @test_vld1_u32_x2( // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint32x2x2_t, align 8 -// CHECK-A32: %struct.uint32x2x2_t* noalias sret [[RETVAL:%.*]], +// CHECK-A32: %struct.uint32x2x2_t* noalias sret align 8 [[RETVAL:%.*]], // CHECK: [[__RET:%.*]] = alloca %struct.uint32x2x2_t, align 8 // CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[__RET]] to i8* // CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8* @@ -556,7 +556,7 @@ uint32x2x2_t test_vld1_u32_x2(uint32_t const *a) { // CHECK-LABEL: @test_vld1_u32_x3( // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint32x2x3_t, align 8 -// CHECK-A32: %struct.uint32x2x3_t* noalias sret [[RETVAL:%.*]], +// CHECK-A32: %struct.uint32x2x3_t* noalias sret align 8 [[RETVAL:%.*]], // CHECK: [[__RET:%.*]] = alloca %struct.uint32x2x3_t, align 8 // CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x2x3_t* [[__RET]] to i8* // CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8* @@ -576,7 +576,7 @@ uint32x2x3_t test_vld1_u32_x3(uint32_t const *a) { // CHECK-LABEL: @test_vld1_u32_x4( // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint32x2x4_t, align 8 -// CHECK-A32: %struct.uint32x2x4_t* noalias sret [[RETVAL:%.*]], +// CHECK-A32: %struct.uint32x2x4_t* noalias sret align 8 [[RETVAL:%.*]], // CHECK: [[__RET:%.*]] = alloca %struct.uint32x2x4_t, align 8 // CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x2x4_t* [[__RET]] to i8* // CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8* @@ -596,7 +596,7 @@ uint32x2x4_t test_vld1_u32_x4(uint32_t const *a) { // CHECK-LABEL: @test_vld1_u64_x2( // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint64x1x2_t, align 8 -// CHECK-A32: %struct.uint64x1x2_t* noalias sret [[RETVAL:%.*]], +// CHECK-A32: %struct.uint64x1x2_t* noalias sret align 8 [[RETVAL:%.*]], // CHECK: [[__RET:%.*]] = alloca %struct.uint64x1x2_t, align 8 // CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x1x2_t* [[__RET]] to i8* // CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8* @@ -616,7 +616,7 @@ uint64x1x2_t test_vld1_u64_x2(uint64_t const *a) { // CHECK-LABEL: @test_vld1_u64_x3( // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint64x1x3_t, align 8 -// CHECK-A32: %struct.uint64x1x3_t* noalias sret [[RETVAL:%.*]], +// CHECK-A32: %struct.uint64x1x3_t* noalias sret align 8 [[RETVAL:%.*]], // CHECK: [[__RET:%.*]] = alloca %struct.uint64x1x3_t, align 8 // CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x1x3_t* [[__RET]] to i8* // CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8* @@ -636,7 +636,7 @@ uint64x1x3_t test_vld1_u64_x3(uint64_t const *a) { // CHECK-LABEL: @test_vld1_u64_x4( // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint64x1x4_t, align 8 -// CHECK-A32: %struct.uint64x1x4_t* noalias sret [[RETVAL:%.*]], +// CHECK-A32: %struct.uint64x1x4_t* noalias sret align 8 [[RETVAL:%.*]], // CHECK: [[__RET:%.*]] = alloca %struct.uint64x1x4_t, align 8 // CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x1x4_t* [[__RET]] to i8* // CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8* @@ -656,7 +656,7 @@ uint64x1x4_t test_vld1_u64_x4(uint64_t const *a) { // CHECK-LABEL: @test_vld1_u8_x2( // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint8x8x2_t, align 8 -// CHECK-A32: %struct.uint8x8x2_t* noalias sret [[RETVAL:%.*]], +// CHECK-A32: %struct.uint8x8x2_t* noalias sret align 8 [[RETVAL:%.*]], // CHECK: [[__RET:%.*]] = alloca %struct.uint8x8x2_t, align 8 // CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[__RET]] to i8* // CHECK: [[VLD1XN:%.*]] = call { <8 x i8>, <8 x i8> } @llvm.{{aarch64.neon.ld1x2|arm.neon.vld1x2}}.v8i8.p0i8(i8* %a) @@ -674,7 +674,7 @@ uint8x8x2_t test_vld1_u8_x2(uint8_t const *a) { // CHECK-LABEL: @test_vld1_u8_x3( // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint8x8x3_t, align 8 -// CHECK-A32: %struct.uint8x8x3_t* noalias sret [[RETVAL:%.*]], +// CHECK-A32: %struct.uint8x8x3_t* noalias sret align 8 [[RETVAL:%.*]], // CHECK: [[__RET:%.*]] = alloca %struct.uint8x8x3_t, align 8 // CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x8x3_t* [[__RET]] to i8* // CHECK: [[VLD1XN:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.{{aarch64.neon.ld1x3|arm.neon.vld1x3}}.v8i8.p0i8(i8* %a) @@ -692,7 +692,7 @@ uint8x8x3_t test_vld1_u8_x3(uint8_t const *a) { // CHECK-LABEL: @test_vld1_u8_x4( // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint8x8x4_t, align 8 -// CHECK-A32: %struct.uint8x8x4_t* noalias sret [[RETVAL:%.*]], +// CHECK-A32: %struct.uint8x8x4_t* noalias sret align 8 [[RETVAL:%.*]], // CHECK: [[__RET:%.*]] = alloca %struct.uint8x8x4_t, align 8 // CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x8x4_t* [[__RET]] to i8* // CHECK: [[VLD1XN:%.*]] = call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.{{aarch64.neon.ld1x4|arm.neon.vld1x4}}.v8i8.p0i8(i8* %a) @@ -710,7 +710,7 @@ uint8x8x4_t test_vld1_u8_x4(uint8_t const *a) { // CHECK-LABEL: @test_vld1q_f16_x2( // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.float16x8x2_t, align 16 -// CHECK-A32: %struct.float16x8x2_t* noalias sret [[RETVAL:%.*]], +// CHECK-A32: %struct.float16x8x2_t* noalias sret align 8 [[RETVAL:%.*]], // CHECK: [[__RET:%.*]] = alloca %struct.float16x8x2_t, align {{16|8}} // CHECK: [[TMP0:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8* // CHECK: [[TMP1:%.*]] = bitcast half* %a to i8* @@ -730,7 +730,7 @@ float16x8x2_t test_vld1q_f16_x2(float16_t const *a) { // CHECK-LABEL: @test_vld1q_f16_x3( // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.float16x8x3_t, align 16 -// CHECK-A32: %struct.float16x8x3_t* noalias sret [[RETVAL:%.*]], +// CHECK-A32: %struct.float16x8x3_t* noalias sret align 8 [[RETVAL:%.*]], // CHECK: [[__RET:%.*]] = alloca %struct.float16x8x3_t, align {{16|8}} // CHECK: [[TMP0:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8* // CHECK: [[TMP1:%.*]] = bitcast half* %a to i8* @@ -750,7 +750,7 @@ float16x8x3_t test_vld1q_f16_x3(float16_t const *a) { // CHECK-LABEL: @test_vld1q_f16_x4( // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.float16x8x4_t, align 16 -// CHECK-A32: %struct.float16x8x4_t* noalias sret [[RETVAL:%.*]], +// CHECK-A32: %struct.float16x8x4_t* noalias sret align 8 [[RETVAL:%.*]], // CHECK: [[__RET:%.*]] = alloca %struct.float16x8x4_t, align {{16|8}} // CHECK: [[TMP0:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8* // CHECK: [[TMP1:%.*]] = bitcast half* %a to i8* @@ -770,7 +770,7 @@ float16x8x4_t test_vld1q_f16_x4(float16_t const *a) { // CHECK-LABEL: @test_vld1q_f32_x2( // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.float32x4x2_t, align 16 -// CHECK-A32: %struct.float32x4x2_t* noalias sret [[RETVAL:%.*]], +// CHECK-A32: %struct.float32x4x2_t* noalias sret align 8 [[RETVAL:%.*]], // CHECK: [[__RET:%.*]] = alloca %struct.float32x4x2_t, align {{16|8}} // CHECK: [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8* // CHECK: [[TMP1:%.*]] = bitcast float* %a to i8* @@ -790,7 +790,7 @@ float32x4x2_t test_vld1q_f32_x2(float32_t const *a) { // CHECK-LABEL: @test_vld1q_f32_x3( // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.float32x4x3_t, align 16 -// CHECK-A32: %struct.float32x4x3_t* noalias sret [[RETVAL:%.*]], +// CHECK-A32: %struct.float32x4x3_t* noalias sret align 8 [[RETVAL:%.*]], // CHECK: [[__RET:%.*]] = alloca %struct.float32x4x3_t, align {{16|8}} // CHECK: [[TMP0:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8* // CHECK: [[TMP1:%.*]] = bitcast float* %a to i8* @@ -810,7 +810,7 @@ float32x4x3_t test_vld1q_f32_x3(float32_t const *a) { // CHECK-LABEL: @test_vld1q_f32_x4( // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.float32x4x4_t, align 16 -// CHECK-A32: %struct.float32x4x4_t* noalias sret [[RETVAL:%.*]], +// CHECK-A32: %struct.float32x4x4_t* noalias sret align 8 [[RETVAL:%.*]], // CHECK: [[__RET:%.*]] = alloca %struct.float32x4x4_t, align {{16|8}} // CHECK: [[TMP0:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8* // CHECK: [[TMP1:%.*]] = bitcast float* %a to i8* @@ -830,7 +830,7 @@ float32x4x4_t test_vld1q_f32_x4(float32_t const *a) { // CHECK-LABEL: @test_vld1q_p16_x2( // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.poly16x8x2_t, align 16 -// CHECK-A32: %struct.poly16x8x2_t* noalias sret [[RETVAL:%.*]], +// CHECK-A32: %struct.poly16x8x2_t* noalias sret align 8 [[RETVAL:%.*]], // CHECK: [[__RET:%.*]] = alloca %struct.poly16x8x2_t, align {{16|8}} // CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8* // CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* @@ -850,7 +850,7 @@ poly16x8x2_t test_vld1q_p16_x2(poly16_t const *a) { // CHECK-LABEL: @test_vld1q_p16_x3( // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.poly16x8x3_t, align 16 -// CHECK-A32: %struct.poly16x8x3_t* noalias sret [[RETVAL:%.*]], +// CHECK-A32: %struct.poly16x8x3_t* noalias sret align 8 [[RETVAL:%.*]], // CHECK: [[__RET:%.*]] = alloca %struct.poly16x8x3_t, align {{16|8}} // CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8* // CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* @@ -870,7 +870,7 @@ poly16x8x3_t test_vld1q_p16_x3(poly16_t const *a) { // CHECK-LABEL: @test_vld1q_p16_x4( // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.poly16x8x4_t, align 16 -// CHECK-A32: %struct.poly16x8x4_t* noalias sret [[RETVAL:%.*]], +// CHECK-A32: %struct.poly16x8x4_t* noalias sret align 8 [[RETVAL:%.*]], // CHECK: [[__RET:%.*]] = alloca %struct.poly16x8x4_t, align {{16|8}} // CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8* // CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* @@ -890,7 +890,7 @@ poly16x8x4_t test_vld1q_p16_x4(poly16_t const *a) { // CHECK-LABEL: @test_vld1q_p8_x2( // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.poly8x16x2_t, align 16 -// CHECK-A32: %struct.poly8x16x2_t* noalias sret [[RETVAL:%.*]], +// CHECK-A32: %struct.poly8x16x2_t* noalias sret align 8 [[RETVAL:%.*]], // CHECK: [[__RET:%.*]] = alloca %struct.poly8x16x2_t, align {{16|8}} // CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET]] to i8* // CHECK: [[VLD1XN:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.{{aarch64.neon.ld1x2|arm.neon.vld1x2}}.v16i8.p0i8(i8* %a) @@ -908,7 +908,7 @@ poly8x16x2_t test_vld1q_p8_x2(poly8_t const *a) { // CHECK-LABEL: @test_vld1q_p8_x3( // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.poly8x16x3_t, align 16 -// CHECK-A32: %struct.poly8x16x3_t* noalias sret [[RETVAL:%.*]], +// CHECK-A32: %struct.poly8x16x3_t* noalias sret align 8 [[RETVAL:%.*]], // CHECK: [[__RET:%.*]] = alloca %struct.poly8x16x3_t, align {{16|8}} // CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x16x3_t* [[__RET]] to i8* // CHECK: [[VLD1XN:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.{{aarch64.neon.ld1x3|arm.neon.vld1x3}}.v16i8.p0i8(i8* %a) @@ -926,7 +926,7 @@ poly8x16x3_t test_vld1q_p8_x3(poly8_t const *a) { // CHECK-LABEL: @test_vld1q_p8_x4( // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.poly8x16x4_t, align 16 -// CHECK-A32: %struct.poly8x16x4_t* noalias sret [[RETVAL:%.*]], +// CHECK-A32: %struct.poly8x16x4_t* noalias sret align 8 [[RETVAL:%.*]], // CHECK: [[__RET:%.*]] = alloca %struct.poly8x16x4_t, align {{16|8}} // CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x16x4_t* [[__RET]] to i8* // CHECK: [[VLD1XN:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.{{aarch64.neon.ld1x4|arm.neon.vld1x4}}.v16i8.p0i8(i8* %a) @@ -944,7 +944,7 @@ poly8x16x4_t test_vld1q_p8_x4(poly8_t const *a) { // CHECK-LABEL: @test_vld1q_s16_x2( // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int16x8x2_t, align 16 -// CHECK-A32: %struct.int16x8x2_t* noalias sret [[RETVAL:%.*]], +// CHECK-A32: %struct.int16x8x2_t* noalias sret align 8 [[RETVAL:%.*]], // CHECK: [[__RET:%.*]] = alloca %struct.int16x8x2_t, align {{16|8}} // CHECK: [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8* // CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* @@ -964,7 +964,7 @@ int16x8x2_t test_vld1q_s16_x2(int16_t const *a) { // CHECK-LABEL: @test_vld1q_s16_x3( // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int16x8x3_t, align 16 -// CHECK-A32: %struct.int16x8x3_t* noalias sret [[RETVAL:%.*]], +// CHECK-A32: %struct.int16x8x3_t* noalias sret align 8 [[RETVAL:%.*]], // CHECK: [[__RET:%.*]] = alloca %struct.int16x8x3_t, align {{16|8}} // CHECK: [[TMP0:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8* // CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* @@ -984,7 +984,7 @@ int16x8x3_t test_vld1q_s16_x3(int16_t const *a) { // CHECK-LABEL: @test_vld1q_s16_x4( // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int16x8x4_t, align 16 -// CHECK-A32: %struct.int16x8x4_t* noalias sret [[RETVAL:%.*]], +// CHECK-A32: %struct.int16x8x4_t* noalias sret align 8 [[RETVAL:%.*]], // CHECK: [[__RET:%.*]] = alloca %struct.int16x8x4_t, align {{16|8}} // CHECK: [[TMP0:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8* // CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* @@ -1004,7 +1004,7 @@ int16x8x4_t test_vld1q_s16_x4(int16_t const *a) { // CHECK-LABEL: @test_vld1q_s32_x2( // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int32x4x2_t, align 16 -// CHECK-A32: %struct.int32x4x2_t* noalias sret [[RETVAL:%.*]], +// CHECK-A32: %struct.int32x4x2_t* noalias sret align 8 [[RETVAL:%.*]], // CHECK: [[__RET:%.*]] = alloca %struct.int32x4x2_t, align {{16|8}} // CHECK: [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8* // CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8* @@ -1024,7 +1024,7 @@ int32x4x2_t test_vld1q_s32_x2(int32_t const *a) { // CHECK-LABEL: @test_vld1q_s32_x3( // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int32x4x3_t, align 16 -// CHECK-A32: %struct.int32x4x3_t* noalias sret [[RETVAL:%.*]], +// CHECK-A32: %struct.int32x4x3_t* noalias sret align 8 [[RETVAL:%.*]], // CHECK: [[__RET:%.*]] = alloca %struct.int32x4x3_t, align {{16|8}} // CHECK: [[TMP0:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8* // CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8* @@ -1044,7 +1044,7 @@ int32x4x3_t test_vld1q_s32_x3(int32_t const *a) { // CHECK-LABEL: @test_vld1q_s32_x4( // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int32x4x4_t, align 16 -// CHECK-A32: %struct.int32x4x4_t* noalias sret [[RETVAL:%.*]], +// CHECK-A32: %struct.int32x4x4_t* noalias sret align 8 [[RETVAL:%.*]], // CHECK: [[__RET:%.*]] = alloca %struct.int32x4x4_t, align {{16|8}} // CHECK: [[TMP0:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8* // CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8* @@ -1064,7 +1064,7 @@ int32x4x4_t test_vld1q_s32_x4(int32_t const *a) { // CHECK-LABEL: @test_vld1q_s64_x2( // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int64x2x2_t, align 16 -// CHECK-A32: %struct.int64x2x2_t* noalias sret [[RETVAL:%.*]], +// CHECK-A32: %struct.int64x2x2_t* noalias sret align 8 [[RETVAL:%.*]], // CHECK: [[__RET:%.*]] = alloca %struct.int64x2x2_t, align {{16|8}} // CHECK: [[TMP0:%.*]] = bitcast %struct.int64x2x2_t* [[__RET]] to i8* // CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8* @@ -1084,7 +1084,7 @@ int64x2x2_t test_vld1q_s64_x2(int64_t const *a) { // CHECK-LABEL: @test_vld1q_s64_x3( // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int64x2x3_t, align 16 -// CHECK-A32: %struct.int64x2x3_t* noalias sret [[RETVAL:%.*]], +// CHECK-A32: %struct.int64x2x3_t* noalias sret align 8 [[RETVAL:%.*]], // CHECK: [[__RET:%.*]] = alloca %struct.int64x2x3_t, align {{16|8}} // CHECK: [[TMP0:%.*]] = bitcast %struct.int64x2x3_t* [[__RET]] to i8* // CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8* @@ -1104,7 +1104,7 @@ int64x2x3_t test_vld1q_s64_x3(int64_t const *a) { // CHECK-LABEL: @test_vld1q_s64_x4( // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int64x2x4_t, align 16 -// CHECK-A32: %struct.int64x2x4_t* noalias sret [[RETVAL:%.*]], +// CHECK-A32: %struct.int64x2x4_t* noalias sret align 8 [[RETVAL:%.*]], // CHECK: [[__RET:%.*]] = alloca %struct.int64x2x4_t, align {{16|8}} // CHECK: [[TMP0:%.*]] = bitcast %struct.int64x2x4_t* [[__RET]] to i8* // CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8* @@ -1124,7 +1124,7 @@ int64x2x4_t test_vld1q_s64_x4(int64_t const *a) { // CHECK-LABEL: @test_vld1q_s8_x2( // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int8x16x2_t, align 16 -// CHECK-A32: %struct.int8x16x2_t* noalias sret [[RETVAL:%.*]], +// CHECK-A32: %struct.int8x16x2_t* noalias sret align 8 [[RETVAL:%.*]], // CHECK: [[__RET:%.*]] = alloca %struct.int8x16x2_t, align {{16|8}} // CHECK: [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__RET]] to i8* // CHECK: [[VLD1XN:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.{{aarch64.neon.ld1x2|arm.neon.vld1x2}}.v16i8.p0i8(i8* %a) @@ -1142,7 +1142,7 @@ int8x16x2_t test_vld1q_s8_x2(int8_t const *a) { // CHECK-LABEL: @test_vld1q_s8_x3( // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int8x16x3_t, align 16 -// CHECK-A32: %struct.int8x16x3_t* noalias sret [[RETVAL:%.*]], +// CHECK-A32: %struct.int8x16x3_t* noalias sret align 8 [[RETVAL:%.*]], // CHECK: [[__RET:%.*]] = alloca %struct.int8x16x3_t, align {{16|8}} // CHECK: [[TMP0:%.*]] = bitcast %struct.int8x16x3_t* [[__RET]] to i8* // CHECK: [[VLD1XN:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.{{aarch64.neon.ld1x3|arm.neon.vld1x3}}.v16i8.p0i8(i8* %a) @@ -1160,7 +1160,7 @@ int8x16x3_t test_vld1q_s8_x3(int8_t const *a) { // CHECK-LABEL: @test_vld1q_s8_x4( // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.int8x16x4_t, align 16 -// CHECK-A32: %struct.int8x16x4_t* noalias sret [[RETVAL:%.*]], +// CHECK-A32: %struct.int8x16x4_t* noalias sret align 8 [[RETVAL:%.*]], // CHECK: [[__RET:%.*]] = alloca %struct.int8x16x4_t, align {{16|8}} // CHECK: [[TMP0:%.*]] = bitcast %struct.int8x16x4_t* [[__RET]] to i8* // CHECK: [[VLD1XN:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.{{aarch64.neon.ld1x4|arm.neon.vld1x4}}.v16i8.p0i8(i8* %a) @@ -1178,7 +1178,7 @@ int8x16x4_t test_vld1q_s8_x4(int8_t const *a) { // CHECK-LABEL: @test_vld1q_u16_x2( // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint16x8x2_t, align 16 -// CHECK-A32: %struct.uint16x8x2_t* noalias sret [[RETVAL:%.*]], +// CHECK-A32: %struct.uint16x8x2_t* noalias sret align 8 [[RETVAL:%.*]], // CHECK: [[__RET:%.*]] = alloca %struct.uint16x8x2_t, align {{16|8}} // CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8* // CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* @@ -1198,7 +1198,7 @@ uint16x8x2_t test_vld1q_u16_x2(uint16_t const *a) { // CHECK-LABEL: @test_vld1q_u16_x3( // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint16x8x3_t, align 16 -// CHECK-A32: %struct.uint16x8x3_t* noalias sret [[RETVAL:%.*]], +// CHECK-A32: %struct.uint16x8x3_t* noalias sret align 8 [[RETVAL:%.*]], // CHECK: [[__RET:%.*]] = alloca %struct.uint16x8x3_t, align {{16|8}} // CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8* // CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* @@ -1218,7 +1218,7 @@ uint16x8x3_t test_vld1q_u16_x3(uint16_t const *a) { // CHECK-LABEL: @test_vld1q_u16_x4( // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint16x8x4_t, align 16 -// CHECK-A32: %struct.uint16x8x4_t* noalias sret [[RETVAL:%.*]], +// CHECK-A32: %struct.uint16x8x4_t* noalias sret align 8 [[RETVAL:%.*]], // CHECK: [[__RET:%.*]] = alloca %struct.uint16x8x4_t, align {{16|8}} // CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8* // CHECK: [[TMP1:%.*]] = bitcast i16* %a to i8* @@ -1238,7 +1238,7 @@ uint16x8x4_t test_vld1q_u16_x4(uint16_t const *a) { // CHECK-LABEL: @test_vld1q_u32_x2( // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint32x4x2_t, align 16 -// CHECK-A32: %struct.uint32x4x2_t* noalias sret [[RETVAL:%.*]], +// CHECK-A32: %struct.uint32x4x2_t* noalias sret align 8 [[RETVAL:%.*]], // CHECK: [[__RET:%.*]] = alloca %struct.uint32x4x2_t, align {{16|8}} // CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8* // CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8* @@ -1258,7 +1258,7 @@ uint32x4x2_t test_vld1q_u32_x2(uint32_t const *a) { // CHECK-LABEL: @test_vld1q_u32_x3( // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint32x4x3_t, align 16 -// CHECK-A32: %struct.uint32x4x3_t* noalias sret [[RETVAL:%.*]], +// CHECK-A32: %struct.uint32x4x3_t* noalias sret align 8 [[RETVAL:%.*]], // CHECK: [[__RET:%.*]] = alloca %struct.uint32x4x3_t, align {{16|8}} // CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8* // CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8* @@ -1278,7 +1278,7 @@ uint32x4x3_t test_vld1q_u32_x3(uint32_t const *a) { // CHECK-LABEL: @test_vld1q_u32_x4( // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint32x4x4_t, align 16 -// CHECK-A32: %struct.uint32x4x4_t* noalias sret [[RETVAL:%.*]], +// CHECK-A32: %struct.uint32x4x4_t* noalias sret align 8 [[RETVAL:%.*]], // CHECK: [[__RET:%.*]] = alloca %struct.uint32x4x4_t, align {{16|8}} // CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8* // CHECK: [[TMP1:%.*]] = bitcast i32* %a to i8* @@ -1298,7 +1298,7 @@ uint32x4x4_t test_vld1q_u32_x4(uint32_t const *a) { // CHECK-LABEL: @test_vld1q_u64_x2( // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint64x2x2_t, align 16 -// CHECK-A32: %struct.uint64x2x2_t* noalias sret [[RETVAL:%.*]], +// CHECK-A32: %struct.uint64x2x2_t* noalias sret align 8 [[RETVAL:%.*]], // CHECK: [[__RET:%.*]] = alloca %struct.uint64x2x2_t, align {{16|8}} // CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x2x2_t* [[__RET]] to i8* // CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8* @@ -1318,7 +1318,7 @@ uint64x2x2_t test_vld1q_u64_x2(uint64_t const *a) { // CHECK-LABEL: @test_vld1q_u64_x3( // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint64x2x3_t, align 16 -// CHECK-A32: %struct.uint64x2x3_t* noalias sret [[RETVAL:%.*]], +// CHECK-A32: %struct.uint64x2x3_t* noalias sret align 8 [[RETVAL:%.*]], // CHECK: [[__RET:%.*]] = alloca %struct.uint64x2x3_t, align {{16|8}} // CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x2x3_t* [[__RET]] to i8* // CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8* @@ -1338,7 +1338,7 @@ uint64x2x3_t test_vld1q_u64_x3(uint64_t const *a) { // CHECK-LABEL: @test_vld1q_u64_x4( // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint64x2x4_t, align 16 -// CHECK-A32: %struct.uint64x2x4_t* noalias sret [[RETVAL:%.*]], +// CHECK-A32: %struct.uint64x2x4_t* noalias sret align 8 [[RETVAL:%.*]], // CHECK: [[__RET:%.*]] = alloca %struct.uint64x2x4_t, align {{16|8}} // CHECK: [[TMP0:%.*]] = bitcast %struct.uint64x2x4_t* [[__RET]] to i8* // CHECK: [[TMP1:%.*]] = bitcast i64* %a to i8* @@ -1358,7 +1358,7 @@ uint64x2x4_t test_vld1q_u64_x4(uint64_t const *a) { // CHECK-LABEL: @test_vld1q_u8_x2( // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint8x16x2_t, align 16 -// CHECK-A32: %struct.uint8x16x2_t* noalias sret [[RETVAL:%.*]], +// CHECK-A32: %struct.uint8x16x2_t* noalias sret align 8 [[RETVAL:%.*]], // CHECK: [[__RET:%.*]] = alloca %struct.uint8x16x2_t, align {{16|8}} // CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET]] to i8* // CHECK: [[VLD1XN:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.{{aarch64.neon.ld1x2|arm.neon.vld1x2}}.v16i8.p0i8(i8* %a) @@ -1376,7 +1376,7 @@ uint8x16x2_t test_vld1q_u8_x2(uint8_t const *a) { // CHECK-LABEL: @test_vld1q_u8_x3( // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint8x16x3_t, align 16 -// CHECK-A32: %struct.uint8x16x3_t* noalias sret [[RETVAL:%.*]], +// CHECK-A32: %struct.uint8x16x3_t* noalias sret align 8 [[RETVAL:%.*]], // CHECK: [[__RET:%.*]] = alloca %struct.uint8x16x3_t, align {{16|8}} // CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x16x3_t* [[__RET]] to i8* // CHECK: [[VLD1XN:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.{{aarch64.neon.ld1x3|arm.neon.vld1x3}}.v16i8.p0i8(i8* %a) @@ -1394,7 +1394,7 @@ uint8x16x3_t test_vld1q_u8_x3(uint8_t const *a) { // CHECK-LABEL: @test_vld1q_u8_x4( // CHECK-A64: [[RETVAL:%.*]] = alloca %struct.uint8x16x4_t, align 16 -// CHECK-A32: %struct.uint8x16x4_t* noalias sret [[RETVAL:%.*]], +// CHECK-A32: %struct.uint8x16x4_t* noalias sret align 8 [[RETVAL:%.*]], // CHECK: [[__RET:%.*]] = alloca %struct.uint8x16x4_t, align {{16|8}} // CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x16x4_t* [[__RET]] to i8* // CHECK: [[VLD1XN:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.{{aarch64.neon.ld1x4|arm.neon.vld1x4}}.v16i8.p0i8(i8* %a) diff --git a/clang/test/CodeGen/arm-v8.2a-neon-intrinsics.c b/clang/test/CodeGen/arm-v8.2a-neon-intrinsics.c index 4b48ba01c4bcd..a0896c7aa4f22 100644 --- a/clang/test/CodeGen/arm-v8.2a-neon-intrinsics.c +++ b/clang/test/CodeGen/arm-v8.2a-neon-intrinsics.c @@ -773,19 +773,23 @@ float16x8_t test_vfmsq_f16(float16x8_t a, float16x8_t b, float16x8_t c) { } // CHECK-LABEL: test_vmul_lane_f16 -// CHECK: [[TMP0:%.*]] = shufflevector <4 x half> %b, <4 x half> %b, <4 x i32> -// CHECK: [[MUL:%.*]] = fmul <4 x half> %a, [[TMP0]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> +// CHECK: [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <4 x i32> +// CHECK: [[MUL:%.*]] = fmul <4 x half> [[A:%.*]], [[LANE]] // CHECK: ret <4 x half> [[MUL]] float16x4_t test_vmul_lane_f16(float16x4_t a, float16x4_t b) { return vmul_lane_f16(a, b, 3); } // CHECK-LABEL: test_vmulq_lane_f16 -// CHECK: [[TMP0:%.*]] = shufflevector <4 x half> %b, <4 x half> %b, <8 x i32> -// CHECK: [[MUL:%.*]] = fmul <8 x half> %a, [[TMP0]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x half> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> +// CHECK: [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <8 x i32> +// CHECK: [[MUL:%.*]] = fmul <8 x half> [[A:%.*]], [[LANE]] // CHECK: ret <8 x half> [[MUL]] float16x8_t test_vmulq_lane_f16(float16x8_t a, float16x4_t b) { - return vmulq_lane_f16(a, b, 7); + return vmulq_lane_f16(a, b, 3); } // CHECK-LABEL: test_vmul_n_f16 @@ -939,17 +943,21 @@ float16x8_t test_vdupq_n_f16(float16_t a) { } // CHECK-LABEL: test_vdup_lane_f16 -// CHECK: [[SHFL:%.*]] = shufflevector <4 x half> %a, <4 x half> %a, <4 x i32> -// CHECK: ret <4 x half> [[SHFL]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x half> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> +// CHECK: [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <4 x i32> +// CHECK: ret <4 x half> [[LANE]] float16x4_t test_vdup_lane_f16(float16x4_t a) { return vdup_lane_f16(a, 3); } // CHECK-LABEL: test_vdupq_lane_f16 -// CHECK: [[SHFL:%.*]] = shufflevector <4 x half> %a, <4 x half> %a, <8 x i32> -// CHECK: ret <8 x half> [[SHFL]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x half> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> +// CHECK: [[LANE:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP1]], <8 x i32> +// CHECK: ret <8 x half> [[LANE]] float16x8_t test_vdupq_lane_f16(float16x4_t a) { - return vdupq_lane_f16(a, 7); + return vdupq_lane_f16(a, 3); } // CHECK-LABEL: @test_vext_f16( diff --git a/clang/test/CodeGen/arm-varargs.c b/clang/test/CodeGen/arm-varargs.c index 1f5c07ef57dad..dff62568b6cae 100644 --- a/clang/test/CodeGen/arm-varargs.c +++ b/clang/test/CodeGen/arm-varargs.c @@ -24,7 +24,7 @@ struct bigstruct { }; struct bigstruct simple_struct(void) { -// CHECK-LABEL: define void @simple_struct(%struct.bigstruct* noalias sret %agg.result) +// CHECK-LABEL: define void @simple_struct(%struct.bigstruct* noalias sret align 4 %agg.result) return va_arg(the_list, struct bigstruct); // CHECK: [[CUR:%[a-z0-9._]+]] = load i8*, i8** getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 0), align 4 // CHECK: [[NEXT:%[a-z0-9._]+]] = getelementptr inbounds i8, i8* [[CUR]], i32 40 @@ -42,7 +42,7 @@ struct aligned_bigstruct { }; struct aligned_bigstruct simple_aligned_struct(void) { -// CHECK-LABEL: define void @simple_aligned_struct(%struct.aligned_bigstruct* noalias sret %agg.result) +// CHECK-LABEL: define void @simple_aligned_struct(%struct.aligned_bigstruct* noalias sret align 8 %agg.result) return va_arg(the_list, struct aligned_bigstruct); // CHECK: [[CUR:%[a-z0-9._]+]] = load i8*, i8** getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 0), align 4 // CHECK: [[CUR_INT:%[a-z0-9._]+]] = ptrtoint i8* [[CUR]] to i32 @@ -78,7 +78,7 @@ struct hfa { }; struct hfa simple_hfa(void) { -// CHECK-LABEL: define void @simple_hfa(%struct.hfa* noalias sret %agg.result) +// CHECK-LABEL: define void @simple_hfa(%struct.hfa* noalias sret align 4 %agg.result) return va_arg(the_list, struct hfa); // CHECK: [[CUR:%[a-z0-9._]+]] = load i8*, i8** getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 0), align 4 // CHECK: [[NEXT:%[a-z0-9._]+]] = getelementptr inbounds i8, i8* [[CUR]], i32 8 @@ -185,7 +185,7 @@ typedef struct __attribute__((aligned(16))) { int val; } overaligned_int_struct; overaligned_int_struct overaligned_int_struct_test() { -// CHECK-LABEL: define void @overaligned_int_struct_test(%struct.overaligned_int_struct* noalias sret %agg.result) +// CHECK-LABEL: define void @overaligned_int_struct_test(%struct.overaligned_int_struct* noalias sret align 16 %agg.result) return va_arg(the_list, overaligned_int_struct); // CHECK: [[CUR:%[a-z0-9._]+]] = load i8*, i8** getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 0), align 4 // CHECK: [[NEXT:%[a-z0-9._]+]] = getelementptr inbounds i8, i8* [[CUR]], i32 16 @@ -201,7 +201,7 @@ typedef struct __attribute__((packed,aligned(2))) { long long val; } underaligned_long_long_struct; underaligned_long_long_struct underaligned_long_long_struct_test() { -// CHECK-LABEL: define void @underaligned_long_long_struct_test(%struct.underaligned_long_long_struct* noalias sret %agg.result) +// CHECK-LABEL: define void @underaligned_long_long_struct_test(%struct.underaligned_long_long_struct* noalias sret align 2 %agg.result) return va_arg(the_list, underaligned_long_long_struct); // CHECK: [[CUR:%[a-z0-9._]+]] = load i8*, i8** getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 0), align 4 // CHECK: [[NEXT:%[a-z0-9._]+]] = getelementptr inbounds i8, i8* [[CUR]], i32 8 @@ -217,7 +217,7 @@ typedef struct __attribute__((aligned(16))) { long long val; } overaligned_long_long_struct; overaligned_long_long_struct overaligned_long_long_struct_test() { -// CHECK-LABEL: define void @overaligned_long_long_struct_test(%struct.overaligned_long_long_struct* noalias sret %agg.result) +// CHECK-LABEL: define void @overaligned_long_long_struct_test(%struct.overaligned_long_long_struct* noalias sret align 16 %agg.result) return va_arg(the_list, overaligned_long_long_struct); // CHECK: [[CUR:%[a-z0-9._]+]] = load i8*, i8** getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 0), align 4 // CHECK: [[CUR_INT:%[a-z0-9._]+]] = ptrtoint i8* [[CUR]] to i32 @@ -259,7 +259,7 @@ typedef struct { int val __attribute__((aligned(16))); } overaligned_int_struct_member; overaligned_int_struct_member overaligned_int_struct_member_test() { -// CHECK-LABEL: define void @overaligned_int_struct_member_test(%struct.overaligned_int_struct_member* noalias sret %agg.result) +// CHECK-LABEL: define void @overaligned_int_struct_member_test(%struct.overaligned_int_struct_member* noalias sret align 16 %agg.result) return va_arg(the_list, overaligned_int_struct_member); // CHECK: [[CUR:%[a-z0-9._]+]] = load i8*, i8** getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 0), align 4 // CHECK: [[CUR_INT:%[a-z0-9._]+]] = ptrtoint i8* [[CUR]] to i32 @@ -279,7 +279,7 @@ typedef struct { long long val __attribute__((packed,aligned(2))); } underaligned_long_long_struct_member; underaligned_long_long_struct_member underaligned_long_long_struct_member_test() { -// CHECK-LABEL: define void @underaligned_long_long_struct_member_test(%struct.underaligned_long_long_struct_member* noalias sret %agg.result) +// CHECK-LABEL: define void @underaligned_long_long_struct_member_test(%struct.underaligned_long_long_struct_member* noalias sret align 2 %agg.result) return va_arg(the_list, underaligned_long_long_struct_member); // CHECK: [[CUR:%[a-z0-9._]+]] = load i8*, i8** getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 0), align 4 // CHECK: [[NEXT:%[a-z0-9._]+]] = getelementptr inbounds i8, i8* [[CUR]], i32 8 @@ -295,7 +295,7 @@ typedef struct { long long val __attribute__((aligned(16))); } overaligned_long_long_struct_member; overaligned_long_long_struct_member overaligned_long_long_struct_member_test() { -// CHECK-LABEL: define void @overaligned_long_long_struct_member_test(%struct.overaligned_long_long_struct_member* noalias sret %agg.result) +// CHECK-LABEL: define void @overaligned_long_long_struct_member_test(%struct.overaligned_long_long_struct_member* noalias sret align 16 %agg.result) return va_arg(the_list, overaligned_long_long_struct_member); // CHECK: [[CUR:%[a-z0-9._]+]] = load i8*, i8** getelementptr inbounds (%struct.__va_list, %struct.__va_list* @the_list, i32 0, i32 0), align 4 // CHECK: [[CUR_INT:%[a-z0-9._]+]] = ptrtoint i8* [[CUR]] to i32 diff --git a/clang/test/CodeGen/arm-vector-arguments.c b/clang/test/CodeGen/arm-vector-arguments.c index 9bdddb72696e3..aa8e65ba366f4 100644 --- a/clang/test/CodeGen/arm-vector-arguments.c +++ b/clang/test/CodeGen/arm-vector-arguments.c @@ -9,7 +9,7 @@ #include -// CHECK: define void @f0(%struct.int8x16x2_t* noalias sret %agg.result, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}) +// CHECK: define void @f0(%struct.int8x16x2_t* noalias sret align 16 %agg.result, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}) int8x16x2_t f0(int8x16_t a0, int8x16_t a1) { return vzipq_s8(a0, a1); } @@ -25,7 +25,7 @@ typedef float T_float32x16 __attribute__ ((__vector_size__ (64))); T_float32x2 f1_0(T_float32x2 a0) { return a0; } // CHECK: define <4 x float> @f1_1(<4 x float> %{{.*}}) T_float32x4 f1_1(T_float32x4 a0) { return a0; } -// CHECK: define void @f1_2(<8 x float>* noalias sret %{{.*}}, <8 x float> %{{.*}}) +// CHECK: define void @f1_2(<8 x float>* noalias sret align 32 %{{.*}}, <8 x float> %{{.*}}) T_float32x8 f1_2(T_float32x8 a0) { return a0; } -// CHECK: define void @f1_3(<16 x float>* noalias sret %{{.*}}, <16 x float> %{{.*}}) +// CHECK: define void @f1_3(<16 x float>* noalias sret align 64 %{{.*}}, <16 x float> %{{.*}}) T_float32x16 f1_3(T_float32x16 a0) { return a0; } diff --git a/clang/test/CodeGen/arm-vfp16-arguments.c b/clang/test/CodeGen/arm-vfp16-arguments.c index 32f1bb7b2a775..42d990d970862 100644 --- a/clang/test/CodeGen/arm-vfp16-arguments.c +++ b/clang/test/CodeGen/arm-vfp16-arguments.c @@ -71,6 +71,6 @@ void test_hfa(hfa_t a) {} hfa_t ghfa; hfa_t test_ret_hfa(void) { return ghfa; } -// CHECK-SOFT: define void @test_ret_hfa(%struct.hfa_t* noalias nocapture sret %agg.result) +// CHECK-SOFT: define void @test_ret_hfa(%struct.hfa_t* noalias nocapture sret align 8 %agg.result) // CHECK-HARD: define arm_aapcs_vfpcc [2 x <2 x i32>] @test_ret_hfa() // CHECK-FULL: define arm_aapcs_vfpcc %struct.hfa_t @test_ret_hfa() diff --git a/clang/test/CodeGen/arm-vfp16-arguments2.cpp b/clang/test/CodeGen/arm-vfp16-arguments2.cpp index e436a5ecd6abd..ccc81a3bfdd92 100644 --- a/clang/test/CodeGen/arm-vfp16-arguments2.cpp +++ b/clang/test/CodeGen/arm-vfp16-arguments2.cpp @@ -37,27 +37,27 @@ struct S5 : B1 { B1 M[1]; }; -// CHECK-SOFT: define void @_Z2f12S1(%struct.S1* noalias nocapture sret %agg.result, [2 x i64] %s1.coerce) +// CHECK-SOFT: define void @_Z2f12S1(%struct.S1* noalias nocapture sret align 8 %agg.result, [2 x i64] %s1.coerce) // CHECK-HARD: define arm_aapcs_vfpcc [2 x <2 x i32>] @_Z2f12S1([2 x <2 x i32>] returned %s1.coerce) // CHECK-FULL: define arm_aapcs_vfpcc %struct.S1 @_Z2f12S1(%struct.S1 returned %s1.coerce) struct S1 f1(struct S1 s1) { return s1; } -// CHECK-SOFT: define void @_Z2f22S2(%struct.S2* noalias nocapture sret %agg.result, [4 x i32] %s2.coerce) +// CHECK-SOFT: define void @_Z2f22S2(%struct.S2* noalias nocapture sret align 8 %agg.result, [4 x i32] %s2.coerce) // CHECK-HARD: define arm_aapcs_vfpcc [2 x <2 x i32>] @_Z2f22S2([2 x <2 x i32>] returned %s2.coerce) // CHECK-FULL: define arm_aapcs_vfpcc %struct.S2 @_Z2f22S2(%struct.S2 returned %s2.coerce) struct S2 f2(struct S2 s2) { return s2; } -// CHECK-SOFT: define void @_Z2f32S3(%struct.S3* noalias nocapture sret %agg.result, [2 x i64] %s3.coerce) +// CHECK-SOFT: define void @_Z2f32S3(%struct.S3* noalias nocapture sret align 8 %agg.result, [2 x i64] %s3.coerce) // CHECK-HARD: define arm_aapcs_vfpcc [2 x <2 x i32>] @_Z2f32S3([2 x <2 x i32>] returned %s3.coerce) // CHECK-FULL: define arm_aapcs_vfpcc %struct.S3 @_Z2f32S3(%struct.S3 returned %s3.coerce) struct S3 f3(struct S3 s3) { return s3; } -// CHECK-SOFT: define void @_Z2f42S4(%struct.S4* noalias nocapture sret %agg.result, [2 x i64] %s4.coerce) +// CHECK-SOFT: define void @_Z2f42S4(%struct.S4* noalias nocapture sret align 8 %agg.result, [2 x i64] %s4.coerce) // CHECK-HARD: define arm_aapcs_vfpcc [2 x <2 x i32>] @_Z2f42S4([2 x <2 x i32>] returned %s4.coerce) // CHECK-FULL: define arm_aapcs_vfpcc %struct.S4 @_Z2f42S4(%struct.S4 returned %s4.coerce) struct S4 f4(struct S4 s4) { return s4; } -// CHECK-SOFT: define void @_Z2f52S5(%struct.S5* noalias nocapture sret %agg.result, [2 x i64] %s5.coerce) +// CHECK-SOFT: define void @_Z2f52S5(%struct.S5* noalias nocapture sret align 8 %agg.result, [2 x i64] %s5.coerce) // CHECK-HARD: define arm_aapcs_vfpcc %struct.S5 @_Z2f52S5(%struct.S5 returned %s5.coerce) // CHECK-FULL: define arm_aapcs_vfpcc %struct.S5 @_Z2f52S5(%struct.S5 returned %s5.coerce) struct S5 f5(struct S5 s5) { return s5; } diff --git a/clang/test/CodeGen/arm64-arguments.c b/clang/test/CodeGen/arm64-arguments.c index 5c8474cae7be8..97332deb7c807 100644 --- a/clang/test/CodeGen/arm64-arguments.c +++ b/clang/test/CodeGen/arm64-arguments.c @@ -181,9 +181,9 @@ T_float32x2 f1_0(T_float32x2 a0) { return a0; } // CHECK: define <4 x float> @f1_1(<4 x float> %{{.*}}) T_float32x4 f1_1(T_float32x4 a0) { return a0; } // Vector with length bigger than 16-byte is illegal and is passed indirectly. -// CHECK: define void @f1_2(<8 x float>* noalias sret %{{.*}}, <8 x float>* %0) +// CHECK: define void @f1_2(<8 x float>* noalias sret align 16 %{{.*}}, <8 x float>* %0) T_float32x8 f1_2(T_float32x8 a0) { return a0; } -// CHECK: define void @f1_3(<16 x float>* noalias sret %{{.*}}, <16 x float>* %0) +// CHECK: define void @f1_3(<16 x float>* noalias sret align 16 %{{.*}}, <16 x float>* %0) T_float32x16 f1_3(T_float32x16 a0) { return a0; } // Testing alignment with aggregates: HFA, aggregates with size <= 16 bytes and diff --git a/clang/test/CodeGen/arm64-microsoft-arguments.cpp b/clang/test/CodeGen/arm64-microsoft-arguments.cpp index bca7cc94b3937..f5bcda756b061 100644 --- a/clang/test/CodeGen/arm64-microsoft-arguments.cpp +++ b/clang/test/CodeGen/arm64-microsoft-arguments.cpp @@ -28,8 +28,8 @@ S2 f2() { } // Pass and return for type size > 16 bytes. -// CHECK: define {{.*}} void @{{.*}}f3{{.*}}(%struct.S3* noalias sret %agg.result) -// CHECK: call void {{.*}}func3{{.*}}(%struct.S3* sret %agg.result, %struct.S3* %agg.tmp) +// CHECK: define {{.*}} void @{{.*}}f3{{.*}}(%struct.S3* noalias sret align 4 %agg.result) +// CHECK: call void {{.*}}func3{{.*}}(%struct.S3* sret align 4 %agg.result, %struct.S3* %agg.tmp) struct S3 { int a[5]; }; @@ -42,8 +42,8 @@ S3 f3() { // Pass and return aggregate (of size < 16 bytes) with non-trivial destructor. // Passed directly but returned indirectly. -// CHECK: define {{.*}} void {{.*}}f4{{.*}}(%struct.S4* inreg noalias sret %agg.result) -// CHECK: call void {{.*}}func4{{.*}}(%struct.S4* inreg sret %agg.result, [2 x i64] %5) +// CHECK: define {{.*}} void {{.*}}f4{{.*}}(%struct.S4* inreg noalias sret align 4 %agg.result) +// CHECK: call void {{.*}}func4{{.*}}(%struct.S4* inreg sret align 4 %agg.result, [2 x i64] %5) struct S4 { int a[3]; ~S4(); @@ -56,8 +56,8 @@ S4 f4() { } // Pass and return from instance method called from instance method. -// CHECK: define {{.*}} void @{{.*}}bar@Q1{{.*}}(%class.Q1* %this, %class.P1* inreg noalias sret %agg.result) -// CHECK: call void {{.*}}foo@P1{{.*}}(%class.P1* %ref.tmp, %class.P1* inreg sret %agg.result, i8 %1) +// CHECK: define {{.*}} void @{{.*}}bar@Q1{{.*}}(%class.Q1* %this, %class.P1* inreg noalias sret align 1 %agg.result) +// CHECK: call void {{.*}}foo@P1{{.*}}(%class.P1* %ref.tmp, %class.P1* inreg sret align 1 %agg.result, i8 %1) class P1 { public: @@ -76,7 +76,7 @@ P1 Q1::bar() { // Pass and return from instance method called from free function. // CHECK: define {{.*}} void {{.*}}bar{{.*}}() -// CHECK: call void {{.*}}foo@P2{{.*}}(%class.P2* %ref.tmp, %class.P2* inreg sret %retval, i8 %0) +// CHECK: call void {{.*}}foo@P2{{.*}}(%class.P2* %ref.tmp, %class.P2* inreg sret align 1 %retval, i8 %0) class P2 { public: P2 foo(P2 x); @@ -89,8 +89,8 @@ P2 bar() { // Pass and return an object with a user-provided constructor (passed directly, // returned indirectly) -// CHECK: define {{.*}} void @{{.*}}f5{{.*}}(%struct.S5* inreg noalias sret %agg.result) -// CHECK: call void {{.*}}func5{{.*}}(%struct.S5* inreg sret %agg.result, i64 {{.*}}) +// CHECK: define {{.*}} void @{{.*}}f5{{.*}}(%struct.S5* inreg noalias sret align 4 %agg.result) +// CHECK: call void {{.*}}func5{{.*}}(%struct.S5* inreg sret align 4 %agg.result, i64 {{.*}}) struct S5 { S5(); int x; @@ -146,8 +146,8 @@ struct S8 { int y; }; -// CHECK: define {{.*}} void {{.*}}?f8{{.*}}(%struct.S8* inreg noalias sret {{.*}}) -// CHECK: call void {{.*}}func8{{.*}}(%struct.S8* inreg sret {{.*}}, i64 {{.*}}) +// CHECK: define {{.*}} void {{.*}}?f8{{.*}}(%struct.S8* inreg noalias sret align 4 {{.*}}) +// CHECK: call void {{.*}}func8{{.*}}(%struct.S8* inreg sret align 4 {{.*}}, i64 {{.*}}) S8 func8(S8 x); S8 f8() { S8 x; @@ -157,8 +157,8 @@ S8 f8() { // Pass and return an object with a non-trivial copy-assignment operator and // a trivial copy constructor (passed directly, returned indirectly) -// CHECK: define {{.*}} void @"?f9@@YA?AUS9@@XZ"(%struct.S9* inreg noalias sret {{.*}}) -// CHECK: call void {{.*}}func9{{.*}}(%struct.S9* inreg sret {{.*}}, i64 {{.*}}) +// CHECK: define {{.*}} void @"?f9@@YA?AUS9@@XZ"(%struct.S9* inreg noalias sret align 4 {{.*}}) +// CHECK: call void {{.*}}func9{{.*}}(%struct.S9* inreg sret align 4 {{.*}}, i64 {{.*}}) struct S9 { S9& operator=(const S9&); int x; @@ -174,8 +174,8 @@ S9 f9() { // Pass and return an object with a base class (passed directly, returned // indirectly). -// CHECK: define dso_local void {{.*}}f10{{.*}}(%struct.S10* inreg noalias sret {{.*}}) -// CHECK: call void {{.*}}func10{{.*}}(%struct.S10* inreg sret {{.*}}, [2 x i64] {{.*}}) +// CHECK: define dso_local void {{.*}}f10{{.*}}(%struct.S10* inreg noalias sret align 4 {{.*}}) +// CHECK: call void {{.*}}func10{{.*}}(%struct.S10* inreg sret align 4 {{.*}}, [2 x i64] {{.*}}) struct S10 : public S1 { int x; }; @@ -189,8 +189,8 @@ S10 f10() { // Pass and return a non aggregate object exceeding > 128 bits (passed // indirectly, returned indirectly) -// CHECK: define dso_local void {{.*}}f11{{.*}}(%struct.S11* inreg noalias sret {{.*}}) -// CHECK: call void {{.*}}func11{{.*}}(%struct.S11* inreg sret {{.*}}, %struct.S11* {{.*}}) +// CHECK: define dso_local void {{.*}}f11{{.*}}(%struct.S11* inreg noalias sret align 8 {{.*}}) +// CHECK: call void {{.*}}func11{{.*}}(%struct.S11* inreg sret align 8 {{.*}}, %struct.S11* {{.*}}) struct S11 { virtual void f(); int a[5]; diff --git a/clang/test/CodeGen/arm64_32.c b/clang/test/CodeGen/arm64_32.c index 245dfefc99e3b..1fb121cfcfb14 100644 --- a/clang/test/CodeGen/arm64_32.c +++ b/clang/test/CodeGen/arm64_32.c @@ -27,4 +27,4 @@ long double LongDoubleVar = 0.0; typedef float __attribute__((ext_vector_type(16))) v16f32; v16f32 func(v16f32 in) { return in; } -// CHECK: define void @func(<16 x float>* noalias sret {{%.*}}, <16 x float> {{%.*}}) +// CHECK: define void @func(<16 x float>* noalias sret align 16 {{%.*}}, <16 x float> {{%.*}}) diff --git a/clang/test/CodeGen/arm64_vdupq_n_f64.c b/clang/test/CodeGen/arm64_vdupq_n_f64.c index 24c57c4f0de3a..5c6f61e7acf91 100644 --- a/clang/test/CodeGen/arm64_vdupq_n_f64.c +++ b/clang/test/CodeGen/arm64_vdupq_n_f64.c @@ -28,7 +28,9 @@ float32x4_t test_vdupq_n_f32(float32_t w) { // this was in , but had already been implemented, // test anyway // CHECK-LABEL: define <2 x double> @test_vdupq_lane_f64(<1 x double> %V) #0 { -// CHECK: [[SHUFFLE:%.*]] = shufflevector <1 x double> %V, <1 x double> %V, <2 x i32> zeroinitializer +// CHECK: [[TMP0:%.*]] = bitcast <1 x double> %V to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x double> +// CHECK: [[SHUFFLE:%.*]] = shufflevector <1 x double> [[TMP1]], <1 x double> [[TMP1]], <2 x i32> zeroinitializer // CHECK: ret <2 x double> [[SHUFFLE]] float64x2_t test_vdupq_lane_f64(float64x1_t V) { return vdupq_lane_f64(V, 0); diff --git a/clang/test/CodeGen/arm_neon_intrinsics.c b/clang/test/CodeGen/arm_neon_intrinsics.c index 9f1a64554155c..bff9f6a2e30fc 100644 --- a/clang/test/CodeGen/arm_neon_intrinsics.c +++ b/clang/test/CodeGen/arm_neon_intrinsics.c @@ -2419,15 +2419,19 @@ uint8x8_t test_vdup_lane_u8(uint8x8_t a) { } // CHECK-LABEL: @test_vdup_lane_u16( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> -// CHECK: ret <4 x i16> [[SHUFFLE]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK: ret <4 x i16> [[LANE]] uint16x4_t test_vdup_lane_u16(uint16x4_t a) { return vdup_lane_u16(a, 3); } // CHECK-LABEL: @test_vdup_lane_u32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <2 x i32> -// CHECK: ret <2 x i32> [[SHUFFLE]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK: ret <2 x i32> [[LANE]] uint32x2_t test_vdup_lane_u32(uint32x2_t a) { return vdup_lane_u32(a, 1); } @@ -2440,15 +2444,19 @@ int8x8_t test_vdup_lane_s8(int8x8_t a) { } // CHECK-LABEL: @test_vdup_lane_s16( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> -// CHECK: ret <4 x i16> [[SHUFFLE]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK: ret <4 x i16> [[LANE]] int16x4_t test_vdup_lane_s16(int16x4_t a) { return vdup_lane_s16(a, 3); } // CHECK-LABEL: @test_vdup_lane_s32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <2 x i32> -// CHECK: ret <2 x i32> [[SHUFFLE]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK: ret <2 x i32> [[LANE]] int32x2_t test_vdup_lane_s32(int32x2_t a) { return vdup_lane_s32(a, 1); } @@ -2461,15 +2469,19 @@ poly8x8_t test_vdup_lane_p8(poly8x8_t a) { } // CHECK-LABEL: @test_vdup_lane_p16( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <4 x i32> -// CHECK: ret <4 x i16> [[SHUFFLE]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK: ret <4 x i16> [[LANE]] poly16x4_t test_vdup_lane_p16(poly16x4_t a) { return vdup_lane_p16(a, 3); } // CHECK-LABEL: @test_vdup_lane_f32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %a, <2 x float> %a, <2 x i32> -// CHECK: ret <2 x float> [[SHUFFLE]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> +// CHECK: ret <2 x float> [[LANE]] float32x2_t test_vdup_lane_f32(float32x2_t a) { return vdup_lane_f32(a, 1); } @@ -2482,15 +2494,19 @@ uint8x16_t test_vdupq_lane_u8(uint8x8_t a) { } // CHECK-LABEL: @test_vdupq_lane_u16( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> +// CHECK: ret <8 x i16> [[LANE]] uint16x8_t test_vdupq_lane_u16(uint16x4_t a) { return vdupq_lane_u16(a, 3); } // CHECK-LABEL: @test_vdupq_lane_u32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <4 x i32> -// CHECK: ret <4 x i32> [[SHUFFLE]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> +// CHECK: ret <4 x i32> [[LANE]] uint32x4_t test_vdupq_lane_u32(uint32x2_t a) { return vdupq_lane_u32(a, 1); } @@ -2503,15 +2519,19 @@ int8x16_t test_vdupq_lane_s8(int8x8_t a) { } // CHECK-LABEL: @test_vdupq_lane_s16( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> +// CHECK: ret <8 x i16> [[LANE]] int16x8_t test_vdupq_lane_s16(int16x4_t a) { return vdupq_lane_s16(a, 3); } // CHECK-LABEL: @test_vdupq_lane_s32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %a, <2 x i32> %a, <4 x i32> -// CHECK: ret <4 x i32> [[SHUFFLE]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> +// CHECK: ret <4 x i32> [[LANE]] int32x4_t test_vdupq_lane_s32(int32x2_t a) { return vdupq_lane_s32(a, 1); } @@ -2524,43 +2544,55 @@ poly8x16_t test_vdupq_lane_p8(poly8x8_t a) { } // CHECK-LABEL: @test_vdupq_lane_p16( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %a, <4 x i16> %a, <8 x i32> -// CHECK: ret <8 x i16> [[SHUFFLE]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> +// CHECK: ret <8 x i16> [[LANE]] poly16x8_t test_vdupq_lane_p16(poly16x4_t a) { return vdupq_lane_p16(a, 3); } // CHECK-LABEL: @test_vdupq_lane_f32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %a, <2 x float> %a, <4 x i32> -// CHECK: ret <4 x float> [[SHUFFLE]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> +// CHECK: ret <4 x float> [[LANE]] float32x4_t test_vdupq_lane_f32(float32x2_t a) { return vdupq_lane_f32(a, 1); } // CHECK-LABEL: @test_vdup_lane_s64( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <1 x i64> %a, <1 x i64> %a, <1 x i32> zeroinitializer -// CHECK: ret <1 x i64> [[SHUFFLE]] +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[LANE:%.*]] = shufflevector <1 x i64> [[TMP1]], <1 x i64> [[TMP1]], <1 x i32> zeroinitializer +// CHECK: ret <1 x i64> [[LANE]] int64x1_t test_vdup_lane_s64(int64x1_t a) { return vdup_lane_s64(a, 0); } // CHECK-LABEL: @test_vdup_lane_u64( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <1 x i64> %a, <1 x i64> %a, <1 x i32> zeroinitializer -// CHECK: ret <1 x i64> [[SHUFFLE]] +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[LANE:%.*]] = shufflevector <1 x i64> [[TMP1]], <1 x i64> [[TMP1]], <1 x i32> zeroinitializer +// CHECK: ret <1 x i64> [[LANE]] uint64x1_t test_vdup_lane_u64(uint64x1_t a) { return vdup_lane_u64(a, 0); } // CHECK-LABEL: @test_vdupq_lane_s64( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <1 x i64> %a, <1 x i64> %a, <2 x i32> zeroinitializer -// CHECK: ret <2 x i64> [[SHUFFLE]] +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[LANE:%.*]] = shufflevector <1 x i64> [[TMP1]], <1 x i64> [[TMP1]], <2 x i32> zeroinitializer +// CHECK: ret <2 x i64> [[LANE]] int64x2_t test_vdupq_lane_s64(int64x1_t a) { return vdupq_lane_s64(a, 0); } // CHECK-LABEL: @test_vdupq_lane_u64( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <1 x i64> %a, <1 x i64> %a, <2 x i32> zeroinitializer -// CHECK: ret <2 x i64> [[SHUFFLE]] +// CHECK: [[TMP0:%.*]] = bitcast <1 x i64> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <1 x i64> +// CHECK: [[LANE:%.*]] = shufflevector <1 x i64> [[TMP1]], <1 x i64> [[TMP1]], <2 x i32> zeroinitializer +// CHECK: ret <2 x i64> [[LANE]] uint64x2_t test_vdupq_lane_u64(uint64x1_t a) { return vdupq_lane_u64(a, 0); } @@ -7077,44 +7109,52 @@ uint64x2_t test_vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) { } // CHECK-LABEL: @test_vmlal_lane_s16( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) -// CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #8 +// CHECK: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK: ret <4 x i32> [[ADD]] int32x4_t test_vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t c) { return vmlal_lane_s16(a, b, c, 3); } // CHECK-LABEL: @test_vmlal_lane_s32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) -// CHECK: [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #8 +// CHECK: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK: ret <2 x i64> [[ADD]] int64x2_t test_vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t c) { return vmlal_lane_s32(a, b, c, 1); } // CHECK-LABEL: @test_vmlal_lane_u16( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) -// CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[VMULL2_I]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #8 +// CHECK: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK: ret <4 x i32> [[ADD]] uint32x4_t test_vmlal_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) { return vmlal_lane_u16(a, b, c, 3); } // CHECK-LABEL: @test_vmlal_lane_u32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) -// CHECK: [[ADD:%.*]] = add <2 x i64> %a, [[VMULL2_I]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #8 +// CHECK: [[ADD:%.*]] = add <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK: ret <2 x i64> [[ADD]] uint64x2_t test_vmlal_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) { return vmlal_lane_u32(a, b, c, 1); @@ -7173,90 +7213,110 @@ uint64x2_t test_vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) { } // CHECK-LABEL: @test_vmla_lane_s16( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> -// CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]] -// CHECK: [[ADD:%.*]] = add <4 x i16> %a, [[MUL]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] +// CHECK: [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]] // CHECK: ret <4 x i16> [[ADD]] int16x4_t test_vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t c) { return vmla_lane_s16(a, b, c, 3); } // CHECK-LABEL: @test_vmla_lane_s32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> -// CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]] -// CHECK: [[ADD:%.*]] = add <2 x i32> %a, [[MUL]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] +// CHECK: [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]] // CHECK: ret <2 x i32> [[ADD]] int32x2_t test_vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t c) { return vmla_lane_s32(a, b, c, 1); } // CHECK-LABEL: @test_vmla_lane_u16( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> -// CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]] -// CHECK: [[ADD:%.*]] = add <4 x i16> %a, [[MUL]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] +// CHECK: [[ADD:%.*]] = add <4 x i16> [[A:%.*]], [[MUL]] // CHECK: ret <4 x i16> [[ADD]] uint16x4_t test_vmla_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) { return vmla_lane_u16(a, b, c, 3); } // CHECK-LABEL: @test_vmla_lane_u32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> -// CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]] -// CHECK: [[ADD:%.*]] = add <2 x i32> %a, [[MUL]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] +// CHECK: [[ADD:%.*]] = add <2 x i32> [[A:%.*]], [[MUL]] // CHECK: ret <2 x i32> [[ADD]] uint32x2_t test_vmla_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) { return vmla_lane_u32(a, b, c, 1); } // CHECK-LABEL: @test_vmla_lane_f32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %c, <2 x float> %c, <2 x i32> -// CHECK: [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]] -// CHECK: [[ADD:%.*]] = fadd <2 x float> %a, [[MUL]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> +// CHECK: [[MUL:%.*]] = fmul <2 x float> [[B:%.*]], [[LANE]] +// CHECK: [[ADD:%.*]] = fadd <2 x float> [[A:%.*]], [[MUL]] // CHECK: ret <2 x float> [[ADD]] float32x2_t test_vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t c) { return vmla_lane_f32(a, b, c, 1); } // CHECK-LABEL: @test_vmlaq_lane_s16( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <8 x i32> -// CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]] -// CHECK: [[ADD:%.*]] = add <8 x i16> %a, [[MUL]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> +// CHECK: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] +// CHECK: [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]] // CHECK: ret <8 x i16> [[ADD]] int16x8_t test_vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t c) { return vmlaq_lane_s16(a, b, c, 3); } // CHECK-LABEL: @test_vmlaq_lane_s32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <4 x i32> -// CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]] -// CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[MUL]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> +// CHECK: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] +// CHECK: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]] // CHECK: ret <4 x i32> [[ADD]] int32x4_t test_vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t c) { return vmlaq_lane_s32(a, b, c, 1); } // CHECK-LABEL: @test_vmlaq_lane_u16( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <8 x i32> -// CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]] -// CHECK: [[ADD:%.*]] = add <8 x i16> %a, [[MUL]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> +// CHECK: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] +// CHECK: [[ADD:%.*]] = add <8 x i16> [[A:%.*]], [[MUL]] // CHECK: ret <8 x i16> [[ADD]] uint16x8_t test_vmlaq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t c) { return vmlaq_lane_u16(a, b, c, 3); } // CHECK-LABEL: @test_vmlaq_lane_u32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <4 x i32> -// CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]] -// CHECK: [[ADD:%.*]] = add <4 x i32> %a, [[MUL]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> +// CHECK: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] +// CHECK: [[ADD:%.*]] = add <4 x i32> [[A:%.*]], [[MUL]] // CHECK: ret <4 x i32> [[ADD]] uint32x4_t test_vmlaq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t c) { return vmlaq_lane_u32(a, b, c, 1); } // CHECK-LABEL: @test_vmlaq_lane_f32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %c, <2 x float> %c, <4 x i32> -// CHECK: [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]] -// CHECK: [[ADD:%.*]] = fadd <4 x float> %a, [[MUL]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> +// CHECK: [[MUL:%.*]] = fmul <4 x float> [[B:%.*]], [[LANE]] +// CHECK: [[ADD:%.*]] = fadd <4 x float> [[A:%.*]], [[MUL]] // CHECK: ret <4 x float> [[ADD]] float32x4_t test_vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t c) { return vmlaq_lane_f32(a, b, c, 1); @@ -7553,44 +7613,52 @@ uint64x2_t test_vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) { } // CHECK-LABEL: @test_vmlsl_lane_s16( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) -// CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #8 +// CHECK: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK: ret <4 x i32> [[SUB]] int32x4_t test_vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t c) { return vmlsl_lane_s16(a, b, c, 3); } // CHECK-LABEL: @test_vmlsl_lane_s32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) -// CHECK: [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #8 +// CHECK: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK: ret <2 x i64> [[SUB]] int64x2_t test_vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t c) { return vmlsl_lane_s32(a, b, c, 1); } // CHECK-LABEL: @test_vmlsl_lane_u16( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) -// CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[VMULL2_I]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #8 +// CHECK: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[VMULL2_I]] // CHECK: ret <4 x i32> [[SUB]] uint32x4_t test_vmlsl_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) { return vmlsl_lane_u16(a, b, c, 3); } // CHECK-LABEL: @test_vmlsl_lane_u32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) -// CHECK: [[SUB:%.*]] = sub <2 x i64> %a, [[VMULL2_I]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #8 +// CHECK: [[SUB:%.*]] = sub <2 x i64> [[A:%.*]], [[VMULL2_I]] // CHECK: ret <2 x i64> [[SUB]] uint64x2_t test_vmlsl_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) { return vmlsl_lane_u32(a, b, c, 1); @@ -7649,90 +7717,110 @@ uint64x2_t test_vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) { } // CHECK-LABEL: @test_vmls_lane_s16( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> -// CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]] -// CHECK: [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] +// CHECK: [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]] // CHECK: ret <4 x i16> [[SUB]] int16x4_t test_vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t c) { return vmls_lane_s16(a, b, c, 3); } // CHECK-LABEL: @test_vmls_lane_s32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> -// CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]] -// CHECK: [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] +// CHECK: [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]] // CHECK: ret <2 x i32> [[SUB]] int32x2_t test_vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t c) { return vmls_lane_s32(a, b, c, 1); } // CHECK-LABEL: @test_vmls_lane_u16( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> -// CHECK: [[MUL:%.*]] = mul <4 x i16> %b, [[SHUFFLE]] -// CHECK: [[SUB:%.*]] = sub <4 x i16> %a, [[MUL]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK: [[MUL:%.*]] = mul <4 x i16> [[B:%.*]], [[LANE]] +// CHECK: [[SUB:%.*]] = sub <4 x i16> [[A:%.*]], [[MUL]] // CHECK: ret <4 x i16> [[SUB]] uint16x4_t test_vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) { return vmls_lane_u16(a, b, c, 3); } // CHECK-LABEL: @test_vmls_lane_u32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> -// CHECK: [[MUL:%.*]] = mul <2 x i32> %b, [[SHUFFLE]] -// CHECK: [[SUB:%.*]] = sub <2 x i32> %a, [[MUL]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK: [[MUL:%.*]] = mul <2 x i32> [[B:%.*]], [[LANE]] +// CHECK: [[SUB:%.*]] = sub <2 x i32> [[A:%.*]], [[MUL]] // CHECK: ret <2 x i32> [[SUB]] uint32x2_t test_vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) { return vmls_lane_u32(a, b, c, 1); } // CHECK-LABEL: @test_vmls_lane_f32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %c, <2 x float> %c, <2 x i32> -// CHECK: [[MUL:%.*]] = fmul <2 x float> %b, [[SHUFFLE]] -// CHECK: [[SUB:%.*]] = fsub <2 x float> %a, [[MUL]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> +// CHECK: [[MUL:%.*]] = fmul <2 x float> [[B:%.*]], [[LANE]] +// CHECK: [[SUB:%.*]] = fsub <2 x float> [[A:%.*]], [[MUL]] // CHECK: ret <2 x float> [[SUB]] float32x2_t test_vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t c) { return vmls_lane_f32(a, b, c, 1); } // CHECK-LABEL: @test_vmlsq_lane_s16( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <8 x i32> -// CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]] -// CHECK: [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> +// CHECK: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] +// CHECK: [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]] // CHECK: ret <8 x i16> [[SUB]] int16x8_t test_vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t c) { return vmlsq_lane_s16(a, b, c, 3); } // CHECK-LABEL: @test_vmlsq_lane_s32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <4 x i32> -// CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]] -// CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> +// CHECK: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] +// CHECK: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]] // CHECK: ret <4 x i32> [[SUB]] int32x4_t test_vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t c) { return vmlsq_lane_s32(a, b, c, 1); } // CHECK-LABEL: @test_vmlsq_lane_u16( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <8 x i32> -// CHECK: [[MUL:%.*]] = mul <8 x i16> %b, [[SHUFFLE]] -// CHECK: [[SUB:%.*]] = sub <8 x i16> %a, [[MUL]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> +// CHECK: [[MUL:%.*]] = mul <8 x i16> [[B:%.*]], [[LANE]] +// CHECK: [[SUB:%.*]] = sub <8 x i16> [[A:%.*]], [[MUL]] // CHECK: ret <8 x i16> [[SUB]] uint16x8_t test_vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t c) { return vmlsq_lane_u16(a, b, c, 3); } // CHECK-LABEL: @test_vmlsq_lane_u32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <4 x i32> -// CHECK: [[MUL:%.*]] = mul <4 x i32> %b, [[SHUFFLE]] -// CHECK: [[SUB:%.*]] = sub <4 x i32> %a, [[MUL]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> +// CHECK: [[MUL:%.*]] = mul <4 x i32> [[B:%.*]], [[LANE]] +// CHECK: [[SUB:%.*]] = sub <4 x i32> [[A:%.*]], [[MUL]] // CHECK: ret <4 x i32> [[SUB]] uint32x4_t test_vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t c) { return vmlsq_lane_u32(a, b, c, 1); } // CHECK-LABEL: @test_vmlsq_lane_f32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %c, <2 x float> %c, <4 x i32> -// CHECK: [[MUL:%.*]] = fmul <4 x float> %b, [[SHUFFLE]] -// CHECK: [[SUB:%.*]] = fsub <4 x float> %a, [[MUL]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> +// CHECK: [[MUL:%.*]] = fmul <4 x float> [[B:%.*]], [[LANE]] +// CHECK: [[SUB:%.*]] = fsub <4 x float> [[A:%.*]], [[MUL]] // CHECK: ret <4 x float> [[SUB]] float32x4_t test_vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t c) { return vmlsq_lane_f32(a, b, c, 1); @@ -8404,40 +8492,48 @@ poly16x8_t test_vmull_p8(poly8x8_t a, poly8x8_t b) { } // CHECK-LABEL: @test_vmull_lane_s16( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %a, <4 x i16> [[SHUFFLE]]) +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #8 // CHECK: ret <4 x i32> [[VMULL2_I]] int32x4_t test_vmull_lane_s16(int16x4_t a, int16x4_t b) { return vmull_lane_s16(a, b, 3); } // CHECK-LABEL: @test_vmull_lane_s32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %a, <2 x i32> [[SHUFFLE]]) +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #8 // CHECK: ret <2 x i64> [[VMULL2_I]] int64x2_t test_vmull_lane_s32(int32x2_t a, int32x2_t b) { return vmull_lane_s32(a, b, 1); } // CHECK-LABEL: @test_vmull_lane_u16( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %a, <4 x i16> [[SHUFFLE]]) +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK: [[VMULL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #8 // CHECK: ret <4 x i32> [[VMULL2_I]] uint32x4_t test_vmull_lane_u16(uint16x4_t a, uint16x4_t b) { return vmull_lane_u16(a, b, 3); } // CHECK-LABEL: @test_vmull_lane_u32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %a, <2 x i32> [[SHUFFLE]]) +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK: [[VMULL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #8 // CHECK: ret <2 x i64> [[VMULL2_I]] uint64x2_t test_vmull_lane_u32(uint32x2_t a, uint32x2_t b) { return vmull_lane_u32(a, b, 1); @@ -8506,80 +8602,100 @@ poly8x16_t test_vmulq_p8(poly8x16_t a, poly8x16_t b) { } // CHECK-LABEL: @test_vmul_lane_s16( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> -// CHECK: [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK: [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]] // CHECK: ret <4 x i16> [[MUL]] int16x4_t test_vmul_lane_s16(int16x4_t a, int16x4_t b) { return vmul_lane_s16(a, b, 3); } // CHECK-LABEL: @test_vmul_lane_s32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> -// CHECK: [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK: [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]] // CHECK: ret <2 x i32> [[MUL]] int32x2_t test_vmul_lane_s32(int32x2_t a, int32x2_t b) { return vmul_lane_s32(a, b, 1); } // CHECK-LABEL: @test_vmul_lane_f32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %b, <2 x float> %b, <2 x i32> -// CHECK: [[MUL:%.*]] = fmul <2 x float> %a, [[SHUFFLE]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <2 x i32> +// CHECK: [[MUL:%.*]] = fmul <2 x float> [[A:%.*]], [[LANE]] // CHECK: ret <2 x float> [[MUL]] float32x2_t test_vmul_lane_f32(float32x2_t a, float32x2_t b) { return vmul_lane_f32(a, b, 1); } // CHECK-LABEL: @test_vmul_lane_u16( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> -// CHECK: [[MUL:%.*]] = mul <4 x i16> %a, [[SHUFFLE]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK: [[MUL:%.*]] = mul <4 x i16> [[A:%.*]], [[LANE]] // CHECK: ret <4 x i16> [[MUL]] uint16x4_t test_vmul_lane_u16(uint16x4_t a, uint16x4_t b) { return vmul_lane_u16(a, b, 3); } // CHECK-LABEL: @test_vmul_lane_u32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> -// CHECK: [[MUL:%.*]] = mul <2 x i32> %a, [[SHUFFLE]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK: [[MUL:%.*]] = mul <2 x i32> [[A:%.*]], [[LANE]] // CHECK: ret <2 x i32> [[MUL]] uint32x2_t test_vmul_lane_u32(uint32x2_t a, uint32x2_t b) { return vmul_lane_u32(a, b, 1); } // CHECK-LABEL: @test_vmulq_lane_s16( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <8 x i32> -// CHECK: [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> +// CHECK: [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]] // CHECK: ret <8 x i16> [[MUL]] int16x8_t test_vmulq_lane_s16(int16x8_t a, int16x4_t b) { return vmulq_lane_s16(a, b, 3); } // CHECK-LABEL: @test_vmulq_lane_s32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <4 x i32> -// CHECK: [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> +// CHECK: [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]] // CHECK: ret <4 x i32> [[MUL]] int32x4_t test_vmulq_lane_s32(int32x4_t a, int32x2_t b) { return vmulq_lane_s32(a, b, 1); } // CHECK-LABEL: @test_vmulq_lane_f32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x float> %b, <2 x float> %b, <4 x i32> -// CHECK: [[MUL:%.*]] = fmul <4 x float> %a, [[SHUFFLE]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x float> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x float> +// CHECK: [[LANE:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP1]], <4 x i32> +// CHECK: [[MUL:%.*]] = fmul <4 x float> [[A:%.*]], [[LANE]] // CHECK: ret <4 x float> [[MUL]] float32x4_t test_vmulq_lane_f32(float32x4_t a, float32x2_t b) { return vmulq_lane_f32(a, b, 1); } // CHECK-LABEL: @test_vmulq_lane_u16( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <8 x i32> -// CHECK: [[MUL:%.*]] = mul <8 x i16> %a, [[SHUFFLE]] +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> +// CHECK: [[MUL:%.*]] = mul <8 x i16> [[A:%.*]], [[LANE]] // CHECK: ret <8 x i16> [[MUL]] uint16x8_t test_vmulq_lane_u16(uint16x8_t a, uint16x4_t b) { return vmulq_lane_u16(a, b, 3); } // CHECK-LABEL: @test_vmulq_lane_u32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <4 x i32> -// CHECK: [[MUL:%.*]] = mul <4 x i32> %a, [[SHUFFLE]] +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> +// CHECK: [[MUL:%.*]] = mul <4 x i32> [[A:%.*]], [[LANE]] // CHECK: ret <4 x i32> [[MUL]] uint32x4_t test_vmulq_lane_u32(uint32x4_t a, uint32x2_t b) { return vmulq_lane_u32(a, b, 1); @@ -9700,24 +9816,28 @@ int64x2_t test_vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c) { } // CHECK-LABEL: @test_vqdmlal_lane_s16( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) -// CHECK: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]]) +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #8 +// CHECK: [[VQDMLAL_V3_I:%.*]] = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #8 // CHECK: ret <4 x i32> [[VQDMLAL_V3_I]] int32x4_t test_vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t c) { return vqdmlal_lane_s16(a, b, c, 3); } // CHECK-LABEL: @test_vqdmlal_lane_s32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) -// CHECK: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]]) +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #8 +// CHECK: [[VQDMLAL_V3_I:%.*]] = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #8 // CHECK: ret <2 x i64> [[VQDMLAL_V3_I]] int64x2_t test_vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t c) { return vqdmlal_lane_s32(a, b, c, 1); @@ -9774,24 +9894,28 @@ int64x2_t test_vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c) { } // CHECK-LABEL: @test_vqdmlsl_lane_s16( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %c, <4 x i16> %c, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %b, <4 x i16> [[SHUFFLE]]) -// CHECK: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> %a, <4 x i32> [[VQDMLAL2_I]]) +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP4:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK: [[VQDMLAL2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[B]], <4 x i16> [[LANE]]) #8 +// CHECK: [[VQDMLSL_V3_I:%.*]] = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> [[A]], <4 x i32> [[VQDMLAL2_I]]) #8 // CHECK: ret <4 x i32> [[VQDMLSL_V3_I]] int32x4_t test_vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t c) { return vqdmlsl_lane_s16(a, b, c, 3); } // CHECK-LABEL: @test_vqdmlsl_lane_s32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %c, <2 x i32> %c, <2 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <2 x i64> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %b to <8 x i8> -// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %b, <2 x i32> [[SHUFFLE]]) -// CHECK: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> %a, <2 x i64> [[VQDMLAL2_I]]) +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[C:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <2 x i64> [[A:%.*]] to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK: [[VQDMLAL2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[B]], <2 x i32> [[LANE]]) #8 +// CHECK: [[VQDMLSL_V3_I:%.*]] = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> [[A]], <2 x i64> [[VQDMLAL2_I]]) #8 // CHECK: ret <2 x i64> [[VQDMLSL_V3_I]] int64x2_t test_vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t c) { return vqdmlsl_lane_s32(a, b, c, 1); @@ -9866,10 +9990,12 @@ int32x4_t test_vqdmulhq_s32(int32x4_t a, int32x4_t b) { } // CHECK-LABEL: @test_vqdmulh_lane_s16( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK: [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> %a, <4 x i16> [[SHUFFLE]]) +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK: [[VQDMULH_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[LANE]]) #8 // CHECK: [[VQDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQDMULH_V2_I]] to <8 x i8> // CHECK: ret <4 x i16> [[VQDMULH_V2_I]] int16x4_t test_vqdmulh_lane_s16(int16x4_t a, int16x4_t b) { @@ -9877,10 +10003,12 @@ int16x4_t test_vqdmulh_lane_s16(int16x4_t a, int16x4_t b) { } // CHECK-LABEL: @test_vqdmulh_lane_s32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK: [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> %a, <2 x i32> [[SHUFFLE]]) +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK: [[VQDMULH_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[LANE]]) #8 // CHECK: [[VQDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQDMULH_V2_I]] to <8 x i8> // CHECK: ret <2 x i32> [[VQDMULH_V2_I]] int32x2_t test_vqdmulh_lane_s32(int32x2_t a, int32x2_t b) { @@ -9888,10 +10016,12 @@ int32x2_t test_vqdmulh_lane_s32(int32x2_t a, int32x2_t b) { } // CHECK-LABEL: @test_vqdmulhq_lane_s16( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <8 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8> -// CHECK: [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> %a, <8 x i16> [[SHUFFLE]]) +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i16> [[LANE]] to <16 x i8> +// CHECK: [[VQDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[LANE]]) #8 // CHECK: [[VQDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQDMULHQ_V2_I]] to <16 x i8> // CHECK: ret <8 x i16> [[VQDMULHQ_V2_I]] int16x8_t test_vqdmulhq_lane_s16(int16x8_t a, int16x4_t b) { @@ -9899,10 +10029,12 @@ int16x8_t test_vqdmulhq_lane_s16(int16x8_t a, int16x4_t b) { } // CHECK-LABEL: @test_vqdmulhq_lane_s32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8> -// CHECK: [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> %a, <4 x i32> [[SHUFFLE]]) +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8> +// CHECK: [[VQDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[LANE]]) #8 // CHECK: [[VQDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULHQ_V2_I]] to <16 x i8> // CHECK: ret <4 x i32> [[VQDMULHQ_V2_I]] int32x4_t test_vqdmulhq_lane_s32(int32x4_t a, int32x2_t b) { @@ -9988,10 +10120,12 @@ int64x2_t test_vqdmull_s32(int32x2_t a, int32x2_t b) { } // CHECK-LABEL: @test_vqdmull_lane_s16( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %a, <4 x i16> [[SHUFFLE]]) +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK: [[VQDMULL_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> [[A]], <4 x i16> [[LANE]]) #8 // CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <4 x i32> [[VQDMULL_V2_I]] to <16 x i8> // CHECK: ret <4 x i32> [[VQDMULL_V2_I]] int32x4_t test_vqdmull_lane_s16(int16x4_t a, int16x4_t b) { @@ -9999,10 +10133,12 @@ int32x4_t test_vqdmull_lane_s16(int16x4_t a, int16x4_t b) { } // CHECK-LABEL: @test_vqdmull_lane_s32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %a, <2 x i32> [[SHUFFLE]]) +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK: [[VQDMULL_V2_I:%.*]] = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> [[A]], <2 x i32> [[LANE]]) #8 // CHECK: [[VQDMULL_V3_I:%.*]] = bitcast <2 x i64> [[VQDMULL_V2_I]] to <16 x i8> // CHECK: ret <2 x i64> [[VQDMULL_V2_I]] int64x2_t test_vqdmull_lane_s32(int32x2_t a, int32x2_t b) { @@ -10204,10 +10340,12 @@ int32x4_t test_vqrdmulhq_s32(int32x4_t a, int32x4_t b) { } // CHECK-LABEL: @test_vqrdmulh_lane_s16( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i16> [[SHUFFLE]] to <8 x i8> -// CHECK: [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %a, <4 x i16> [[SHUFFLE]]) +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <4 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i16> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <4 x i16> [[LANE]] to <8 x i8> +// CHECK: [[VQRDMULH_V2_I:%.*]] = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> [[A]], <4 x i16> [[LANE]]) #8 // CHECK: [[VQRDMULH_V3_I:%.*]] = bitcast <4 x i16> [[VQRDMULH_V2_I]] to <8 x i8> // CHECK: ret <4 x i16> [[VQRDMULH_V2_I]] int16x4_t test_vqrdmulh_lane_s16(int16x4_t a, int16x4_t b) { @@ -10215,10 +10353,12 @@ int16x4_t test_vqrdmulh_lane_s16(int16x4_t a, int16x4_t b) { } // CHECK-LABEL: @test_vqrdmulh_lane_s32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <2 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> %a to <8 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[SHUFFLE]] to <8 x i8> -// CHECK: [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %a, <2 x i32> [[SHUFFLE]]) +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <2 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[A:%.*]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK: [[VQRDMULH_V2_I:%.*]] = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> [[A]], <2 x i32> [[LANE]]) #8 // CHECK: [[VQRDMULH_V3_I:%.*]] = bitcast <2 x i32> [[VQRDMULH_V2_I]] to <8 x i8> // CHECK: ret <2 x i32> [[VQRDMULH_V2_I]] int32x2_t test_vqrdmulh_lane_s32(int32x2_t a, int32x2_t b) { @@ -10226,10 +10366,12 @@ int32x2_t test_vqrdmulh_lane_s32(int32x2_t a, int32x2_t b) { } // CHECK-LABEL: @test_vqrdmulhq_lane_s16( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <4 x i16> %b, <4 x i16> %b, <8 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <8 x i16> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <8 x i16> [[SHUFFLE]] to <16 x i8> -// CHECK: [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %a, <8 x i16> [[SHUFFLE]]) +// CHECK: [[TMP0:%.*]] = bitcast <4 x i16> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x i16> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP1]], <8 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i16> [[A:%.*]] to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i16> [[LANE]] to <16 x i8> +// CHECK: [[VQRDMULHQ_V2_I:%.*]] = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> [[A]], <8 x i16> [[LANE]]) #8 // CHECK: [[VQRDMULHQ_V3_I:%.*]] = bitcast <8 x i16> [[VQRDMULHQ_V2_I]] to <16 x i8> // CHECK: ret <8 x i16> [[VQRDMULHQ_V2_I]] int16x8_t test_vqrdmulhq_lane_s16(int16x8_t a, int16x4_t b) { @@ -10237,10 +10379,12 @@ int16x8_t test_vqrdmulhq_lane_s16(int16x8_t a, int16x4_t b) { } // CHECK-LABEL: @test_vqrdmulhq_lane_s32( -// CHECK: [[SHUFFLE:%.*]] = shufflevector <2 x i32> %b, <2 x i32> %b, <4 x i32> -// CHECK: [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8> -// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[SHUFFLE]] to <16 x i8> -// CHECK: [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %a, <4 x i32> [[SHUFFLE]]) +// CHECK: [[TMP0:%.*]] = bitcast <2 x i32> [[B:%.*]] to <8 x i8> +// CHECK: [[TMP1:%.*]] = bitcast <8 x i8> [[TMP0]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP1]], <4 x i32> +// CHECK: [[TMP2:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8> +// CHECK: [[VQRDMULHQ_V2_I:%.*]] = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> [[A]], <4 x i32> [[LANE]]) #8 // CHECK: [[VQRDMULHQ_V3_I:%.*]] = bitcast <4 x i32> [[VQRDMULHQ_V2_I]] to <16 x i8> // CHECK: ret <4 x i32> [[VQRDMULHQ_V2_I]] int32x4_t test_vqrdmulhq_lane_s32(int32x4_t a, int32x2_t b) { @@ -20079,7 +20223,7 @@ poly8x8_t test_vtbx4_p8(poly8x8_t a, poly8x8x4_t b, uint8x8_t c) { return vtbx4_p8(a, b, c); } -// CHECK: @test_vtrn_s8({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]], +// CHECK: @test_vtrn_s8({{.*}} sret align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]], // CHECK: [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[AGG_RESULT]] to i8* // CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>* // CHECK: [[VTRN_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> @@ -20092,7 +20236,7 @@ int8x8x2_t test_vtrn_s8(int8x8_t a, int8x8_t b) { return vtrn_s8(a, b); } -// CHECK: @test_vtrn_s16({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]], +// CHECK: @test_vtrn_s16({{.*}} sret align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]], // CHECK: [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[AGG_RESULT]] to i8* // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8> // CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8> @@ -20107,7 +20251,7 @@ int16x4x2_t test_vtrn_s16(int16x4_t a, int16x4_t b) { return vtrn_s16(a, b); } -// CHECK: @test_vtrn_s32({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]], +// CHECK: @test_vtrn_s32({{.*}} sret align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]], // CHECK: [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[AGG_RESULT]] to i8* // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8> // CHECK: [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8> @@ -20122,7 +20266,7 @@ int32x2x2_t test_vtrn_s32(int32x2_t a, int32x2_t b) { return vtrn_s32(a, b); } -// CHECK: @test_vtrn_u8({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]], +// CHECK: @test_vtrn_u8({{.*}} sret align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]], // CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[AGG_RESULT]] to i8* // CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>* // CHECK: [[VTRN_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> @@ -20135,7 +20279,7 @@ uint8x8x2_t test_vtrn_u8(uint8x8_t a, uint8x8_t b) { return vtrn_u8(a, b); } -// CHECK: @test_vtrn_u16({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]], +// CHECK: @test_vtrn_u16({{.*}} sret align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]], // CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[AGG_RESULT]] to i8* // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8> // CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8> @@ -20150,7 +20294,7 @@ uint16x4x2_t test_vtrn_u16(uint16x4_t a, uint16x4_t b) { return vtrn_u16(a, b); } -// CHECK: @test_vtrn_u32({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]], +// CHECK: @test_vtrn_u32({{.*}} sret align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]], // CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[AGG_RESULT]] to i8* // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8> // CHECK: [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8> @@ -20165,7 +20309,7 @@ uint32x2x2_t test_vtrn_u32(uint32x2_t a, uint32x2_t b) { return vtrn_u32(a, b); } -// CHECK: @test_vtrn_f32({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]], +// CHECK: @test_vtrn_f32({{.*}} sret align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]], // CHECK: [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[AGG_RESULT]] to i8* // CHECK: [[TMP1:%.*]] = bitcast <2 x float> %a to <8 x i8> // CHECK: [[TMP2:%.*]] = bitcast <2 x float> %b to <8 x i8> @@ -20180,7 +20324,7 @@ float32x2x2_t test_vtrn_f32(float32x2_t a, float32x2_t b) { return vtrn_f32(a, b); } -// CHECK: @test_vtrn_p8({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]], +// CHECK: @test_vtrn_p8({{.*}} sret align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]], // CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[AGG_RESULT]] to i8* // CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>* // CHECK: [[VTRN_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> @@ -20193,7 +20337,7 @@ poly8x8x2_t test_vtrn_p8(poly8x8_t a, poly8x8_t b) { return vtrn_p8(a, b); } -// CHECK: @test_vtrn_p16({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]], +// CHECK: @test_vtrn_p16({{.*}} sret align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]], // CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[AGG_RESULT]] to i8* // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8> // CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8> @@ -20208,7 +20352,7 @@ poly16x4x2_t test_vtrn_p16(poly16x4_t a, poly16x4_t b) { return vtrn_p16(a, b); } -// CHECK: @test_vtrnq_s8({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]], +// CHECK: @test_vtrnq_s8({{.*}} sret align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]], // CHECK: [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[AGG_RESULT]] to i8* // CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>* // CHECK: [[VTRN_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> @@ -20221,7 +20365,7 @@ int8x16x2_t test_vtrnq_s8(int8x16_t a, int8x16_t b) { return vtrnq_s8(a, b); } -// CHECK: @test_vtrnq_s16({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]], +// CHECK: @test_vtrnq_s16({{.*}} sret align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]], // CHECK: [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[AGG_RESULT]] to i8* // CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8> // CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8> @@ -20236,7 +20380,7 @@ int16x8x2_t test_vtrnq_s16(int16x8_t a, int16x8_t b) { return vtrnq_s16(a, b); } -// CHECK: @test_vtrnq_s32({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]], +// CHECK: @test_vtrnq_s32({{.*}} sret align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]], // CHECK: [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[AGG_RESULT]] to i8* // CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8> // CHECK: [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8> @@ -20251,7 +20395,7 @@ int32x4x2_t test_vtrnq_s32(int32x4_t a, int32x4_t b) { return vtrnq_s32(a, b); } -// CHECK: @test_vtrnq_u8({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]], +// CHECK: @test_vtrnq_u8({{.*}} sret align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]], // CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[AGG_RESULT]] to i8* // CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>* // CHECK: [[VTRN_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> @@ -20264,7 +20408,7 @@ uint8x16x2_t test_vtrnq_u8(uint8x16_t a, uint8x16_t b) { return vtrnq_u8(a, b); } -// CHECK: @test_vtrnq_u16({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]], +// CHECK: @test_vtrnq_u16({{.*}} sret align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]], // CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[AGG_RESULT]] to i8* // CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8> // CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8> @@ -20279,7 +20423,7 @@ uint16x8x2_t test_vtrnq_u16(uint16x8_t a, uint16x8_t b) { return vtrnq_u16(a, b); } -// CHECK: @test_vtrnq_u32({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]], +// CHECK: @test_vtrnq_u32({{.*}} sret align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]], // CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[AGG_RESULT]] to i8* // CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8> // CHECK: [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8> @@ -20294,7 +20438,7 @@ uint32x4x2_t test_vtrnq_u32(uint32x4_t a, uint32x4_t b) { return vtrnq_u32(a, b); } -// CHECK: @test_vtrnq_f32({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]], +// CHECK: @test_vtrnq_f32({{.*}} sret align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]], // CHECK: [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[AGG_RESULT]] to i8* // CHECK: [[TMP1:%.*]] = bitcast <4 x float> %a to <16 x i8> // CHECK: [[TMP2:%.*]] = bitcast <4 x float> %b to <16 x i8> @@ -20309,7 +20453,7 @@ float32x4x2_t test_vtrnq_f32(float32x4_t a, float32x4_t b) { return vtrnq_f32(a, b); } -// CHECK: @test_vtrnq_p8({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]], +// CHECK: @test_vtrnq_p8({{.*}} sret align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]], // CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[AGG_RESULT]] to i8* // CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>* // CHECK: [[VTRN_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> @@ -20322,7 +20466,7 @@ poly8x16x2_t test_vtrnq_p8(poly8x16_t a, poly8x16_t b) { return vtrnq_p8(a, b); } -// CHECK: @test_vtrnq_p16({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]], +// CHECK: @test_vtrnq_p16({{.*}} sret align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]], // CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[AGG_RESULT]] to i8* // CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8> // CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8> @@ -20501,7 +20645,7 @@ uint16x8_t test_vtstq_p16(poly16x8_t a, poly16x8_t b) { return vtstq_p16(a, b); } -// CHECK: @test_vuzp_s8({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]], +// CHECK: @test_vuzp_s8({{.*}} sret align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]], // CHECK: [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[AGG_RESULT]] to i8* // CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>* // CHECK: [[VUZP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> @@ -20514,7 +20658,7 @@ int8x8x2_t test_vuzp_s8(int8x8_t a, int8x8_t b) { return vuzp_s8(a, b); } -// CHECK: @test_vuzp_s16({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]], +// CHECK: @test_vuzp_s16({{.*}} sret align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]], // CHECK: [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[AGG_RESULT]] to i8* // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8> // CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8> @@ -20529,7 +20673,7 @@ int16x4x2_t test_vuzp_s16(int16x4_t a, int16x4_t b) { return vuzp_s16(a, b); } -// CHECK: @test_vuzp_s32({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]], +// CHECK: @test_vuzp_s32({{.*}} sret align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]], // CHECK: [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[AGG_RESULT]] to i8* // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8> // CHECK: [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8> @@ -20544,7 +20688,7 @@ int32x2x2_t test_vuzp_s32(int32x2_t a, int32x2_t b) { return vuzp_s32(a, b); } -// CHECK: @test_vuzp_u8({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]], +// CHECK: @test_vuzp_u8({{.*}} sret align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]], // CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[AGG_RESULT]] to i8* // CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>* // CHECK: [[VUZP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> @@ -20557,7 +20701,7 @@ uint8x8x2_t test_vuzp_u8(uint8x8_t a, uint8x8_t b) { return vuzp_u8(a, b); } -// CHECK: @test_vuzp_u16({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]], +// CHECK: @test_vuzp_u16({{.*}} sret align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]], // CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[AGG_RESULT]] to i8* // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8> // CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8> @@ -20572,7 +20716,7 @@ uint16x4x2_t test_vuzp_u16(uint16x4_t a, uint16x4_t b) { return vuzp_u16(a, b); } -// CHECK: @test_vuzp_u32({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]], +// CHECK: @test_vuzp_u32({{.*}} sret align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]], // CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[AGG_RESULT]] to i8* // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8> // CHECK: [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8> @@ -20587,7 +20731,7 @@ uint32x2x2_t test_vuzp_u32(uint32x2_t a, uint32x2_t b) { return vuzp_u32(a, b); } -// CHECK: @test_vuzp_f32({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]], +// CHECK: @test_vuzp_f32({{.*}} sret align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]], // CHECK: [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[AGG_RESULT]] to i8* // CHECK: [[TMP1:%.*]] = bitcast <2 x float> %a to <8 x i8> // CHECK: [[TMP2:%.*]] = bitcast <2 x float> %b to <8 x i8> @@ -20602,7 +20746,7 @@ float32x2x2_t test_vuzp_f32(float32x2_t a, float32x2_t b) { return vuzp_f32(a, b); } -// CHECK: @test_vuzp_p8({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]], +// CHECK: @test_vuzp_p8({{.*}} sret align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]], // CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[AGG_RESULT]] to i8* // CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>* // CHECK: [[VUZP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> @@ -20615,7 +20759,7 @@ poly8x8x2_t test_vuzp_p8(poly8x8_t a, poly8x8_t b) { return vuzp_p8(a, b); } -// CHECK: @test_vuzp_p16({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]], +// CHECK: @test_vuzp_p16({{.*}} sret align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]], // CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[AGG_RESULT]] to i8* // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8> // CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8> @@ -20630,7 +20774,7 @@ poly16x4x2_t test_vuzp_p16(poly16x4_t a, poly16x4_t b) { return vuzp_p16(a, b); } -// CHECK: @test_vuzpq_s8({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]], +// CHECK: @test_vuzpq_s8({{.*}} sret align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]], // CHECK: [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[AGG_RESULT]] to i8* // CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>* // CHECK: [[VUZP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> @@ -20643,7 +20787,7 @@ int8x16x2_t test_vuzpq_s8(int8x16_t a, int8x16_t b) { return vuzpq_s8(a, b); } -// CHECK: @test_vuzpq_s16({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]], +// CHECK: @test_vuzpq_s16({{.*}} sret align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]], // CHECK: [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[AGG_RESULT]] to i8* // CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8> // CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8> @@ -20658,7 +20802,7 @@ int16x8x2_t test_vuzpq_s16(int16x8_t a, int16x8_t b) { return vuzpq_s16(a, b); } -// CHECK: @test_vuzpq_s32({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]], +// CHECK: @test_vuzpq_s32({{.*}} sret align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]], // CHECK: [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[AGG_RESULT]] to i8* // CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8> // CHECK: [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8> @@ -20673,7 +20817,7 @@ int32x4x2_t test_vuzpq_s32(int32x4_t a, int32x4_t b) { return vuzpq_s32(a, b); } -// CHECK: @test_vuzpq_u8({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]], +// CHECK: @test_vuzpq_u8({{.*}} sret align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]], // CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[AGG_RESULT]] to i8* // CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>* // CHECK: [[VUZP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> @@ -20686,7 +20830,7 @@ uint8x16x2_t test_vuzpq_u8(uint8x16_t a, uint8x16_t b) { return vuzpq_u8(a, b); } -// CHECK: @test_vuzpq_u16({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]], +// CHECK: @test_vuzpq_u16({{.*}} sret align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]], // CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[AGG_RESULT]] to i8* // CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8> // CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8> @@ -20701,7 +20845,7 @@ uint16x8x2_t test_vuzpq_u16(uint16x8_t a, uint16x8_t b) { return vuzpq_u16(a, b); } -// CHECK: @test_vuzpq_u32({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]], +// CHECK: @test_vuzpq_u32({{.*}} sret align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]], // CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[AGG_RESULT]] to i8* // CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8> // CHECK: [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8> @@ -20716,7 +20860,7 @@ uint32x4x2_t test_vuzpq_u32(uint32x4_t a, uint32x4_t b) { return vuzpq_u32(a, b); } -// CHECK: @test_vuzpq_f32({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]], +// CHECK: @test_vuzpq_f32({{.*}} sret align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]], // CHECK: [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[AGG_RESULT]] to i8* // CHECK: [[TMP1:%.*]] = bitcast <4 x float> %a to <16 x i8> // CHECK: [[TMP2:%.*]] = bitcast <4 x float> %b to <16 x i8> @@ -20731,7 +20875,7 @@ float32x4x2_t test_vuzpq_f32(float32x4_t a, float32x4_t b) { return vuzpq_f32(a, b); } -// CHECK: @test_vuzpq_p8({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]], +// CHECK: @test_vuzpq_p8({{.*}} sret align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]], // CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[AGG_RESULT]] to i8* // CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>* // CHECK: [[VUZP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> @@ -20744,7 +20888,7 @@ poly8x16x2_t test_vuzpq_p8(poly8x16_t a, poly8x16_t b) { return vuzpq_p8(a, b); } -// CHECK: @test_vuzpq_p16({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]], +// CHECK: @test_vuzpq_p16({{.*}} sret align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]], // CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[AGG_RESULT]] to i8* // CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8> // CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8> @@ -20759,7 +20903,7 @@ poly16x8x2_t test_vuzpq_p16(poly16x8_t a, poly16x8_t b) { return vuzpq_p16(a, b); } -// CHECK: @test_vzip_s8({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]], +// CHECK: @test_vzip_s8({{.*}} sret align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]], // CHECK: [[TMP0:%.*]] = bitcast %struct.int8x8x2_t* [[AGG_RESULT]] to i8* // CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>* // CHECK: [[VZIP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> @@ -20772,7 +20916,7 @@ int8x8x2_t test_vzip_s8(int8x8_t a, int8x8_t b) { return vzip_s8(a, b); } -// CHECK: @test_vzip_s16({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]], +// CHECK: @test_vzip_s16({{.*}} sret align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]], // CHECK: [[TMP0:%.*]] = bitcast %struct.int16x4x2_t* [[AGG_RESULT]] to i8* // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8> // CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8> @@ -20787,7 +20931,7 @@ int16x4x2_t test_vzip_s16(int16x4_t a, int16x4_t b) { return vzip_s16(a, b); } -// CHECK: @test_vzip_s32({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]], +// CHECK: @test_vzip_s32({{.*}} sret align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]], // CHECK: [[TMP0:%.*]] = bitcast %struct.int32x2x2_t* [[AGG_RESULT]] to i8* // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8> // CHECK: [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8> @@ -20802,7 +20946,7 @@ int32x2x2_t test_vzip_s32(int32x2_t a, int32x2_t b) { return vzip_s32(a, b); } -// CHECK: @test_vzip_u8({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]], +// CHECK: @test_vzip_u8({{.*}} sret align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]], // CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x8x2_t* [[AGG_RESULT]] to i8* // CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>* // CHECK: [[VZIP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> @@ -20815,7 +20959,7 @@ uint8x8x2_t test_vzip_u8(uint8x8_t a, uint8x8_t b) { return vzip_u8(a, b); } -// CHECK: @test_vzip_u16({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]], +// CHECK: @test_vzip_u16({{.*}} sret align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]], // CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x4x2_t* [[AGG_RESULT]] to i8* // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8> // CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8> @@ -20830,7 +20974,7 @@ uint16x4x2_t test_vzip_u16(uint16x4_t a, uint16x4_t b) { return vzip_u16(a, b); } -// CHECK: @test_vzip_u32({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]], +// CHECK: @test_vzip_u32({{.*}} sret align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]], // CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x2x2_t* [[AGG_RESULT]] to i8* // CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %a to <8 x i8> // CHECK: [[TMP2:%.*]] = bitcast <2 x i32> %b to <8 x i8> @@ -20845,7 +20989,7 @@ uint32x2x2_t test_vzip_u32(uint32x2_t a, uint32x2_t b) { return vzip_u32(a, b); } -// CHECK: @test_vzip_f32({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]], +// CHECK: @test_vzip_f32({{.*}} sret align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]], // CHECK: [[TMP0:%.*]] = bitcast %struct.float32x2x2_t* [[AGG_RESULT]] to i8* // CHECK: [[TMP1:%.*]] = bitcast <2 x float> %a to <8 x i8> // CHECK: [[TMP2:%.*]] = bitcast <2 x float> %b to <8 x i8> @@ -20860,7 +21004,7 @@ float32x2x2_t test_vzip_f32(float32x2_t a, float32x2_t b) { return vzip_f32(a, b); } -// CHECK: @test_vzip_p8({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]], +// CHECK: @test_vzip_p8({{.*}} sret align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]], // CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x8x2_t* [[AGG_RESULT]] to i8* // CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <8 x i8>* // CHECK: [[VZIP_I:%.*]] = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> @@ -20873,7 +21017,7 @@ poly8x8x2_t test_vzip_p8(poly8x8_t a, poly8x8_t b) { return vzip_p8(a, b); } -// CHECK: @test_vzip_p16({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]], +// CHECK: @test_vzip_p16({{.*}} sret align 8 [[AGG_RESULT:%[0-9a-zA-Z.]+]], // CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x4x2_t* [[AGG_RESULT]] to i8* // CHECK: [[TMP1:%.*]] = bitcast <4 x i16> %a to <8 x i8> // CHECK: [[TMP2:%.*]] = bitcast <4 x i16> %b to <8 x i8> @@ -20888,7 +21032,7 @@ poly16x4x2_t test_vzip_p16(poly16x4_t a, poly16x4_t b) { return vzip_p16(a, b); } -// CHECK: @test_vzipq_s8({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]], +// CHECK: @test_vzipq_s8({{.*}} sret align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]], // CHECK: [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[AGG_RESULT]] to i8* // CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>* // CHECK: [[VZIP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> @@ -20901,7 +21045,7 @@ int8x16x2_t test_vzipq_s8(int8x16_t a, int8x16_t b) { return vzipq_s8(a, b); } -// CHECK: @test_vzipq_s16({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]], +// CHECK: @test_vzipq_s16({{.*}} sret align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]], // CHECK: [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[AGG_RESULT]] to i8* // CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8> // CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8> @@ -20916,7 +21060,7 @@ int16x8x2_t test_vzipq_s16(int16x8_t a, int16x8_t b) { return vzipq_s16(a, b); } -// CHECK: @test_vzipq_s32({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]], +// CHECK: @test_vzipq_s32({{.*}} sret align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]], // CHECK: [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[AGG_RESULT]] to i8* // CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8> // CHECK: [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8> @@ -20931,7 +21075,7 @@ int32x4x2_t test_vzipq_s32(int32x4_t a, int32x4_t b) { return vzipq_s32(a, b); } -// CHECK: @test_vzipq_u8({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]], +// CHECK: @test_vzipq_u8({{.*}} sret align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]], // CHECK: [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[AGG_RESULT]] to i8* // CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>* // CHECK: [[VZIP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> @@ -20944,7 +21088,7 @@ uint8x16x2_t test_vzipq_u8(uint8x16_t a, uint8x16_t b) { return vzipq_u8(a, b); } -// CHECK: @test_vzipq_u16({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]], +// CHECK: @test_vzipq_u16({{.*}} sret align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]], // CHECK: [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[AGG_RESULT]] to i8* // CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8> // CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8> @@ -20959,7 +21103,7 @@ uint16x8x2_t test_vzipq_u16(uint16x8_t a, uint16x8_t b) { return vzipq_u16(a, b); } -// CHECK: @test_vzipq_u32({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]], +// CHECK: @test_vzipq_u32({{.*}} sret align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]], // CHECK: [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[AGG_RESULT]] to i8* // CHECK: [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8> // CHECK: [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8> @@ -20974,7 +21118,7 @@ uint32x4x2_t test_vzipq_u32(uint32x4_t a, uint32x4_t b) { return vzipq_u32(a, b); } -// CHECK: @test_vzipq_f32({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]], +// CHECK: @test_vzipq_f32({{.*}} sret align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]], // CHECK: [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[AGG_RESULT]] to i8* // CHECK: [[TMP1:%.*]] = bitcast <4 x float> %a to <16 x i8> // CHECK: [[TMP2:%.*]] = bitcast <4 x float> %b to <16 x i8> @@ -20989,7 +21133,7 @@ float32x4x2_t test_vzipq_f32(float32x4_t a, float32x4_t b) { return vzipq_f32(a, b); } -// CHECK: @test_vzipq_p8({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]], +// CHECK: @test_vzipq_p8({{.*}} sret align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]], // CHECK: [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[AGG_RESULT]] to i8* // CHECK: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>* // CHECK: [[VZIP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> @@ -21002,7 +21146,7 @@ poly8x16x2_t test_vzipq_p8(poly8x16_t a, poly8x16_t b) { return vzipq_p8(a, b); } -// CHECK: @test_vzipq_p16({{.*}} sret [[AGG_RESULT:%[0-9a-zA-Z.]+]], +// CHECK: @test_vzipq_p16({{.*}} sret align 16 [[AGG_RESULT:%[0-9a-zA-Z.]+]], // CHECK: [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[AGG_RESULT]] to i8* // CHECK: [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8> // CHECK: [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8> diff --git a/clang/test/CodeGen/attr-noreturn.c b/clang/test/CodeGen/attr-noreturn.c new file mode 100644 index 0000000000000..5dca4fa1f520a --- /dev/null +++ b/clang/test/CodeGen/attr-noreturn.c @@ -0,0 +1,10 @@ +// RUN: %clang_cc1 -S -emit-llvm %s -o - | FileCheck %s + +typedef void (*fptrs_t[4])(void); +fptrs_t p __attribute__((noreturn)); + +void __attribute__((noreturn)) f() { + p[0](); +} +// CHECK: call void +// CHECK-NEXT: unreachable diff --git a/clang/test/CodeGen/blocks.c b/clang/test/CodeGen/blocks.c index fd348c98f65fe..3f1f2502652cd 100644 --- a/clang/test/CodeGen/blocks.c +++ b/clang/test/CodeGen/blocks.c @@ -18,7 +18,7 @@ struct s0 { int a[64]; }; -// CHECK: define internal void @__f2_block_invoke(%struct.s0* noalias sret {{%.*}}, i8* {{%.*}}, %struct.s0* byval(%struct.s0) align 4 {{.*}}) +// CHECK: define internal void @__f2_block_invoke(%struct.s0* noalias sret align 4 {{%.*}}, i8* {{%.*}}, %struct.s0* byval(%struct.s0) align 4 {{.*}}) struct s0 f2(struct s0 a0) { return ^(struct s0 a1){ return a1; }(a0); } diff --git a/clang/test/CodeGen/bounds-checking.c b/clang/test/CodeGen/bounds-checking.c index 2e6a08650dd97..15cef8c007a55 100644 --- a/clang/test/CodeGen/bounds-checking.c +++ b/clang/test/CodeGen/bounds-checking.c @@ -1,7 +1,7 @@ // RUN: %clang_cc1 -fsanitize=local-bounds -emit-llvm -triple x86_64-apple-darwin10 %s -o - | FileCheck %s // RUN: %clang_cc1 -fsanitize=local-bounds -fexperimental-new-pass-manager -emit-llvm -triple x86_64-apple-darwin10 %s -o - | FileCheck %s -// RUN: %clang_cc1 -fsanitize=array-bounds -O -fsanitize-trap=array-bounds -emit-llvm -triple x86_64-apple-darwin10 -DNO_DYNAMIC %s -o - | FileCheck %s -// RUN: %clang_cc1 -fsanitize=array-bounds -O -fsanitize-trap=array-bounds -fexperimental-new-pass-manager -emit-llvm -triple x86_64-apple-darwin10 -DNO_DYNAMIC %s -o - | FileCheck %s +// RUN: %clang_cc1 -fsanitize=array-bounds -O -fsanitize-trap=array-bounds -emit-llvm -triple x86_64-apple-darwin10 -DNO_DYNAMIC %s -o - | FileCheck %s --check-prefixes=CHECK,NONLOCAL +// RUN: %clang_cc1 -fsanitize=array-bounds -O -fsanitize-trap=array-bounds -fexperimental-new-pass-manager -emit-llvm -triple x86_64-apple-darwin10 -DNO_DYNAMIC %s -o - | FileCheck %s --check-prefixes=CHECK,NONLOCAL // // REQUIRES: x86-registered-target @@ -31,3 +31,21 @@ void f3() { // CHECK: call {{.*}} @llvm.trap a[2] = 1; } + +union U { int a[0]; int b[1]; int c[2]; }; + +// CHECK-LABEL: define {{.*}} @f4 +int f4(union U *u, int i) { + // a and b are treated as flexible array members. + // CHECK-NOT: @llvm.trap + return u->a[i] + u->b[i]; + // CHECK: } +} + +// CHECK-LABEL: define {{.*}} @f5 +int f5(union U *u, int i) { + // c is not a flexible array member. + // NONLOCAL: call {{.*}} @llvm.trap + return u->c[i]; + // CHECK: } +} diff --git a/clang/test/CodeGen/builtins-wasm.c b/clang/test/CodeGen/builtins-wasm.c index b27bf75248cae..8b497d95298cb 100644 --- a/clang/test/CodeGen/builtins-wasm.c +++ b/clang/test/CodeGen/builtins-wasm.c @@ -338,6 +338,30 @@ i8x16 sub_saturate_u_i8x16(i8x16 x, i8x16 y) { // WEBASSEMBLY-NEXT: ret } +i8x16 abs_i8x16(i8x16 v) { + return __builtin_wasm_abs_i8x16(v); + // WEBASSEMBLY: %neg = sub <16 x i8> zeroinitializer, %v + // WEBASSEMBLY: %abscond = icmp slt <16 x i8> %v, zeroinitializer + // WEBASSEMBLY: %abs = select <16 x i1> %abscond, <16 x i8> %neg, <16 x i8> %v + // WEBASSEMBLY: ret <16 x i8> %abs +} + +i16x8 abs_i16x8(i16x8 v) { + return __builtin_wasm_abs_i16x8(v); + // WEBASSEMBLY: %neg = sub <8 x i16> zeroinitializer, %v + // WEBASSEMBLY: %abscond = icmp slt <8 x i16> %v, zeroinitializer + // WEBASSEMBLY: %abs = select <8 x i1> %abscond, <8 x i16> %neg, <8 x i16> %v + // WEBASSEMBLY: ret <8 x i16> %abs +} + +i32x4 abs_i32x4(i32x4 v) { + return __builtin_wasm_abs_i32x4(v); + // WEBASSEMBLY: %neg = sub <4 x i32> zeroinitializer, %v + // WEBASSEMBLY: %abscond = icmp slt <4 x i32> %v, zeroinitializer + // WEBASSEMBLY: %abs = select <4 x i1> %abscond, <4 x i32> %neg, <4 x i32> %v + // WEBASSEMBLY: ret <4 x i32> %abs +} + i8x16 min_s_i8x16(i8x16 x, i8x16 y) { return __builtin_wasm_min_s_i8x16(x, y); // WEBASSEMBLY: %0 = icmp slt <16 x i8> %x, %y @@ -511,6 +535,24 @@ int all_true_i64x2(i64x2 x) { // WEBASSEMBLY: ret } +int bitmask_i8x16(i8x16 x) { + return __builtin_wasm_bitmask_i8x16(x); + // WEBASSEMBLY: call i32 @llvm.wasm.bitmask.v16i8(<16 x i8> %x) + // WEBASSEMBLY: ret +} + +int bitmask_i16x8(i16x8 x) { + return __builtin_wasm_bitmask_i16x8(x); + // WEBASSEMBLY: call i32 @llvm.wasm.bitmask.v8i16(<8 x i16> %x) + // WEBASSEMBLY: ret +} + +int bitmask_i32x4(i32x4 x) { + return __builtin_wasm_bitmask_i32x4(x); + // WEBASSEMBLY: call i32 @llvm.wasm.bitmask.v4i32(<4 x i32> %x) + // WEBASSEMBLY: ret +} + f32x4 abs_f32x4(f32x4 x) { return __builtin_wasm_abs_f32x4(x); // WEBASSEMBLY: call <4 x float> @llvm.fabs.v4f32(<4 x float> %x) diff --git a/clang/test/CodeGen/c11atomics-ios.c b/clang/test/CodeGen/c11atomics-ios.c index f48e10e4aa430..92d318dac1c35 100644 --- a/clang/test/CodeGen/c11atomics-ios.c +++ b/clang/test/CodeGen/c11atomics-ios.c @@ -203,7 +203,7 @@ void testPromotedStruct(_Atomic(PS) *fp) { } PS test_promoted_load(_Atomic(PS) *addr) { - // CHECK-LABEL: @test_promoted_load(%struct.PS* noalias sret %agg.result, { %struct.PS, [2 x i8] }* %addr) + // CHECK-LABEL: @test_promoted_load(%struct.PS* noalias sret align 2 %agg.result, { %struct.PS, [2 x i8] }* %addr) // CHECK: [[ADDR_ARG:%.*]] = alloca { %struct.PS, [2 x i8] }*, align 4 // CHECK: [[ATOMIC_RES:%.*]] = alloca { %struct.PS, [2 x i8] }, align 8 // CHECK: store { %struct.PS, [2 x i8] }* %addr, { %struct.PS, [2 x i8] }** [[ADDR_ARG]], align 4 @@ -245,7 +245,7 @@ void test_promoted_store(_Atomic(PS) *addr, PS *val) { } PS test_promoted_exchange(_Atomic(PS) *addr, PS *val) { - // CHECK-LABEL: @test_promoted_exchange(%struct.PS* noalias sret %agg.result, { %struct.PS, [2 x i8] }* %addr, %struct.PS* %val) + // CHECK-LABEL: @test_promoted_exchange(%struct.PS* noalias sret align 2 %agg.result, { %struct.PS, [2 x i8] }* %addr, %struct.PS* %val) // CHECK: [[ADDR_ARG:%.*]] = alloca { %struct.PS, [2 x i8] }*, align 4 // CHECK: [[VAL_ARG:%.*]] = alloca %struct.PS*, align 4 // CHECK: [[NONATOMIC_TMP:%.*]] = alloca %struct.PS, align 2 diff --git a/clang/test/CodeGen/c11atomics.c b/clang/test/CodeGen/c11atomics.c index 0a32ebcc724e9..8697b798566d3 100644 --- a/clang/test/CodeGen/c11atomics.c +++ b/clang/test/CodeGen/c11atomics.c @@ -368,7 +368,7 @@ void testPromotedStruct(_Atomic(PS) *fp) { } PS test_promoted_load(_Atomic(PS) *addr) { - // CHECK-LABEL: @test_promoted_load(%struct.PS* noalias sret %agg.result, { %struct.PS, [2 x i8] }* %addr) + // CHECK-LABEL: @test_promoted_load(%struct.PS* noalias sret align 2 %agg.result, { %struct.PS, [2 x i8] }* %addr) // CHECK: [[ADDR_ARG:%.*]] = alloca { %struct.PS, [2 x i8] }*, align 4 // CHECK: [[ATOMIC_RES:%.*]] = alloca { %struct.PS, [2 x i8] }, align 8 // CHECK: store { %struct.PS, [2 x i8] }* %addr, { %struct.PS, [2 x i8] }** [[ADDR_ARG]], align 4 @@ -411,7 +411,7 @@ void test_promoted_store(_Atomic(PS) *addr, PS *val) { } PS test_promoted_exchange(_Atomic(PS) *addr, PS *val) { - // CHECK-LABEL: @test_promoted_exchange(%struct.PS* noalias sret %agg.result, { %struct.PS, [2 x i8] }* %addr, %struct.PS* %val) + // CHECK-LABEL: @test_promoted_exchange(%struct.PS* noalias sret align 2 %agg.result, { %struct.PS, [2 x i8] }* %addr, %struct.PS* %val) // CHECK: [[ADDR_ARG:%.*]] = alloca { %struct.PS, [2 x i8] }*, align 4 // CHECK: [[VAL_ARG:%.*]] = alloca %struct.PS*, align 4 // CHECK: [[NONATOMIC_TMP:%.*]] = alloca %struct.PS, align 2 diff --git a/clang/test/CodeGen/debug-info-extern-call.c b/clang/test/CodeGen/debug-info-extern-call.c index da3764f7359ea..072e578b58986 100644 --- a/clang/test/CodeGen/debug-info-extern-call.c +++ b/clang/test/CodeGen/debug-info-extern-call.c @@ -1,7 +1,7 @@ // When entry values are emitted, expect a subprogram for extern decls so that // the dwarf generator can describe call site parameters at extern call sites. // -// RUN: %clang -Xclang -femit-debug-entry-values -g -O2 -target x86_64-none-linux-gnu -S -emit-llvm %s -o - \ +// RUN: %clang -g -O2 -target x86_64-none-linux-gnu -S -emit-llvm %s -o - \ // RUN: | FileCheck %s -check-prefix=DECLS-FOR-EXTERN // Similarly, when the debugger tuning is gdb, expect a subprogram for extern diff --git a/clang/test/CodeGen/debug-prefix-map.c b/clang/test/CodeGen/debug-prefix-map.c index 5366e19447ae2..354110d1b0da7 100644 --- a/clang/test/CodeGen/debug-prefix-map.c +++ b/clang/test/CodeGen/debug-prefix-map.c @@ -2,6 +2,7 @@ // RUN: %clang_cc1 -debug-info-kind=standalone -fdebug-prefix-map=%p=/UNLIKELY_PATH=empty %s -emit-llvm -o - | FileCheck %s -check-prefix CHECK-EVIL // RUN: %clang_cc1 -debug-info-kind=standalone -fdebug-prefix-map=%p=/UNLIKELY_PATH/empty %s -emit-llvm -o - -main-file-name debug-prefix-map.c | FileCheck %s // RUN: %clang_cc1 -debug-info-kind=standalone -fdebug-prefix-map=%p=/UNLIKELY_PATH/empty %s -emit-llvm -o - -fdebug-compilation-dir %p | FileCheck %s -check-prefix CHECK-COMPILATION-DIR +// RUN: %clang_cc1 -debug-info-kind=standalone -fdebug-prefix-map=%p=/UNLIKELY_PATH/empty %s -emit-llvm -o - -isysroot %p -debugger-tuning=lldb | FileCheck %s -check-prefix CHECK-SYSROOT // RUN: %clang -g -fdebug-prefix-map=%p=/UNLIKELY_PATH/empty -S -c %s -emit-llvm -o - | FileCheck %s // RUN: %clang -g -ffile-prefix-map=%p=/UNLIKELY_PATH/empty -S -c %s -emit-llvm -o - | FileCheck %s @@ -40,3 +41,4 @@ void test_rewrite_includes() { // CHECK-COMPILATION-DIR: !DIFile(filename: "{{.*}}", directory: "/UNLIKELY_PATH/empty") // CHECK-COMPILATION-DIR: !DIFile(filename: "{{.*}}Inputs/stdio.h", directory: "/UNLIKELY_PATH/empty") // CHECK-COMPILATION-DIR-NOT: !DIFile(filename: +// CHECK-SYSROOT: !DICompileUnit({{.*}}sysroot: "/UNLIKELY_PATH/empty" diff --git a/clang/test/CodeGen/lanai-arguments.c b/clang/test/CodeGen/lanai-arguments.c index 9ce4ed98a78ce..ef06b3221bc52 100644 --- a/clang/test/CodeGen/lanai-arguments.c +++ b/clang/test/CodeGen/lanai-arguments.c @@ -16,7 +16,7 @@ void f1(s1 i) {} typedef struct { int cc; } s2; -// CHECK: define void @f2(%struct.s2* noalias sret %agg.result) +// CHECK: define void @f2(%struct.s2* noalias sret align 4 %agg.result) s2 f2() { s2 foo; return foo; @@ -26,7 +26,7 @@ typedef struct { int cc; int dd; } s3; -// CHECK: define void @f3(%struct.s3* noalias sret %agg.result) +// CHECK: define void @f3(%struct.s3* noalias sret align 4 %agg.result) s3 f3() { s3 foo; return foo; diff --git a/clang/test/CodeGen/le32-arguments.c b/clang/test/CodeGen/le32-arguments.c index 9e6908d7fc41c..ad368e1a3941a 100644 --- a/clang/test/CodeGen/le32-arguments.c +++ b/clang/test/CodeGen/le32-arguments.c @@ -17,7 +17,7 @@ typedef struct { int cc; } s2; // Structs should be returned sret and not simplified by the frontend -// CHECK-LABEL: define void @f2(%struct.s2* noalias sret %agg.result) +// CHECK-LABEL: define void @f2(%struct.s2* noalias sret align 4 %agg.result) s2 f2() { s2 foo; return foo; diff --git a/clang/test/CodeGen/mcu-struct-return.c b/clang/test/CodeGen/mcu-struct-return.c index 353c963dadb01..93325254bc8db 100644 --- a/clang/test/CodeGen/mcu-struct-return.c +++ b/clang/test/CodeGen/mcu-struct-return.c @@ -42,7 +42,7 @@ struct S1 bar1() { return s1; } struct S2 bar2() { return s2; } struct S1 bar3(union U1 u) { return s1; } // CHECK: define void @foo1() -// CHECK: define void @foo2([[UNION2_TYPE]]* noalias sret %{{.+}}) +// CHECK: define void @foo2([[UNION2_TYPE]]* noalias sret align 4 %{{.+}}) // CHECK: define i32 @foo3() // CHECK: define void @bar1() // CHECK: define i32 @bar2() @@ -62,7 +62,7 @@ void run() { // CHECK: [[Y1:%.+]] = alloca [[STRUCT1_TYPE]] // CHECK: [[Y2:%.+]] = alloca [[STRUCT2_TYPE]] // CHECK: call void @foo1() - // CHECK: call void @foo2([[UNION2_TYPE]]* sret [[X2]]) + // CHECK: call void @foo2([[UNION2_TYPE]]* sret align 4 [[X2]]) // CHECK: {{.+}} = call i32 @foo3() // CHECK: call void @bar1() // CHECK: {{.+}} = call i32 @bar2() diff --git a/clang/test/CodeGen/mingw-long-double.c b/clang/test/CodeGen/mingw-long-double.c index 57e4adaa5fabe..08e3ac754d6b3 100644 --- a/clang/test/CodeGen/mingw-long-double.c +++ b/clang/test/CodeGen/mingw-long-double.c @@ -32,15 +32,15 @@ long double TestLD(long double x) { return x * x; } // GNU32: define dso_local x86_fp80 @TestLD(x86_fp80 %x) -// GNU64: define dso_local void @TestLD(x86_fp80* noalias sret %agg.result, x86_fp80* %0) +// GNU64: define dso_local void @TestLD(x86_fp80* noalias sret align 16 %agg.result, x86_fp80* %0) // MSC64: define dso_local double @TestLD(double %x) long double _Complex TestLDC(long double _Complex x) { return x * x; } -// GNU32: define dso_local void @TestLDC({ x86_fp80, x86_fp80 }* noalias sret %agg.result, { x86_fp80, x86_fp80 }* byval({ x86_fp80, x86_fp80 }) align 4 %x) -// GNU64: define dso_local void @TestLDC({ x86_fp80, x86_fp80 }* noalias sret %agg.result, { x86_fp80, x86_fp80 }* %x) -// MSC64: define dso_local void @TestLDC({ double, double }* noalias sret %agg.result, { double, double }* %x) +// GNU32: define dso_local void @TestLDC({ x86_fp80, x86_fp80 }* noalias sret align 4 %agg.result, { x86_fp80, x86_fp80 }* byval({ x86_fp80, x86_fp80 }) align 4 %x) +// GNU64: define dso_local void @TestLDC({ x86_fp80, x86_fp80 }* noalias sret align 16 %agg.result, { x86_fp80, x86_fp80 }* %x) +// MSC64: define dso_local void @TestLDC({ double, double }* noalias sret align 8 %agg.result, { double, double }* %x) // GNU32: declare dso_local void @__mulxc3 // GNU64: declare dso_local void @__mulxc3 diff --git a/clang/test/CodeGen/mips-zero-sized-struct.c b/clang/test/CodeGen/mips-zero-sized-struct.c index 08ebf9df3e93b..5f0e660cf395b 100644 --- a/clang/test/CodeGen/mips-zero-sized-struct.c +++ b/clang/test/CodeGen/mips-zero-sized-struct.c @@ -19,7 +19,7 @@ // RUN: %clang_cc1 -triple mipsisa64r6-unknown-linux-gnuabi64 -S -emit-llvm -o - %s | FileCheck -check-prefix=N64 %s // RUN: %clang_cc1 -triple mipsisa64r6el-unknown-linux-gnuabi64 -S -emit-llvm -o - %s | FileCheck -check-prefix=N64 %s -// O32: define void @fn28(%struct.T2* noalias sret %agg.result, i8 signext %arg0) +// O32: define void @fn28(%struct.T2* noalias sret align 1 %agg.result, i8 signext %arg0) // N32: define void @fn28(i8 signext %arg0) // N64: define void @fn28(i8 signext %arg0) diff --git a/clang/test/CodeGen/mips64-padding-arg.c b/clang/test/CodeGen/mips64-padding-arg.c index a7c8f0ff6fdc0..d440743fd7238 100644 --- a/clang/test/CodeGen/mips64-padding-arg.c +++ b/clang/test/CodeGen/mips64-padding-arg.c @@ -33,9 +33,9 @@ void foo3(int a0, long double a1) { // Insert padding after hidden argument. // -// N64-LABEL: define void @foo5(%struct.S0* noalias sret %agg.result, i64 %0, fp128 %a0) -// N64: call void @foo6(%struct.S0* sret %agg.result, i32 signext 1, i32 signext 2, i64 undef, fp128 %a0) -// N64: declare void @foo6(%struct.S0* sret, i32 signext, i32 signext, i64, fp128) +// N64-LABEL: define void @foo5(%struct.S0* noalias sret align 16 %agg.result, i64 %0, fp128 %a0) +// N64: call void @foo6(%struct.S0* sret align 16 %agg.result, i32 signext 1, i32 signext 2, i64 undef, fp128 %a0) +// N64: declare void @foo6(%struct.S0* sret align 16, i32 signext, i32 signext, i64, fp128) extern S0 foo6(int, int, long double); diff --git a/clang/test/CodeGen/ms_abi.c b/clang/test/CodeGen/ms_abi.c index 75e1caf922df9..8c66c5dc43610 100644 --- a/clang/test/CodeGen/ms_abi.c +++ b/clang/test/CodeGen/ms_abi.c @@ -155,7 +155,7 @@ struct i128 { }; __attribute__((ms_abi)) struct i128 f7(struct i128 a) { - // WIN64: define dso_local void @f7(%struct.i128* noalias sret %agg.result, %struct.i128* %a) - // FREEBSD: define win64cc void @f7(%struct.i128* noalias sret %agg.result, %struct.i128* %a) + // WIN64: define dso_local void @f7(%struct.i128* noalias sret align 8 %agg.result, %struct.i128* %a) + // FREEBSD: define win64cc void @f7(%struct.i128* noalias sret align 8 %agg.result, %struct.i128* %a) return a; } diff --git a/clang/test/CodeGen/ppc64-align-struct.c b/clang/test/CodeGen/ppc64-align-struct.c index bcff4920d0c49..3435a6e429396 100644 --- a/clang/test/CodeGen/ppc64-align-struct.c +++ b/clang/test/CodeGen/ppc64-align-struct.c @@ -48,7 +48,7 @@ void test7 (int x, struct test7 y) { } -// CHECK: define void @test1va(%struct.test1* noalias sret %[[AGG_RESULT:.*]], i32 signext %x, ...) +// CHECK: define void @test1va(%struct.test1* noalias sret align 4 %[[AGG_RESULT:.*]], i32 signext %x, ...) // CHECK: %[[CUR:[^ ]+]] = load i8*, i8** %ap // CHECK: %[[NEXT:[^ ]+]] = getelementptr inbounds i8, i8* %[[CUR]], i64 8 // CHECK: store i8* %[[NEXT]], i8** %ap @@ -66,7 +66,7 @@ struct test1 test1va (int x, ...) return y; } -// CHECK: define void @test2va(%struct.test2* noalias sret %[[AGG_RESULT:.*]], i32 signext %x, ...) +// CHECK: define void @test2va(%struct.test2* noalias sret align 16 %[[AGG_RESULT:.*]], i32 signext %x, ...) // CHECK: %[[CUR:[^ ]+]] = load i8*, i8** %ap // CHECK: %[[TMP0:[^ ]+]] = ptrtoint i8* %[[CUR]] to i64 // CHECK: %[[TMP1:[^ ]+]] = add i64 %[[TMP0]], 15 @@ -88,7 +88,7 @@ struct test2 test2va (int x, ...) return y; } -// CHECK: define void @test3va(%struct.test3* noalias sret %[[AGG_RESULT:.*]], i32 signext %x, ...) +// CHECK: define void @test3va(%struct.test3* noalias sret align 32 %[[AGG_RESULT:.*]], i32 signext %x, ...) // CHECK: %[[CUR:[^ ]+]] = load i8*, i8** %ap // CHECK: %[[TMP0:[^ ]+]] = ptrtoint i8* %[[CUR]] to i64 // CHECK: %[[TMP1:[^ ]+]] = add i64 %[[TMP0]], 15 @@ -110,7 +110,7 @@ struct test3 test3va (int x, ...) return y; } -// CHECK: define void @test4va(%struct.test4* noalias sret %[[AGG_RESULT:.*]], i32 signext %x, ...) +// CHECK: define void @test4va(%struct.test4* noalias sret align 4 %[[AGG_RESULT:.*]], i32 signext %x, ...) // CHECK: %[[CUR:[^ ]+]] = load i8*, i8** %ap // CHECK: %[[NEXT:[^ ]+]] = getelementptr inbounds i8, i8* %[[CUR]], i64 16 // CHECK: store i8* %[[NEXT]], i8** %ap @@ -128,7 +128,7 @@ struct test4 test4va (int x, ...) return y; } -// CHECK: define void @testva_longdouble(%struct.test_longdouble* noalias sret %[[AGG_RESULT:.*]], i32 signext %x, ...) +// CHECK: define void @testva_longdouble(%struct.test_longdouble* noalias sret align 16 %[[AGG_RESULT:.*]], i32 signext %x, ...) // CHECK: %[[CUR:[^ ]+]] = load i8*, i8** %ap // CHECK: %[[NEXT:[^ ]+]] = getelementptr inbounds i8, i8* %[[CUR]], i64 16 // CHECK: store i8* %[[NEXT]], i8** %ap @@ -147,7 +147,7 @@ struct test_longdouble testva_longdouble (int x, ...) return y; } -// CHECK: define void @testva_vector(%struct.test_vector* noalias sret %[[AGG_RESULT:.*]], i32 signext %x, ...) +// CHECK: define void @testva_vector(%struct.test_vector* noalias sret align 16 %[[AGG_RESULT:.*]], i32 signext %x, ...) // CHECK: %[[CUR:[^ ]+]] = load i8*, i8** %ap // CHECK: %[[TMP0:[^ ]+]] = ptrtoint i8* %[[CUR]] to i64 // CHECK: %[[TMP1:[^ ]+]] = add i64 %[[TMP0]], 15 diff --git a/clang/test/CodeGen/ppc64-elf-abi.c b/clang/test/CodeGen/ppc64-elf-abi.c index 59112a0baf4a7..4270ba2c799b8 100644 --- a/clang/test/CodeGen/ppc64-elf-abi.c +++ b/clang/test/CodeGen/ppc64-elf-abi.c @@ -17,7 +17,7 @@ // RUN: %clang_cc1 -triple powerpc64le-unknown-linux-gnu -emit-llvm -o - %s \ // RUN: -target-abi elfv2 | FileCheck %s --check-prefix=CHECK-ELFv2 -// CHECK-ELFv1: define void @func_fab(%struct.fab* noalias sret %agg.result, i64 %x.coerce) +// CHECK-ELFv1: define void @func_fab(%struct.fab* noalias sret align 4 %agg.result, i64 %x.coerce) // CHECK-ELFv2: define [2 x float] @func_fab([2 x float] %x.coerce) struct fab { float a; float b; }; struct fab func_fab(struct fab x) { return x; } diff --git a/clang/test/CodeGen/ppc64-qpx-vector.c b/clang/test/CodeGen/ppc64-qpx-vector.c index e7c009328b232..0e55851b9f33e 100644 --- a/clang/test/CodeGen/ppc64-qpx-vector.c +++ b/clang/test/CodeGen/ppc64-qpx-vector.c @@ -24,6 +24,6 @@ v4df foo2(struct sdf a, v4df b, struct sdf2 c) { // QPX-LABEL: define <4 x double> @foo2(<4 x double> inreg %a.coerce, <4 x double> %b, [2 x i256] %c.coerce) // QPX: ret <4 x double> -// NORMAL-LABEL: define void @foo2(<4 x double>* noalias sret %agg.result, [2 x i128] %a.coerce, <4 x double>* %0, [4 x i128] %c.coerce) +// NORMAL-LABEL: define void @foo2(<4 x double>* noalias sret align 32 %agg.result, [2 x i128] %a.coerce, <4 x double>* %0, [4 x i128] %c.coerce) // NORMAL: ret void diff --git a/clang/test/CodeGen/ppc64-soft-float.c b/clang/test/CodeGen/ppc64-soft-float.c index 84ac2d55b636f..b033dea68fe20 100644 --- a/clang/test/CodeGen/ppc64-soft-float.c +++ b/clang/test/CodeGen/ppc64-soft-float.c @@ -30,53 +30,53 @@ struct fabc { float a; float b; float c; }; struct f2a2b { float a[2]; float b[2]; }; // CHECK-LE: define i32 @func_f1(float inreg %x.coerce) -// CHECK-BE: define void @func_f1(%struct.f1* noalias sret %agg.result, float inreg %x.coerce) +// CHECK-BE: define void @func_f1(%struct.f1* noalias sret align 4 %agg.result, float inreg %x.coerce) struct f1 func_f1(struct f1 x) { return x; } // CHECK-LE: define i64 @func_f2(i64 %x.coerce) -// CHECK-BE: define void @func_f2(%struct.f2* noalias sret %agg.result, i64 %x.coerce) +// CHECK-BE: define void @func_f2(%struct.f2* noalias sret align 4 %agg.result, i64 %x.coerce) struct f2 func_f2(struct f2 x) { return x; } // CHECK-LE: define { i64, i64 } @func_f3([2 x i64] %x.coerce) -// CHECK-BE: define void @func_f3(%struct.f3* noalias sret %agg.result, [2 x i64] %x.coerce) +// CHECK-BE: define void @func_f3(%struct.f3* noalias sret align 4 %agg.result, [2 x i64] %x.coerce) struct f3 func_f3(struct f3 x) { return x; } // CHECK-LE: define { i64, i64 } @func_f4([2 x i64] %x.coerce) -// CHECK-BE: define void @func_f4(%struct.f4* noalias sret %agg.result, [2 x i64] %x.coerce) +// CHECK-BE: define void @func_f4(%struct.f4* noalias sret align 4 %agg.result, [2 x i64] %x.coerce) struct f4 func_f4(struct f4 x) { return x; } -// CHECK: define void @func_f5(%struct.f5* noalias sret %agg.result, [3 x i64] %x.coerce) +// CHECK: define void @func_f5(%struct.f5* noalias sret align 4 %agg.result, [3 x i64] %x.coerce) struct f5 func_f5(struct f5 x) { return x; } -// CHECK: define void @func_f6(%struct.f6* noalias sret %agg.result, [3 x i64] %x.coerce) +// CHECK: define void @func_f6(%struct.f6* noalias sret align 4 %agg.result, [3 x i64] %x.coerce) struct f6 func_f6(struct f6 x) { return x; } -// CHECK: define void @func_f7(%struct.f7* noalias sret %agg.result, [4 x i64] %x.coerce) +// CHECK: define void @func_f7(%struct.f7* noalias sret align 4 %agg.result, [4 x i64] %x.coerce) struct f7 func_f7(struct f7 x) { return x; } -// CHECK: define void @func_f8(%struct.f8* noalias sret %agg.result, [4 x i64] %x.coerce) +// CHECK: define void @func_f8(%struct.f8* noalias sret align 4 %agg.result, [4 x i64] %x.coerce) struct f8 func_f8(struct f8 x) { return x; } -// CHECK: define void @func_f9(%struct.f9* noalias sret %agg.result, [5 x i64] %x.coerce) +// CHECK: define void @func_f9(%struct.f9* noalias sret align 4 %agg.result, [5 x i64] %x.coerce) struct f9 func_f9(struct f9 x) { return x; } // CHECK-LE: define i64 @func_fab(i64 %x.coerce) -// CHECK-BE: define void @func_fab(%struct.fab* noalias sret %agg.result, i64 %x.coerce) +// CHECK-BE: define void @func_fab(%struct.fab* noalias sret align 4 %agg.result, i64 %x.coerce) struct fab func_fab(struct fab x) { return x; } // CHECK-LE: define { i64, i64 } @func_fabc([2 x i64] %x.coerce) -// CHECK-BE: define void @func_fabc(%struct.fabc* noalias sret %agg.result, [2 x i64] %x.coerce) +// CHECK-BE: define void @func_fabc(%struct.fabc* noalias sret align 4 %agg.result, [2 x i64] %x.coerce) struct fabc func_fabc(struct fabc x) { return x; } // CHECK-LE: define { i64, i64 } @func_f2a2b([2 x i64] %x.coerce) -// CHECK-BE: define void @func_f2a2b(%struct.f2a2b* noalias sret %agg.result, [2 x i64] %x.coerce) +// CHECK-BE: define void @func_f2a2b(%struct.f2a2b* noalias sret align 4 %agg.result, [2 x i64] %x.coerce) struct f2a2b func_f2a2b(struct f2a2b x) { return x; } // CHECK-LABEL: @call_f1 // CHECK-BE: %[[TMP0:[^ ]+]] = alloca %struct.f1, align 4 // CHECK: %[[TMP:[^ ]+]] = load float, float* getelementptr inbounds (%struct.f1, %struct.f1* @global_f1, i32 0, i32 0, i32 0), align 4 // CHECK-LE: call i32 @func_f1(float inreg %[[TMP]]) -// CHECK-BE: call void @func_f1(%struct.f1* sret %[[TMP0]], float inreg %[[TMP]]) +// CHECK-BE: call void @func_f1(%struct.f1* sret align 4 %[[TMP0]], float inreg %[[TMP]]) struct f1 global_f1; void call_f1(void) { global_f1 = func_f1(global_f1); } @@ -84,7 +84,7 @@ void call_f1(void) { global_f1 = func_f1(global_f1); } // CHECK-BE: %[[TMP0:[^ ]+]] = alloca %struct.f2, align 4 // CHECK: %[[TMP:[^ ]+]] = load i64, i64* bitcast (%struct.f2* @global_f2 to i64*), align 4 // CHECK-LE: call i64 @func_f2(i64 %[[TMP]]) -// CHECK-BE: call void @func_f2(%struct.f2* sret %[[TMP0]], i64 %[[TMP]]) +// CHECK-BE: call void @func_f2(%struct.f2* sret align 4 %[[TMP0]], i64 %[[TMP]]) struct f2 global_f2; void call_f2(void) { global_f2 = func_f2(global_f2); } @@ -95,7 +95,7 @@ void call_f2(void) { global_f2 = func_f2(global_f2); } // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %[[TMP2]], i8* align 4 bitcast (%struct.f3* @global_f3 to i8*), i64 12, i1 false) // CHECK: %[[TMP3:[^ ]+]] = load [2 x i64], [2 x i64]* %[[TMP1]] // CHECK-LE: call { i64, i64 } @func_f3([2 x i64] %[[TMP3]]) -// CHECK-BE: call void @func_f3(%struct.f3* sret %[[TMP0]], [2 x i64] %[[TMP3]]) +// CHECK-BE: call void @func_f3(%struct.f3* sret align 4 %[[TMP0]], [2 x i64] %[[TMP3]]) struct f3 global_f3; void call_f3(void) { global_f3 = func_f3(global_f3); } @@ -103,7 +103,7 @@ void call_f3(void) { global_f3 = func_f3(global_f3); } // CHECK-BE: %[[TMP0:[^ ]+]] = alloca %struct.f4, align 4 // CHECK: %[[TMP:[^ ]+]] = load [2 x i64], [2 x i64]* bitcast (%struct.f4* @global_f4 to [2 x i64]*), align 4 // CHECK-LE: call { i64, i64 } @func_f4([2 x i64] %[[TMP]]) -// CHECK-BE: call void @func_f4(%struct.f4* sret %[[TMP0]], [2 x i64] %[[TMP]]) +// CHECK-BE: call void @func_f4(%struct.f4* sret align 4 %[[TMP0]], [2 x i64] %[[TMP]]) struct f4 global_f4; void call_f4(void) { global_f4 = func_f4(global_f4); } @@ -113,14 +113,14 @@ void call_f4(void) { global_f4 = func_f4(global_f4); } // CHECK: %[[TMP2:[^ ]+]] = bitcast [3 x i64]* %[[TMP1]] to i8* // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %[[TMP2]], i8* align 4 bitcast (%struct.f5* @global_f5 to i8*), i64 20, i1 false) // CHECK: %[[TMP3:[^ ]+]] = load [3 x i64], [3 x i64]* %[[TMP1]] -// CHECK: call void @func_f5(%struct.f5* sret %[[TMP0]], [3 x i64] %[[TMP3]]) +// CHECK: call void @func_f5(%struct.f5* sret align 4 %[[TMP0]], [3 x i64] %[[TMP3]]) struct f5 global_f5; void call_f5(void) { global_f5 = func_f5(global_f5); } // CHECK-LABEL: @call_f6 // CHECK: %[[TMP0:[^ ]+]] = alloca %struct.f6, align 4 // CHECK: %[[TMP:[^ ]+]] = load [3 x i64], [3 x i64]* bitcast (%struct.f6* @global_f6 to [3 x i64]*), align 4 -// CHECK: call void @func_f6(%struct.f6* sret %[[TMP0]], [3 x i64] %[[TMP]]) +// CHECK: call void @func_f6(%struct.f6* sret align 4 %[[TMP0]], [3 x i64] %[[TMP]]) struct f6 global_f6; void call_f6(void) { global_f6 = func_f6(global_f6); } @@ -130,14 +130,14 @@ void call_f6(void) { global_f6 = func_f6(global_f6); } // CHECK: %[[TMP2:[^ ]+]] = bitcast [4 x i64]* %[[TMP1]] to i8* // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %[[TMP2]], i8* align 4 bitcast (%struct.f7* @global_f7 to i8*), i64 28, i1 false) // CHECK: %[[TMP3:[^ ]+]] = load [4 x i64], [4 x i64]* %[[TMP1]], align 8 -// CHECK: call void @func_f7(%struct.f7* sret %[[TMP0]], [4 x i64] %[[TMP3]]) +// CHECK: call void @func_f7(%struct.f7* sret align 4 %[[TMP0]], [4 x i64] %[[TMP3]]) struct f7 global_f7; void call_f7(void) { global_f7 = func_f7(global_f7); } // CHECK-LABEL: @call_f8 // CHECK: %[[TMP0:[^ ]+]] = alloca %struct.f8, align 4 // CHECK: %[[TMP:[^ ]+]] = load [4 x i64], [4 x i64]* bitcast (%struct.f8* @global_f8 to [4 x i64]*), align 4 -// CHECK: call void @func_f8(%struct.f8* sret %[[TMP0]], [4 x i64] %[[TMP]]) +// CHECK: call void @func_f8(%struct.f8* sret align 4 %[[TMP0]], [4 x i64] %[[TMP]]) struct f8 global_f8; void call_f8(void) { global_f8 = func_f8(global_f8); } @@ -146,7 +146,7 @@ void call_f8(void) { global_f8 = func_f8(global_f8); } // CHECK: %[[TMP2:[^ ]+]] = bitcast [5 x i64]* %[[TMP1]] to i8* // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %[[TMP2]], i8* align 4 bitcast (%struct.f9* @global_f9 to i8*), i64 36, i1 false) // CHECK: %[[TMP3:[^ ]+]] = load [5 x i64], [5 x i64]* %[[TMP1]] -// CHECK: call void @func_f9(%struct.f9* sret %{{[^ ]+}}, [5 x i64] %[[TMP3]]) +// CHECK: call void @func_f9(%struct.f9* sret align 4 %{{[^ ]+}}, [5 x i64] %[[TMP3]]) struct f9 global_f9; void call_f9(void) { global_f9 = func_f9(global_f9); } @@ -154,7 +154,7 @@ void call_f9(void) { global_f9 = func_f9(global_f9); } // CHECK: %[[TMP0:[^ ]+]] = alloca %struct.fab, align 4 // CHECK: %[[TMP:[^ ]+]] = load i64, i64* bitcast (%struct.fab* @global_fab to i64*), align 4 // CHECK-LE: %call = call i64 @func_fab(i64 %[[TMP]]) -// CHECK-BE: call void @func_fab(%struct.fab* sret %[[TMP0]], i64 %[[TMP]]) +// CHECK-BE: call void @func_fab(%struct.fab* sret align 4 %[[TMP0]], i64 %[[TMP]]) struct fab global_fab; void call_fab(void) { global_fab = func_fab(global_fab); } @@ -165,7 +165,7 @@ void call_fab(void) { global_fab = func_fab(global_fab); } // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %[[TMP2]], i8* align 4 bitcast (%struct.fabc* @global_fabc to i8*), i64 12, i1 false) // CHECK: %[[TMP3:[^ ]+]] = load [2 x i64], [2 x i64]* %[[TMP0]], align 8 // CHECK-LE: %call = call { i64, i64 } @func_fabc([2 x i64] %[[TMP3]]) -// CHECK-BE: call void @func_fabc(%struct.fabc* sret %[[TMPX]], [2 x i64] %[[TMP3]]) +// CHECK-BE: call void @func_fabc(%struct.fabc* sret align 4 %[[TMPX]], [2 x i64] %[[TMP3]]) struct fabc global_fabc; void call_fabc(void) { global_fabc = func_fabc(global_fabc); } diff --git a/clang/test/CodeGen/ppc64-vector.c b/clang/test/CodeGen/ppc64-vector.c index 7ed0beade4cd0..7ea5b007d5bfc 100644 --- a/clang/test/CodeGen/ppc64-vector.c +++ b/clang/test/CodeGen/ppc64-vector.c @@ -39,13 +39,13 @@ v8i16 test_v8i16(v8i16 x) return x; } -// CHECK: define void @test_v16i16(<16 x i16>* noalias sret %agg.result, <16 x i16>* %0) +// CHECK: define void @test_v16i16(<16 x i16>* noalias sret align 32 %agg.result, <16 x i16>* %0) v16i16 test_v16i16(v16i16 x) { return x; } -// CHECK: define void @test_struct_v16i16(%struct.v16i16* noalias sret %agg.result, [2 x i128] %x.coerce) +// CHECK: define void @test_struct_v16i16(%struct.v16i16* noalias sret align 32 %agg.result, [2 x i128] %x.coerce) struct v16i16 test_struct_v16i16(struct v16i16 x) { return x; diff --git a/clang/test/CodeGen/ppc64le-aggregates.c b/clang/test/CodeGen/ppc64le-aggregates.c index e36faa2b80258..ea32d69b7cf95 100644 --- a/clang/test/CodeGen/ppc64le-aggregates.c +++ b/clang/test/CodeGen/ppc64le-aggregates.c @@ -41,7 +41,7 @@ struct f7 func_f7(struct f7 x) { return x; } // CHECK: define [8 x float] @func_f8([8 x float] %x.coerce) struct f8 func_f8(struct f8 x) { return x; } -// CHECK: define void @func_f9(%struct.f9* noalias sret %agg.result, [5 x i64] %x.coerce) +// CHECK: define void @func_f9(%struct.f9* noalias sret align 4 %agg.result, [5 x i64] %x.coerce) struct f9 func_f9(struct f9 x) { return x; } // CHECK: define [2 x float] @func_fab([2 x float] %x.coerce) @@ -106,7 +106,7 @@ void call_f8(void) { global_f8 = func_f8(global_f8); } // CHECK: %[[TMP2:[^ ]+]] = bitcast [5 x i64]* %[[TMP1]] to i8* // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %[[TMP2]], i8* align 4 bitcast (%struct.f9* @global_f9 to i8*), i64 36, i1 false) // CHECK: %[[TMP3:[^ ]+]] = load [5 x i64], [5 x i64]* %[[TMP1]] -// CHECK: call void @func_f9(%struct.f9* sret %{{[^ ]+}}, [5 x i64] %[[TMP3]]) +// CHECK: call void @func_f9(%struct.f9* sret align 4 %{{[^ ]+}}, [5 x i64] %[[TMP3]]) struct f9 global_f9; void call_f9(void) { global_f9 = func_f9(global_f9); } @@ -162,7 +162,7 @@ struct v7 func_v7(struct v7 x) { return x; } // CHECK: define [8 x <4 x i32>] @func_v8([8 x <4 x i32>] %x.coerce) struct v8 func_v8(struct v8 x) { return x; } -// CHECK: define void @func_v9(%struct.v9* noalias sret %agg.result, %struct.v9* byval(%struct.v9) align 16 %x) +// CHECK: define void @func_v9(%struct.v9* noalias sret align 16 %agg.result, %struct.v9* byval(%struct.v9) align 16 %x) struct v9 func_v9(struct v9 x) { return x; } // CHECK: define [2 x <4 x i32>] @func_vab([2 x <4 x i32>] %x.coerce) @@ -220,7 +220,7 @@ struct v8 global_v8; void call_v8(void) { global_v8 = func_v8(global_v8); } // CHECK-LABEL: @call_v9 -// CHECK: call void @func_v9(%struct.v9* sret %{{[^ ]+}}, %struct.v9* byval(%struct.v9) align 16 @global_v9) +// CHECK: call void @func_v9(%struct.v9* sret align 16 %{{[^ ]+}}, %struct.v9* byval(%struct.v9) align 16 @global_v9) struct v9 global_v9; void call_v9(void) { global_v9 = func_v9(global_v9); } @@ -279,7 +279,7 @@ struct v3f7 func_v3f7(struct v3f7 x) { return x; } // CHECK: define [8 x <4 x float>] @func_v3f8([8 x <4 x float>] %x.coerce) struct v3f8 func_v3f8(struct v3f8 x) { return x; } -// CHECK: define void @func_v3f9(%struct.v3f9* noalias sret %agg.result, %struct.v3f9* byval(%struct.v3f9) align 16 %x) +// CHECK: define void @func_v3f9(%struct.v3f9* noalias sret align 16 %agg.result, %struct.v3f9* byval(%struct.v3f9) align 16 %x) struct v3f9 func_v3f9(struct v3f9 x) { return x; } // CHECK: define [2 x <4 x float>] @func_v3fab([2 x <4 x float>] %x.coerce) @@ -337,7 +337,7 @@ struct v3f8 global_v3f8; void call_v3f8(void) { global_v3f8 = func_v3f8(global_v3f8); } // CHECK-LABEL: @call_v3f9 -// CHECK: call void @func_v3f9(%struct.v3f9* sret %{{[^ ]+}}, %struct.v3f9* byval(%struct.v3f9) align 16 @global_v3f9) +// CHECK: call void @func_v3f9(%struct.v3f9* sret align 16 %{{[^ ]+}}, %struct.v3f9* byval(%struct.v3f9) align 16 @global_v3f9) struct v3f9 global_v3f9; void call_v3f9(void) { global_v3f9 = func_v3f9(global_v3f9); } diff --git a/clang/test/CodeGen/ppc64le-f128Aggregates.c b/clang/test/CodeGen/ppc64le-f128Aggregates.c index 3b363bf0f2eac..acebea69b31dc 100644 --- a/clang/test/CodeGen/ppc64le-f128Aggregates.c +++ b/clang/test/CodeGen/ppc64le-f128Aggregates.c @@ -42,7 +42,7 @@ struct fp7 func_f7(struct fp7 x) { return x; } // CHECK: define [8 x fp128] @func_f8([8 x fp128] %x.coerce) struct fp8 func_f8(struct fp8 x) { return x; } -// CHECK: define void @func_f9(%struct.fp9* noalias sret %agg.result, %struct.fp9* byval(%struct.fp9) align 16 %x) +// CHECK: define void @func_f9(%struct.fp9* noalias sret align 16 %agg.result, %struct.fp9* byval(%struct.fp9) align 16 %x) struct fp9 func_f9(struct fp9 x) { return x; } // CHECK: define [2 x fp128] @func_fab([2 x fp128] %x.coerce) @@ -104,7 +104,7 @@ void call_fp8(void) { global_f8 = func_f8(global_f8); } // CHECK-LABEL: @call_fp9 // CHECK: %[[TMP1:[^ ]+]] = alloca %struct.fp9, align 16 -// CHECK: call void @func_f9(%struct.fp9* sret %[[TMP2:[^ ]+]], %struct.fp9* byval(%struct.fp9) align 16 @global_f9 +// CHECK: call void @func_f9(%struct.fp9* sret align 16 %[[TMP2:[^ ]+]], %struct.fp9* byval(%struct.fp9) align 16 @global_f9 // CHECK: %[[TMP3:[^ ]+]] = bitcast %struct.fp9* %[[TMP2]] to i8* // CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 bitcast (%struct.fp9* @global_f9 to i8*), i8* align 16 %[[TMP3]], i64 144, i1 false // CHECK: ret void diff --git a/clang/test/CodeGen/regparm-struct.c b/clang/test/CodeGen/regparm-struct.c index 7f56ae094a69c..8c74c8b1f0586 100644 --- a/clang/test/CodeGen/regparm-struct.c +++ b/clang/test/CodeGen/regparm-struct.c @@ -159,7 +159,7 @@ void g16(void) { } __attribute__((regparm(3))) struct s12 f17(int a, int b, int c); -// CHECK: declare void @f17(%struct.s12* inreg sret, i32 inreg, i32 inreg, i32) +// CHECK: declare void @f17(%struct.s12* inreg sret align 4, i32 inreg, i32 inreg, i32) void g17(void) { f17(41, 42, 43); } diff --git a/clang/test/CodeGen/renderscript.c b/clang/test/CodeGen/renderscript.c index a85dc35c61496..fee97a154344e 100644 --- a/clang/test/CodeGen/renderscript.c +++ b/clang/test/CodeGen/renderscript.c @@ -83,15 +83,15 @@ void argLongInt(sLongInt s) {} // and coerced to [a x iNN] for 64-bit RenderScript // ============================================================================= -// CHECK-RS32: void @retShortCharShort(%struct.sShortCharShort* noalias sret %agg.result) +// CHECK-RS32: void @retShortCharShort(%struct.sShortCharShort* noalias sret align 2 %agg.result) // CHECK-RS64: [3 x i16] @retShortCharShort() sShortCharShort retShortCharShort() { sShortCharShort r; return r; } -// CHECK-RS32: void @retIntShortChar(%struct.sIntShortChar* noalias sret %agg.result) +// CHECK-RS32: void @retIntShortChar(%struct.sIntShortChar* noalias sret align 4 %agg.result) // CHECK-RS64: [2 x i32] @retIntShortChar() sIntShortChar retIntShortChar() { sIntShortChar r; return r; } -// CHECK-RS32: void @retLongInt(%struct.sLongInt* noalias sret %agg.result) +// CHECK-RS32: void @retLongInt(%struct.sLongInt* noalias sret align 8 %agg.result) // CHECK-RS64: [2 x i64] @retLongInt() sLongInt retLongInt() { sLongInt r; return r; } @@ -116,12 +116,12 @@ void argLong2Char(sLong2Char s) {} // 64-bit RenderScript // ============================================================================= -// CHECK-RS32: void @retInt5(%struct.sInt5* noalias sret %agg.result) -// CHECK-RS64: void @retInt5(%struct.sInt5* noalias sret %agg.result) +// CHECK-RS32: void @retInt5(%struct.sInt5* noalias sret align 4 %agg.result) +// CHECK-RS64: void @retInt5(%struct.sInt5* noalias sret align 4 %agg.result) sInt5 retInt5() { sInt5 r; return r;} -// CHECK-RS32: void @retLong2Char(%struct.sLong2Char* noalias sret %agg.result) -// CHECK-RS64: void @retLong2Char(%struct.sLong2Char* noalias sret %agg.result) +// CHECK-RS32: void @retLong2Char(%struct.sLong2Char* noalias sret align 8 %agg.result) +// CHECK-RS64: void @retLong2Char(%struct.sLong2Char* noalias sret align 8 %agg.result) sLong2Char retLong2Char() { sLong2Char r; return r;} // ============================================================================= @@ -135,6 +135,6 @@ typedef struct {long l1, l2, l3, l4, l5, l6, l7, l8, l9; } sLong9; // CHECK-RS64: void @argLong9(%struct.sLong9* %s) void argLong9(sLong9 s) {} -// CHECK-RS32: void @retLong9(%struct.sLong9* noalias sret %agg.result) -// CHECK-RS64: void @retLong9(%struct.sLong9* noalias sret %agg.result) +// CHECK-RS32: void @retLong9(%struct.sLong9* noalias sret align 8 %agg.result) +// CHECK-RS64: void @retLong9(%struct.sLong9* noalias sret align 8 %agg.result) sLong9 retLong9() { sLong9 r; return r; } diff --git a/clang/test/CodeGen/riscv-sdata-module-flag.c b/clang/test/CodeGen/riscv-sdata-module-flag.c new file mode 100644 index 0000000000000..4ad83b2c6d7cb --- /dev/null +++ b/clang/test/CodeGen/riscv-sdata-module-flag.c @@ -0,0 +1,48 @@ +// RUN: %clang -target riscv32-unknown-elf %s -S -emit-llvm -o - \ +// RUN: | FileCheck %s -check-prefix=RV32-DEFAULT +// RUN: %clang -target riscv32-unknown-elf %s -S -emit-llvm -G4 -o - \ +// RUN: | FileCheck %s -check-prefix=RV32-G4 +// RUN: %clang -target riscv32-unknown-elf %s -S -emit-llvm -msmall-data-limit=0 -o - \ +// RUN: | FileCheck %s -check-prefix=RV32-S0 +// RUN: %clang -target riscv32-unknown-elf %s -S -emit-llvm -msmall-data-limit=2 -G4 -o - \ +// RUN: | FileCheck %s -check-prefix=RV32-S2G4 +// RUN: %clang -target riscv32-unknown-elf %s -S -emit-llvm -msmall-data-threshold=16 -o - \ +// RUN: | FileCheck %s -check-prefix=RV32-T16 +// RUN: %clang -target riscv32-unknown-elf %s -S -emit-llvm -fpic -o - \ +// RUN: | FileCheck %s -check-prefix=RV32-PIC + +// RUN: %clang -target riscv64-unknown-elf %s -S -emit-llvm -o - \ +// RUN: | FileCheck %s -check-prefix=RV64-DEFAULT +// RUN: %clang -target riscv64-unknown-elf %s -S -emit-llvm -G4 -o - \ +// RUN: | FileCheck %s -check-prefix=RV64-G4 +// RUN: %clang -target riscv64-unknown-elf %s -S -emit-llvm -msmall-data-limit=0 -o - \ +// RUN: | FileCheck %s -check-prefix=RV64-S0 +// RUN: %clang -target riscv64-unknown-elf %s -S -emit-llvm -msmall-data-limit=2 -G4 -o - \ +// RUN: | FileCheck %s -check-prefix=RV64-S2G4 +// RUN: %clang -target riscv64-unknown-elf %s -S -emit-llvm -msmall-data-threshold=16 -o - \ +// RUN: | FileCheck %s -check-prefix=RV64-T16 +// RUN: %clang -target riscv64-unknown-elf %s -S -emit-llvm -fpic -o - \ +// RUN: | FileCheck %s -check-prefix=RV64-PIC +// RUN: %clang -target riscv64-unknown-elf %s -S -emit-llvm -mcmodel=large -o - \ +// RUN: | FileCheck %s -check-prefix=RV64-LARGE + +void test() {} + +// RV32-DEFAULT: !{i32 1, !"SmallDataLimit", i32 8} +// RV32-G4: !{i32 1, !"SmallDataLimit", i32 4} +// RV32-S0: !{i32 1, !"SmallDataLimit", i32 0} +// RV32-S2G4: !{i32 1, !"SmallDataLimit", i32 4} +// RV32-T16: !{i32 1, !"SmallDataLimit", i32 16} +// RV32-PIC: !{i32 1, !"SmallDataLimit", i32 0} + +// RV64-DEFAULT: !{i32 1, !"SmallDataLimit", i32 8} +// RV64-G4: !{i32 1, !"SmallDataLimit", i32 4} +// RV64-S0: !{i32 1, !"SmallDataLimit", i32 0} +// RV64-S2G4: !{i32 1, !"SmallDataLimit", i32 4} +// RV64-T16: !{i32 1, !"SmallDataLimit", i32 16} +// RV64-PIC: !{i32 1, !"SmallDataLimit", i32 0} +// RV64-LARGE: !{i32 1, !"SmallDataLimit", i32 0} + +// The value will be passed by module flag instead of target feature. +// RV32-S0-NOT: +small-data-limit= +// RV64-S0-NOT: +small-data-limit= diff --git a/clang/test/CodeGen/riscv32-ilp32-abi.c b/clang/test/CodeGen/riscv32-ilp32-abi.c index 59f0bb5683726..1b32024f51582 100644 --- a/clang/test/CodeGen/riscv32-ilp32-abi.c +++ b/clang/test/CodeGen/riscv32-ilp32-abi.c @@ -35,7 +35,7 @@ int f_scalar_stack_1(int32_t a, int64_t b, float c, double d, long double e, // the presence of large return values that consume a register due to the need // to pass a pointer. -// CHECK-LABEL: define void @f_scalar_stack_2(%struct.large* noalias sret %agg.result, float %a, i64 %b, double %c, fp128 %d, i8 zeroext %e, i8 %f, i8 %g) +// CHECK-LABEL: define void @f_scalar_stack_2(%struct.large* noalias sret align 4 %agg.result, float %a, i64 %b, double %c, fp128 %d, i8 zeroext %e, i8 %f, i8 %g) struct large f_scalar_stack_2(float a, int64_t b, double c, long double d, uint8_t e, int8_t f, uint8_t g) { return (struct large){a, e, f, g}; diff --git a/clang/test/CodeGen/riscv32-ilp32-ilp32f-abi.c b/clang/test/CodeGen/riscv32-ilp32-ilp32f-abi.c index 677040626f578..225b12358a0e3 100644 --- a/clang/test/CodeGen/riscv32-ilp32-ilp32f-abi.c +++ b/clang/test/CodeGen/riscv32-ilp32-ilp32f-abi.c @@ -37,7 +37,7 @@ int f_scalar_stack_1(int32_t a, int64_t b, int32_t c, double d, long double e, // the presence of large return values that consume a register due to the need // to pass a pointer. -// CHECK-LABEL: define void @f_scalar_stack_2(%struct.large* noalias sret %agg.result, i32 %a, i64 %b, double %c, fp128 %d, i8 zeroext %e, i8 %f, i8 %g) +// CHECK-LABEL: define void @f_scalar_stack_2(%struct.large* noalias sret align 4 %agg.result, i32 %a, i64 %b, double %c, fp128 %d, i8 zeroext %e, i8 %f, i8 %g) struct large f_scalar_stack_2(int32_t a, int64_t b, double c, long double d, uint8_t e, int8_t f, uint8_t g) { return (struct large){a, e, f, g}; diff --git a/clang/test/CodeGen/riscv32-ilp32-ilp32f-ilp32d-abi.c b/clang/test/CodeGen/riscv32-ilp32-ilp32f-ilp32d-abi.c index 86ad8fd370bca..740079d28d3be 100644 --- a/clang/test/CodeGen/riscv32-ilp32-ilp32f-ilp32d-abi.c +++ b/clang/test/CodeGen/riscv32-ilp32-ilp32f-ilp32d-abi.c @@ -177,7 +177,7 @@ void f_agg_large(struct large x) { // The address where the struct should be written to will be the first // argument -// CHECK-LABEL: define void @f_agg_large_ret(%struct.large* noalias sret %agg.result, i32 %i, i8 signext %j) +// CHECK-LABEL: define void @f_agg_large_ret(%struct.large* noalias sret align 4 %agg.result, i32 %i, i8 signext %j) struct large f_agg_large_ret(int32_t i, int8_t j) { return (struct large){1, 2, 3, 4}; } @@ -189,7 +189,7 @@ void f_vec_large_v16i8(v16i8 x) { x[0] = x[7]; } -// CHECK-LABEL: define void @f_vec_large_v16i8_ret(<16 x i8>* noalias sret %agg.result) +// CHECK-LABEL: define void @f_vec_large_v16i8_ret(<16 x i8>* noalias sret align 16 %agg.result) v16i8 f_vec_large_v16i8_ret() { return (v16i8){1, 2, 3, 4, 5, 6, 7, 8}; } @@ -207,7 +207,7 @@ int f_scalar_stack_1(struct tiny a, struct small b, struct small_aligned c, // the presence of large return values that consume a register due to the need // to pass a pointer. -// CHECK-LABEL: define void @f_scalar_stack_2(%struct.large* noalias sret %agg.result, i32 %a, i64 %b, i64 %c, fp128 %d, i8 zeroext %e, i8 %f, i8 %g) +// CHECK-LABEL: define void @f_scalar_stack_2(%struct.large* noalias sret align 4 %agg.result, i32 %a, i64 %b, i64 %c, fp128 %d, i8 zeroext %e, i8 %f, i8 %g) struct large f_scalar_stack_2(int32_t a, int64_t b, int64_t c, long double d, uint8_t e, int8_t f, uint8_t g) { return (struct large){a, e, f, g}; diff --git a/clang/test/CodeGen/riscv32-ilp32d-abi.c b/clang/test/CodeGen/riscv32-ilp32d-abi.c index b5b451cee1517..02398e66766f1 100644 --- a/clang/test/CodeGen/riscv32-ilp32d-abi.c +++ b/clang/test/CodeGen/riscv32-ilp32d-abi.c @@ -119,7 +119,7 @@ struct double_int32_s f_ret_double_int32_s() { // CHECK: define void @f_double_int64_s_arg(%struct.double_int64_s* %a) void f_double_int64_s_arg(struct double_int64_s a) {} -// CHECK: define void @f_ret_double_int64_s(%struct.double_int64_s* noalias sret %agg.result) +// CHECK: define void @f_ret_double_int64_s(%struct.double_int64_s* noalias sret align 8 %agg.result) struct double_int64_s f_ret_double_int64_s() { return (struct double_int64_s){1.0, 2}; } @@ -243,7 +243,7 @@ struct int_double_int_s { int a; double b; int c; }; // CHECK: define void @f_int_double_int_s_arg(%struct.int_double_int_s* %a) void f_int_double_int_s_arg(struct int_double_int_s a) {} -// CHECK: define void @f_ret_int_double_int_s(%struct.int_double_int_s* noalias sret %agg.result) +// CHECK: define void @f_ret_int_double_int_s(%struct.int_double_int_s* noalias sret align 8 %agg.result) struct int_double_int_s f_ret_int_double_int_s() { return (struct int_double_int_s){1, 2.0, 3}; } @@ -253,7 +253,7 @@ struct int64_double_s { int64_t a; double b; }; // CHECK: define void @f_int64_double_s_arg(%struct.int64_double_s* %a) void f_int64_double_s_arg(struct int64_double_s a) {} -// CHECK: define void @f_ret_int64_double_s(%struct.int64_double_s* noalias sret %agg.result) +// CHECK: define void @f_ret_int64_double_s(%struct.int64_double_s* noalias sret align 8 %agg.result) struct int64_double_s f_ret_int64_double_s() { return (struct int64_double_s){1, 2.0}; } @@ -263,7 +263,7 @@ struct char_char_double_s { char a; char b; double c; }; // CHECK-LABEL: define void @f_char_char_double_s_arg(%struct.char_char_double_s* %a) void f_char_char_double_s_arg(struct char_char_double_s a) {} -// CHECK: define void @f_ret_char_char_double_s(%struct.char_char_double_s* noalias sret %agg.result) +// CHECK: define void @f_ret_char_char_double_s(%struct.char_char_double_s* noalias sret align 8 %agg.result) struct char_char_double_s f_ret_char_char_double_s() { return (struct char_char_double_s){1, 2, 3.0}; } diff --git a/clang/test/CodeGen/riscv32-ilp32f-abi.c b/clang/test/CodeGen/riscv32-ilp32f-abi.c index 76092958aeddf..c8e6418b9daae 100644 --- a/clang/test/CodeGen/riscv32-ilp32f-abi.c +++ b/clang/test/CodeGen/riscv32-ilp32f-abi.c @@ -26,7 +26,7 @@ struct double_double_s { double d; double e; }; // CHECK: define void @f_double_double_s_arg(%struct.double_double_s* %a) void f_double_double_s_arg(struct double_double_s a) {} -// CHECK: define void @f_ret_double_double_s(%struct.double_double_s* noalias sret %agg.result) +// CHECK: define void @f_ret_double_double_s(%struct.double_double_s* noalias sret align 8 %agg.result) struct double_double_s f_ret_double_double_s() { return (struct double_double_s){1.0, 2.0}; } @@ -38,7 +38,7 @@ struct int_double_s { int a; double b; }; // CHECK: define void @f_int_double_s_arg(%struct.int_double_s* %a) void f_int_double_s_arg(struct int_double_s a) {} -// CHECK: define void @f_ret_int_double_s(%struct.int_double_s* noalias sret %agg.result) +// CHECK: define void @f_ret_int_double_s(%struct.int_double_s* noalias sret align 8 %agg.result) struct int_double_s f_ret_int_double_s() { return (struct int_double_s){1, 2.0}; } diff --git a/clang/test/CodeGen/riscv32-ilp32f-ilp32d-abi.c b/clang/test/CodeGen/riscv32-ilp32f-ilp32d-abi.c index e9705ca3d62b3..419bd87fdecfa 100644 --- a/clang/test/CodeGen/riscv32-ilp32f-ilp32d-abi.c +++ b/clang/test/CodeGen/riscv32-ilp32f-ilp32d-abi.c @@ -112,7 +112,7 @@ struct float_int32_s f_ret_float_int32_s() { // CHECK: define void @f_float_int64_s_arg(%struct.float_int64_s* %a) void f_float_int64_s_arg(struct float_int64_s a) {} -// CHECK: define void @f_ret_float_int64_s(%struct.float_int64_s* noalias sret %agg.result) +// CHECK: define void @f_ret_float_int64_s(%struct.float_int64_s* noalias sret align 8 %agg.result) struct float_int64_s f_ret_float_int64_s() { return (struct float_int64_s){1.0, 2}; } @@ -236,7 +236,7 @@ struct int_float_int_s { int a; float b; int c; }; // CHECK: define void @f_int_float_int_s_arg(%struct.int_float_int_s* %a) void f_int_float_int_s_arg(struct int_float_int_s a) {} -// CHECK: define void @f_ret_int_float_int_s(%struct.int_float_int_s* noalias sret %agg.result) +// CHECK: define void @f_ret_int_float_int_s(%struct.int_float_int_s* noalias sret align 4 %agg.result) struct int_float_int_s f_ret_int_float_int_s() { return (struct int_float_int_s){1, 2.0, 3}; } @@ -246,7 +246,7 @@ struct int64_float_s { int64_t a; float b; }; // CHECK: define void @f_int64_float_s_arg(%struct.int64_float_s* %a) void f_int64_float_s_arg(struct int64_float_s a) {} -// CHECK: define void @f_ret_int64_float_s(%struct.int64_float_s* noalias sret %agg.result) +// CHECK: define void @f_ret_int64_float_s(%struct.int64_float_s* noalias sret align 8 %agg.result) struct int64_float_s f_ret_int64_float_s() { return (struct int64_float_s){1, 2.0}; } diff --git a/clang/test/CodeGen/riscv64-lp64-abi.c b/clang/test/CodeGen/riscv64-lp64-abi.c index bae5470c377d9..8347056c54d35 100644 --- a/clang/test/CodeGen/riscv64-lp64-abi.c +++ b/clang/test/CodeGen/riscv64-lp64-abi.c @@ -25,7 +25,7 @@ int f_scalar_stack_1(int32_t a, __int128_t b, float c, long double d, v32i8 e, // the presence of large return values that consume a register due to the need // to pass a pointer. -// CHECK-LABEL: define void @f_scalar_stack_2(%struct.large* noalias sret %agg.result, double %a, i128 %b, fp128 %c, <32 x i8>* %0, i8 zeroext %e, i8 %f, i8 %g) +// CHECK-LABEL: define void @f_scalar_stack_2(%struct.large* noalias sret align 8 %agg.result, double %a, i128 %b, fp128 %c, <32 x i8>* %0, i8 zeroext %e, i8 %f, i8 %g) struct large f_scalar_stack_2(double a, __int128_t b, long double c, v32i8 d, uint8_t e, int8_t f, uint8_t g) { return (struct large){a, e, f, g}; diff --git a/clang/test/CodeGen/riscv64-lp64-lp64f-abi.c b/clang/test/CodeGen/riscv64-lp64-lp64f-abi.c index d9c909e88bd8f..489d0e83dcbc5 100644 --- a/clang/test/CodeGen/riscv64-lp64-lp64f-abi.c +++ b/clang/test/CodeGen/riscv64-lp64-lp64f-abi.c @@ -27,7 +27,7 @@ int f_scalar_stack_1(int32_t a, __int128_t b, double c, long double d, v32i8 e, // the presence of large return values that consume a register due to the need // to pass a pointer. -// CHECK-LABEL: define void @f_scalar_stack_2(%struct.large* noalias sret %agg.result, double %a, i128 %b, fp128 %c, <32 x i8>* %0, i8 zeroext %e, i8 %f, i8 %g) +// CHECK-LABEL: define void @f_scalar_stack_2(%struct.large* noalias sret align 8 %agg.result, double %a, i128 %b, fp128 %c, <32 x i8>* %0, i8 zeroext %e, i8 %f, i8 %g) struct large f_scalar_stack_2(double a, __int128_t b, long double c, v32i8 d, uint8_t e, int8_t f, uint8_t g) { return (struct large){a, e, f, g}; diff --git a/clang/test/CodeGen/riscv64-lp64-lp64f-lp64d-abi.c b/clang/test/CodeGen/riscv64-lp64-lp64f-lp64d-abi.c index f50a8ca905757..8e263aeba25c5 100644 --- a/clang/test/CodeGen/riscv64-lp64-lp64f-lp64d-abi.c +++ b/clang/test/CodeGen/riscv64-lp64-lp64f-lp64d-abi.c @@ -166,7 +166,7 @@ void f_agg_large(struct large x) { // The address where the struct should be written to will be the first // argument -// CHECK-LABEL: define void @f_agg_large_ret(%struct.large* noalias sret %agg.result, i32 signext %i, i8 signext %j) +// CHECK-LABEL: define void @f_agg_large_ret(%struct.large* noalias sret align 8 %agg.result, i32 signext %i, i8 signext %j) struct large f_agg_large_ret(int32_t i, int8_t j) { return (struct large){1, 2, 3, 4}; } @@ -178,7 +178,7 @@ void f_vec_large_v32i8(v32i8 x) { x[0] = x[7]; } -// CHECK-LABEL: define void @f_vec_large_v32i8_ret(<32 x i8>* noalias sret %agg.result) +// CHECK-LABEL: define void @f_vec_large_v32i8_ret(<32 x i8>* noalias sret align 32 %agg.result) v32i8 f_vec_large_v32i8_ret() { return (v32i8){1, 2, 3, 4, 5, 6, 7, 8}; } @@ -202,7 +202,7 @@ int f_scalar_stack_2(int32_t a, __int128_t b, int64_t c, long double d, v32i8 e, // the presence of large return values that consume a register due to the need // to pass a pointer. -// CHECK-LABEL: define void @f_scalar_stack_3(%struct.large* noalias sret %agg.result, i32 signext %a, i128 %b, fp128 %c, <32 x i8>* %0, i8 zeroext %e, i8 %f, i8 %g) +// CHECK-LABEL: define void @f_scalar_stack_3(%struct.large* noalias sret align 8 %agg.result, i32 signext %a, i128 %b, fp128 %c, <32 x i8>* %0, i8 zeroext %e, i8 %f, i8 %g) struct large f_scalar_stack_3(uint32_t a, __int128_t b, long double c, v32i8 d, uint8_t e, int8_t f, uint8_t g) { return (struct large){a, e, f, g}; diff --git a/clang/test/CodeGen/riscv64-lp64d-abi.c b/clang/test/CodeGen/riscv64-lp64d-abi.c index 83947def30851..ec47428e6ccab 100644 --- a/clang/test/CodeGen/riscv64-lp64d-abi.c +++ b/clang/test/CodeGen/riscv64-lp64d-abi.c @@ -243,7 +243,7 @@ struct int_double_int_s { int a; double b; int c; }; // CHECK: define void @f_int_double_int_s_arg(%struct.int_double_int_s* %a) void f_int_double_int_s_arg(struct int_double_int_s a) {} -// CHECK: define void @f_ret_int_double_int_s(%struct.int_double_int_s* noalias sret %agg.result) +// CHECK: define void @f_ret_int_double_int_s(%struct.int_double_int_s* noalias sret align 8 %agg.result) struct int_double_int_s f_ret_int_double_int_s() { return (struct int_double_int_s){1, 2.0, 3}; } diff --git a/clang/test/CodeGen/sparcv9-abi.c b/clang/test/CodeGen/sparcv9-abi.c index 5984fa558c83c..2d97001ab1ae4 100644 --- a/clang/test/CodeGen/sparcv9-abi.c +++ b/clang/test/CodeGen/sparcv9-abi.c @@ -53,7 +53,7 @@ struct large { int x; }; -// CHECK-LABEL: define void @f_large(%struct.large* noalias sret %agg.result, %struct.large* %x) +// CHECK-LABEL: define void @f_large(%struct.large* noalias sret align 8 %agg.result, %struct.large* %x) struct large f_large(struct large x) { x.a += *x.b; x.b = 0; diff --git a/clang/test/CodeGen/struct-passing.c b/clang/test/CodeGen/struct-passing.c index 80847b9fea64f..e3108b964bd26 100644 --- a/clang/test/CodeGen/struct-passing.c +++ b/clang/test/CodeGen/struct-passing.c @@ -18,8 +18,8 @@ void *ps[] = { f0, f1, f2, f3, f4, f5 }; // CHECK: declare i32 @f0() [[RN:#[0-9]+]] // CHECK: declare i32 @f1() [[RO:#[0-9]+]] -// CHECK: declare void @f2({{.*}} sret) -// CHECK: declare void @f3({{.*}} sret) +// CHECK: declare void @f2({{.*}} sret align 4) +// CHECK: declare void @f3({{.*}} sret align 4) // CHECK: declare void @f4({{.*}} byval({{.*}}) align 4) // CHECK: declare void @f5({{.*}} byval({{.*}}) align 4) diff --git a/clang/test/CodeGen/systemz-abi-vector.c b/clang/test/CodeGen/systemz-abi-vector.c index f2e6c13c718f5..896cc0994d6df 100644 --- a/clang/test/CodeGen/systemz-abi-vector.c +++ b/clang/test/CodeGen/systemz-abi-vector.c @@ -50,91 +50,91 @@ unsigned int align = __alignof__ (v16i8); // CHECK-VECTOR: @align = global i32 8 v1i8 pass_v1i8(v1i8 arg) { return arg; } -// CHECK-LABEL: define void @pass_v1i8(<1 x i8>* noalias sret %{{.*}}, <1 x i8>* %0) +// CHECK-LABEL: define void @pass_v1i8(<1 x i8>* noalias sret align 1 %{{.*}}, <1 x i8>* %0) // CHECK-VECTOR-LABEL: define <1 x i8> @pass_v1i8(<1 x i8> %{{.*}}) v2i8 pass_v2i8(v2i8 arg) { return arg; } -// CHECK-LABEL: define void @pass_v2i8(<2 x i8>* noalias sret %{{.*}}, <2 x i8>* %0) +// CHECK-LABEL: define void @pass_v2i8(<2 x i8>* noalias sret align 2 %{{.*}}, <2 x i8>* %0) // CHECK-VECTOR-LABEL: define <2 x i8> @pass_v2i8(<2 x i8> %{{.*}}) v4i8 pass_v4i8(v4i8 arg) { return arg; } -// CHECK-LABEL: define void @pass_v4i8(<4 x i8>* noalias sret %{{.*}}, <4 x i8>* %0) +// CHECK-LABEL: define void @pass_v4i8(<4 x i8>* noalias sret align 4 %{{.*}}, <4 x i8>* %0) // CHECK-VECTOR-LABEL: define <4 x i8> @pass_v4i8(<4 x i8> %{{.*}}) v8i8 pass_v8i8(v8i8 arg) { return arg; } -// CHECK-LABEL: define void @pass_v8i8(<8 x i8>* noalias sret %{{.*}}, <8 x i8>* %0) +// CHECK-LABEL: define void @pass_v8i8(<8 x i8>* noalias sret align 8 %{{.*}}, <8 x i8>* %0) // CHECK-VECTOR-LABEL: define <8 x i8> @pass_v8i8(<8 x i8> %{{.*}}) v16i8 pass_v16i8(v16i8 arg) { return arg; } -// CHECK-LABEL: define void @pass_v16i8(<16 x i8>* noalias sret %{{.*}}, <16 x i8>* %0) +// CHECK-LABEL: define void @pass_v16i8(<16 x i8>* noalias sret align 16 %{{.*}}, <16 x i8>* %0) // CHECK-VECTOR-LABEL: define <16 x i8> @pass_v16i8(<16 x i8> %{{.*}}) v32i8 pass_v32i8(v32i8 arg) { return arg; } -// CHECK-LABEL: define void @pass_v32i8(<32 x i8>* noalias sret %{{.*}}, <32 x i8>* %0) -// CHECK-VECTOR-LABEL: define void @pass_v32i8(<32 x i8>* noalias sret %{{.*}}, <32 x i8>* %0) +// CHECK-LABEL: define void @pass_v32i8(<32 x i8>* noalias sret align 32 %{{.*}}, <32 x i8>* %0) +// CHECK-VECTOR-LABEL: define void @pass_v32i8(<32 x i8>* noalias sret align 8 %{{.*}}, <32 x i8>* %0) v1i16 pass_v1i16(v1i16 arg) { return arg; } -// CHECK-LABEL: define void @pass_v1i16(<1 x i16>* noalias sret %{{.*}}, <1 x i16>* %0) +// CHECK-LABEL: define void @pass_v1i16(<1 x i16>* noalias sret align 2 %{{.*}}, <1 x i16>* %0) // CHECK-VECTOR-LABEL: define <1 x i16> @pass_v1i16(<1 x i16> %{{.*}}) v2i16 pass_v2i16(v2i16 arg) { return arg; } -// CHECK-LABEL: define void @pass_v2i16(<2 x i16>* noalias sret %{{.*}}, <2 x i16>* %0) +// CHECK-LABEL: define void @pass_v2i16(<2 x i16>* noalias sret align 4 %{{.*}}, <2 x i16>* %0) // CHECK-VECTOR-LABEL: define <2 x i16> @pass_v2i16(<2 x i16> %{{.*}}) v4i16 pass_v4i16(v4i16 arg) { return arg; } -// CHECK-LABEL: define void @pass_v4i16(<4 x i16>* noalias sret %{{.*}}, <4 x i16>* %0) +// CHECK-LABEL: define void @pass_v4i16(<4 x i16>* noalias sret align 8 %{{.*}}, <4 x i16>* %0) // CHECK-VECTOR-LABEL: define <4 x i16> @pass_v4i16(<4 x i16> %{{.*}}) v8i16 pass_v8i16(v8i16 arg) { return arg; } -// CHECK-LABEL: define void @pass_v8i16(<8 x i16>* noalias sret %{{.*}}, <8 x i16>* %0) +// CHECK-LABEL: define void @pass_v8i16(<8 x i16>* noalias sret align 16 %{{.*}}, <8 x i16>* %0) // CHECK-VECTOR-LABEL: define <8 x i16> @pass_v8i16(<8 x i16> %{{.*}}) v1i32 pass_v1i32(v1i32 arg) { return arg; } -// CHECK-LABEL: define void @pass_v1i32(<1 x i32>* noalias sret %{{.*}}, <1 x i32>* %0) +// CHECK-LABEL: define void @pass_v1i32(<1 x i32>* noalias sret align 4 %{{.*}}, <1 x i32>* %0) // CHECK-VECTOR-LABEL: define <1 x i32> @pass_v1i32(<1 x i32> %{{.*}}) v2i32 pass_v2i32(v2i32 arg) { return arg; } -// CHECK-LABEL: define void @pass_v2i32(<2 x i32>* noalias sret %{{.*}}, <2 x i32>* %0) +// CHECK-LABEL: define void @pass_v2i32(<2 x i32>* noalias sret align 8 %{{.*}}, <2 x i32>* %0) // CHECK-VECTOR-LABEL: define <2 x i32> @pass_v2i32(<2 x i32> %{{.*}}) v4i32 pass_v4i32(v4i32 arg) { return arg; } -// CHECK-LABEL: define void @pass_v4i32(<4 x i32>* noalias sret %{{.*}}, <4 x i32>* %0) +// CHECK-LABEL: define void @pass_v4i32(<4 x i32>* noalias sret align 16 %{{.*}}, <4 x i32>* %0) // CHECK-VECTOR-LABEL: define <4 x i32> @pass_v4i32(<4 x i32> %{{.*}}) v1i64 pass_v1i64(v1i64 arg) { return arg; } -// CHECK-LABEL: define void @pass_v1i64(<1 x i64>* noalias sret %{{.*}}, <1 x i64>* %0) +// CHECK-LABEL: define void @pass_v1i64(<1 x i64>* noalias sret align 8 %{{.*}}, <1 x i64>* %0) // CHECK-VECTOR-LABEL: define <1 x i64> @pass_v1i64(<1 x i64> %{{.*}}) v2i64 pass_v2i64(v2i64 arg) { return arg; } -// CHECK-LABEL: define void @pass_v2i64(<2 x i64>* noalias sret %{{.*}}, <2 x i64>* %0) +// CHECK-LABEL: define void @pass_v2i64(<2 x i64>* noalias sret align 16 %{{.*}}, <2 x i64>* %0) // CHECK-VECTOR-LABEL: define <2 x i64> @pass_v2i64(<2 x i64> %{{.*}}) v1i128 pass_v1i128(v1i128 arg) { return arg; } -// CHECK-LABEL: define void @pass_v1i128(<1 x i128>* noalias sret %{{.*}}, <1 x i128>* %0) +// CHECK-LABEL: define void @pass_v1i128(<1 x i128>* noalias sret align 16 %{{.*}}, <1 x i128>* %0) // CHECK-VECTOR-LABEL: define <1 x i128> @pass_v1i128(<1 x i128> %{{.*}}) v1f32 pass_v1f32(v1f32 arg) { return arg; } -// CHECK-LABEL: define void @pass_v1f32(<1 x float>* noalias sret %{{.*}}, <1 x float>* %0) +// CHECK-LABEL: define void @pass_v1f32(<1 x float>* noalias sret align 4 %{{.*}}, <1 x float>* %0) // CHECK-VECTOR-LABEL: define <1 x float> @pass_v1f32(<1 x float> %{{.*}}) v2f32 pass_v2f32(v2f32 arg) { return arg; } -// CHECK-LABEL: define void @pass_v2f32(<2 x float>* noalias sret %{{.*}}, <2 x float>* %0) +// CHECK-LABEL: define void @pass_v2f32(<2 x float>* noalias sret align 8 %{{.*}}, <2 x float>* %0) // CHECK-VECTOR-LABEL: define <2 x float> @pass_v2f32(<2 x float> %{{.*}}) v4f32 pass_v4f32(v4f32 arg) { return arg; } -// CHECK-LABEL: define void @pass_v4f32(<4 x float>* noalias sret %{{.*}}, <4 x float>* %0) +// CHECK-LABEL: define void @pass_v4f32(<4 x float>* noalias sret align 16 %{{.*}}, <4 x float>* %0) // CHECK-VECTOR-LABEL: define <4 x float> @pass_v4f32(<4 x float> %{{.*}}) v1f64 pass_v1f64(v1f64 arg) { return arg; } -// CHECK-LABEL: define void @pass_v1f64(<1 x double>* noalias sret %{{.*}}, <1 x double>* %0) +// CHECK-LABEL: define void @pass_v1f64(<1 x double>* noalias sret align 8 %{{.*}}, <1 x double>* %0) // CHECK-VECTOR-LABEL: define <1 x double> @pass_v1f64(<1 x double> %{{.*}}) v2f64 pass_v2f64(v2f64 arg) { return arg; } -// CHECK-LABEL: define void @pass_v2f64(<2 x double>* noalias sret %{{.*}}, <2 x double>* %0) +// CHECK-LABEL: define void @pass_v2f64(<2 x double>* noalias sret align 16 %{{.*}}, <2 x double>* %0) // CHECK-VECTOR-LABEL: define <2 x double> @pass_v2f64(<2 x double> %{{.*}}) v1f128 pass_v1f128(v1f128 arg) { return arg; } -// CHECK-LABEL: define void @pass_v1f128(<1 x fp128>* noalias sret %{{.*}}, <1 x fp128>* %0) +// CHECK-LABEL: define void @pass_v1f128(<1 x fp128>* noalias sret align 16 %{{.*}}, <1 x fp128>* %0) // CHECK-VECTOR-LABEL: define <1 x fp128> @pass_v1f128(<1 x fp128> %{{.*}}) @@ -142,62 +142,62 @@ v1f128 pass_v1f128(v1f128 arg) { return arg; } struct agg_v1i8 { v1i8 a; }; struct agg_v1i8 pass_agg_v1i8(struct agg_v1i8 arg) { return arg; } -// CHECK-LABEL: define void @pass_agg_v1i8(%struct.agg_v1i8* noalias sret %{{.*}}, i8 %{{.*}}) -// CHECK-VECTOR-LABEL: define void @pass_agg_v1i8(%struct.agg_v1i8* noalias sret %{{.*}}, <1 x i8> %{{.*}}) +// CHECK-LABEL: define void @pass_agg_v1i8(%struct.agg_v1i8* noalias sret align 1 %{{.*}}, i8 %{{.*}}) +// CHECK-VECTOR-LABEL: define void @pass_agg_v1i8(%struct.agg_v1i8* noalias sret align 1 %{{.*}}, <1 x i8> %{{.*}}) struct agg_v2i8 { v2i8 a; }; struct agg_v2i8 pass_agg_v2i8(struct agg_v2i8 arg) { return arg; } -// CHECK-LABEL: define void @pass_agg_v2i8(%struct.agg_v2i8* noalias sret %{{.*}}, i16 %{{.*}}) -// CHECK-VECTOR-LABEL: define void @pass_agg_v2i8(%struct.agg_v2i8* noalias sret %{{.*}}, <2 x i8> %{{.*}}) +// CHECK-LABEL: define void @pass_agg_v2i8(%struct.agg_v2i8* noalias sret align 2 %{{.*}}, i16 %{{.*}}) +// CHECK-VECTOR-LABEL: define void @pass_agg_v2i8(%struct.agg_v2i8* noalias sret align 2 %{{.*}}, <2 x i8> %{{.*}}) struct agg_v4i8 { v4i8 a; }; struct agg_v4i8 pass_agg_v4i8(struct agg_v4i8 arg) { return arg; } -// CHECK-LABEL: define void @pass_agg_v4i8(%struct.agg_v4i8* noalias sret %{{.*}}, i32 %{{.*}}) -// CHECK-VECTOR-LABEL: define void @pass_agg_v4i8(%struct.agg_v4i8* noalias sret %{{.*}}, <4 x i8> %{{.*}}) +// CHECK-LABEL: define void @pass_agg_v4i8(%struct.agg_v4i8* noalias sret align 4 %{{.*}}, i32 %{{.*}}) +// CHECK-VECTOR-LABEL: define void @pass_agg_v4i8(%struct.agg_v4i8* noalias sret align 4 %{{.*}}, <4 x i8> %{{.*}}) struct agg_v8i8 { v8i8 a; }; struct agg_v8i8 pass_agg_v8i8(struct agg_v8i8 arg) { return arg; } -// CHECK-LABEL: define void @pass_agg_v8i8(%struct.agg_v8i8* noalias sret %{{.*}}, i64 %{{.*}}) -// CHECK-VECTOR-LABEL: define void @pass_agg_v8i8(%struct.agg_v8i8* noalias sret %{{.*}}, <8 x i8> %{{.*}}) +// CHECK-LABEL: define void @pass_agg_v8i8(%struct.agg_v8i8* noalias sret align 8 %{{.*}}, i64 %{{.*}}) +// CHECK-VECTOR-LABEL: define void @pass_agg_v8i8(%struct.agg_v8i8* noalias sret align 8 %{{.*}}, <8 x i8> %{{.*}}) struct agg_v16i8 { v16i8 a; }; struct agg_v16i8 pass_agg_v16i8(struct agg_v16i8 arg) { return arg; } -// CHECK-LABEL: define void @pass_agg_v16i8(%struct.agg_v16i8* noalias sret %{{.*}}, %struct.agg_v16i8* %{{.*}}) -// CHECK-VECTOR-LABEL: define void @pass_agg_v16i8(%struct.agg_v16i8* noalias sret %{{.*}}, <16 x i8> %{{.*}}) +// CHECK-LABEL: define void @pass_agg_v16i8(%struct.agg_v16i8* noalias sret align 16 %{{.*}}, %struct.agg_v16i8* %{{.*}}) +// CHECK-VECTOR-LABEL: define void @pass_agg_v16i8(%struct.agg_v16i8* noalias sret align 8 %{{.*}}, <16 x i8> %{{.*}}) struct agg_v32i8 { v32i8 a; }; struct agg_v32i8 pass_agg_v32i8(struct agg_v32i8 arg) { return arg; } -// CHECK-LABEL: define void @pass_agg_v32i8(%struct.agg_v32i8* noalias sret %{{.*}}, %struct.agg_v32i8* %{{.*}}) -// CHECK-VECTOR-LABEL: define void @pass_agg_v32i8(%struct.agg_v32i8* noalias sret %{{.*}}, %struct.agg_v32i8* %{{.*}}) +// CHECK-LABEL: define void @pass_agg_v32i8(%struct.agg_v32i8* noalias sret align 32 %{{.*}}, %struct.agg_v32i8* %{{.*}}) +// CHECK-VECTOR-LABEL: define void @pass_agg_v32i8(%struct.agg_v32i8* noalias sret align 8 %{{.*}}, %struct.agg_v32i8* %{{.*}}) // Verify that the following are *not* vector-like aggregate types struct agg_novector1 { v4i8 a; v4i8 b; }; struct agg_novector1 pass_agg_novector1(struct agg_novector1 arg) { return arg; } -// CHECK-LABEL: define void @pass_agg_novector1(%struct.agg_novector1* noalias sret %{{.*}}, i64 %{{.*}}) -// CHECK-VECTOR-LABEL: define void @pass_agg_novector1(%struct.agg_novector1* noalias sret %{{.*}}, i64 %{{.*}}) +// CHECK-LABEL: define void @pass_agg_novector1(%struct.agg_novector1* noalias sret align 4 %{{.*}}, i64 %{{.*}}) +// CHECK-VECTOR-LABEL: define void @pass_agg_novector1(%struct.agg_novector1* noalias sret align 4 %{{.*}}, i64 %{{.*}}) struct agg_novector2 { v4i8 a; float b; }; struct agg_novector2 pass_agg_novector2(struct agg_novector2 arg) { return arg; } -// CHECK-LABEL: define void @pass_agg_novector2(%struct.agg_novector2* noalias sret %{{.*}}, i64 %{{.*}}) -// CHECK-VECTOR-LABEL: define void @pass_agg_novector2(%struct.agg_novector2* noalias sret %{{.*}}, i64 %{{.*}}) +// CHECK-LABEL: define void @pass_agg_novector2(%struct.agg_novector2* noalias sret align 4 %{{.*}}, i64 %{{.*}}) +// CHECK-VECTOR-LABEL: define void @pass_agg_novector2(%struct.agg_novector2* noalias sret align 4 %{{.*}}, i64 %{{.*}}) struct agg_novector3 { v4i8 a; int : 0; }; struct agg_novector3 pass_agg_novector3(struct agg_novector3 arg) { return arg; } -// CHECK-LABEL: define void @pass_agg_novector3(%struct.agg_novector3* noalias sret %{{.*}}, i32 %{{.*}}) -// CHECK-VECTOR-LABEL: define void @pass_agg_novector3(%struct.agg_novector3* noalias sret %{{.*}}, i32 %{{.*}}) +// CHECK-LABEL: define void @pass_agg_novector3(%struct.agg_novector3* noalias sret align 4 %{{.*}}, i32 %{{.*}}) +// CHECK-VECTOR-LABEL: define void @pass_agg_novector3(%struct.agg_novector3* noalias sret align 4 %{{.*}}, i32 %{{.*}}) struct agg_novector4 { v4i8 a __attribute__((aligned (8))); }; struct agg_novector4 pass_agg_novector4(struct agg_novector4 arg) { return arg; } -// CHECK-LABEL: define void @pass_agg_novector4(%struct.agg_novector4* noalias sret %{{.*}}, i64 %{{.*}}) -// CHECK-VECTOR-LABEL: define void @pass_agg_novector4(%struct.agg_novector4* noalias sret %{{.*}}, i64 %{{.*}}) +// CHECK-LABEL: define void @pass_agg_novector4(%struct.agg_novector4* noalias sret align 8 %{{.*}}, i64 %{{.*}}) +// CHECK-VECTOR-LABEL: define void @pass_agg_novector4(%struct.agg_novector4* noalias sret align 8 %{{.*}}, i64 %{{.*}}) // Accessing variable argument lists v1i8 va_v1i8(__builtin_va_list l) { return __builtin_va_arg(l, v1i8); } -// CHECK-LABEL: define void @va_v1i8(<1 x i8>* noalias sret %{{.*}}, %struct.__va_list_tag* %{{.*}}) +// CHECK-LABEL: define void @va_v1i8(<1 x i8>* noalias sret align 1 %{{.*}}, %struct.__va_list_tag* %{{.*}}) // CHECK: [[REG_COUNT_PTR:%[^ ]+]] = getelementptr inbounds %struct.__va_list_tag, %struct.__va_list_tag* %{{.*}}, i32 0, i32 0 // CHECK: [[REG_COUNT:%[^ ]+]] = load i64, i64* [[REG_COUNT_PTR]] // CHECK: [[FITS_IN_REGS:%[^ ]+]] = icmp ult i64 [[REG_COUNT]], 5 @@ -229,7 +229,7 @@ v1i8 va_v1i8(__builtin_va_list l) { return __builtin_va_arg(l, v1i8); } // CHECK-VECTOR: ret <1 x i8> [[RET]] v2i8 va_v2i8(__builtin_va_list l) { return __builtin_va_arg(l, v2i8); } -// CHECK-LABEL: define void @va_v2i8(<2 x i8>* noalias sret %{{.*}}, %struct.__va_list_tag* %{{.*}}) +// CHECK-LABEL: define void @va_v2i8(<2 x i8>* noalias sret align 2 %{{.*}}, %struct.__va_list_tag* %{{.*}}) // CHECK: [[REG_COUNT_PTR:%[^ ]+]] = getelementptr inbounds %struct.__va_list_tag, %struct.__va_list_tag* %{{.*}}, i32 0, i32 0 // CHECK: [[REG_COUNT:%[^ ]+]] = load i64, i64* [[REG_COUNT_PTR]] // CHECK: [[FITS_IN_REGS:%[^ ]+]] = icmp ult i64 [[REG_COUNT]], 5 @@ -261,7 +261,7 @@ v2i8 va_v2i8(__builtin_va_list l) { return __builtin_va_arg(l, v2i8); } // CHECK-VECTOR: ret <2 x i8> [[RET]] v4i8 va_v4i8(__builtin_va_list l) { return __builtin_va_arg(l, v4i8); } -// CHECK-LABEL: define void @va_v4i8(<4 x i8>* noalias sret %{{.*}}, %struct.__va_list_tag* %{{.*}}) +// CHECK-LABEL: define void @va_v4i8(<4 x i8>* noalias sret align 4 %{{.*}}, %struct.__va_list_tag* %{{.*}}) // CHECK: [[REG_COUNT_PTR:%[^ ]+]] = getelementptr inbounds %struct.__va_list_tag, %struct.__va_list_tag* %{{.*}}, i32 0, i32 0 // CHECK: [[REG_COUNT:%[^ ]+]] = load i64, i64* [[REG_COUNT_PTR]] // CHECK: [[FITS_IN_REGS:%[^ ]+]] = icmp ult i64 [[REG_COUNT]], 5 @@ -293,7 +293,7 @@ v4i8 va_v4i8(__builtin_va_list l) { return __builtin_va_arg(l, v4i8); } // CHECK-VECTOR: ret <4 x i8> [[RET]] v8i8 va_v8i8(__builtin_va_list l) { return __builtin_va_arg(l, v8i8); } -// CHECK-LABEL: define void @va_v8i8(<8 x i8>* noalias sret %{{.*}}, %struct.__va_list_tag* %{{.*}}) +// CHECK-LABEL: define void @va_v8i8(<8 x i8>* noalias sret align 8 %{{.*}}, %struct.__va_list_tag* %{{.*}}) // CHECK: [[REG_COUNT_PTR:%[^ ]+]] = getelementptr inbounds %struct.__va_list_tag, %struct.__va_list_tag* %{{.*}}, i32 0, i32 0 // CHECK: [[REG_COUNT:%[^ ]+]] = load i64, i64* [[REG_COUNT_PTR]] // CHECK: [[FITS_IN_REGS:%[^ ]+]] = icmp ult i64 [[REG_COUNT]], 5 @@ -325,7 +325,7 @@ v8i8 va_v8i8(__builtin_va_list l) { return __builtin_va_arg(l, v8i8); } // CHECK-VECTOR: ret <8 x i8> [[RET]] v16i8 va_v16i8(__builtin_va_list l) { return __builtin_va_arg(l, v16i8); } -// CHECK-LABEL: define void @va_v16i8(<16 x i8>* noalias sret %{{.*}}, %struct.__va_list_tag* %{{.*}}) +// CHECK-LABEL: define void @va_v16i8(<16 x i8>* noalias sret align 16 %{{.*}}, %struct.__va_list_tag* %{{.*}}) // CHECK: [[REG_COUNT_PTR:%[^ ]+]] = getelementptr inbounds %struct.__va_list_tag, %struct.__va_list_tag* %{{.*}}, i32 0, i32 0 // CHECK: [[REG_COUNT:%[^ ]+]] = load i64, i64* [[REG_COUNT_PTR]] // CHECK: [[FITS_IN_REGS:%[^ ]+]] = icmp ult i64 [[REG_COUNT]], 5 @@ -357,7 +357,7 @@ v16i8 va_v16i8(__builtin_va_list l) { return __builtin_va_arg(l, v16i8); } // CHECK-VECTOR: ret <16 x i8> [[RET]] v32i8 va_v32i8(__builtin_va_list l) { return __builtin_va_arg(l, v32i8); } -// CHECK-LABEL: define void @va_v32i8(<32 x i8>* noalias sret %{{.*}}, %struct.__va_list_tag* %{{.*}}) +// CHECK-LABEL: define void @va_v32i8(<32 x i8>* noalias sret align 32 %{{.*}}, %struct.__va_list_tag* %{{.*}}) // CHECK: [[REG_COUNT_PTR:%[^ ]+]] = getelementptr inbounds %struct.__va_list_tag, %struct.__va_list_tag* %{{.*}}, i32 0, i32 0 // CHECK: [[REG_COUNT:%[^ ]+]] = load i64, i64* [[REG_COUNT_PTR]] // CHECK: [[FITS_IN_REGS:%[^ ]+]] = icmp ult i64 [[REG_COUNT]], 5 @@ -379,7 +379,7 @@ v32i8 va_v32i8(__builtin_va_list l) { return __builtin_va_arg(l, v32i8); } // CHECK: [[VA_ARG_ADDR:%[^ ]+]] = phi <32 x i8>** [ [[REG_ADDR]], %{{.*}} ], [ [[MEM_ADDR]], %{{.*}} ] // CHECK: [[INDIRECT_ARG:%[^ ]+]] = load <32 x i8>*, <32 x i8>** [[VA_ARG_ADDR]] // CHECK: ret void -// CHECK-VECTOR-LABEL: define void @va_v32i8(<32 x i8>* noalias sret %{{.*}}, %struct.__va_list_tag* %{{.*}}) +// CHECK-VECTOR-LABEL: define void @va_v32i8(<32 x i8>* noalias sret align 8 %{{.*}}, %struct.__va_list_tag* %{{.*}}) // CHECK-VECTOR: [[REG_COUNT_PTR:%[^ ]+]] = getelementptr inbounds %struct.__va_list_tag, %struct.__va_list_tag* %{{.*}}, i32 0, i32 0 // CHECK-VECTOR: [[REG_COUNT:%[^ ]+]] = load i64, i64* [[REG_COUNT_PTR]] // CHECK-VECTOR: [[FITS_IN_REGS:%[^ ]+]] = icmp ult i64 [[REG_COUNT]], 5 @@ -403,7 +403,7 @@ v32i8 va_v32i8(__builtin_va_list l) { return __builtin_va_arg(l, v32i8); } // CHECK-VECTOR: ret void struct agg_v1i8 va_agg_v1i8(__builtin_va_list l) { return __builtin_va_arg(l, struct agg_v1i8); } -// CHECK-LABEL: define void @va_agg_v1i8(%struct.agg_v1i8* noalias sret %{{.*}}, %struct.__va_list_tag* %{{.*}}) +// CHECK-LABEL: define void @va_agg_v1i8(%struct.agg_v1i8* noalias sret align 1 %{{.*}}, %struct.__va_list_tag* %{{.*}}) // CHECK: [[REG_COUNT_PTR:%[^ ]+]] = getelementptr inbounds %struct.__va_list_tag, %struct.__va_list_tag* %{{.*}}, i32 0, i32 0 // CHECK: [[REG_COUNT:%[^ ]+]] = load i64, i64* [[REG_COUNT_PTR]] // CHECK: [[FITS_IN_REGS:%[^ ]+]] = icmp ult i64 [[REG_COUNT]], 5 @@ -424,7 +424,7 @@ struct agg_v1i8 va_agg_v1i8(__builtin_va_list l) { return __builtin_va_arg(l, st // CHECK: store i8* [[OVERFLOW_ARG_AREA2]], i8** [[OVERFLOW_ARG_AREA_PTR]] // CHECK: [[VA_ARG_ADDR:%[^ ]+]] = phi %struct.agg_v1i8* [ [[REG_ADDR]], %{{.*}} ], [ [[MEM_ADDR]], %{{.*}} ] // CHECK: ret void -// CHECK-VECTOR-LABEL: define void @va_agg_v1i8(%struct.agg_v1i8* noalias sret %{{.*}}, %struct.__va_list_tag* %{{.*}}) +// CHECK-VECTOR-LABEL: define void @va_agg_v1i8(%struct.agg_v1i8* noalias sret align 1 %{{.*}}, %struct.__va_list_tag* %{{.*}}) // CHECK-VECTOR: [[OVERFLOW_ARG_AREA_PTR:%[^ ]+]] = getelementptr inbounds %struct.__va_list_tag, %struct.__va_list_tag* %{{.*}}, i32 0, i32 2 // CHECK-VECTOR: [[OVERFLOW_ARG_AREA:%[^ ]+]] = load i8*, i8** [[OVERFLOW_ARG_AREA_PTR]] // CHECK-VECTOR: [[MEM_ADDR:%[^ ]+]] = bitcast i8* [[OVERFLOW_ARG_AREA]] to %struct.agg_v1i8* @@ -433,7 +433,7 @@ struct agg_v1i8 va_agg_v1i8(__builtin_va_list l) { return __builtin_va_arg(l, st // CHECK-VECTOR: ret void struct agg_v2i8 va_agg_v2i8(__builtin_va_list l) { return __builtin_va_arg(l, struct agg_v2i8); } -// CHECK-LABEL: define void @va_agg_v2i8(%struct.agg_v2i8* noalias sret %{{.*}}, %struct.__va_list_tag* %{{.*}}) +// CHECK-LABEL: define void @va_agg_v2i8(%struct.agg_v2i8* noalias sret align 2 %{{.*}}, %struct.__va_list_tag* %{{.*}}) // CHECK: [[REG_COUNT_PTR:%[^ ]+]] = getelementptr inbounds %struct.__va_list_tag, %struct.__va_list_tag* %{{.*}}, i32 0, i32 0 // CHECK: [[REG_COUNT:%[^ ]+]] = load i64, i64* [[REG_COUNT_PTR]] // CHECK: [[FITS_IN_REGS:%[^ ]+]] = icmp ult i64 [[REG_COUNT]], 5 @@ -454,7 +454,7 @@ struct agg_v2i8 va_agg_v2i8(__builtin_va_list l) { return __builtin_va_arg(l, st // CHECK: store i8* [[OVERFLOW_ARG_AREA2]], i8** [[OVERFLOW_ARG_AREA_PTR]] // CHECK: [[VA_ARG_ADDR:%[^ ]+]] = phi %struct.agg_v2i8* [ [[REG_ADDR]], %{{.*}} ], [ [[MEM_ADDR]], %{{.*}} ] // CHECK: ret void -// CHECK-VECTOR-LABEL: define void @va_agg_v2i8(%struct.agg_v2i8* noalias sret %{{.*}}, %struct.__va_list_tag* %{{.*}}) +// CHECK-VECTOR-LABEL: define void @va_agg_v2i8(%struct.agg_v2i8* noalias sret align 2 %{{.*}}, %struct.__va_list_tag* %{{.*}}) // CHECK-VECTOR: [[OVERFLOW_ARG_AREA_PTR:%[^ ]+]] = getelementptr inbounds %struct.__va_list_tag, %struct.__va_list_tag* %{{.*}}, i32 0, i32 2 // CHECK-VECTOR: [[OVERFLOW_ARG_AREA:%[^ ]+]] = load i8*, i8** [[OVERFLOW_ARG_AREA_PTR]] // CHECK-VECTOR: [[MEM_ADDR:%[^ ]+]] = bitcast i8* [[OVERFLOW_ARG_AREA]] to %struct.agg_v2i8* @@ -463,7 +463,7 @@ struct agg_v2i8 va_agg_v2i8(__builtin_va_list l) { return __builtin_va_arg(l, st // CHECK-VECTOR: ret void struct agg_v4i8 va_agg_v4i8(__builtin_va_list l) { return __builtin_va_arg(l, struct agg_v4i8); } -// CHECK-LABEL: define void @va_agg_v4i8(%struct.agg_v4i8* noalias sret %{{.*}}, %struct.__va_list_tag* %{{.*}}) +// CHECK-LABEL: define void @va_agg_v4i8(%struct.agg_v4i8* noalias sret align 4 %{{.*}}, %struct.__va_list_tag* %{{.*}}) // CHECK: [[REG_COUNT_PTR:%[^ ]+]] = getelementptr inbounds %struct.__va_list_tag, %struct.__va_list_tag* %{{.*}}, i32 0, i32 0 // CHECK: [[REG_COUNT:%[^ ]+]] = load i64, i64* [[REG_COUNT_PTR]] // CHECK: [[FITS_IN_REGS:%[^ ]+]] = icmp ult i64 [[REG_COUNT]], 5 @@ -484,7 +484,7 @@ struct agg_v4i8 va_agg_v4i8(__builtin_va_list l) { return __builtin_va_arg(l, st // CHECK: store i8* [[OVERFLOW_ARG_AREA2]], i8** [[OVERFLOW_ARG_AREA_PTR]] // CHECK: [[VA_ARG_ADDR:%[^ ]+]] = phi %struct.agg_v4i8* [ [[REG_ADDR]], %{{.*}} ], [ [[MEM_ADDR]], %{{.*}} ] // CHECK: ret void -// CHECK-VECTOR-LABEL: define void @va_agg_v4i8(%struct.agg_v4i8* noalias sret %{{.*}}, %struct.__va_list_tag* %{{.*}}) +// CHECK-VECTOR-LABEL: define void @va_agg_v4i8(%struct.agg_v4i8* noalias sret align 4 %{{.*}}, %struct.__va_list_tag* %{{.*}}) // CHECK-VECTOR: [[OVERFLOW_ARG_AREA_PTR:%[^ ]+]] = getelementptr inbounds %struct.__va_list_tag, %struct.__va_list_tag* %{{.*}}, i32 0, i32 2 // CHECK-VECTOR: [[OVERFLOW_ARG_AREA:%[^ ]+]] = load i8*, i8** [[OVERFLOW_ARG_AREA_PTR]] // CHECK-VECTOR: [[MEM_ADDR:%[^ ]+]] = bitcast i8* [[OVERFLOW_ARG_AREA]] to %struct.agg_v4i8* @@ -493,7 +493,7 @@ struct agg_v4i8 va_agg_v4i8(__builtin_va_list l) { return __builtin_va_arg(l, st // CHECK-VECTOR: ret void struct agg_v8i8 va_agg_v8i8(__builtin_va_list l) { return __builtin_va_arg(l, struct agg_v8i8); } -// CHECK-LABEL: define void @va_agg_v8i8(%struct.agg_v8i8* noalias sret %{{.*}}, %struct.__va_list_tag* %{{.*}}) +// CHECK-LABEL: define void @va_agg_v8i8(%struct.agg_v8i8* noalias sret align 8 %{{.*}}, %struct.__va_list_tag* %{{.*}}) // CHECK: [[REG_COUNT_PTR:%[^ ]+]] = getelementptr inbounds %struct.__va_list_tag, %struct.__va_list_tag* %{{.*}}, i32 0, i32 0 // CHECK: [[REG_COUNT:%[^ ]+]] = load i64, i64* [[REG_COUNT_PTR]] // CHECK: [[FITS_IN_REGS:%[^ ]+]] = icmp ult i64 [[REG_COUNT]], 5 @@ -514,7 +514,7 @@ struct agg_v8i8 va_agg_v8i8(__builtin_va_list l) { return __builtin_va_arg(l, st // CHECK: store i8* [[OVERFLOW_ARG_AREA2]], i8** [[OVERFLOW_ARG_AREA_PTR]] // CHECK: [[VA_ARG_ADDR:%[^ ]+]] = phi %struct.agg_v8i8* [ [[REG_ADDR]], %{{.*}} ], [ [[MEM_ADDR]], %{{.*}} ] // CHECK: ret void -// CHECK-VECTOR-LABEL: define void @va_agg_v8i8(%struct.agg_v8i8* noalias sret %{{.*}}, %struct.__va_list_tag* %{{.*}}) +// CHECK-VECTOR-LABEL: define void @va_agg_v8i8(%struct.agg_v8i8* noalias sret align 8 %{{.*}}, %struct.__va_list_tag* %{{.*}}) // CHECK-VECTOR: [[OVERFLOW_ARG_AREA_PTR:%[^ ]+]] = getelementptr inbounds %struct.__va_list_tag, %struct.__va_list_tag* %{{.*}}, i32 0, i32 2 // CHECK-VECTOR: [[OVERFLOW_ARG_AREA:%[^ ]+]] = load i8*, i8** [[OVERFLOW_ARG_AREA_PTR]] // CHECK-VECTOR: [[MEM_ADDR:%[^ ]+]] = bitcast i8* [[OVERFLOW_ARG_AREA]] to %struct.agg_v8i8* @@ -523,7 +523,7 @@ struct agg_v8i8 va_agg_v8i8(__builtin_va_list l) { return __builtin_va_arg(l, st // CHECK-VECTOR: ret void struct agg_v16i8 va_agg_v16i8(__builtin_va_list l) { return __builtin_va_arg(l, struct agg_v16i8); } -// CHECK-LABEL: define void @va_agg_v16i8(%struct.agg_v16i8* noalias sret %{{.*}}, %struct.__va_list_tag* %{{.*}}) +// CHECK-LABEL: define void @va_agg_v16i8(%struct.agg_v16i8* noalias sret align 16 %{{.*}}, %struct.__va_list_tag* %{{.*}}) // CHECK: [[REG_COUNT_PTR:%[^ ]+]] = getelementptr inbounds %struct.__va_list_tag, %struct.__va_list_tag* %{{.*}}, i32 0, i32 0 // CHECK: [[REG_COUNT:%[^ ]+]] = load i64, i64* [[REG_COUNT_PTR]] // CHECK: [[FITS_IN_REGS:%[^ ]+]] = icmp ult i64 [[REG_COUNT]], 5 @@ -545,7 +545,7 @@ struct agg_v16i8 va_agg_v16i8(__builtin_va_list l) { return __builtin_va_arg(l, // CHECK: [[VA_ARG_ADDR:%[^ ]+]] = phi %struct.agg_v16i8** [ [[REG_ADDR]], %{{.*}} ], [ [[MEM_ADDR]], %{{.*}} ] // CHECK: [[INDIRECT_ARG:%[^ ]+]] = load %struct.agg_v16i8*, %struct.agg_v16i8** [[VA_ARG_ADDR]] // CHECK: ret void -// CHECK-VECTOR-LABEL: define void @va_agg_v16i8(%struct.agg_v16i8* noalias sret %{{.*}}, %struct.__va_list_tag* %{{.*}}) +// CHECK-VECTOR-LABEL: define void @va_agg_v16i8(%struct.agg_v16i8* noalias sret align 8 %{{.*}}, %struct.__va_list_tag* %{{.*}}) // CHECK-VECTOR: [[OVERFLOW_ARG_AREA_PTR:%[^ ]+]] = getelementptr inbounds %struct.__va_list_tag, %struct.__va_list_tag* %{{.*}}, i32 0, i32 2 // CHECK-VECTOR: [[OVERFLOW_ARG_AREA:%[^ ]+]] = load i8*, i8** [[OVERFLOW_ARG_AREA_PTR]] // CHECK-VECTOR: [[MEM_ADDR:%[^ ]+]] = bitcast i8* [[OVERFLOW_ARG_AREA]] to %struct.agg_v16i8* @@ -554,7 +554,7 @@ struct agg_v16i8 va_agg_v16i8(__builtin_va_list l) { return __builtin_va_arg(l, // CHECK-VECTOR: ret void struct agg_v32i8 va_agg_v32i8(__builtin_va_list l) { return __builtin_va_arg(l, struct agg_v32i8); } -// CHECK-LABEL: define void @va_agg_v32i8(%struct.agg_v32i8* noalias sret %{{.*}}, %struct.__va_list_tag* %{{.*}}) +// CHECK-LABEL: define void @va_agg_v32i8(%struct.agg_v32i8* noalias sret align 32 %{{.*}}, %struct.__va_list_tag* %{{.*}}) // CHECK: [[REG_COUNT_PTR:%[^ ]+]] = getelementptr inbounds %struct.__va_list_tag, %struct.__va_list_tag* %{{.*}}, i32 0, i32 0 // CHECK: [[REG_COUNT:%[^ ]+]] = load i64, i64* [[REG_COUNT_PTR]] // CHECK: [[FITS_IN_REGS:%[^ ]+]] = icmp ult i64 [[REG_COUNT]], 5 @@ -576,7 +576,7 @@ struct agg_v32i8 va_agg_v32i8(__builtin_va_list l) { return __builtin_va_arg(l, // CHECK: [[VA_ARG_ADDR:%[^ ]+]] = phi %struct.agg_v32i8** [ [[REG_ADDR]], %{{.*}} ], [ [[MEM_ADDR]], %{{.*}} ] // CHECK: [[INDIRECT_ARG:%[^ ]+]] = load %struct.agg_v32i8*, %struct.agg_v32i8** [[VA_ARG_ADDR]] // CHECK: ret void -// CHECK-VECTOR-LABEL: define void @va_agg_v32i8(%struct.agg_v32i8* noalias sret %{{.*}}, %struct.__va_list_tag* %{{.*}}) +// CHECK-VECTOR-LABEL: define void @va_agg_v32i8(%struct.agg_v32i8* noalias sret align 8 %{{.*}}, %struct.__va_list_tag* %{{.*}}) // CHECK-VECTOR: [[REG_COUNT_PTR:%[^ ]+]] = getelementptr inbounds %struct.__va_list_tag, %struct.__va_list_tag* %{{.*}}, i32 0, i32 0 // CHECK-VECTOR: [[REG_COUNT:%[^ ]+]] = load i64, i64* [[REG_COUNT_PTR]] // CHECK-VECTOR: [[FITS_IN_REGS:%[^ ]+]] = icmp ult i64 [[REG_COUNT]], 5 diff --git a/clang/test/CodeGen/systemz-abi.c b/clang/test/CodeGen/systemz-abi.c index c46b9ec19626f..35adbbe301c47 100644 --- a/clang/test/CodeGen/systemz-abi.c +++ b/clang/test/CodeGen/systemz-abi.c @@ -36,7 +36,7 @@ long long pass_longlong(long long arg) { return arg; } // CHECK-LABEL: define i64 @pass_longlong(i64 %{{.*}}) __int128 pass_int128(__int128 arg) { return arg; } -// CHECK-LABEL: define void @pass_int128(i128* noalias sret %{{.*}}, i128* %0) +// CHECK-LABEL: define void @pass_int128(i128* noalias sret align 16 %{{.*}}, i128* %0) float pass_float(float arg) { return arg; } // CHECK-LABEL: define float @pass_float(float %{{.*}}) @@ -45,114 +45,114 @@ double pass_double(double arg) { return arg; } // CHECK-LABEL: define double @pass_double(double %{{.*}}) long double pass_longdouble(long double arg) { return arg; } -// CHECK-LABEL: define void @pass_longdouble(fp128* noalias sret %{{.*}}, fp128* %0) +// CHECK-LABEL: define void @pass_longdouble(fp128* noalias sret align 8 %{{.*}}, fp128* %0) // Complex types _Complex char pass_complex_char(_Complex char arg) { return arg; } -// CHECK-LABEL: define void @pass_complex_char({ i8, i8 }* noalias sret %{{.*}}, { i8, i8 }* %{{.*}}arg) +// CHECK-LABEL: define void @pass_complex_char({ i8, i8 }* noalias sret align 1 %{{.*}}, { i8, i8 }* %{{.*}}arg) _Complex short pass_complex_short(_Complex short arg) { return arg; } -// CHECK-LABEL: define void @pass_complex_short({ i16, i16 }* noalias sret %{{.*}}, { i16, i16 }* %{{.*}}arg) +// CHECK-LABEL: define void @pass_complex_short({ i16, i16 }* noalias sret align 2 %{{.*}}, { i16, i16 }* %{{.*}}arg) _Complex int pass_complex_int(_Complex int arg) { return arg; } -// CHECK-LABEL: define void @pass_complex_int({ i32, i32 }* noalias sret %{{.*}}, { i32, i32 }* %{{.*}}arg) +// CHECK-LABEL: define void @pass_complex_int({ i32, i32 }* noalias sret align 4 %{{.*}}, { i32, i32 }* %{{.*}}arg) _Complex long pass_complex_long(_Complex long arg) { return arg; } -// CHECK-LABEL: define void @pass_complex_long({ i64, i64 }* noalias sret %{{.*}}, { i64, i64 }* %{{.*}}arg) +// CHECK-LABEL: define void @pass_complex_long({ i64, i64 }* noalias sret align 8 %{{.*}}, { i64, i64 }* %{{.*}}arg) _Complex long long pass_complex_longlong(_Complex long long arg) { return arg; } -// CHECK-LABEL: define void @pass_complex_longlong({ i64, i64 }* noalias sret %{{.*}}, { i64, i64 }* %{{.*}}arg) +// CHECK-LABEL: define void @pass_complex_longlong({ i64, i64 }* noalias sret align 8 %{{.*}}, { i64, i64 }* %{{.*}}arg) _Complex float pass_complex_float(_Complex float arg) { return arg; } -// CHECK-LABEL: define void @pass_complex_float({ float, float }* noalias sret %{{.*}}, { float, float }* %{{.*}}arg) +// CHECK-LABEL: define void @pass_complex_float({ float, float }* noalias sret align 4 %{{.*}}, { float, float }* %{{.*}}arg) _Complex double pass_complex_double(_Complex double arg) { return arg; } -// CHECK-LABEL: define void @pass_complex_double({ double, double }* noalias sret %{{.*}}, { double, double }* %{{.*}}arg) +// CHECK-LABEL: define void @pass_complex_double({ double, double }* noalias sret align 8 %{{.*}}, { double, double }* %{{.*}}arg) _Complex long double pass_complex_longdouble(_Complex long double arg) { return arg; } -// CHECK-LABEL: define void @pass_complex_longdouble({ fp128, fp128 }* noalias sret %{{.*}}, { fp128, fp128 }* %{{.*}}arg) +// CHECK-LABEL: define void @pass_complex_longdouble({ fp128, fp128 }* noalias sret align 8 %{{.*}}, { fp128, fp128 }* %{{.*}}arg) // Aggregate types struct agg_1byte { char a[1]; }; struct agg_1byte pass_agg_1byte(struct agg_1byte arg) { return arg; } -// CHECK-LABEL: define void @pass_agg_1byte(%struct.agg_1byte* noalias sret %{{.*}}, i8 %{{.*}}) +// CHECK-LABEL: define void @pass_agg_1byte(%struct.agg_1byte* noalias sret align 1 %{{.*}}, i8 %{{.*}}) struct agg_2byte { char a[2]; }; struct agg_2byte pass_agg_2byte(struct agg_2byte arg) { return arg; } -// CHECK-LABEL: define void @pass_agg_2byte(%struct.agg_2byte* noalias sret %{{.*}}, i16 %{{.*}}) +// CHECK-LABEL: define void @pass_agg_2byte(%struct.agg_2byte* noalias sret align 1 %{{.*}}, i16 %{{.*}}) struct agg_3byte { char a[3]; }; struct agg_3byte pass_agg_3byte(struct agg_3byte arg) { return arg; } -// CHECK-LABEL: define void @pass_agg_3byte(%struct.agg_3byte* noalias sret %{{.*}}, %struct.agg_3byte* %{{.*}}) +// CHECK-LABEL: define void @pass_agg_3byte(%struct.agg_3byte* noalias sret align 1 %{{.*}}, %struct.agg_3byte* %{{.*}}) struct agg_4byte { char a[4]; }; struct agg_4byte pass_agg_4byte(struct agg_4byte arg) { return arg; } -// CHECK-LABEL: define void @pass_agg_4byte(%struct.agg_4byte* noalias sret %{{.*}}, i32 %{{.*}}) +// CHECK-LABEL: define void @pass_agg_4byte(%struct.agg_4byte* noalias sret align 1 %{{.*}}, i32 %{{.*}}) struct agg_5byte { char a[5]; }; struct agg_5byte pass_agg_5byte(struct agg_5byte arg) { return arg; } -// CHECK-LABEL: define void @pass_agg_5byte(%struct.agg_5byte* noalias sret %{{.*}}, %struct.agg_5byte* %{{.*}}) +// CHECK-LABEL: define void @pass_agg_5byte(%struct.agg_5byte* noalias sret align 1 %{{.*}}, %struct.agg_5byte* %{{.*}}) struct agg_6byte { char a[6]; }; struct agg_6byte pass_agg_6byte(struct agg_6byte arg) { return arg; } -// CHECK-LABEL: define void @pass_agg_6byte(%struct.agg_6byte* noalias sret %{{.*}}, %struct.agg_6byte* %{{.*}}) +// CHECK-LABEL: define void @pass_agg_6byte(%struct.agg_6byte* noalias sret align 1 %{{.*}}, %struct.agg_6byte* %{{.*}}) struct agg_7byte { char a[7]; }; struct agg_7byte pass_agg_7byte(struct agg_7byte arg) { return arg; } -// CHECK-LABEL: define void @pass_agg_7byte(%struct.agg_7byte* noalias sret %{{.*}}, %struct.agg_7byte* %{{.*}}) +// CHECK-LABEL: define void @pass_agg_7byte(%struct.agg_7byte* noalias sret align 1 %{{.*}}, %struct.agg_7byte* %{{.*}}) struct agg_8byte { char a[8]; }; struct agg_8byte pass_agg_8byte(struct agg_8byte arg) { return arg; } -// CHECK-LABEL: define void @pass_agg_8byte(%struct.agg_8byte* noalias sret %{{.*}}, i64 %{{.*}}) +// CHECK-LABEL: define void @pass_agg_8byte(%struct.agg_8byte* noalias sret align 1 %{{.*}}, i64 %{{.*}}) struct agg_16byte { char a[16]; }; struct agg_16byte pass_agg_16byte(struct agg_16byte arg) { return arg; } -// CHECK-LABEL: define void @pass_agg_16byte(%struct.agg_16byte* noalias sret %{{.*}}, %struct.agg_16byte* %{{.*}}) +// CHECK-LABEL: define void @pass_agg_16byte(%struct.agg_16byte* noalias sret align 1 %{{.*}}, %struct.agg_16byte* %{{.*}}) // Float-like aggregate types struct agg_float { float a; }; struct agg_float pass_agg_float(struct agg_float arg) { return arg; } -// HARD-FLOAT-LABEL: define void @pass_agg_float(%struct.agg_float* noalias sret %{{.*}}, float %{{.*}}) -// SOFT-FLOAT-LABEL: define void @pass_agg_float(%struct.agg_float* noalias sret %{{.*}}, i32 %{{.*}}) +// HARD-FLOAT-LABEL: define void @pass_agg_float(%struct.agg_float* noalias sret align 4 %{{.*}}, float %{{.*}}) +// SOFT-FLOAT-LABEL: define void @pass_agg_float(%struct.agg_float* noalias sret align 4 %{{.*}}, i32 %{{.*}}) struct agg_double { double a; }; struct agg_double pass_agg_double(struct agg_double arg) { return arg; } -// HARD-FLOAT-LABEL: define void @pass_agg_double(%struct.agg_double* noalias sret %{{.*}}, double %{{.*}}) -// SOFT-FLOAT-LABEL: define void @pass_agg_double(%struct.agg_double* noalias sret %{{.*}}, i64 %{{.*}}) +// HARD-FLOAT-LABEL: define void @pass_agg_double(%struct.agg_double* noalias sret align 8 %{{.*}}, double %{{.*}}) +// SOFT-FLOAT-LABEL: define void @pass_agg_double(%struct.agg_double* noalias sret align 8 %{{.*}}, i64 %{{.*}}) struct agg_longdouble { long double a; }; struct agg_longdouble pass_agg_longdouble(struct agg_longdouble arg) { return arg; } -// CHECK-LABEL: define void @pass_agg_longdouble(%struct.agg_longdouble* noalias sret %{{.*}}, %struct.agg_longdouble* %{{.*}}) +// CHECK-LABEL: define void @pass_agg_longdouble(%struct.agg_longdouble* noalias sret align 8 %{{.*}}, %struct.agg_longdouble* %{{.*}}) struct agg_float_a8 { float a __attribute__((aligned (8))); }; struct agg_float_a8 pass_agg_float_a8(struct agg_float_a8 arg) { return arg; } -// HARD-FLOAT-LABEL: define void @pass_agg_float_a8(%struct.agg_float_a8* noalias sret %{{.*}}, double %{{.*}}) -// SOFT-FLOAT-LABEL: define void @pass_agg_float_a8(%struct.agg_float_a8* noalias sret %{{.*}}, i64 %{{.*}}) +// HARD-FLOAT-LABEL: define void @pass_agg_float_a8(%struct.agg_float_a8* noalias sret align 8 %{{.*}}, double %{{.*}}) +// SOFT-FLOAT-LABEL: define void @pass_agg_float_a8(%struct.agg_float_a8* noalias sret align 8 %{{.*}}, i64 %{{.*}}) struct agg_float_a16 { float a __attribute__((aligned (16))); }; struct agg_float_a16 pass_agg_float_a16(struct agg_float_a16 arg) { return arg; } -// CHECK-LABEL: define void @pass_agg_float_a16(%struct.agg_float_a16* noalias sret %{{.*}}, %struct.agg_float_a16* %{{.*}}) +// CHECK-LABEL: define void @pass_agg_float_a16(%struct.agg_float_a16* noalias sret align 16 %{{.*}}, %struct.agg_float_a16* %{{.*}}) // Verify that the following are *not* float-like aggregate types struct agg_nofloat1 { float a; float b; }; struct agg_nofloat1 pass_agg_nofloat1(struct agg_nofloat1 arg) { return arg; } -// CHECK-LABEL: define void @pass_agg_nofloat1(%struct.agg_nofloat1* noalias sret %{{.*}}, i64 %{{.*}}) +// CHECK-LABEL: define void @pass_agg_nofloat1(%struct.agg_nofloat1* noalias sret align 4 %{{.*}}, i64 %{{.*}}) struct agg_nofloat2 { float a; int b; }; struct agg_nofloat2 pass_agg_nofloat2(struct agg_nofloat2 arg) { return arg; } -// CHECK-LABEL: define void @pass_agg_nofloat2(%struct.agg_nofloat2* noalias sret %{{.*}}, i64 %{{.*}}) +// CHECK-LABEL: define void @pass_agg_nofloat2(%struct.agg_nofloat2* noalias sret align 4 %{{.*}}, i64 %{{.*}}) struct agg_nofloat3 { float a; int : 0; }; struct agg_nofloat3 pass_agg_nofloat3(struct agg_nofloat3 arg) { return arg; } -// CHECK-LABEL: define void @pass_agg_nofloat3(%struct.agg_nofloat3* noalias sret %{{.*}}, i32 %{{.*}}) +// CHECK-LABEL: define void @pass_agg_nofloat3(%struct.agg_nofloat3* noalias sret align 4 %{{.*}}, i32 %{{.*}}) // Accessing variable argument lists @@ -257,7 +257,7 @@ double va_double(__builtin_va_list l) { return __builtin_va_arg(l, double); } // CHECK: ret double [[RET]] long double va_longdouble(__builtin_va_list l) { return __builtin_va_arg(l, long double); } -// CHECK-LABEL: define void @va_longdouble(fp128* noalias sret %{{.*}}, %struct.__va_list_tag* %{{.*}}) +// CHECK-LABEL: define void @va_longdouble(fp128* noalias sret align 8 %{{.*}}, %struct.__va_list_tag* %{{.*}}) // CHECK: [[REG_COUNT_PTR:%[^ ]+]] = getelementptr inbounds %struct.__va_list_tag, %struct.__va_list_tag* %{{.*}}, i32 0, i32 0 // CHECK: [[REG_COUNT:%[^ ]+]] = load i64, i64* [[REG_COUNT_PTR]] // CHECK: [[FITS_IN_REGS:%[^ ]+]] = icmp ult i64 [[REG_COUNT]], 5 @@ -283,7 +283,7 @@ long double va_longdouble(__builtin_va_list l) { return __builtin_va_arg(l, long // CHECK: ret void _Complex char va_complex_char(__builtin_va_list l) { return __builtin_va_arg(l, _Complex char); } -// CHECK-LABEL: define void @va_complex_char({ i8, i8 }* noalias sret %{{.*}}, %struct.__va_list_tag* %{{.*}} +// CHECK-LABEL: define void @va_complex_char({ i8, i8 }* noalias sret align 1 %{{.*}}, %struct.__va_list_tag* %{{.*}} // CHECK: [[REG_COUNT_PTR:%[^ ]+]] = getelementptr inbounds %struct.__va_list_tag, %struct.__va_list_tag* %{{.*}}, i32 0, i32 0 // CHECK: [[REG_COUNT:%[^ ]+]] = load i64, i64* [[REG_COUNT_PTR]] // CHECK: [[FITS_IN_REGS:%[^ ]+]] = icmp ult i64 [[REG_COUNT]], 5 @@ -307,7 +307,7 @@ _Complex char va_complex_char(__builtin_va_list l) { return __builtin_va_arg(l, // CHECK: ret void struct agg_1byte va_agg_1byte(__builtin_va_list l) { return __builtin_va_arg(l, struct agg_1byte); } -// CHECK-LABEL: define void @va_agg_1byte(%struct.agg_1byte* noalias sret %{{.*}}, %struct.__va_list_tag* %{{.*}} +// CHECK-LABEL: define void @va_agg_1byte(%struct.agg_1byte* noalias sret align 1 %{{.*}}, %struct.__va_list_tag* %{{.*}} // CHECK: [[REG_COUNT_PTR:%[^ ]+]] = getelementptr inbounds %struct.__va_list_tag, %struct.__va_list_tag* %{{.*}}, i32 0, i32 0 // CHECK: [[REG_COUNT:%[^ ]+]] = load i64, i64* [[REG_COUNT_PTR]] // CHECK: [[FITS_IN_REGS:%[^ ]+]] = icmp ult i64 [[REG_COUNT]], 5 @@ -330,7 +330,7 @@ struct agg_1byte va_agg_1byte(__builtin_va_list l) { return __builtin_va_arg(l, // CHECK: ret void struct agg_2byte va_agg_2byte(__builtin_va_list l) { return __builtin_va_arg(l, struct agg_2byte); } -// CHECK-LABEL: define void @va_agg_2byte(%struct.agg_2byte* noalias sret %{{.*}}, %struct.__va_list_tag* %{{.*}} +// CHECK-LABEL: define void @va_agg_2byte(%struct.agg_2byte* noalias sret align 1 %{{.*}}, %struct.__va_list_tag* %{{.*}} // CHECK: [[REG_COUNT_PTR:%[^ ]+]] = getelementptr inbounds %struct.__va_list_tag, %struct.__va_list_tag* %{{.*}}, i32 0, i32 0 // CHECK: [[REG_COUNT:%[^ ]+]] = load i64, i64* [[REG_COUNT_PTR]] // CHECK: [[FITS_IN_REGS:%[^ ]+]] = icmp ult i64 [[REG_COUNT]], 5 @@ -353,7 +353,7 @@ struct agg_2byte va_agg_2byte(__builtin_va_list l) { return __builtin_va_arg(l, // CHECK: ret void struct agg_3byte va_agg_3byte(__builtin_va_list l) { return __builtin_va_arg(l, struct agg_3byte); } -// CHECK-LABEL: define void @va_agg_3byte(%struct.agg_3byte* noalias sret %{{.*}}, %struct.__va_list_tag* %{{.*}} +// CHECK-LABEL: define void @va_agg_3byte(%struct.agg_3byte* noalias sret align 1 %{{.*}}, %struct.__va_list_tag* %{{.*}} // CHECK: [[REG_COUNT_PTR:%[^ ]+]] = getelementptr inbounds %struct.__va_list_tag, %struct.__va_list_tag* %{{.*}}, i32 0, i32 0 // CHECK: [[REG_COUNT:%[^ ]+]] = load i64, i64* [[REG_COUNT_PTR]] // CHECK: [[FITS_IN_REGS:%[^ ]+]] = icmp ult i64 [[REG_COUNT]], 5 @@ -377,7 +377,7 @@ struct agg_3byte va_agg_3byte(__builtin_va_list l) { return __builtin_va_arg(l, // CHECK: ret void struct agg_4byte va_agg_4byte(__builtin_va_list l) { return __builtin_va_arg(l, struct agg_4byte); } -// CHECK-LABEL: define void @va_agg_4byte(%struct.agg_4byte* noalias sret %{{.*}}, %struct.__va_list_tag* %{{.*}} +// CHECK-LABEL: define void @va_agg_4byte(%struct.agg_4byte* noalias sret align 1 %{{.*}}, %struct.__va_list_tag* %{{.*}} // CHECK: [[REG_COUNT_PTR:%[^ ]+]] = getelementptr inbounds %struct.__va_list_tag, %struct.__va_list_tag* %{{.*}}, i32 0, i32 0 // CHECK: [[REG_COUNT:%[^ ]+]] = load i64, i64* [[REG_COUNT_PTR]] // CHECK: [[FITS_IN_REGS:%[^ ]+]] = icmp ult i64 [[REG_COUNT]], 5 @@ -400,7 +400,7 @@ struct agg_4byte va_agg_4byte(__builtin_va_list l) { return __builtin_va_arg(l, // CHECK: ret void struct agg_8byte va_agg_8byte(__builtin_va_list l) { return __builtin_va_arg(l, struct agg_8byte); } -// CHECK-LABEL: define void @va_agg_8byte(%struct.agg_8byte* noalias sret %{{.*}}, %struct.__va_list_tag* %{{.*}} +// CHECK-LABEL: define void @va_agg_8byte(%struct.agg_8byte* noalias sret align 1 %{{.*}}, %struct.__va_list_tag* %{{.*}} // CHECK: [[REG_COUNT_PTR:%[^ ]+]] = getelementptr inbounds %struct.__va_list_tag, %struct.__va_list_tag* %{{.*}}, i32 0, i32 0 // CHECK: [[REG_COUNT:%[^ ]+]] = load i64, i64* [[REG_COUNT_PTR]] // CHECK: [[FITS_IN_REGS:%[^ ]+]] = icmp ult i64 [[REG_COUNT]], 5 @@ -423,7 +423,7 @@ struct agg_8byte va_agg_8byte(__builtin_va_list l) { return __builtin_va_arg(l, // CHECK: ret void struct agg_float va_agg_float(__builtin_va_list l) { return __builtin_va_arg(l, struct agg_float); } -// CHECK-LABEL: define void @va_agg_float(%struct.agg_float* noalias sret %{{.*}}, %struct.__va_list_tag* %{{.*}} +// CHECK-LABEL: define void @va_agg_float(%struct.agg_float* noalias sret align 4 %{{.*}}, %struct.__va_list_tag* %{{.*}} // HARD-FLOAT: [[REG_COUNT_PTR:%[^ ]+]] = getelementptr inbounds %struct.__va_list_tag, %struct.__va_list_tag* %{{.*}}, i32 0, i32 1 // SOFT-FLOAT: [[REG_COUNT_PTR:%[^ ]+]] = getelementptr inbounds %struct.__va_list_tag, %struct.__va_list_tag* %{{.*}}, i32 0, i32 0 // CHECK: [[REG_COUNT:%[^ ]+]] = load i64, i64* [[REG_COUNT_PTR]] @@ -449,7 +449,7 @@ struct agg_float va_agg_float(__builtin_va_list l) { return __builtin_va_arg(l, // CHECK: ret void struct agg_double va_agg_double(__builtin_va_list l) { return __builtin_va_arg(l, struct agg_double); } -// CHECK-LABEL: define void @va_agg_double(%struct.agg_double* noalias sret %{{.*}}, %struct.__va_list_tag* %{{.*}} +// CHECK-LABEL: define void @va_agg_double(%struct.agg_double* noalias sret align 8 %{{.*}}, %struct.__va_list_tag* %{{.*}} // HARD-FLOAT: [[REG_COUNT_PTR:%[^ ]+]] = getelementptr inbounds %struct.__va_list_tag, %struct.__va_list_tag* %{{.*}}, i32 0, i32 1 // SOFT-FLOAT: [[REG_COUNT_PTR:%[^ ]+]] = getelementptr inbounds %struct.__va_list_tag, %struct.__va_list_tag* %{{.*}}, i32 0, i32 0 // CHECK: [[REG_COUNT:%[^ ]+]] = load i64, i64* [[REG_COUNT_PTR]] @@ -475,7 +475,7 @@ struct agg_double va_agg_double(__builtin_va_list l) { return __builtin_va_arg(l // CHECK: ret void struct agg_longdouble va_agg_longdouble(__builtin_va_list l) { return __builtin_va_arg(l, struct agg_longdouble); } -// CHECK-LABEL: define void @va_agg_longdouble(%struct.agg_longdouble* noalias sret %{{.*}}, %struct.__va_list_tag* %{{.*}} +// CHECK-LABEL: define void @va_agg_longdouble(%struct.agg_longdouble* noalias sret align 8 %{{.*}}, %struct.__va_list_tag* %{{.*}} // CHECK: [[REG_COUNT_PTR:%[^ ]+]] = getelementptr inbounds %struct.__va_list_tag, %struct.__va_list_tag* %{{.*}}, i32 0, i32 0 // CHECK: [[REG_COUNT:%[^ ]+]] = load i64, i64* [[REG_COUNT_PTR]] // CHECK: [[FITS_IN_REGS:%[^ ]+]] = icmp ult i64 [[REG_COUNT]], 5 @@ -499,7 +499,7 @@ struct agg_longdouble va_agg_longdouble(__builtin_va_list l) { return __builtin_ // CHECK: ret void struct agg_float_a8 va_agg_float_a8(__builtin_va_list l) { return __builtin_va_arg(l, struct agg_float_a8); } -// CHECK-LABEL: define void @va_agg_float_a8(%struct.agg_float_a8* noalias sret %{{.*}}, %struct.__va_list_tag* %{{.*}} +// CHECK-LABEL: define void @va_agg_float_a8(%struct.agg_float_a8* noalias sret align 8 %{{.*}}, %struct.__va_list_tag* %{{.*}} // HARD-FLOAT: [[REG_COUNT_PTR:%[^ ]+]] = getelementptr inbounds %struct.__va_list_tag, %struct.__va_list_tag* %{{.*}}, i32 0, i32 1 // SOFT-FLOAT: [[REG_COUNT_PTR:%[^ ]+]] = getelementptr inbounds %struct.__va_list_tag, %struct.__va_list_tag* %{{.*}}, i32 0, i32 0 // CHECK: [[REG_COUNT:%[^ ]+]] = load i64, i64* [[REG_COUNT_PTR]] @@ -525,7 +525,7 @@ struct agg_float_a8 va_agg_float_a8(__builtin_va_list l) { return __builtin_va_a // CHECK: ret void struct agg_float_a16 va_agg_float_a16(__builtin_va_list l) { return __builtin_va_arg(l, struct agg_float_a16); } -// CHECK-LABEL: define void @va_agg_float_a16(%struct.agg_float_a16* noalias sret %{{.*}}, %struct.__va_list_tag* %{{.*}} +// CHECK-LABEL: define void @va_agg_float_a16(%struct.agg_float_a16* noalias sret align 16 %{{.*}}, %struct.__va_list_tag* %{{.*}} // CHECK: [[REG_COUNT_PTR:%[^ ]+]] = getelementptr inbounds %struct.__va_list_tag, %struct.__va_list_tag* %{{.*}}, i32 0, i32 0 // CHECK: [[REG_COUNT:%[^ ]+]] = load i64, i64* [[REG_COUNT_PTR]] // CHECK: [[FITS_IN_REGS:%[^ ]+]] = icmp ult i64 [[REG_COUNT]], 5 @@ -549,7 +549,7 @@ struct agg_float_a16 va_agg_float_a16(__builtin_va_list l) { return __builtin_va // CHECK: ret void struct agg_nofloat1 va_agg_nofloat1(__builtin_va_list l) { return __builtin_va_arg(l, struct agg_nofloat1); } -// CHECK-LABEL: define void @va_agg_nofloat1(%struct.agg_nofloat1* noalias sret %{{.*}}, %struct.__va_list_tag* %{{.*}} +// CHECK-LABEL: define void @va_agg_nofloat1(%struct.agg_nofloat1* noalias sret align 4 %{{.*}}, %struct.__va_list_tag* %{{.*}} // CHECK: [[REG_COUNT_PTR:%[^ ]+]] = getelementptr inbounds %struct.__va_list_tag, %struct.__va_list_tag* %{{.*}}, i32 0, i32 0 // CHECK: [[REG_COUNT:%[^ ]+]] = load i64, i64* [[REG_COUNT_PTR]] // CHECK: [[FITS_IN_REGS:%[^ ]+]] = icmp ult i64 [[REG_COUNT]], 5 @@ -572,7 +572,7 @@ struct agg_nofloat1 va_agg_nofloat1(__builtin_va_list l) { return __builtin_va_a // CHECK: ret void struct agg_nofloat2 va_agg_nofloat2(__builtin_va_list l) { return __builtin_va_arg(l, struct agg_nofloat2); } -// CHECK-LABEL: define void @va_agg_nofloat2(%struct.agg_nofloat2* noalias sret %{{.*}}, %struct.__va_list_tag* %{{.*}} +// CHECK-LABEL: define void @va_agg_nofloat2(%struct.agg_nofloat2* noalias sret align 4 %{{.*}}, %struct.__va_list_tag* %{{.*}} // CHECK: [[REG_COUNT_PTR:%[^ ]+]] = getelementptr inbounds %struct.__va_list_tag, %struct.__va_list_tag* %{{.*}}, i32 0, i32 0 // CHECK: [[REG_COUNT:%[^ ]+]] = load i64, i64* [[REG_COUNT_PTR]] // CHECK: [[FITS_IN_REGS:%[^ ]+]] = icmp ult i64 [[REG_COUNT]], 5 @@ -595,7 +595,7 @@ struct agg_nofloat2 va_agg_nofloat2(__builtin_va_list l) { return __builtin_va_a // CHECK: ret void struct agg_nofloat3 va_agg_nofloat3(__builtin_va_list l) { return __builtin_va_arg(l, struct agg_nofloat3); } -// CHECK-LABEL: define void @va_agg_nofloat3(%struct.agg_nofloat3* noalias sret %{{.*}}, %struct.__va_list_tag* %{{.*}} +// CHECK-LABEL: define void @va_agg_nofloat3(%struct.agg_nofloat3* noalias sret align 4 %{{.*}}, %struct.__va_list_tag* %{{.*}} // CHECK: [[REG_COUNT_PTR:%[^ ]+]] = getelementptr inbounds %struct.__va_list_tag, %struct.__va_list_tag* %{{.*}}, i32 0, i32 0 // CHECK: [[REG_COUNT:%[^ ]+]] = load i64, i64* [[REG_COUNT_PTR]] // CHECK: [[FITS_IN_REGS:%[^ ]+]] = icmp ult i64 [[REG_COUNT]], 5 diff --git a/clang/test/CodeGen/systemz-abi.cpp b/clang/test/CodeGen/systemz-abi.cpp index d3088f5299c35..cb381e88dd8f1 100644 --- a/clang/test/CodeGen/systemz-abi.cpp +++ b/clang/test/CodeGen/systemz-abi.cpp @@ -7,5 +7,5 @@ struct agg_float_cpp { float a; int : 0; }; struct agg_float_cpp pass_agg_float_cpp(struct agg_float_cpp arg) { return arg; } -// CHECK-LABEL: define void @_Z18pass_agg_float_cpp13agg_float_cpp(%struct.agg_float_cpp* noalias sret %{{.*}}, float %{{.*}}) -// SOFT-FLOAT: define void @_Z18pass_agg_float_cpp13agg_float_cpp(%struct.agg_float_cpp* noalias sret %{{.*}}, i32 %{{.*}}) +// CHECK-LABEL: define void @_Z18pass_agg_float_cpp13agg_float_cpp(%struct.agg_float_cpp* noalias sret align 4 %{{.*}}, float %{{.*}}) +// SOFT-FLOAT: define void @_Z18pass_agg_float_cpp13agg_float_cpp(%struct.agg_float_cpp* noalias sret align 4 %{{.*}}, i32 %{{.*}}) diff --git a/clang/test/CodeGen/systemz-inline-asm.c b/clang/test/CodeGen/systemz-inline-asm.c index 7c273dac579e8..2dc5023c55cb0 100644 --- a/clang/test/CodeGen/systemz-inline-asm.c +++ b/clang/test/CodeGen/systemz-inline-asm.c @@ -123,7 +123,7 @@ double test_f64(double f, double g) { long double test_f128(long double f, long double g) { asm("axbr %0, %2" : "=f" (f) : "0" (f), "f" (g)); return f; -// CHECK: define void @test_f128(fp128* noalias nocapture sret [[DEST:%.*]], fp128* nocapture readonly %0, fp128* nocapture readonly %1) +// CHECK: define void @test_f128(fp128* noalias nocapture sret align 8 [[DEST:%.*]], fp128* nocapture readonly %0, fp128* nocapture readonly %1) // CHECK: %f = load fp128, fp128* %0 // CHECK: %g = load fp128, fp128* %1 // CHECK: [[RESULT:%.*]] = tail call fp128 asm "axbr $0, $2", "=f,0,f"(fp128 %f, fp128 %g) diff --git a/clang/test/CodeGen/vectorcall.c b/clang/test/CodeGen/vectorcall.c index 77db600a7f89c..5e052990498f7 100644 --- a/clang/test/CodeGen/vectorcall.c +++ b/clang/test/CodeGen/vectorcall.c @@ -86,8 +86,8 @@ struct HVA4 __vectorcall hva6(struct HVA4 a, struct HVA4 b) { return b;} // X64: define dso_local x86_vectorcallcc %struct.HVA4 @"\01hva6@@128"(%struct.HVA4 inreg %a.coerce, %struct.HVA4* %b) struct HVA5 __vectorcall hva7() {struct HVA5 a = {}; return a;} -// X32: define dso_local x86_vectorcallcc void @"\01hva7@@0"(%struct.HVA5* inreg noalias sret %agg.result) -// X64: define dso_local x86_vectorcallcc void @"\01hva7@@0"(%struct.HVA5* noalias sret %agg.result) +// X32: define dso_local x86_vectorcallcc void @"\01hva7@@0"(%struct.HVA5* inreg noalias sret align 16 %agg.result) +// X64: define dso_local x86_vectorcallcc void @"\01hva7@@0"(%struct.HVA5* noalias sret align 16 %agg.result) v4f32 __vectorcall hva8(v4f32 a, v4f32 b, v4f32 c, v4f32 d, int e, v4f32 f) {return f;} // X32: define dso_local x86_vectorcallcc <4 x float> @"\01hva8@@84"(<4 x float> inreg %a, <4 x float> inreg %b, <4 x float> inreg %c, <4 x float> inreg %d, i32 inreg %e, <4 x float> inreg %f) diff --git a/clang/test/CodeGen/wasm-arguments.c b/clang/test/CodeGen/wasm-arguments.c index 25978d8a0990f..2f9d7e4b3ecae 100644 --- a/clang/test/CodeGen/wasm-arguments.c +++ b/clang/test/CodeGen/wasm-arguments.c @@ -25,9 +25,9 @@ typedef struct { void struct_arg(s1 i) {} // Structs should be returned sret and not simplified by the frontend. -// WEBASSEMBLY32: define void @struct_ret(%struct.s1* noalias sret %agg.result) +// WEBASSEMBLY32: define void @struct_ret(%struct.s1* noalias sret align 4 %agg.result) // WEBASSEMBLY32: ret void -// WEBASSEMBLY64: define void @struct_ret(%struct.s1* noalias sret %agg.result) +// WEBASSEMBLY64: define void @struct_ret(%struct.s1* noalias sret align 4 %agg.result) // WEBASSEMBLY64: ret void // Except with the experimental multivalue ABI, which returns structs by value @@ -103,9 +103,9 @@ union simple_union { void union_arg(union simple_union s) {} // Unions should be returned sret and not simplified by the frontend. -// WEBASSEMBLY32: define void @union_ret(%union.simple_union* noalias sret %agg.result) +// WEBASSEMBLY32: define void @union_ret(%union.simple_union* noalias sret align 4 %agg.result) // WEBASSEMBLY32: ret void -// WEBASSEMBLY64: define void @union_ret(%union.simple_union* noalias sret %agg.result) +// WEBASSEMBLY64: define void @union_ret(%union.simple_union* noalias sret align 4 %agg.result) // WEBASSEMBLY64: ret void // The experimental multivalue ABI returns them by value, though. @@ -129,8 +129,8 @@ typedef struct { void bitfield_arg(bitfield1 bf1) {} // And returned via sret pointers. -// WEBASSEMBLY32: define void @bitfield_ret(%struct.bitfield1* noalias sret %agg.result) -// WEBASSEMBLY64: define void @bitfield_ret(%struct.bitfield1* noalias sret %agg.result) +// WEBASSEMBLY32: define void @bitfield_ret(%struct.bitfield1* noalias sret align 4 %agg.result) +// WEBASSEMBLY64: define void @bitfield_ret(%struct.bitfield1* noalias sret align 4 %agg.result) // Except, of course, in the experimental multivalue ABI // EXPERIMENTAL-MV: define %struct.bitfield1 @bitfield_ret() diff --git a/clang/test/CodeGen/wasm-varargs.c b/clang/test/CodeGen/wasm-varargs.c index 23506875ac9d7..ba1f2d632b4ec 100644 --- a/clang/test/CodeGen/wasm-varargs.c +++ b/clang/test/CodeGen/wasm-varargs.c @@ -80,7 +80,7 @@ struct S test_struct(char *fmt, ...) { return v; } -// CHECK: define void @test_struct([[STRUCT_S:%[^,=]+]]*{{.*}} noalias sret [[AGG_RESULT:%.*]], i8*{{.*}} %fmt, ...) {{.*}} { +// CHECK: define void @test_struct([[STRUCT_S:%[^,=]+]]*{{.*}} noalias sret align 4 [[AGG_RESULT:%.*]], i8*{{.*}} %fmt, ...) {{.*}} { // CHECK: [[FMT_ADDR:%[^,=]+]] = alloca i8*, align 4 // CHECK-NEXT: [[VA:%[^,=]+]] = alloca i8*, align 4 // CHECK-NEXT: store i8* %fmt, i8** [[FMT_ADDR]], align 4 @@ -112,7 +112,7 @@ struct S test_empty_struct(char *fmt, ...) { return v; } -// CHECK: define void @test_empty_struct([[STRUCT_S:%[^,=]+]]*{{.*}} noalias sret [[AGG_RESULT:%.*]], i8*{{.*}} %fmt, ...) {{.*}} { +// CHECK: define void @test_empty_struct([[STRUCT_S:%[^,=]+]]*{{.*}} noalias sret align 4 [[AGG_RESULT:%.*]], i8*{{.*}} %fmt, ...) {{.*}} { // CHECK: [[FMT_ADDR:%[^,=]+]] = alloca i8*, align 4 // CHECK-NEXT: [[VA:%[^,=]+]] = alloca i8*, align 4 // CHECK-NEXT: [[U:%[^,=]+]] = alloca [[STRUCT_Z:%[^,=]+]], align 1 diff --git a/clang/test/CodeGen/windows-struct-abi.c b/clang/test/CodeGen/windows-struct-abi.c index 5ffc4fad64730..9fa175f136587 100644 --- a/clang/test/CodeGen/windows-struct-abi.c +++ b/clang/test/CodeGen/windows-struct-abi.c @@ -34,7 +34,7 @@ struct f4 { struct f4 return_f4(void) { while (1); } -// CHECK: define dso_local void @return_f4(%struct.f4* noalias sret %agg.result) +// CHECK: define dso_local void @return_f4(%struct.f4* noalias sret align 4 %agg.result) void receive_f4(struct f4 a0) { } diff --git a/clang/test/CodeGen/x86_32-arguments-darwin.c b/clang/test/CodeGen/x86_32-arguments-darwin.c index 71b8a2b9fc848..c88c1b8603b67 100644 --- a/clang/test/CodeGen/x86_32-arguments-darwin.c +++ b/clang/test/CodeGen/x86_32-arguments-darwin.c @@ -71,7 +71,7 @@ struct s10 { // Small vectors and 1 x {i64,double} are returned in registers // CHECK: i32 @f11() -// CHECK: void @f12(<2 x i32>* noalias sret %agg.result) +// CHECK: void @f12(<2 x i32>* noalias sret align 8 %agg.result) // CHECK: i64 @f13() // CHECK: i64 @f14() // CHECK: <2 x i64> @f15() @@ -93,11 +93,11 @@ T16 f16(void) { while (1) {} } // 128-bits). // CHECK: i32 @f17() -// CHECK: void @f18(%{{.*}}* noalias sret %agg.result) -// CHECK: void @f19(%{{.*}}* noalias sret %agg.result) -// CHECK: void @f20(%{{.*}}* noalias sret %agg.result) -// CHECK: void @f21(%{{.*}}* noalias sret %agg.result) -// CHECK: void @f22(%{{.*}}* noalias sret %agg.result) +// CHECK: void @f18(%{{.*}}* noalias sret align 8 %agg.result) +// CHECK: void @f19(%{{.*}}* noalias sret align 8 %agg.result) +// CHECK: void @f20(%{{.*}}* noalias sret align 8 %agg.result) +// CHECK: void @f21(%{{.*}}* noalias sret align 16 %agg.result) +// CHECK: void @f22(%{{.*}}* noalias sret align 16 %agg.result) struct { T11 a; } f17(void) { while (1) {} } struct { T12 a; } f18(void) { while (1) {} } struct { T13 a; } f19(void) { while (1) {} } @@ -116,11 +116,11 @@ struct { struct {} a; struct { float a[1]; } b; } f25(void) { while (1) {} } // Small structures are handled recursively // CHECK: i32 @f26() -// CHECK: void @f27(%struct.s27* noalias sret %agg.result) +// CHECK: void @f27(%struct.s27* noalias sret align 1 %agg.result) struct s26 { struct { char a, b; } a; struct { char a, b; } b; } f26(void) { while (1) {} } struct s27 { struct { char a, b, c; } a; struct { char a; } b; } f27(void) { while (1) {} } -// CHECK: void @f28(%struct.s28* noalias sret %agg.result) +// CHECK: void @f28(%struct.s28* noalias sret align 4 %agg.result) struct s28 { int a; int b[]; } f28(void) { while (1) {} } // CHECK-LABEL: define i16 @f29() @@ -150,7 +150,7 @@ struct s36 { struct { int : 0; } a[2][10]; char b; char c; } f36(void) { while ( // CHECK-LABEL: define float @f37() struct s37 { float c[1][1]; } f37(void) { while (1) {} } -// CHECK-LABEL: define void @f38(%struct.s38* noalias sret %agg.result) +// CHECK-LABEL: define void @f38(%struct.s38* noalias sret align 2 %agg.result) struct s38 { char a[3]; short b; } f38(void) { while (1) {} } // CHECK-LABEL: define void @f39(%struct.s39* byval(%struct.s39) align 16 %x) diff --git a/clang/test/CodeGen/x86_32-arguments-iamcu.c b/clang/test/CodeGen/x86_32-arguments-iamcu.c index e391c711ea101..a134f5d84a77b 100644 --- a/clang/test/CodeGen/x86_32-arguments-iamcu.c +++ b/clang/test/CodeGen/x86_32-arguments-iamcu.c @@ -58,7 +58,7 @@ st4_t retSmallStruct(st4_t r) { return r; } // CHECK-LABEL: define i64 @retPaddedStruct(i32 %r.coerce0, i32 %r.coerce1) st5_t retPaddedStruct(st5_t r) { return r; } -// CHECK-LABEL: define void @retLargeStruct(%struct.st12_t* noalias sret %agg.result, i32 %i1, %struct.st12_t* byval(%struct.st12_t) align 4 %r) +// CHECK-LABEL: define void @retLargeStruct(%struct.st12_t* noalias sret align 4 %agg.result, i32 %i1, %struct.st12_t* byval(%struct.st12_t) align 4 %r) st12_t retLargeStruct(int i1, st12_t r) { return r; } // CHECK-LABEL: define i32 @varArgs(i32 %i1, ...) diff --git a/clang/test/CodeGen/x86_64-arguments-nacl.c b/clang/test/CodeGen/x86_64-arguments-nacl.c index ea4483422dfe2..e7287a90765bd 100644 --- a/clang/test/CodeGen/x86_64-arguments-nacl.c +++ b/clang/test/CodeGen/x86_64-arguments-nacl.c @@ -61,7 +61,7 @@ void f12_1(struct s12 a0) {} // Check that sret parameter is accounted for when checking available integer // registers. -// CHECK: define void @f13(%struct.s13_0* noalias sret %agg.result, i32 %a, i32 %b, i32 %c, i32 %d, {{.*}}* byval({{.*}}) align 8 %e, i32 %f) +// CHECK: define void @f13(%struct.s13_0* noalias sret align 8 %agg.result, i32 %a, i32 %b, i32 %c, i32 %d, {{.*}}* byval({{.*}}) align 8 %e, i32 %f) struct s13_0 { long long f0[3]; }; struct s13_1 { long long f0[2]; }; diff --git a/clang/test/CodeGen/x86_64-arguments-win32.c b/clang/test/CodeGen/x86_64-arguments-win32.c index b43107c65ef64..4f7c4ded4b167 100644 --- a/clang/test/CodeGen/x86_64-arguments-win32.c +++ b/clang/test/CodeGen/x86_64-arguments-win32.c @@ -27,5 +27,5 @@ void f6(_Complex double a) {} // CHECK-LABEL: define dso_local i64 @f7() _Complex float f7() { return 1.0; } -// CHECK-LABEL: define dso_local void @f8({ double, double }* noalias sret %agg.result) +// CHECK-LABEL: define dso_local void @f8({ double, double }* noalias sret align 8 %agg.result) _Complex double f8() { return 1.0; } diff --git a/clang/test/CodeGen/x86_64-arguments.c b/clang/test/CodeGen/x86_64-arguments.c index 107571d8140bb..273b2706f10a9 100644 --- a/clang/test/CodeGen/x86_64-arguments.c +++ b/clang/test/CodeGen/x86_64-arguments.c @@ -47,7 +47,7 @@ void f7(e7 a0) { // Test merging/passing of upper eightbyte with X87 class. // -// CHECK-LABEL: define void @f8_1(%union.u8* noalias sret %agg.result) +// CHECK-LABEL: define void @f8_1(%union.u8* noalias sret align 16 %agg.result) // CHECK-LABEL: define void @f8_2(%union.u8* byval(%union.u8) align 16 %a0) union u8 { long double a; @@ -63,7 +63,7 @@ struct s9 { int a; int b; int : 0; } f9(void) { while (1) {} } struct s10 { int a; int b; int : 0; }; void f10(struct s10 a0) {} -// CHECK-LABEL: define void @f11(%union.anon* noalias sret %agg.result) +// CHECK-LABEL: define void @f11(%union.anon* noalias sret align 16 %agg.result) union { long double a; float b; } f11() { while (1) {} } // CHECK-LABEL: define i32 @f12_0() @@ -74,7 +74,7 @@ void f12_1(struct s12 a0) {} // Check that sret parameter is accounted for when checking available integer // registers. -// CHECK: define void @f13(%struct.s13_0* noalias sret %agg.result, i32 %a, i32 %b, i32 %c, i32 %d, {{.*}}* byval({{.*}}) align 8 %e, i32 %f) +// CHECK: define void @f13(%struct.s13_0* noalias sret align 8 %agg.result, i32 %a, i32 %b, i32 %c, i32 %d, {{.*}}* byval({{.*}}) align 8 %e, i32 %f) struct s13_0 { long long f0[3]; }; struct s13_1 { long long f0[2]; }; diff --git a/clang/test/CodeGenCXX/arm-cc.cpp b/clang/test/CodeGenCXX/arm-cc.cpp index 6027746b9ae80..e738cd31fb544 100644 --- a/clang/test/CodeGenCXX/arm-cc.cpp +++ b/clang/test/CodeGenCXX/arm-cc.cpp @@ -16,5 +16,5 @@ void baz() { zed(a); } -// CHECK: declare void @_Z3fooPv(%class.SMLoc* sret, i8*) +// CHECK: declare void @_Z3fooPv(%class.SMLoc* sret align 4, i8*) // CHECK: declare void @_Z3zed5SMLoc(%class.SMLoc*) diff --git a/clang/test/CodeGenCXX/auto-var-init.cpp b/clang/test/CodeGenCXX/auto-var-init.cpp index 9cd71bdfd1a7d..9399ec4eca089 100644 --- a/clang/test/CodeGenCXX/auto-var-init.cpp +++ b/clang/test/CodeGenCXX/auto-var-init.cpp @@ -1610,5 +1610,24 @@ TEST_CUSTOM(doublevec32, double __attribute__((vector_size(32))), { 3.141592653 // CHECK-NEXT: store <4 x double> , <4 x double>* %custom, align [[ALIGN]] // CHECK-NEXT: call void @{{.*}}used{{.*}}%custom) +// TODO: This vector has tail padding +TEST_UNINIT(doublevec24, double __attribute__((vector_size(24)))); +// CHECK-LABEL: @test_doublevec24_uninit() +// CHECK: %uninit = alloca <3 x double>, align +// CHECK-NEXT: call void @{{.*}}used{{.*}}%uninit) +// PATTERN-LABEL: @test_doublevec24_uninit() +// PATTERN: store <3 x double> , <3 x double>* %uninit, align 32 +// ZERO-LABEL: @test_doublevec24_uninit() +// ZERO: store <3 x double> zeroinitializer, <3 x double>* %uninit, align 32 + +// TODO: This vector has tail padding +TEST_UNINIT(longdoublevec32, long double __attribute__((vector_size(sizeof(long double)*2)))); +// CHECK-LABEL: @test_longdoublevec32_uninit() +// CHECK: %uninit = alloca <2 x x86_fp80>, align +// CHECK-NEXT: call void @{{.*}}used{{.*}}%uninit) +// PATTERN-LABEL: @test_longdoublevec32_uninit() +// PATTERN: store <2 x x86_fp80> , <2 x x86_fp80>* %uninit, align 32 +// ZERO-LABEL: @test_longdoublevec32_uninit() +// ZERO: store <2 x x86_fp80> zeroinitializer, <2 x x86_fp80>* %uninit, align 32 } // extern "C" diff --git a/clang/test/CodeGenCXX/builtin-source-location.cpp b/clang/test/CodeGenCXX/builtin-source-location.cpp index f8bfd7d940b91..cdc896209c85b 100644 --- a/clang/test/CodeGenCXX/builtin-source-location.cpp +++ b/clang/test/CodeGenCXX/builtin-source-location.cpp @@ -65,7 +65,7 @@ SL const_init_global = SL::current(); // // CHECK-GLOBAL-TWO: define internal void @__cxx_global_var_init() // CHECK-GLOBAL-TWO-NOT: ret -// CHECK-GLOBAL-TWO: call void @_ZN15source_location11bad_currentEjjPKcS1_(%struct.source_location* sret @runtime_init_global, +// CHECK-GLOBAL-TWO: call void @_ZN15source_location11bad_currentEjjPKcS1_(%struct.source_location* sret align 8 @runtime_init_global, // CHECK-GLOBAL-TWO-SAME: i32 1100, i32 {{[0-9]+}}, {{[^@]*}}@[[FILE]], {{[^@]*}}@[[FUNC]], #line 1100 "test_runtime_init.cpp" SL runtime_init_global = SL::bad_current(); @@ -77,7 +77,7 @@ extern "C" void test_function() { // CHECK-LOCAL-ONE-DAG: @[[FILE:.*]] = {{.*}}c"test_current.cpp\00" // CHECK-LOCAL-ONE-DAG: @[[FUNC:.*]] = {{.*}}c"test_function\00" // -// CHECK-LOCAL-ONE: call void @_ZN15source_location7currentEjjPKcS1_(%struct.source_location* sret %local, +// CHECK-LOCAL-ONE: call void @_ZN15source_location7currentEjjPKcS1_(%struct.source_location* sret align 8 %local, // CHECK-LOCAL-ONE-SAME: i32 2100, i32 {{[0-9]+}}, // CHECK-LOCAL-ONE-SAME: {{[^@]*}}@[[FILE]], {{[^@]*}}@[[FUNC]], #line 2100 "test_current.cpp" @@ -102,7 +102,7 @@ struct TestInit { // CHECK-CTOR-GLOBAL: define internal void @__cxx_global_var_init.{{[0-9]+}}() // CHECK-CTOR-GLOBAL-NOT: ret // -// CHECK-CTOR-GLOBAL: call void @_ZN15source_location7currentEjjPKcS1_(%struct.source_location* sret %[[TMP_ONE:[^,]*]], +// CHECK-CTOR-GLOBAL: call void @_ZN15source_location7currentEjjPKcS1_(%struct.source_location* sret align 8 %[[TMP_ONE:[^,]*]], // CHECK-CTOR-GLOBAL-SAME: i32 3400, i32 {{[0-9]+}}, {{[^@]*}}@[[FILE]], {{[^@]*}}@[[FUNC]], // CHECK-CTOR-GLOBAL-NEXT: call void @_ZN8TestInitC1E15source_location(%struct.TestInit* @GlobalInitVal, %struct.source_location* {{.*}}%[[TMP_ONE]]) #line 3400 "GlobalInitVal.cpp" @@ -117,7 +117,7 @@ extern "C" void test_init_function() { // CHECK-CTOR-LOCAL: define void @test_init_function() // CHECK-CTOR-LOCAL-NOT: ret // -// CHECK-CTOR-LOCAL: call void @_ZN15source_location7currentEjjPKcS1_(%struct.source_location* sret %[[TMP:[^,]*]], +// CHECK-CTOR-LOCAL: call void @_ZN15source_location7currentEjjPKcS1_(%struct.source_location* sret align 8 %[[TMP:[^,]*]], // CHECK-CTOR-LOCAL-SAME: i32 3500, i32 {{[0-9]+}}, {{[^@]*}}@[[FILE]], {{[^@]*}}@[[FUNC]], // CHECK-CTOR-LOCAL-NEXT: call void @_ZN8TestInitC1E15source_location(%struct.TestInit* %init_local, %struct.source_location* {{.*}}%[[TMP]]) #line 3500 "LocalInitVal.cpp" @@ -153,7 +153,7 @@ extern "C" void test_init_function_constexpr() { // CHECK-CONSTEXPR-LOCAL-DAG: @[[FILE:.*]] = {{.*}}c"ConstexprLocal.cpp\00" // // CHECK-CONSTEXPR-LOCAL: define void @test_init_function_constexpr() -// CHECK-CONSTEXPR-LOCAL: call void @_ZN15source_location7currentEjjPKcS1_(%struct.source_location* sret %[[TMP:[^,]*]], +// CHECK-CONSTEXPR-LOCAL: call void @_ZN15source_location7currentEjjPKcS1_(%struct.source_location* sret align 8 %[[TMP:[^,]*]], // CHECK-CONSTEXPR-LOCAL-SAME: i32 4600, i32 {{[0-9]+}}, {{[^@]*}}@[[FILE]], {{[^@]*}}@[[FUNC]] // CHECK-CONSTEXPR-LOCAL: call void @_ZN17TestInitConstexprC1E15source_location(%struct.TestInitConstexpr* %local_val, {{.*}}%[[TMP]]) #line 4600 "ConstexprLocal.cpp" @@ -189,7 +189,7 @@ extern "C" void test_agg_init() { // // CHECK-AGG-BRACE: define void @test_agg_init() // CHECK-AGG-BRACE: %[[I2:.*]] = getelementptr inbounds %struct.TestInitAgg, %struct.TestInitAgg* %local_brace_init, i32 0, i32 1 -// CHECK-AGG-BRACE-NEXT: call void @_ZN15source_location7currentEjjPKcS1_(%struct.source_location* sret %[[I2]], +// CHECK-AGG-BRACE-NEXT: call void @_ZN15source_location7currentEjjPKcS1_(%struct.source_location* sret align 8 %[[I2]], // CHECK-AGG-BRACE-SAME: i32 5700, i32 {{[0-9]+}}, {{[^@]*}}@[[FILE]], {{[^@]*}}@[[FUNC]] #line 5600 "BraceInitStart.cpp" TestInitAgg local_brace_init{ @@ -203,7 +203,7 @@ extern "C" void test_agg_init() { // // CHECK-AGG-EQUAL: define void @test_agg_init() // CHECK-AGG-EQUAL: %[[I2:.*]] = getelementptr inbounds %struct.TestInitAgg, %struct.TestInitAgg* %local_equal_init, i32 0, i32 1 -// CHECK-AGG-EQUAL-NEXT: call void @_ZN15source_location7currentEjjPKcS1_(%struct.source_location* sret %[[I2]], +// CHECK-AGG-EQUAL-NEXT: call void @_ZN15source_location7currentEjjPKcS1_(%struct.source_location* sret align 8 %[[I2]], // CHECK-AGG-EQUAL-SAME: i32 5900, i32 {{[0-9]+}}, {{[^@]*}}@[[FILE]], {{[^@]*}}@[[FUNC]] #line 5800 "EqualInitStart.cpp" TestInitAgg local_equal_init = @@ -220,11 +220,11 @@ extern "C" void test_agg_init() { // CHECK-AGG-LIST: define void @test_agg_init() // // CHECK-AGG-LIST: %[[I1:.*]] = getelementptr inbounds %struct.TestInitAgg, %struct.TestInitAgg* %local_list_init, i32 0, i32 0 -// CHECK-AGG-LIST-NEXT: call void @_ZN15source_location7currentEjjPKcS1_(%struct.source_location* sret %[[I1]], +// CHECK-AGG-LIST-NEXT: call void @_ZN15source_location7currentEjjPKcS1_(%struct.source_location* sret align 8 %[[I1]], // CHECK-AGG-LIST-SAME: i32 6100, i32 {{[0-9]+}}, {{[^@]*}}@[[FILE_ELEM]], {{[^@]*}}@[[FUNC]] // // CHECK-AGG-LIST: %[[I2:.*]] = getelementptr inbounds %struct.TestInitAgg, %struct.TestInitAgg* %local_list_init, i32 0, i32 1 -// CHECK-AGG-LIST-NEXT: call void @_ZN15source_location7currentEjjPKcS1_(%struct.source_location* sret %[[I2]], +// CHECK-AGG-LIST-NEXT: call void @_ZN15source_location7currentEjjPKcS1_(%struct.source_location* sret align 8 %[[I2]], // CHECK-AGG-LIST-SAME: i32 6200, i32 {{[0-9]+}}, {{[^@]*}}@[[FILE_DEFAULT]], {{[^@]*}}@[[FUNC]] #line 6000 "InitListStart.cpp" TestInitAgg local_list_init = @@ -258,7 +258,7 @@ void test_template() { // CHECK-TEMPL-NEXT: entry: // CHECK-TEMPL-NOT: ret // -// CHECK-TEMPL: call void @_ZN15source_location7currentEjjPKcS1_(%struct.source_location* sret %[[TMP:[^,]*]], +// CHECK-TEMPL: call void @_ZN15source_location7currentEjjPKcS1_(%struct.source_location* sret align 8 %[[TMP:[^,]*]], // CHECK-TEMPL-SAME: i32 7300, i32 {{[0-9]+}}, {{[^@]*}}@[[FILE]], {{[^@]*}}@[[FUNC]] #line 7300 "local_templ.cpp" TestTemplate local_templ; diff --git a/clang/test/CodeGenCXX/call-with-static-chain.cpp b/clang/test/CodeGenCXX/call-with-static-chain.cpp index ac1149b52eddf..17e676433e1a4 100644 --- a/clang/test/CodeGenCXX/call-with-static-chain.cpp +++ b/clang/test/CodeGenCXX/call-with-static-chain.cpp @@ -25,8 +25,8 @@ void test() { // CHECK64: call i32 bitcast (i32 (i64, i64, i64, i64, i64, i64, %struct.A*)* @f1 to i32 (i8*, i64, i64, i64, i64, i64, i64, %struct.A*)*)(i8* nest bitcast (i32 (i64, i64, i64, i64, i64, i64, %struct.A*)* @f1 to i8*) __builtin_call_with_static_chain(f1(a, a, a, a), f1); - // CHECK32: call void bitcast (void (%struct.B*)* @f2 to void (%struct.B*, i8*)*)(%struct.B* sret %{{[0-9a-z]+}}, i8* nest bitcast (void (%struct.B*)* @f2 to i8*)) - // CHECK64: call void bitcast (void (%struct.B*)* @f2 to void (%struct.B*, i8*)*)(%struct.B* sret %{{[0-9a-z]+}}, i8* nest bitcast (void (%struct.B*)* @f2 to i8*)) + // CHECK32: call void bitcast (void (%struct.B*)* @f2 to void (%struct.B*, i8*)*)(%struct.B* sret align 4 %{{[0-9a-z]+}}, i8* nest bitcast (void (%struct.B*)* @f2 to i8*)) + // CHECK64: call void bitcast (void (%struct.B*)* @f2 to void (%struct.B*, i8*)*)(%struct.B* sret align 8 %{{[0-9a-z]+}}, i8* nest bitcast (void (%struct.B*)* @f2 to i8*)) __builtin_call_with_static_chain(f2(), f2); // CHECK32: call i64 bitcast (i64 ()* @f3 to i64 (i8*)*)(i8* nest bitcast (i64 ()* @f3 to i8*)) diff --git a/clang/test/CodeGenCXX/conditional-gnu-ext.cpp b/clang/test/CodeGenCXX/conditional-gnu-ext.cpp index 613dd65ee7c96..ec6d097994183 100644 --- a/clang/test/CodeGenCXX/conditional-gnu-ext.cpp +++ b/clang/test/CodeGenCXX/conditional-gnu-ext.cpp @@ -94,7 +94,7 @@ namespace test3 { B test1() { // CHECK-LABEL: define void @_ZN5test35test1Ev( // CHECK: [[TEMP:%.*]] = alloca [[B]], - // CHECK: call void @_ZN5test312test1_helperEv([[B]]* sret [[TEMP]]) + // CHECK: call void @_ZN5test312test1_helperEv([[B]]* sret align 1 [[TEMP]]) // CHECK-NEXT: [[BOOL:%.*]] = call zeroext i1 @_ZN5test31BcvbEv([[B]]* [[TEMP]]) // CHECK-NEXT: br i1 [[BOOL]] // CHECK: call void @_ZN5test31BC1ERKS0_([[B]]* [[RESULT:%.*]], [[B]]* dereferenceable({{[0-9]+}}) [[TEMP]]) @@ -115,7 +115,7 @@ namespace test3 { // CHECK-NEXT: [[T0:%.*]] = load [[B]]*, [[B]]** [[X]] // CHECK-NEXT: [[BOOL:%.*]] = call zeroext i1 @_ZN5test31BcvbEv([[B]]* [[T0]]) // CHECK-NEXT: br i1 [[BOOL]] - // CHECK: call void @_ZN5test31BcvNS_1AEEv([[A:%.*]]* sret [[RESULT:%.*]], [[B]]* [[T0]]) + // CHECK: call void @_ZN5test31BcvNS_1AEEv([[A:%.*]]* sret align 1 [[RESULT:%.*]], [[B]]* [[T0]]) // CHECK-NEXT: br label // CHECK: call void @_ZN5test31AC1Ev([[A]]* [[RESULT]]) // CHECK-NEXT: br label @@ -126,10 +126,10 @@ namespace test3 { A test3() { // CHECK-LABEL: define void @_ZN5test35test3Ev( // CHECK: [[TEMP:%.*]] = alloca [[B]], - // CHECK: call void @_ZN5test312test3_helperEv([[B]]* sret [[TEMP]]) + // CHECK: call void @_ZN5test312test3_helperEv([[B]]* sret align 1 [[TEMP]]) // CHECK-NEXT: [[BOOL:%.*]] = call zeroext i1 @_ZN5test31BcvbEv([[B]]* [[TEMP]]) // CHECK-NEXT: br i1 [[BOOL]] - // CHECK: call void @_ZN5test31BcvNS_1AEEv([[A]]* sret [[RESULT:%.*]], [[B]]* [[TEMP]]) + // CHECK: call void @_ZN5test31BcvNS_1AEEv([[A]]* sret align 1 [[RESULT:%.*]], [[B]]* [[TEMP]]) // CHECK-NEXT: br label // CHECK: call void @_ZN5test31AC1Ev([[A]]* [[RESULT]]) // CHECK-NEXT: br label diff --git a/clang/test/CodeGenCXX/cxx1z-copy-omission.cpp b/clang/test/CodeGenCXX/cxx1z-copy-omission.cpp index b33a21808175a..dd821949772a7 100644 --- a/clang/test/CodeGenCXX/cxx1z-copy-omission.cpp +++ b/clang/test/CodeGenCXX/cxx1z-copy-omission.cpp @@ -19,7 +19,7 @@ void g() { // CHECK: %[[A:.*]] = alloca // CHECK-NOT: alloca // CHECK-NOT: call - // CHECK: call {{.*}} @_Z1fv({{.*}}* sret %[[A]]) + // CHECK: call {{.*}} @_Z1fv({{.*}}* sret align 4 %[[A]]) A a = A( A{ f() } ); // CHECK-NOT: call @@ -40,7 +40,7 @@ void h() { // CHECK-NOT: alloca // CHECK-NOT: call - // CHECK: call {{.*}} @_Z1fv({{.*}}* sret %[[A]]) + // CHECK: call {{.*}} @_Z1fv({{.*}}* sret align 4 %[[A]]) // CHECK-NOT: call // CHECK: call {{.*}} @_Z1f1A({{.*}}* %[[A]]) f(f()); diff --git a/clang/test/CodeGenCXX/cxx1z-lambda-star-this.cpp b/clang/test/CodeGenCXX/cxx1z-lambda-star-this.cpp index 114791c6558b3..fc13c197076f4 100644 --- a/clang/test/CodeGenCXX/cxx1z-lambda-star-this.cpp +++ b/clang/test/CodeGenCXX/cxx1z-lambda-star-this.cpp @@ -10,7 +10,7 @@ namespace ns1 { int X = A{}.foo()(); } //end ns1 -//CHECK: @"?foo@A@@QAE?A?@@XZ"(%struct.A* %this, %class.anon* noalias sret %[[A_LAMBDA_RETVAL:.*]]) +//CHECK: @"?foo@A@@QAE?A?@@XZ"(%struct.A* %this, %class.anon* noalias sret align 8 %[[A_LAMBDA_RETVAL:.*]]) // get the first object with the closure type, which is of type 'struct.A' //CHECK: %[[I0:.+]] = getelementptr inbounds %[[A_LAMBDA]], %[[A_LAMBDA]]* %[[A_LAMBDA_RETVAL]], i32 0, i32 0 //CHECK: %[[I1:.+]] = bitcast %struct.A* %[[I0]] to i8* @@ -26,6 +26,6 @@ struct B { namespace ns2 { int X = B{}.bar()(); } -//CHECK: @"?bar@B@@QAE?A?@@XZ"(%struct.B* %this, %class.anon.0* noalias sret %agg.result) +//CHECK: @"?bar@B@@QAE?A?@@XZ"(%struct.B* %this, %class.anon.0* noalias sret align 4 %agg.result) //CHECK: %[[I20:.+]] = getelementptr inbounds %class.anon.0, %class.anon.0* %agg.result, i32 0, i32 0 //CHECK: store %struct.B* %this1, %struct.B** %[[I20]], align 4 diff --git a/clang/test/CodeGenCXX/dbg-info-all-calls-described.cpp b/clang/test/CodeGenCXX/dbg-info-all-calls-described.cpp index 667c2469b55ea..e64e07cdb7485 100644 --- a/clang/test/CodeGenCXX/dbg-info-all-calls-described.cpp +++ b/clang/test/CodeGenCXX/dbg-info-all-calls-described.cpp @@ -15,22 +15,27 @@ // RUN: | FileCheck %s -check-prefix=HAS-ATTR \ // RUN: -implicit-check-not=DISubprogram -implicit-check-not=DIFlagAllCallsDescribed -// Supported: DWARF4 + GDB tuning by using '-femit-debug-entry-values' -// RUN: %clang_cc1 -femit-debug-entry-values -emit-llvm -triple x86_64-linux-gnu \ +// Note: DIFlagAllCallsDescribed may have been enabled prematurely when tuning +// for GDB under -gdwarf-4 in https://reviews.llvm.org/D69743. It's possible +// this should have been 'Unsupported' until entry values emission was enabled +// by default. +// +// Supported: DWARF4 + GDB tuning +// RUN: %clang_cc1 -emit-llvm -triple x86_64-linux-gnu \ // RUN: %s -o - -O1 -disable-llvm-passes -debugger-tuning=gdb \ // RUN: -debug-info-kind=standalone -dwarf-version=4 \ // RUN: | FileCheck %s -check-prefix=HAS-ATTR \ // RUN: -implicit-check-not=DIFlagAllCallsDescribed -// Supported: DWARF4 + LLDB tuning by using '-femit-debug-entry-values' -// RUN: %clang_cc1 -femit-debug-entry-values -emit-llvm -triple x86_64-linux-gnu \ +// Supported: DWARF4 + LLDB, -O1 +// RUN: %clang_cc1 -emit-llvm -triple x86_64-linux-gnu \ // RUN: %s -o - -O1 -disable-llvm-passes -debugger-tuning=lldb \ // RUN: -debug-info-kind=standalone -dwarf-version=4 \ // RUN: | FileCheck %s -check-prefix=HAS-ATTR \ // RUN: -implicit-check-not=DIFlagAllCallsDescribed -// Unsupported: -O0 + '-femit-debug-entry-values' -// RUN: %clang_cc1 -femit-debug-entry-values -emit-llvm -triple x86_64-linux-gnu \ +// Unsupported: -O0 +// RUN: %clang_cc1 -emit-llvm -triple x86_64-linux-gnu \ // RUN: %s -o - -O0 -disable-llvm-passes -debugger-tuning=gdb \ // RUN: -debug-info-kind=standalone -dwarf-version=4 \ // RUN: | FileCheck %s -check-prefix=NO-ATTR diff --git a/clang/test/CodeGenCXX/exceptions.cpp b/clang/test/CodeGenCXX/exceptions.cpp index 90db4023a9f4e..97bc94d78c1ac 100644 --- a/clang/test/CodeGenCXX/exceptions.cpp +++ b/clang/test/CodeGenCXX/exceptions.cpp @@ -146,12 +146,12 @@ namespace test1 { // CHECK: [[NEW:%.*]] = call noalias nonnull i8* @_Znwm(i64 8) // CHECK-NEXT: store i1 true, i1* [[ACTIVE]] // CHECK-NEXT: [[CAST:%.*]] = bitcast i8* [[NEW]] to [[A]]* - // CHECK-NEXT: invoke void @_ZN5test15makeBEv([[B:%.*]]* sret [[T0:%.*]]) + // CHECK-NEXT: invoke void @_ZN5test15makeBEv([[B:%.*]]* sret align 4 [[T0:%.*]]) // CHECK: [[T1:%.*]] = invoke i32 @_ZN5test11BcviEv([[B]]* [[T0]]) // CHECK: invoke void @_ZN5test11AC1Ei([[A]]* [[CAST]], i32 [[T1]]) // CHECK: store i1 false, i1* [[ACTIVE]] // CHECK-NEXT: store [[A]]* [[CAST]], [[A]]** [[X]], align 8 - // CHECK: invoke void @_ZN5test15makeBEv([[B:%.*]]* sret [[T2:%.*]]) + // CHECK: invoke void @_ZN5test15makeBEv([[B:%.*]]* sret align 4 [[T2:%.*]]) // CHECK: [[RET:%.*]] = load [[A]]*, [[A]]** [[X]], align 8 // CHECK98: invoke void @_ZN5test11BD1Ev([[B]]* [[T2]]) @@ -239,7 +239,7 @@ namespace test3 { // CHECK-NEXT: store i8* [[FOO]], i8** [[SAVED1]] // CHECK-NEXT: store i1 true, i1* [[CLEANUPACTIVE]] // CHECK-NEXT: [[CAST:%.*]] = bitcast i8* [[NEW]] to [[A]]* - // CHECK-NEXT: invoke void @_ZN5test35makeAEv([[A]]* sret [[CAST]]) + // CHECK-NEXT: invoke void @_ZN5test35makeAEv([[A]]* sret align 8 [[CAST]]) // CHECK: br label // -> cond.end new(foo(),10.0) A(makeA()) : diff --git a/clang/test/CodeGenCXX/homogeneous-aggregates.cpp b/clang/test/CodeGenCXX/homogeneous-aggregates.cpp index 05fb7f1d20a4b..51a4549d38d76 100644 --- a/clang/test/CodeGenCXX/homogeneous-aggregates.cpp +++ b/clang/test/CodeGenCXX/homogeneous-aggregates.cpp @@ -38,10 +38,10 @@ struct I2 : Base2 {}; struct I3 : Base2 {}; struct D5 : I1, I2, I3 {}; // homogeneous aggregate -// PPC: define void @_Z7func_D12D1(%struct.D1* noalias sret %agg.result, [3 x i64] %x.coerce) -// ARM32: define arm_aapcs_vfpcc void @_Z7func_D12D1(%struct.D1* noalias sret %agg.result, [3 x i64] %x.coerce) -// ARM64: define void @_Z7func_D12D1(%struct.D1* noalias sret %agg.result, %struct.D1* %x) -// X64: define dso_local x86_vectorcallcc void @"\01_Z7func_D12D1@@24"(%struct.D1* noalias sret %agg.result, %struct.D1* %x) +// PPC: define void @_Z7func_D12D1(%struct.D1* noalias sret align 8 %agg.result, [3 x i64] %x.coerce) +// ARM32: define arm_aapcs_vfpcc void @_Z7func_D12D1(%struct.D1* noalias sret align 8 %agg.result, [3 x i64] %x.coerce) +// ARM64: define void @_Z7func_D12D1(%struct.D1* noalias sret align 8 %agg.result, %struct.D1* %x) +// X64: define dso_local x86_vectorcallcc void @"\01_Z7func_D12D1@@24"(%struct.D1* noalias sret align 8 %agg.result, %struct.D1* %x) D1 CC func_D1(D1 x) { return x; } // PPC: define [3 x double] @_Z7func_D22D2([3 x double] %x.coerce) @@ -50,9 +50,9 @@ D1 CC func_D1(D1 x) { return x; } // X64: define dso_local x86_vectorcallcc %struct.D2 @"\01_Z7func_D22D2@@24"(%struct.D2 inreg %x.coerce) D2 CC func_D2(D2 x) { return x; } -// PPC: define void @_Z7func_D32D3(%struct.D3* noalias sret %agg.result, [4 x i64] %x.coerce) -// ARM32: define arm_aapcs_vfpcc void @_Z7func_D32D3(%struct.D3* noalias sret %agg.result, [4 x i64] %x.coerce) -// ARM64: define void @_Z7func_D32D3(%struct.D3* noalias sret %agg.result, %struct.D3* %x) +// PPC: define void @_Z7func_D32D3(%struct.D3* noalias sret align 8 %agg.result, [4 x i64] %x.coerce) +// ARM32: define arm_aapcs_vfpcc void @_Z7func_D32D3(%struct.D3* noalias sret align 8 %agg.result, [4 x i64] %x.coerce) +// ARM64: define void @_Z7func_D32D3(%struct.D3* noalias sret align 8 %agg.result, %struct.D3* %x) D3 CC func_D3(D3 x) { return x; } // PPC: define [4 x double] @_Z7func_D42D4([4 x double] %x.coerce) diff --git a/clang/test/CodeGenCXX/lambda-expressions.cpp b/clang/test/CodeGenCXX/lambda-expressions.cpp index 566132ad64e30..c75f84f038715 100644 --- a/clang/test/CodeGenCXX/lambda-expressions.cpp +++ b/clang/test/CodeGenCXX/lambda-expressions.cpp @@ -194,8 +194,8 @@ namespace pr28595 { // CHECK-NEXT: call i32 @"_ZZ1fvENK3$_6clEii" // CHECK-NEXT: ret i32 -// CHECK-LABEL: define internal void @"_ZZ1hvEN4$_118__invokeEv"(%struct.A* noalias sret %agg.result) {{.*}} { -// CHECK: call void @"_ZZ1hvENK4$_11clEv"(%struct.A* sret %agg.result, +// CHECK-LABEL: define internal void @"_ZZ1hvEN4$_118__invokeEv"(%struct.A* noalias sret align 1 %agg.result) {{.*}} { +// CHECK: call void @"_ZZ1hvENK4$_11clEv"(%struct.A* sret align 1 %agg.result, // CHECK-NEXT: ret void struct A { ~A(); }; void h() { diff --git a/clang/test/CodeGenCXX/microsoft-abi-byval-sret.cpp b/clang/test/CodeGenCXX/microsoft-abi-byval-sret.cpp index 2c940d22010bd..a92049c3a7996 100644 --- a/clang/test/CodeGenCXX/microsoft-abi-byval-sret.cpp +++ b/clang/test/CodeGenCXX/microsoft-abi-byval-sret.cpp @@ -49,7 +49,7 @@ A B::qux(A x) { } // CHECK-LABEL: define dso_local x86_fastcallcc void @"?qux@B@@QAI?AUA@@U2@@Z" -// CHECK: (%struct.B* inreg %this, %struct.A* inreg noalias sret %agg.result, <{ %struct.A }>* inalloca %0) +// CHECK: (%struct.B* inreg %this, %struct.A* inreg noalias sret align 4 %agg.result, <{ %struct.A }>* inalloca %0) // CHECK: ret void int main() { @@ -67,4 +67,4 @@ int main() { // CHECK: call x86_stdcallcc %struct.A* @"?baz@B@@QAG?AUA@@U2@@Z" // CHECK: (<{ %struct.B*, %struct.A*, %struct.A }>* inalloca %{{[^,]*}}) // CHECK: call x86_fastcallcc void @"?qux@B@@QAI?AUA@@U2@@Z" -// CHECK: (%struct.B* inreg %{{[^,]*}}, %struct.A* inreg sret %{{.*}}, <{ %struct.A }>* inalloca %{{[^,]*}}) +// CHECK: (%struct.B* inreg %{{[^,]*}}, %struct.A* inreg sret align 4 %{{.*}}, <{ %struct.A }>* inalloca %{{[^,]*}}) diff --git a/clang/test/CodeGenCXX/microsoft-abi-byval-thunks.cpp b/clang/test/CodeGenCXX/microsoft-abi-byval-thunks.cpp index ed4e1fbb36fdb..0ca68cccb7906 100644 --- a/clang/test/CodeGenCXX/microsoft-abi-byval-thunks.cpp +++ b/clang/test/CodeGenCXX/microsoft-abi-byval-thunks.cpp @@ -86,10 +86,10 @@ C::C() {} // force emission // CHECK32-NEXT: ret %"struct.sret_thunk::Agg"* %[[rv]] // CHECK64-LABEL: define linkonce_odr dso_local void @"?foo@C@sret_thunk@@W7EAA?AUAgg@2@U32@@Z" -// CHECK64: (%"struct.sret_thunk::C"* %this, %"struct.sret_thunk::Agg"* noalias sret %agg.result, %"struct.sret_thunk::Agg"* %x) +// CHECK64: (%"struct.sret_thunk::C"* %this, %"struct.sret_thunk::Agg"* noalias sret align 4 %agg.result, %"struct.sret_thunk::Agg"* %x) // CHECK64: getelementptr i8, i8* %{{.*}}, i32 -8 // CHECK64: call void @"?foo@C@sret_thunk@@UEAA?AUAgg@2@U32@@Z" -// CHECK64: (%"struct.sret_thunk::C"* %{{.*}}, %"struct.sret_thunk::Agg"* sret %agg.result, %"struct.sret_thunk::Agg"* %x) +// CHECK64: (%"struct.sret_thunk::C"* %{{.*}}, %"struct.sret_thunk::Agg"* sret align 4 %agg.result, %"struct.sret_thunk::Agg"* %x) // CHECK64-NOT: call // CHECK64: ret void } diff --git a/clang/test/CodeGenCXX/microsoft-abi-cdecl-method-sret.cpp b/clang/test/CodeGenCXX/microsoft-abi-cdecl-method-sret.cpp index 5a8bdf78100f4..534aa7f804695 100644 --- a/clang/test/CodeGenCXX/microsoft-abi-cdecl-method-sret.cpp +++ b/clang/test/CodeGenCXX/microsoft-abi-cdecl-method-sret.cpp @@ -19,9 +19,9 @@ S C::variadic_sret(const char *f, ...) { return S(); } S C::cdecl_sret() { return S(); } S C::byval_and_sret(S a) { return S(); } -// CHECK: define dso_local void @"?variadic_sret@C@@QAA?AUS@@PBDZZ"(%struct.C* %this, %struct.S* noalias sret %agg.result, i8* %f, ...) -// CHECK: define dso_local void @"?cdecl_sret@C@@QAA?AUS@@XZ"(%struct.C* %this, %struct.S* noalias sret %agg.result) -// CHECK: define dso_local void @"?byval_and_sret@C@@QAA?AUS@@U2@@Z"(%struct.C* %this, %struct.S* noalias sret %agg.result, %struct.S* byval(%struct.S) align 4 %a) +// CHECK: define dso_local void @"?variadic_sret@C@@QAA?AUS@@PBDZZ"(%struct.C* %this, %struct.S* noalias sret align 4 %agg.result, i8* %f, ...) +// CHECK: define dso_local void @"?cdecl_sret@C@@QAA?AUS@@XZ"(%struct.C* %this, %struct.S* noalias sret align 4 %agg.result) +// CHECK: define dso_local void @"?byval_and_sret@C@@QAA?AUS@@U2@@Z"(%struct.C* %this, %struct.S* noalias sret align 4 %agg.result, %struct.S* byval(%struct.S) align 4 %a) int main() { C c; @@ -41,4 +41,4 @@ struct A { S A::f(int x) { return S(); } -// CHECK-LABEL: define dso_local x86_fastcallcc void @"?f@A@@QAI?AUS@@H@Z"(%struct.A* inreg %this, %struct.S* inreg noalias sret %agg.result, i32 %x) +// CHECK-LABEL: define dso_local x86_fastcallcc void @"?f@A@@QAI?AUS@@H@Z"(%struct.A* inreg %this, %struct.S* inreg noalias sret align 4 %agg.result, i32 %x) diff --git a/clang/test/CodeGenCXX/microsoft-abi-eh-cleanups.cpp b/clang/test/CodeGenCXX/microsoft-abi-eh-cleanups.cpp index 7e8619b8b0ecf..60fa5c7991119 100644 --- a/clang/test/CodeGenCXX/microsoft-abi-eh-cleanups.cpp +++ b/clang/test/CodeGenCXX/microsoft-abi-eh-cleanups.cpp @@ -18,9 +18,9 @@ void HasEHCleanup() { // WIN32-LABEL: define dso_local void @"?HasEHCleanup@@YAXXZ"() {{.*}} { // WIN32: %[[base:.*]] = call i8* @llvm.stacksave() // If this call throws, we have to restore the stack. -// WIN32: call void @"?getA@@YA?AUA@@XZ"(%struct.A* sret %{{.*}}) +// WIN32: call void @"?getA@@YA?AUA@@XZ"(%struct.A* sret align 4 %{{.*}}) // If this call throws, we have to cleanup the first temporary. -// WIN32: invoke void @"?getA@@YA?AUA@@XZ"(%struct.A* sret %{{.*}}) +// WIN32: invoke void @"?getA@@YA?AUA@@XZ"(%struct.A* sret align 4 %{{.*}}) // If this call throws, we have to cleanup the stacksave. // WIN32: call i32 @"?TakesTwo@@YAHUA@@0@Z" // WIN32: call void @llvm.stackrestore diff --git a/clang/test/CodeGenCXX/microsoft-abi-sret-and-byval.cpp b/clang/test/CodeGenCXX/microsoft-abi-sret-and-byval.cpp index 9fb9f39cb0832..8c8d4b7383d63 100644 --- a/clang/test/CodeGenCXX/microsoft-abi-sret-and-byval.cpp +++ b/clang/test/CodeGenCXX/microsoft-abi-sret-and-byval.cpp @@ -84,45 +84,45 @@ void call_bools_and_chars() { // Returning structs that fit into a register. Small small_return() { return Small(); } -// LINUX-LABEL: define void @_Z12small_returnv(%struct.Small* noalias sret %agg.result) +// LINUX-LABEL: define void @_Z12small_returnv(%struct.Small* noalias sret align 4 %agg.result) // WIN32: define dso_local i32 @"?small_return@@YA?AUSmall@@XZ"() // WIN64: define dso_local i32 @"?small_return@@YA?AUSmall@@XZ"() Medium medium_return() { return Medium(); } -// LINUX-LABEL: define void @_Z13medium_returnv(%struct.Medium* noalias sret %agg.result) +// LINUX-LABEL: define void @_Z13medium_returnv(%struct.Medium* noalias sret align 4 %agg.result) // WIN32: define dso_local i64 @"?medium_return@@YA?AUMedium@@XZ"() // WIN64: define dso_local i64 @"?medium_return@@YA?AUMedium@@XZ"() // Returning structs that fit into a register but are not POD. SmallCpp11NotCpp03Pod small_non_pod_return() { return SmallCpp11NotCpp03Pod(); } -// LINUX-LABEL: define void @_Z20small_non_pod_returnv(%struct.SmallCpp11NotCpp03Pod* noalias sret %agg.result) -// WIN32: define dso_local void @"?small_non_pod_return@@YA?AUSmallCpp11NotCpp03Pod@@XZ"(%struct.SmallCpp11NotCpp03Pod* noalias sret %agg.result) -// WIN64: define dso_local void @"?small_non_pod_return@@YA?AUSmallCpp11NotCpp03Pod@@XZ"(%struct.SmallCpp11NotCpp03Pod* noalias sret %agg.result) +// LINUX-LABEL: define void @_Z20small_non_pod_returnv(%struct.SmallCpp11NotCpp03Pod* noalias sret align 4 %agg.result) +// WIN32: define dso_local void @"?small_non_pod_return@@YA?AUSmallCpp11NotCpp03Pod@@XZ"(%struct.SmallCpp11NotCpp03Pod* noalias sret align 4 %agg.result) +// WIN64: define dso_local void @"?small_non_pod_return@@YA?AUSmallCpp11NotCpp03Pod@@XZ"(%struct.SmallCpp11NotCpp03Pod* noalias sret align 4 %agg.result) SmallWithCtor small_with_ctor_return() { return SmallWithCtor(); } -// LINUX-LABEL: define void @_Z22small_with_ctor_returnv(%struct.SmallWithCtor* noalias sret %agg.result) -// WIN32: define dso_local void @"?small_with_ctor_return@@YA?AUSmallWithCtor@@XZ"(%struct.SmallWithCtor* noalias sret %agg.result) -// WIN64: define dso_local void @"?small_with_ctor_return@@YA?AUSmallWithCtor@@XZ"(%struct.SmallWithCtor* noalias sret %agg.result) +// LINUX-LABEL: define void @_Z22small_with_ctor_returnv(%struct.SmallWithCtor* noalias sret align 4 %agg.result) +// WIN32: define dso_local void @"?small_with_ctor_return@@YA?AUSmallWithCtor@@XZ"(%struct.SmallWithCtor* noalias sret align 4 %agg.result) +// WIN64: define dso_local void @"?small_with_ctor_return@@YA?AUSmallWithCtor@@XZ"(%struct.SmallWithCtor* noalias sret align 4 %agg.result) // FIXME: The 'sret' mark here doesn't seem to be enough to convince LLVM to // preserve the hidden sret pointer in R0 across the function. -// WOA: define dso_local arm_aapcs_vfpcc void @"?small_with_ctor_return@@YA?AUSmallWithCtor@@XZ"(%struct.SmallWithCtor* noalias sret %agg.result) +// WOA: define dso_local arm_aapcs_vfpcc void @"?small_with_ctor_return@@YA?AUSmallWithCtor@@XZ"(%struct.SmallWithCtor* noalias sret align 4 %agg.result) SmallWithVftable small_with_vftable_return() { return SmallWithVftable(); } -// LINUX-LABEL: define void @_Z25small_with_vftable_returnv(%struct.SmallWithVftable* noalias sret %agg.result) -// WIN32: define dso_local void @"?small_with_vftable_return@@YA?AUSmallWithVftable@@XZ"(%struct.SmallWithVftable* noalias sret %agg.result) -// WIN64: define dso_local void @"?small_with_vftable_return@@YA?AUSmallWithVftable@@XZ"(%struct.SmallWithVftable* noalias sret %agg.result) +// LINUX-LABEL: define void @_Z25small_with_vftable_returnv(%struct.SmallWithVftable* noalias sret align 4 %agg.result) +// WIN32: define dso_local void @"?small_with_vftable_return@@YA?AUSmallWithVftable@@XZ"(%struct.SmallWithVftable* noalias sret align 4 %agg.result) +// WIN64: define dso_local void @"?small_with_vftable_return@@YA?AUSmallWithVftable@@XZ"(%struct.SmallWithVftable* noalias sret align 8 %agg.result) MediumWithCopyCtor medium_with_copy_ctor_return() { return MediumWithCopyCtor(); } -// LINUX-LABEL: define void @_Z28medium_with_copy_ctor_returnv(%struct.MediumWithCopyCtor* noalias sret %agg.result) -// WIN32: define dso_local void @"?medium_with_copy_ctor_return@@YA?AUMediumWithCopyCtor@@XZ"(%struct.MediumWithCopyCtor* noalias sret %agg.result) -// WIN64: define dso_local void @"?medium_with_copy_ctor_return@@YA?AUMediumWithCopyCtor@@XZ"(%struct.MediumWithCopyCtor* noalias sret %agg.result) -// WOA: define dso_local arm_aapcs_vfpcc void @"?medium_with_copy_ctor_return@@YA?AUMediumWithCopyCtor@@XZ"(%struct.MediumWithCopyCtor* noalias sret %agg.result) +// LINUX-LABEL: define void @_Z28medium_with_copy_ctor_returnv(%struct.MediumWithCopyCtor* noalias sret align 4 %agg.result) +// WIN32: define dso_local void @"?medium_with_copy_ctor_return@@YA?AUMediumWithCopyCtor@@XZ"(%struct.MediumWithCopyCtor* noalias sret align 4 %agg.result) +// WIN64: define dso_local void @"?medium_with_copy_ctor_return@@YA?AUMediumWithCopyCtor@@XZ"(%struct.MediumWithCopyCtor* noalias sret align 4 %agg.result) +// WOA: define dso_local arm_aapcs_vfpcc void @"?medium_with_copy_ctor_return@@YA?AUMediumWithCopyCtor@@XZ"(%struct.MediumWithCopyCtor* noalias sret align 4 %agg.result) // Returning a large struct that doesn't fit into a register. Big big_return() { return Big(); } -// LINUX-LABEL: define void @_Z10big_returnv(%struct.Big* noalias sret %agg.result) -// WIN32: define dso_local void @"?big_return@@YA?AUBig@@XZ"(%struct.Big* noalias sret %agg.result) -// WIN64: define dso_local void @"?big_return@@YA?AUBig@@XZ"(%struct.Big* noalias sret %agg.result) +// LINUX-LABEL: define void @_Z10big_returnv(%struct.Big* noalias sret align 4 %agg.result) +// WIN32: define dso_local void @"?big_return@@YA?AUBig@@XZ"(%struct.Big* noalias sret align 4 %agg.result) +// WIN64: define dso_local void @"?big_return@@YA?AUBig@@XZ"(%struct.Big* noalias sret align 4 %agg.result) void small_arg(Small s) {} @@ -181,7 +181,7 @@ void small_arg_with_dtor(SmallWithDtor s) {} // Test that the eligible non-aggregate is passed directly, but returned // indirectly on ARM64 Windows. -// WOA64: define dso_local void @"?small_arg_with_private_member@@YA?AUSmallWithPrivate@@U1@@Z"(%struct.SmallWithPrivate* inreg noalias sret %agg.result, i64 %s.coerce) {{.*}} { +// WOA64: define dso_local void @"?small_arg_with_private_member@@YA?AUSmallWithPrivate@@U1@@Z"(%struct.SmallWithPrivate* inreg noalias sret align 4 %agg.result, i64 %s.coerce) {{.*}} { SmallWithPrivate small_arg_with_private_member(SmallWithPrivate s) { return s; } void call_small_arg_with_dtor() { @@ -281,24 +281,24 @@ void pass_ref_field() { class Class { public: Small thiscall_method_small() { return Small(); } - // LINUX: define {{.*}} void @_ZN5Class21thiscall_method_smallEv(%struct.Small* noalias sret %agg.result, %class.Class* %this) - // WIN32: define {{.*}} x86_thiscallcc void @"?thiscall_method_small@Class@@QAE?AUSmall@@XZ"(%class.Class* %this, %struct.Small* noalias sret %agg.result) - // WIN64: define linkonce_odr dso_local void @"?thiscall_method_small@Class@@QEAA?AUSmall@@XZ"(%class.Class* %this, %struct.Small* noalias sret %agg.result) + // LINUX: define {{.*}} void @_ZN5Class21thiscall_method_smallEv(%struct.Small* noalias sret align 4 %agg.result, %class.Class* %this) + // WIN32: define {{.*}} x86_thiscallcc void @"?thiscall_method_small@Class@@QAE?AUSmall@@XZ"(%class.Class* %this, %struct.Small* noalias sret align 4 %agg.result) + // WIN64: define linkonce_odr dso_local void @"?thiscall_method_small@Class@@QEAA?AUSmall@@XZ"(%class.Class* %this, %struct.Small* noalias sret align 4 %agg.result) SmallWithCtor thiscall_method_small_with_ctor() { return SmallWithCtor(); } - // LINUX: define {{.*}} void @_ZN5Class31thiscall_method_small_with_ctorEv(%struct.SmallWithCtor* noalias sret %agg.result, %class.Class* %this) - // WIN32: define {{.*}} x86_thiscallcc void @"?thiscall_method_small_with_ctor@Class@@QAE?AUSmallWithCtor@@XZ"(%class.Class* %this, %struct.SmallWithCtor* noalias sret %agg.result) - // WIN64: define linkonce_odr dso_local void @"?thiscall_method_small_with_ctor@Class@@QEAA?AUSmallWithCtor@@XZ"(%class.Class* %this, %struct.SmallWithCtor* noalias sret %agg.result) + // LINUX: define {{.*}} void @_ZN5Class31thiscall_method_small_with_ctorEv(%struct.SmallWithCtor* noalias sret align 4 %agg.result, %class.Class* %this) + // WIN32: define {{.*}} x86_thiscallcc void @"?thiscall_method_small_with_ctor@Class@@QAE?AUSmallWithCtor@@XZ"(%class.Class* %this, %struct.SmallWithCtor* noalias sret align 4 %agg.result) + // WIN64: define linkonce_odr dso_local void @"?thiscall_method_small_with_ctor@Class@@QEAA?AUSmallWithCtor@@XZ"(%class.Class* %this, %struct.SmallWithCtor* noalias sret align 4 %agg.result) Small __cdecl cdecl_method_small() { return Small(); } - // LINUX: define {{.*}} void @_ZN5Class18cdecl_method_smallEv(%struct.Small* noalias sret %agg.result, %class.Class* %this) - // WIN32: define {{.*}} void @"?cdecl_method_small@Class@@QAA?AUSmall@@XZ"(%class.Class* %this, %struct.Small* noalias sret %agg.result) - // WIN64: define linkonce_odr dso_local void @"?cdecl_method_small@Class@@QEAA?AUSmall@@XZ"(%class.Class* %this, %struct.Small* noalias sret %agg.result) + // LINUX: define {{.*}} void @_ZN5Class18cdecl_method_smallEv(%struct.Small* noalias sret align 4 %agg.result, %class.Class* %this) + // WIN32: define {{.*}} void @"?cdecl_method_small@Class@@QAA?AUSmall@@XZ"(%class.Class* %this, %struct.Small* noalias sret align 4 %agg.result) + // WIN64: define linkonce_odr dso_local void @"?cdecl_method_small@Class@@QEAA?AUSmall@@XZ"(%class.Class* %this, %struct.Small* noalias sret align 4 %agg.result) Big __cdecl cdecl_method_big() { return Big(); } - // LINUX: define {{.*}} void @_ZN5Class16cdecl_method_bigEv(%struct.Big* noalias sret %agg.result, %class.Class* %this) - // WIN32: define {{.*}} void @"?cdecl_method_big@Class@@QAA?AUBig@@XZ"(%class.Class* %this, %struct.Big* noalias sret %agg.result) - // WIN64: define linkonce_odr dso_local void @"?cdecl_method_big@Class@@QEAA?AUBig@@XZ"(%class.Class* %this, %struct.Big* noalias sret %agg.result) + // LINUX: define {{.*}} void @_ZN5Class16cdecl_method_bigEv(%struct.Big* noalias sret align 4 %agg.result, %class.Class* %this) + // WIN32: define {{.*}} void @"?cdecl_method_big@Class@@QAA?AUBig@@XZ"(%class.Class* %this, %struct.Big* noalias sret align 4 %agg.result) + // WIN64: define linkonce_odr dso_local void @"?cdecl_method_big@Class@@QEAA?AUBig@@XZ"(%class.Class* %this, %struct.Big* noalias sret align 4 %agg.result) void thiscall_method_arg(Empty s) {} // LINUX: define {{.*}} void @_ZN5Class19thiscall_method_argE5Empty(%class.Class* %this) diff --git a/clang/test/CodeGenCXX/microsoft-abi-vmemptr-conflicts.cpp b/clang/test/CodeGenCXX/microsoft-abi-vmemptr-conflicts.cpp index 607ec816aefb5..1ab60a5261287 100644 --- a/clang/test/CodeGenCXX/microsoft-abi-vmemptr-conflicts.cpp +++ b/clang/test/CodeGenCXX/microsoft-abi-vmemptr-conflicts.cpp @@ -65,7 +65,7 @@ void f(C *c) { // CHECK-LABEL: define dso_local void @"?f@sret@@YAXPAUC@1@@Z"(%"struct.sret::C"* %c) // CHECK: call x86_thiscallcc i32 bitcast (void (%"struct.sret::C"*, ...)* @"??_9C@sret@@$BA@AE" to i32 (%"struct.sret::C"*)*)(%"struct.sret::C"* %{{.*}}) -// CHECK: call x86_thiscallcc void bitcast (void (%"struct.sret::C"*, ...)* @"??_9C@sret@@$BA@AE" to void (%"struct.sret::C"*, %"struct.sret::Big"*)*)(%"struct.sret::C"* %{{.*}}, %"struct.sret::Big"* sret %{{.*}}) +// CHECK: call x86_thiscallcc void bitcast (void (%"struct.sret::C"*, ...)* @"??_9C@sret@@$BA@AE" to void (%"struct.sret::C"*, %"struct.sret::Big"*)*)(%"struct.sret::C"* %{{.*}}, %"struct.sret::Big"* sret align 4 %{{.*}}) // CHECK-LABEL: define linkonce_odr x86_thiscallcc void @"??_9C@sret@@$BA@AE"(%"struct.sret::C"* %this, ...) {{.*}} comdat // CHECK: musttail call x86_thiscallcc void (%"struct.sret::C"*, ...) %{{.*}}(%"struct.sret::C"* %{{.*}}, ...) diff --git a/clang/test/CodeGenCXX/ms-thunks-ehspec.cpp b/clang/test/CodeGenCXX/ms-thunks-ehspec.cpp new file mode 100644 index 0000000000000..f72100d5078b1 --- /dev/null +++ b/clang/test/CodeGenCXX/ms-thunks-ehspec.cpp @@ -0,0 +1,27 @@ +// RUN: %clang_cc1 -fexceptions -fcxx-exceptions %s -triple=i686-windows-msvc -emit-llvm -o - | FileCheck %s + +// When generating thunks using musttail due to inalloca parameters, don't push +// and pop terminate scopes. PR44987 + +struct NonTrivial { + NonTrivial(); + NonTrivial(const NonTrivial &o); + ~NonTrivial(); + int x; +}; +struct A { + virtual void f(NonTrivial o) noexcept; +}; +struct B { + virtual void f(NonTrivial o) noexcept; +}; +class C : A, B { + virtual void f(NonTrivial o) noexcept; +}; +C c; + +// CHECK-LABEL: define linkonce_odr dso_local x86_thiscallcc void @"?f@C@@G3AEXUNonTrivial@@@Z"(%class.C* %this, <{ %struct.NonTrivial }>* inalloca %0) +// CHECK-NOT: invoke +// CHECK: musttail call x86_thiscallcc void @"?f@C@@EAEXUNonTrivial@@@Z"(%class.C* %{{.*}}, <{ %struct.NonTrivial }>* inalloca %0) +// CHECK-NEXT ret void + diff --git a/clang/test/CodeGenCXX/regcall.cpp b/clang/test/CodeGenCXX/regcall.cpp index bdf76964bf231..9eca868fc31d2 100644 --- a/clang/test/CodeGenCXX/regcall.cpp +++ b/clang/test/CodeGenCXX/regcall.cpp @@ -74,8 +74,8 @@ bool __regcall operator ==(const test_class&, const test_class&){ --x; return fa // CHECK-WIN32-DAG: define dso_local x86_regcallcc zeroext i1 @"??8@Yw_NABVtest_class@@0@Z" test_class __regcall operator""_test_class (unsigned long long) { ++x; return test_class{};} -// CHECK-LIN64-DAG: define x86_regcallcc void @_Zli11_test_classy(%class.test_class* noalias sret %agg.result, i64 %0) -// CHECK-LIN32-DAG: define x86_regcallcc void @_Zli11_test_classy(%class.test_class* inreg noalias sret %agg.result, i64 %0) +// CHECK-LIN64-DAG: define x86_regcallcc void @_Zli11_test_classy(%class.test_class* noalias sret align 4 %agg.result, i64 %0) +// CHECK-LIN32-DAG: define x86_regcallcc void @_Zli11_test_classy(%class.test_class* inreg noalias sret align 4 %agg.result, i64 %0) // CHECK-WIN64-DAG: ??__K_test_class@@Yw?AVtest_class@@_K@Z" // CHECK-WIN32-DAG: ??__K_test_class@@Yw?AVtest_class@@_K@Z" @@ -99,7 +99,7 @@ void force_gen() { long double _Complex __regcall foo(long double _Complex f) { return f; } -// CHECK-LIN64-DAG: define x86_regcallcc void @_Z15__regcall3__fooCe({ x86_fp80, x86_fp80 }* noalias sret %agg.result, { x86_fp80, x86_fp80 }* byval({ x86_fp80, x86_fp80 }) align 16 %f) -// CHECK-LIN32-DAG: define x86_regcallcc void @_Z15__regcall3__fooCe({ x86_fp80, x86_fp80 }* inreg noalias sret %agg.result, { x86_fp80, x86_fp80 }* byval({ x86_fp80, x86_fp80 }) align 4 %f) +// CHECK-LIN64-DAG: define x86_regcallcc void @_Z15__regcall3__fooCe({ x86_fp80, x86_fp80 }* noalias sret align 16 %agg.result, { x86_fp80, x86_fp80 }* byval({ x86_fp80, x86_fp80 }) align 16 %f) +// CHECK-LIN32-DAG: define x86_regcallcc void @_Z15__regcall3__fooCe({ x86_fp80, x86_fp80 }* inreg noalias sret align 4 %agg.result, { x86_fp80, x86_fp80 }* byval({ x86_fp80, x86_fp80 }) align 4 %f) // CHECK-WIN64-DAG: define dso_local x86_regcallcc { double, double } @"?foo@@YwU?$_Complex@O@__clang@@U12@@Z"(double %f.0, double %f.1) // CHECK-WIN32-DAG: define dso_local x86_regcallcc { double, double } @"?foo@@YwU?$_Complex@O@__clang@@U12@@Z"(double %f.0, double %f.1) diff --git a/clang/test/CodeGenCXX/stack-reuse-miscompile.cpp b/clang/test/CodeGenCXX/stack-reuse-miscompile.cpp index 4e824d94f510e..7d86ea8447b41 100644 --- a/clang/test/CodeGenCXX/stack-reuse-miscompile.cpp +++ b/clang/test/CodeGenCXX/stack-reuse-miscompile.cpp @@ -39,7 +39,7 @@ const char * f(S s) // CHECK: call void @llvm.lifetime.start.p0i8(i64 16, i8* [[T3i8]]) // CHECK: [[T5:%.*]] = call %class.T* @_ZN1TC1E1S(%class.T* [[T3]], [2 x i32] %{{.*}}) // -// CHECK: call void @_ZNK1T6concatERKS_(%class.T* sret [[T1]], %class.T* [[T2]], %class.T* dereferenceable(16) [[T3]]) +// CHECK: call void @_ZNK1T6concatERKS_(%class.T* sret align 4 [[T1]], %class.T* [[T2]], %class.T* dereferenceable(16) [[T3]]) // CHECK: [[T6:%.*]] = call i8* @_ZNK1T3strEv(%class.T* [[T1]]) // // CHECK: call void @llvm.lifetime.end.p0i8( diff --git a/clang/test/CodeGenCXX/stack-reuse.cpp b/clang/test/CodeGenCXX/stack-reuse.cpp index 35dcb5b349c3e..94e5e3d9b364b 100644 --- a/clang/test/CodeGenCXX/stack-reuse.cpp +++ b/clang/test/CodeGenCXX/stack-reuse.cpp @@ -135,7 +135,7 @@ int large_combiner_test(S_large s) { // CHECK: [[T2:%.*]] = alloca %struct.Combiner // CHECK: [[T1:%.*]] = alloca %struct.Combiner // CHECK: [[T3:%.*]] = call %struct.Combiner* @_ZN8CombinerC1E7S_large(%struct.Combiner* nonnull [[T1]], [9 x i32] %s.coerce) -// CHECK: call void @_ZN8Combiner1fEv(%struct.Combiner* nonnull sret [[T2]], %struct.Combiner* nonnull [[T1]]) +// CHECK: call void @_ZN8Combiner1fEv(%struct.Combiner* nonnull sret align 4 [[T2]], %struct.Combiner* nonnull [[T1]]) // CHECK: [[T4:%.*]] = getelementptr inbounds %struct.Combiner, %struct.Combiner* [[T2]], i32 0, i32 0, i32 0, i32 0 // CHECK: [[T5:%.*]] = load i32, i32* [[T4]] // CHECK: ret i32 [[T5]] diff --git a/clang/test/CodeGenCXX/temporaries.cpp b/clang/test/CodeGenCXX/temporaries.cpp index d15e0fa05bd86..175b475c8cd7f 100644 --- a/clang/test/CodeGenCXX/temporaries.cpp +++ b/clang/test/CodeGenCXX/temporaries.cpp @@ -403,13 +403,13 @@ namespace Elision { // CHECK-NEXT: call void @_ZN7Elision1AC1Ev([[A]]* [[I]]) A i = (foo(), A()); - // CHECK-NEXT: call void @_ZN7Elision4fooAEv([[A]]* sret [[T0]]) + // CHECK-NEXT: call void @_ZN7Elision4fooAEv([[A]]* sret align 8 [[T0]]) // CHECK-NEXT: call void @_ZN7Elision1AC1Ev([[A]]* [[J]]) // CHECK-NEXT: call void @_ZN7Elision1AD1Ev([[A]]* [[T0]]) A j = (fooA(), A()); // CHECK-NEXT: call void @_ZN7Elision1AC1Ev([[A]]* [[T1]]) - // CHECK-NEXT: call void @_ZN7Elision4fooAEv([[A]]* sret [[K]]) + // CHECK-NEXT: call void @_ZN7Elision4fooAEv([[A]]* sret align 8 [[K]]) // CHECK-NEXT: call void @_ZN7Elision1AD1Ev([[A]]* [[T1]]) A k = (A(), fooA()); @@ -436,7 +436,7 @@ namespace Elision { // CHECK-NEXT: call void @_ZN7Elision1AD1Ev([[A]]* [[I]]) } - // CHECK: define void @_ZN7Elision5test2Ev([[A]]* noalias sret + // CHECK: define void @_ZN7Elision5test2Ev([[A]]* noalias sret align 8 A test2() { // CHECK: call void @_ZN7Elision3fooEv() // CHECK-NEXT: call void @_ZN7Elision1AC1Ev([[A]]* [[RET:%.*]]) @@ -444,7 +444,7 @@ namespace Elision { return (foo(), A()); } - // CHECK: define void @_ZN7Elision5test3EiNS_1AE([[A]]* noalias sret + // CHECK: define void @_ZN7Elision5test3EiNS_1AE([[A]]* noalias sret align 8 A test3(int v, A x) { if (v < 5) // CHECK: call void @_ZN7Elision1AC1Ev([[A]]* [[RET:%.*]]) @@ -485,7 +485,7 @@ namespace Elision { } // rdar://problem/8433352 - // CHECK: define void @_ZN7Elision5test5Ev([[A]]* noalias sret + // CHECK: define void @_ZN7Elision5test5Ev([[A]]* noalias sret align 8 struct B { A a; B(); }; A test5() { // CHECK: [[AT0:%.*]] = alloca [[A]], align 8 @@ -523,7 +523,7 @@ namespace Elision { void test6(const C *x) { // CHECK: [[T0:%.*]] = alloca [[A]], align 8 // CHECK: [[X:%.*]] = load [[C]]*, [[C]]** {{%.*}}, align 8 - // CHECK-NEXT: call void @_ZNK7Elision1CcvNS_1AEEv([[A]]* sret [[T0]], [[C]]* [[X]]) + // CHECK-NEXT: call void @_ZNK7Elision1CcvNS_1AEEv([[A]]* sret align 8 [[T0]], [[C]]* [[X]]) // CHECK-NEXT: call void @_ZNK7Elision1A3fooEv([[A]]* [[T0]]) // CHECK-NEXT: call void @_ZN7Elision1AD1Ev([[A]]* [[T0]]) // CHECK-NEXT: ret void diff --git a/clang/test/CodeGenCXX/thiscall-struct-return.cpp b/clang/test/CodeGenCXX/thiscall-struct-return.cpp index a6be5aa494e1b..35d5cc479177a 100644 --- a/clang/test/CodeGenCXX/thiscall-struct-return.cpp +++ b/clang/test/CodeGenCXX/thiscall-struct-return.cpp @@ -34,8 +34,8 @@ void test( void ) { // CHECK: call void @_ZN1CC1Ev(%class.C* [[C:%.+]]) C c; -// CHECK: call x86_thiscallcc void @_ZNK1C5SmallEv(%struct.S* sret %{{.+}}, %class.C* [[C]]) +// CHECK: call x86_thiscallcc void @_ZNK1C5SmallEv(%struct.S* sret align 4 %{{.+}}, %class.C* [[C]]) (void)c.Small(); -// CHECK: call x86_thiscallcc void @_ZNK1C6MediumEv(%struct.M* sret %{{.+}}, %class.C* [[C]]) +// CHECK: call x86_thiscallcc void @_ZNK1C6MediumEv(%struct.M* sret align 4 %{{.+}}, %class.C* [[C]]) (void)c.Medium(); } diff --git a/clang/test/CodeGenCXX/thunk-returning-memptr.cpp b/clang/test/CodeGenCXX/thunk-returning-memptr.cpp index 0b7870c6d6582..63bb3d68472d7 100644 --- a/clang/test/CodeGenCXX/thunk-returning-memptr.cpp +++ b/clang/test/CodeGenCXX/thunk-returning-memptr.cpp @@ -23,5 +23,5 @@ C::C() {} // Because of the tail call, the return value cannot be copied into a local // alloca. (PR39901) -// CHECK-LABEL: define linkonce_odr void @_ZThn4_N1C1fEv({ i32, i32 }* noalias sret %agg.result, %struct.C* %this) -// CHECK: tail call void @_ZN1C1fEv({ i32, i32 }* sret %agg.result +// CHECK-LABEL: define linkonce_odr void @_ZThn4_N1C1fEv({ i32, i32 }* noalias sret align 4 %agg.result, %struct.C* %this) +// CHECK: tail call void @_ZN1C1fEv({ i32, i32 }* sret align 4 %agg.result diff --git a/clang/test/CodeGenCXX/thunks-ehspec.cpp b/clang/test/CodeGenCXX/thunks-ehspec.cpp new file mode 100644 index 0000000000000..30276948d3fcd --- /dev/null +++ b/clang/test/CodeGenCXX/thunks-ehspec.cpp @@ -0,0 +1,29 @@ +// RUN: %clang_cc1 -fexceptions -fcxx-exceptions %s -triple=x86_64-pc-linux-gnu -munwind-tables -emit-llvm -o - -O1 -disable-llvm-passes | FileCheck %s + +// When generating the thunk for secondary, do not push terminate scopes for +// either the varargs or non-varargs case. Related to PR44987. + +struct A { + virtual void primary_key(); +}; +struct B { + virtual void secondary(); + virtual void secondary_vararg(int, ...); +}; +class C : A, B { + virtual void primary_key(); + void secondary() noexcept; + void secondary_vararg(int, ...) noexcept; +}; +void C::primary_key() {} + +// CHECK-LABEL: define available_externally void @_ZThn8_N1C9secondaryEv(%class.C* %this) +// CHECK-NOT: invoke +// CHECK: tail call void @_ZN1C9secondaryEv(%class.C* %{{.*}}) +// CHECK-NOT: invoke +// CHECK: ret void + +// CHECK-LABEL: define available_externally void @_ZThn8_N1C16secondary_varargEiz(%class.C* %this, i32 %0, ...) +// CHECK-NOT: invoke +// CHECK: musttail call void (%class.C*, i32, ...) @_ZN1C16secondary_varargEiz(%class.C* %{{.*}}, i32 %{{.*}}, ...) #2 +// CHECK-NEXT: ret void diff --git a/clang/test/CodeGenCXX/thunks.cpp b/clang/test/CodeGenCXX/thunks.cpp index fe7d656eb7e52..b5c2852f87703 100644 --- a/clang/test/CodeGenCXX/thunks.cpp +++ b/clang/test/CodeGenCXX/thunks.cpp @@ -206,13 +206,13 @@ namespace Test6 { // CHECK-LABEL: define void @_ZThn16_N5Test66Thunks1fEv // CHECK-DBG-NOT: dbg.declare // CHECK-NOT: memcpy - // CHECK: {{call void @_ZN5Test66Thunks1fEv.*sret}} + // CHECK: {{call void @_ZN5Test66Thunks1fEv.*sret align 1}} // CHECK: ret void X Thunks::f() { return X(); } - // WIN64-LABEL: define linkonce_odr dso_local void @"?f@Thunks@Test6@@WBA@EAA?AUX@2@XZ"({{.*}} sret %{{.*}}) + // WIN64-LABEL: define linkonce_odr dso_local void @"?f@Thunks@Test6@@WBA@EAA?AUX@2@XZ"({{.*}} sret align 1 %{{.*}}) // WIN64-NOT: memcpy - // WIN64: tail call void @"?f@Thunks@Test6@@UEAA?AUX@2@XZ"({{.*}} sret %{{.*}}) + // WIN64: tail call void @"?f@Thunks@Test6@@UEAA?AUX@2@XZ"({{.*}} sret align 1 %{{.*}}) } namespace Test7 { diff --git a/clang/test/CodeGenCXX/trivial_abi.cpp b/clang/test/CodeGenCXX/trivial_abi.cpp index 2cf07b22581a2..23c589dacd7e2 100644 --- a/clang/test/CodeGenCXX/trivial_abi.cpp +++ b/clang/test/CodeGenCXX/trivial_abi.cpp @@ -126,7 +126,7 @@ void testIgnoredSmall() { void testParamLarge(Large a) noexcept { } -// CHECK: define void @_Z15testReturnLargev(%[[STRUCT_LARGE:.*]]* noalias sret %[[AGG_RESULT:.*]]) +// CHECK: define void @_Z15testReturnLargev(%[[STRUCT_LARGE:.*]]* noalias sret align 8 %[[AGG_RESULT:.*]]) // CHECK: %[[CALL:.*]] = call %[[STRUCT_LARGE]]* @_ZN5LargeC1Ev(%[[STRUCT_LARGE]]* %[[AGG_RESULT]]) // CHECK: ret void // CHECK: } @@ -153,7 +153,7 @@ void testCallLarge0() { // CHECK: define void @_Z14testCallLarge1v() // CHECK: %[[AGG_TMP:.*]] = alloca %[[STRUCT_LARGE:.*]], align 8 -// CHECK: call void @_Z15testReturnLargev(%[[STRUCT_LARGE]]* sret %[[AGG_TMP]]) +// CHECK: call void @_Z15testReturnLargev(%[[STRUCT_LARGE]]* sret align 8 %[[AGG_TMP]]) // CHECK: call void @_Z14testParamLarge5Large(%[[STRUCT_LARGE]]* %[[AGG_TMP]]) // CHECK: ret void // CHECK: } @@ -164,7 +164,7 @@ void testCallLarge1() { // CHECK: define void @_Z16testIgnoredLargev() // CHECK: %[[AGG_TMP_ENSURED:.*]] = alloca %[[STRUCT_LARGE:.*]], align 8 -// CHECK: call void @_Z15testReturnLargev(%[[STRUCT_LARGE]]* sret %[[AGG_TMP_ENSURED]]) +// CHECK: call void @_Z15testReturnLargev(%[[STRUCT_LARGE]]* sret align 8 %[[AGG_TMP_ENSURED]]) // CHECK: %[[CALL:.*]] = call %[[STRUCT_LARGE]]* @_ZN5LargeD1Ev(%[[STRUCT_LARGE]]* %[[AGG_TMP_ENSURED]]) // CHECK: ret void // CHECK: } @@ -186,7 +186,7 @@ Trivial testReturnHasTrivial() { return t; } -// CHECK: define void @_Z23testReturnHasNonTrivialv(%[[STRUCT_NONTRIVIAL:.*]]* noalias sret %[[AGG_RESULT:.*]]) +// CHECK: define void @_Z23testReturnHasNonTrivialv(%[[STRUCT_NONTRIVIAL:.*]]* noalias sret align 4 %[[AGG_RESULT:.*]]) // CHECK: %[[CALL:.*]] = call %[[STRUCT_NONTRIVIAL]]* @_ZN10NonTrivialC1Ev(%[[STRUCT_NONTRIVIAL]]* %[[AGG_RESULT]]) // CHECK: ret void // CHECK: } diff --git a/clang/test/CodeGenCXX/unknown-anytype.cpp b/clang/test/CodeGenCXX/unknown-anytype.cpp index 42ed472380b15..0a7ab53b7af6c 100644 --- a/clang/test/CodeGenCXX/unknown-anytype.cpp +++ b/clang/test/CodeGenCXX/unknown-anytype.cpp @@ -71,7 +71,7 @@ struct Test7 { }; extern "C" __unknown_anytype test7_any(int); Test7 test7() { - // COMMON: call void @test7_any({{%.*}}* sret {{%.*}}, i32 5) + // COMMON: call void @test7_any({{%.*}}* sret align 1 {{%.*}}, i32 5) return (Test7) test7_any(5); } diff --git a/clang/test/CodeGenCXX/used-decl-visitor.cpp b/clang/test/CodeGenCXX/used-decl-visitor.cpp new file mode 100644 index 0000000000000..2b923ab562dbd --- /dev/null +++ b/clang/test/CodeGenCXX/used-decl-visitor.cpp @@ -0,0 +1,18 @@ +// RUN: %clang_cc1 -triple x86_64 -emit-llvm -o %t %s + +// Make sure there is no assertion due to UsedDeclVisitor. + +struct A { + int a; +}; + +static A a; + +struct B { + B(int b = a.a) {} +}; + + +void foo() { + B(); +} diff --git a/clang/test/CodeGenCXX/wasm-args-returns.cpp b/clang/test/CodeGenCXX/wasm-args-returns.cpp index c547eb85390da..3c57961eb2fcc 100644 --- a/clang/test/CodeGenCXX/wasm-args-returns.cpp +++ b/clang/test/CodeGenCXX/wasm-args-returns.cpp @@ -30,52 +30,52 @@ struct two_fields { double d, e; }; test(two_fields); -// CHECK: define void @_Z7forward10two_fields(%struct.two_fields* noalias nocapture sret %{{.*}}, %struct.two_fields* nocapture readonly byval(%struct.two_fields) align 8 %{{.*}}) +// CHECK: define void @_Z7forward10two_fields(%struct.two_fields* noalias nocapture sret align 8 %{{.*}}, %struct.two_fields* nocapture readonly byval(%struct.two_fields) align 8 %{{.*}}) // // CHECK: define void @_Z15test_two_fieldsv() // CHECK: %[[tmp:.*]] = alloca %struct.two_fields, align 8 -// CHECK: call void @_Z14def_two_fieldsv(%struct.two_fields* nonnull sret %[[tmp]]) +// CHECK: call void @_Z14def_two_fieldsv(%struct.two_fields* nonnull sret align 8 %[[tmp]]) // CHECK: call void @_Z3use10two_fields(%struct.two_fields* nonnull byval(%struct.two_fields) align 8 %[[tmp]]) // CHECK: ret void // // CHECK: declare void @_Z3use10two_fields(%struct.two_fields* byval(%struct.two_fields) align 8) -// CHECK: declare void @_Z14def_two_fieldsv(%struct.two_fields* sret) +// CHECK: declare void @_Z14def_two_fieldsv(%struct.two_fields* sret align 8) struct copy_ctor { double d; copy_ctor(copy_ctor const &); }; test(copy_ctor); -// CHECK: define void @_Z7forward9copy_ctor(%struct.copy_ctor* noalias sret %{{.*}}, %struct.copy_ctor* nonnull %{{.*}}) +// CHECK: define void @_Z7forward9copy_ctor(%struct.copy_ctor* noalias sret align 8 %{{.*}}, %struct.copy_ctor* nonnull %{{.*}}) // // CHECK: declare %struct.copy_ctor* @_ZN9copy_ctorC1ERKS_(%struct.copy_ctor* returned, %struct.copy_ctor* dereferenceable(8)) // // CHECK: define void @_Z14test_copy_ctorv() // CHECK: %[[tmp:.*]] = alloca %struct.copy_ctor, align 8 -// CHECK: call void @_Z13def_copy_ctorv(%struct.copy_ctor* nonnull sret %[[tmp]]) +// CHECK: call void @_Z13def_copy_ctorv(%struct.copy_ctor* nonnull sret align 8 %[[tmp]]) // CHECK: call void @_Z3use9copy_ctor(%struct.copy_ctor* nonnull %[[tmp]]) // CHECK: ret void // // CHECK: declare void @_Z3use9copy_ctor(%struct.copy_ctor*) -// CHECK: declare void @_Z13def_copy_ctorv(%struct.copy_ctor* sret) +// CHECK: declare void @_Z13def_copy_ctorv(%struct.copy_ctor* sret align 8) struct __attribute__((aligned(16))) aligned_copy_ctor { double d, e; aligned_copy_ctor(aligned_copy_ctor const &); }; test(aligned_copy_ctor); -// CHECK: define void @_Z7forward17aligned_copy_ctor(%struct.aligned_copy_ctor* noalias sret %{{.*}}, %struct.aligned_copy_ctor* nonnull %{{.*}}) +// CHECK: define void @_Z7forward17aligned_copy_ctor(%struct.aligned_copy_ctor* noalias sret align 16 %{{.*}}, %struct.aligned_copy_ctor* nonnull %{{.*}}) // // CHECK: declare %struct.aligned_copy_ctor* @_ZN17aligned_copy_ctorC1ERKS_(%struct.aligned_copy_ctor* returned, %struct.aligned_copy_ctor* dereferenceable(16)) // // CHECK: define void @_Z22test_aligned_copy_ctorv() // CHECK: %[[tmp:.*]] = alloca %struct.aligned_copy_ctor, align 16 -// CHECK: call void @_Z21def_aligned_copy_ctorv(%struct.aligned_copy_ctor* nonnull sret %[[tmp]]) +// CHECK: call void @_Z21def_aligned_copy_ctorv(%struct.aligned_copy_ctor* nonnull sret align 16 %[[tmp]]) // CHECK: call void @_Z3use17aligned_copy_ctor(%struct.aligned_copy_ctor* nonnull %[[tmp]]) // CHECK: ret void // // CHECK: declare void @_Z3use17aligned_copy_ctor(%struct.aligned_copy_ctor*) -// CHECK: declare void @_Z21def_aligned_copy_ctorv(%struct.aligned_copy_ctor* sret) +// CHECK: declare void @_Z21def_aligned_copy_ctorv(%struct.aligned_copy_ctor* sret align 16) struct empty {}; test(empty); diff --git a/clang/test/CodeGenCXX/x86_32-arguments.cpp b/clang/test/CodeGenCXX/x86_32-arguments.cpp index 830168635b529..c7ff59e943d2e 100644 --- a/clang/test/CodeGenCXX/x86_32-arguments.cpp +++ b/clang/test/CodeGenCXX/x86_32-arguments.cpp @@ -6,7 +6,7 @@ struct S { short s; }; -// CHECK-LABEL: define void @_Z1fv(%struct.S* noalias sret % +// CHECK-LABEL: define void @_Z1fv(%struct.S* noalias sret align 2 % S f() { return S(); } // CHECK-LABEL: define void @_Z1f1S(%struct.S* %0) void f(S) { } @@ -18,7 +18,7 @@ class C { double c; }; -// CHECK-LABEL: define void @_Z1gv(%class.C* noalias sret % +// CHECK-LABEL: define void @_Z1gv(%class.C* noalias sret align 4 % C g() { return C(); } // CHECK-LABEL: define void @_Z1f1C(%class.C* %0) @@ -103,13 +103,13 @@ struct s7_1 { double x; }; struct s7 : s7_0, s7_1 { }; s7 f7() { return s7(); } -// CHECK-LABEL: define void @_Z2f8v(%struct.s8* noalias sret %agg.result) +// CHECK-LABEL: define void @_Z2f8v(%struct.s8* noalias sret align 4 %agg.result) struct s8_0 { }; struct s8_1 { double x; }; struct s8 { s8_0 a; s8_1 b; }; s8 f8() { return s8(); } -// CHECK-LABEL: define void @_Z2f9v(%struct.s9* noalias sret %agg.result) +// CHECK-LABEL: define void @_Z2f9v(%struct.s9* noalias sret align 4 %agg.result) struct s9_0 { unsigned : 0; }; struct s9_1 { double x; }; struct s9 { s9_0 a; s9_1 b; }; diff --git a/clang/test/CodeGenCXX/x86_64-arguments.cpp b/clang/test/CodeGenCXX/x86_64-arguments.cpp index e905907788950..f7a898b220af7 100644 --- a/clang/test/CodeGenCXX/x86_64-arguments.cpp +++ b/clang/test/CodeGenCXX/x86_64-arguments.cpp @@ -176,7 +176,7 @@ namespace test9 { // CHECK: define void @_ZN5test93fooEPNS_1SEPNS_1TE([[S:%.*]]* %0, [[T:%.*]]* %1) void foo(S*, T*) {} - // CHECK: define void @_ZN5test91aEiiiiNS_1TEPv([[S]]* noalias sret {{%.*}}, i32 %0, i32 %1, i32 %2, i32 %3, [[T]]* byval([[T]]) align 8 %4, i8* %5) + // CHECK: define void @_ZN5test91aEiiiiNS_1TEPv([[S]]* noalias sret align 8 {{%.*}}, i32 %0, i32 %1, i32 %2, i32 %3, [[T]]* byval([[T]]) align 8 %4, i8* %5) S a(int, int, int, int, T, void*) { return S(); } @@ -186,7 +186,7 @@ namespace test9 { return sret; } - // CHECK: define void @_ZN5test91cEiiiNS_1TEPv([[S]]* noalias sret {{%.*}}, i32 %0, i32 %1, i32 %2, i8* {{%.*}}, i8* {{%.*}}, i8* %3) + // CHECK: define void @_ZN5test91cEiiiNS_1TEPv([[S]]* noalias sret align 8 {{%.*}}, i32 %0, i32 %1, i32 %2, i8* {{%.*}}, i8* {{%.*}}, i8* %3) S c(int, int, int, T, void*) { return S(); } diff --git a/clang/test/CodeGenCoroutines/coro-always-inline.cpp b/clang/test/CodeGenCoroutines/coro-always-inline.cpp new file mode 100644 index 0000000000000..a2e4bba45c0c9 --- /dev/null +++ b/clang/test/CodeGenCoroutines/coro-always-inline.cpp @@ -0,0 +1,54 @@ +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm -fcoroutines-ts \ +// RUN: -fexperimental-new-pass-manager -O0 %s -o - | FileCheck %s +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm -fcoroutines-ts \ +// RUN: -fexperimental-new-pass-manager -fno-inline -O0 %s -o - | FileCheck %s + +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm -fcoroutines-ts \ +// RUN: -O0 %s -o - | FileCheck %s +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm -fcoroutines-ts \ +// RUN: -fno-inline -O0 %s -o - | FileCheck %s + +namespace std { +namespace experimental { + +struct handle {}; + +struct awaitable { + bool await_ready() { return true; } + // CHECK-NOT: await_suspend + inline void __attribute__((__always_inline__)) await_suspend(handle) {} + bool await_resume() { return true; } +}; + +template +struct coroutine_handle { + static handle from_address(void *address) { return {}; } +}; + +template +struct coroutine_traits { + struct promise_type { + awaitable initial_suspend() { return {}; } + awaitable final_suspend() { return {}; } + void return_void() {} + T get_return_object() { return T(); } + void unhandled_exception() {} + }; +}; +} // namespace experimental +} // namespace std + +// CHECK-LABEL: @_Z3foov +// CHECK-LABEL: entry: +// CHECK-NEXT: %this.addr.i{{[0-9]*}} = alloca %"struct.std::experimental::awaitable"*, align 8 +// CHECK-NEXT: %this.addr.i{{[0-9]*}} = alloca %"struct.std::experimental::awaitable"*, align 8 +// CHECK: [[CAST0:%[0-9]+]] = bitcast %"struct.std::experimental::awaitable"** %this.addr.i{{[0-9]*}} to i8* +// CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 8, i8* [[CAST0]]) +// CHECK: [[CAST1:%[0-9]+]] = bitcast %"struct.std::experimental::awaitable"** %this.addr.i{{[0-9]*}} to i8* +// CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 8, i8* [[CAST1]]) + +// CHECK: [[CAST2:%[0-9]+]] = bitcast %"struct.std::experimental::awaitable"** %this.addr.i{{[0-9]*}} to i8* +// CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 8, i8* [[CAST2]]) +// CHECK: [[CAST3:%[0-9]+]] = bitcast %"struct.std::experimental::awaitable"** %this.addr.i{{[0-9]*}} to i8* +// CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 8, i8* [[CAST3]]) +void foo() { co_return; } diff --git a/clang/test/CodeGenCoroutines/coro-await.cpp b/clang/test/CodeGenCoroutines/coro-await.cpp index 86bacc766db3f..99097f376aa57 100644 --- a/clang/test/CodeGenCoroutines/coro-await.cpp +++ b/clang/test/CodeGenCoroutines/coro-await.cpp @@ -130,7 +130,7 @@ extern "C" void f1(int) { // CHECK: %[[PROMISE:.+]] = alloca %"struct.std::experimental::coroutine_traits::promise_type" // CHECK: %[[FRAME:.+]] = call i8* @llvm.coro.begin( co_yield 42; - // CHECK: call void @_ZNSt12experimental16coroutine_traitsIJviEE12promise_type11yield_valueEi(%struct.suspend_maybe* sret %[[AWAITER:.+]], %"struct.std::experimental::coroutine_traits::promise_type"* %[[PROMISE]], i32 42) + // CHECK: call void @_ZNSt12experimental16coroutine_traitsIJviEE12promise_type11yield_valueEi(%struct.suspend_maybe* sret align 4 %[[AWAITER:.+]], %"struct.std::experimental::coroutine_traits::promise_type"* %[[PROMISE]], i32 42) // See if we need to suspend: // -------------------------- @@ -197,20 +197,20 @@ extern "C" void UseAggr(Aggr&&); extern "C" void TestAggr() { UseAggr(co_await AggrAwaiter{}); Whatever(); - // CHECK: call void @_ZN11AggrAwaiter12await_resumeEv(%struct.Aggr* sret %[[AwaitResume:.+]], + // CHECK: call void @_ZN11AggrAwaiter12await_resumeEv(%struct.Aggr* sret align 4 %[[AwaitResume:.+]], // CHECK: call void @UseAggr(%struct.Aggr* dereferenceable(12) %[[AwaitResume]]) // CHECK: call void @_ZN4AggrD1Ev(%struct.Aggr* %[[AwaitResume]]) // CHECK: call void @Whatever() co_await AggrAwaiter{}; Whatever(); - // CHECK: call void @_ZN11AggrAwaiter12await_resumeEv(%struct.Aggr* sret %[[AwaitResume2:.+]], + // CHECK: call void @_ZN11AggrAwaiter12await_resumeEv(%struct.Aggr* sret align 4 %[[AwaitResume2:.+]], // CHECK: call void @_ZN4AggrD1Ev(%struct.Aggr* %[[AwaitResume2]]) // CHECK: call void @Whatever() Aggr Val = co_await AggrAwaiter{}; Whatever(); - // CHECK: call void @_ZN11AggrAwaiter12await_resumeEv(%struct.Aggr* sret %[[AwaitResume3:.+]], + // CHECK: call void @_ZN11AggrAwaiter12await_resumeEv(%struct.Aggr* sret align 4 %[[AwaitResume3:.+]], // CHECK: call void @Whatever() // CHECK: call void @_ZN4AggrD1Ev(%struct.Aggr* %[[AwaitResume3]]) } @@ -253,7 +253,7 @@ extern "C" void TestOpAwait() { co_await MyAgg{}; // CHECK: call void @_ZN5MyAggawEv(%struct.MyAgg* % - // CHECK: call void @_ZN11AggrAwaiter12await_resumeEv(%struct.Aggr* sret % + // CHECK: call void @_ZN11AggrAwaiter12await_resumeEv(%struct.Aggr* sret align 4 % } // CHECK-LABEL: EndlessLoop( diff --git a/clang/test/CodeGenCoroutines/coro-gro-nrvo.cpp b/clang/test/CodeGenCoroutines/coro-gro-nrvo.cpp index 8a0b234e3ae27..42856f4479ec3 100644 --- a/clang/test/CodeGenCoroutines/coro-gro-nrvo.cpp +++ b/clang/test/CodeGenCoroutines/coro-gro-nrvo.cpp @@ -34,14 +34,14 @@ struct coro { }; // Verify that the NRVO is applied to the Gro object. -// CHECK-LABEL: define void @_Z1fi(%struct.coro* noalias sret %agg.result, i32 %0) +// CHECK-LABEL: define void @_Z1fi(%struct.coro* noalias sret align 8 %agg.result, i32 %0) coro f(int) { // CHECK: %call = call noalias nonnull i8* @_Znwm( // CHECK-NEXT: br label %[[CoroInit:.*]] // CHECK: {{.*}}[[CoroInit]]: // CHECK: store i1 false, i1* %gro.active -// CHECK: call void @{{.*get_return_objectEv}}(%struct.coro* sret %agg.result +// CHECK: call void @{{.*get_return_objectEv}}(%struct.coro* sret align 8 %agg.result // CHECK-NEXT: store i1 true, i1* %gro.active co_return; } @@ -65,7 +65,7 @@ struct coro_two { }; // Verify that the NRVO is applied to the Gro object. -// CHECK-LABEL: define void @_Z1hi(%struct.coro_two* noalias sret %agg.result, i32 %0) +// CHECK-LABEL: define void @_Z1hi(%struct.coro_two* noalias sret align 8 %agg.result, i32 %0) coro_two h(int) { // CHECK: %call = call noalias i8* @_ZnwmRKSt9nothrow_t @@ -73,12 +73,12 @@ struct coro_two { // CHECK-NEXT: br i1 %[[CheckNull]], label %[[InitOnSuccess:.*]], label %[[InitOnFailure:.*]] // CHECK: {{.*}}[[InitOnFailure]]: -// CHECK-NEXT: call void @{{.*get_return_object_on_allocation_failureEv}}(%struct.coro_two* sret %agg.result +// CHECK-NEXT: call void @{{.*get_return_object_on_allocation_failureEv}}(%struct.coro_two* sret align 8 %agg.result // CHECK-NEXT: br label %[[RetLabel:.*]] // CHECK: {{.*}}[[InitOnSuccess]]: // CHECK: store i1 false, i1* %gro.active -// CHECK: call void @{{.*get_return_objectEv}}(%struct.coro_two* sret %agg.result +// CHECK: call void @{{.*get_return_objectEv}}(%struct.coro_two* sret align 8 %agg.result // CHECK-NEXT: store i1 true, i1* %gro.active // CHECK: [[RetLabel]]: diff --git a/clang/test/CodeGenObjC/arc.m b/clang/test/CodeGenObjC/arc.m index 560a084495783..375ad8ed7b416 100644 --- a/clang/test/CodeGenObjC/arc.m +++ b/clang/test/CodeGenObjC/arc.m @@ -1536,23 +1536,20 @@ void test70(id i) { // CHECK-LABEL: define void @test71 void test71(void) { - // FIXME: It would be nice if the __destructor_8_s40 for the first call (and - // the following lifetime.end) came before the second call. - // // CHECK: %[[T:[^ ]+]] = bitcast %struct.AggDtor* %[[TMP1:[^ ]+]] to i8* // CHECK: call void @llvm.lifetime.start.p0i8({{[^,]+}}, i8* %[[T]]) - // CHECK: call void @getAggDtor(%struct.AggDtor* sret %[[TMP1]]) + // CHECK: call void @getAggDtor(%struct.AggDtor* sret align 8 %[[TMP1]]) + // CHECK: %[[T:[^ ]+]] = bitcast %struct.AggDtor* %[[TMP1]] to i8** + // CHECK: call void @__destructor_8_s40(i8** %[[T]]) + // CHECK: %[[T:[^ ]+]] = bitcast %struct.AggDtor* %[[TMP1:[^ ]+]] to i8* + // CHECK: call void @llvm.lifetime.end.p0i8({{[^,]+}}, i8* %[[T]]) // CHECK: %[[T:[^ ]+]] = bitcast %struct.AggDtor* %[[TMP2:[^ ]+]] to i8* // CHECK: call void @llvm.lifetime.start.p0i8({{[^,]+}}, i8* %[[T]]) - // CHECK: call void @getAggDtor(%struct.AggDtor* sret %[[TMP2]]) + // CHECK: call void @getAggDtor(%struct.AggDtor* sret align 8 %[[TMP2]]) // CHECK: %[[T:[^ ]+]] = bitcast %struct.AggDtor* %[[TMP2]] to i8** // CHECK: call void @__destructor_8_s40(i8** %[[T]]) // CHECK: %[[T:[^ ]+]] = bitcast %struct.AggDtor* %[[TMP2:[^ ]+]] to i8* // CHECK: call void @llvm.lifetime.end.p0i8({{[^,]+}}, i8* %[[T]]) - // CHECK: %[[T:[^ ]+]] = bitcast %struct.AggDtor* %[[TMP1]] to i8** - // CHECK: call void @__destructor_8_s40(i8** %[[T]]) - // CHECK: %[[T:[^ ]+]] = bitcast %struct.AggDtor* %[[TMP1:[^ ]+]] to i8* - // CHECK: call void @llvm.lifetime.end.p0i8({{[^,]+}}, i8* %[[T]]) getAggDtor(); getAggDtor(); } diff --git a/clang/test/CodeGenObjC/direct-method.m b/clang/test/CodeGenObjC/direct-method.m index e53c99bc0f5e8..5bb84de1ddb58 100644 --- a/clang/test/CodeGenObjC/direct-method.m +++ b/clang/test/CodeGenObjC/direct-method.m @@ -120,7 +120,7 @@ + (struct my_complex_struct)classGetComplex __attribute__((objc_direct)) { // CHECK-LABEL: define hidden void @"\01-[Root getAggregate]"( - (struct my_aggregate_struct)getAggregate __attribute__((objc_direct)) { - // CHECK: %struct.my_aggregate_struct* noalias sret [[RETVAL:%[^,]*]], + // CHECK: %struct.my_aggregate_struct* noalias sret align 4 [[RETVAL:%[^,]*]], // loading parameters // CHECK-LABEL: entry: diff --git a/clang/test/CodeGenObjC/nontrivial-c-struct-exception.m b/clang/test/CodeGenObjC/nontrivial-c-struct-exception.m index 1733a019026c0..8d66485959a8c 100644 --- a/clang/test/CodeGenObjC/nontrivial-c-struct-exception.m +++ b/clang/test/CodeGenObjC/nontrivial-c-struct-exception.m @@ -41,8 +41,8 @@ void testStrongException(void) { // CHECK: define void @testWeakException() // CHECK: %[[AGG_TMP:.*]] = alloca %[[STRUCT_WEAK]], align 8 // CHECK: %[[AGG_TMP1:.*]] = alloca %[[STRUCT_WEAK]], align 8 -// CHECK: call void @genWeak(%[[STRUCT_WEAK]]* sret %[[AGG_TMP]]) -// CHECK: invoke void @genWeak(%[[STRUCT_WEAK]]* sret %[[AGG_TMP1]]) +// CHECK: call void @genWeak(%[[STRUCT_WEAK]]* sret align 8 %[[AGG_TMP]]) +// CHECK: invoke void @genWeak(%[[STRUCT_WEAK]]* sret align 8 %[[AGG_TMP1]]) // CHECK: call void @calleeWeak(%[[STRUCT_WEAK]]* %[[AGG_TMP]], %[[STRUCT_WEAK]]* %[[AGG_TMP1]]) // CHECK: ret void diff --git a/clang/test/CodeGenObjC/objc-non-trivial-struct-nrvo.m b/clang/test/CodeGenObjC/objc-non-trivial-struct-nrvo.m index 53ff433989e29..93f348185412a 100644 --- a/clang/test/CodeGenObjC/objc-non-trivial-struct-nrvo.m +++ b/clang/test/CodeGenObjC/objc-non-trivial-struct-nrvo.m @@ -37,7 +37,7 @@ Trivial testTrivial(void) { void func1(TrivialBig *); -// CHECK: define void @testTrivialBig(%[[STRUCT_TRIVIALBIG]]* noalias sret %[[AGG_RESULT:.*]]) +// CHECK: define void @testTrivialBig(%[[STRUCT_TRIVIALBIG]]* noalias sret align 4 %[[AGG_RESULT:.*]]) // CHECK: call void @func1(%[[STRUCT_TRIVIALBIG]]* %[[AGG_RESULT]]) // CHECK-NEXT: ret void @@ -69,7 +69,7 @@ Strong testStrong(void) { return a; } -// CHECK: define void @testWeak(%[[STRUCT_WEAK]]* noalias sret %[[AGG_RESULT:.*]]) +// CHECK: define void @testWeak(%[[STRUCT_WEAK]]* noalias sret align 8 %[[AGG_RESULT:.*]]) // CHECK: %[[NRVO:.*]] = alloca i1, align 1 // CHECK: %[[V0:.*]] = bitcast %[[STRUCT_WEAK]]* %[[AGG_RESULT]] to i8** // CHECK: call void @__default_constructor_8_w0(i8** %[[V0]]) @@ -105,7 +105,7 @@ Weak testWeak2(int c) { return b; } -// CHECK: define internal void @"\01-[C1 foo1]"(%[[STRUCT_WEAK]]* noalias sret %[[AGG_RESULT:.*]], %{{.*}}* %{{.*}}, i8* %{{.*}}) +// CHECK: define internal void @"\01-[C1 foo1]"(%[[STRUCT_WEAK]]* noalias sret align 8 %[[AGG_RESULT:.*]], %{{.*}}* %{{.*}}, i8* %{{.*}}) // CHECK: %[[NRVO:.*]] = alloca i1, align 1 // CHECK: %[[V0:.*]] = bitcast %[[STRUCT_WEAK]]* %[[AGG_RESULT]] to i8** // CHECK: call void @__default_constructor_8_w0(i8** %[[V0]]) diff --git a/clang/test/CodeGenObjC/stret-1.m b/clang/test/CodeGenObjC/stret-1.m index 1122c28a468bd..f25c40438e590 100644 --- a/clang/test/CodeGenObjC/stret-1.m +++ b/clang/test/CodeGenObjC/stret-1.m @@ -14,19 +14,19 @@ int main(int argc, const char **argv) { struct stret s; s = [(id)(argc&~255) method]; - // CHECK: call void bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to void (%struct.stret*, i8*, i8*)*)(%struct.stret* sret [[T0:%[^,]+]] + // CHECK: call void bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to void (%struct.stret*, i8*, i8*)*)(%struct.stret* sret align 4 [[T0:%[^,]+]] // CHECK: [[T0P:%.*]] = bitcast %struct.stret* [[T0]] to i8* // CHECK: call void @llvm.memset.p0i8.i64(i8* align 4 [[T0P]], i8 0, i64 400, i1 false) s = [Test method]; - // CHECK: call void bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to void (%struct.stret*, i8*, i8*)*)(%struct.stret* sret [[T1:%[^,]+]] + // CHECK: call void bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to void (%struct.stret*, i8*, i8*)*)(%struct.stret* sret align 4 [[T1:%[^,]+]] // CHECK-NOT: call void @llvm.memset.p0i8.i64( [(id)(argc&~255) method]; - // CHECK: call void bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to void (%struct.stret*, i8*, i8*)*)(%struct.stret* sret [[T1:%[^,]+]] + // CHECK: call void bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to void (%struct.stret*, i8*, i8*)*)(%struct.stret* sret align 4 [[T1:%[^,]+]] // CHECK-NOT: call void @llvm.memset.p0i8.i64( [Test method]; - // CHECK: call void bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to void (%struct.stret*, i8*, i8*)*)(%struct.stret* sret [[T1:%[^,]+]] + // CHECK: call void bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to void (%struct.stret*, i8*, i8*)*)(%struct.stret* sret align 4 [[T1:%[^,]+]] // CHECK-NOT: call void @llvm.memset.p0i8.i64( } diff --git a/clang/test/CodeGenObjC/strong-in-c-struct.m b/clang/test/CodeGenObjC/strong-in-c-struct.m index 34bf0323695a6..ec212c46803da 100644 --- a/clang/test/CodeGenObjC/strong-in-c-struct.m +++ b/clang/test/CodeGenObjC/strong-in-c-struct.m @@ -89,6 +89,13 @@ void calleeStrongSmall(StrongSmall); void func(Strong *); +@interface C +- (StrongSmall)getStrongSmall; ++ (StrongSmall)getStrongSmallClass; +@end + +id g0; + // CHECK: %[[STRUCT_STRONGOUTER:.*]] = type { %[[STRUCT_STRONG:.*]], i8*, double } // CHECK: %[[STRUCT_STRONG]] = type { %[[STRUCT_TRIVIAL:.*]], i8* } // CHECK: %[[STRUCT_TRIVIAL]] = type { [4 x i32] } @@ -476,6 +483,18 @@ void test_destructor_ignored_result(void) { getStrongSmall(); } +// CHECK: define void @test_destructor_ignored_result2(%{{.*}}* %[[C:.*]]) +// CHECK: %[[TMP:.*]] = alloca %[[STRUCT_STRONGSMALL]], align 8 +// CHECK: %[[CALL:.*]] = call [2 x i64]{{.*}}@objc_msgSend +// CHECK: %[[V5:.*]] = bitcast %[[STRUCT_STRONGSMALL]]* %[[TMP]] to [2 x i64]* +// CHECK: store [2 x i64] %[[CALL]], [2 x i64]* %[[V5]], align 8 +// CHECK: %[[V6:.*]] = bitcast %[[STRUCT_STRONGSMALL]]* %[[TMP]] to i8** +// CHECK: call void @__destructor_8_s8(i8** %[[V6]]) + +void test_destructor_ignored_result2(C *c) { + [c getStrongSmall]; +} + // CHECK: define void @test_copy_constructor_StrongBlock( // CHECK: call void @__copy_constructor_8_8_sb0( // CHECK: call void @__destructor_8_sb0( @@ -520,7 +539,9 @@ void test_copy_assignment_StrongBlock(StrongBlock *d, StrongBlock *s) { // CHECK: define void @test_copy_constructor_StrongVolatile0( // CHECK: call void @__copy_constructor_8_8_t0w4_sv8( +// CHECK-NOT: call // CHECK: call void @__destructor_8_sv8( +// CHECK-NOT: call // CHECK: define linkonce_odr hidden void @__copy_constructor_8_8_t0w4_sv8( // CHECK: %[[V8:.*]] = load volatile i8*, i8** %{{.*}}, align 8 @@ -808,4 +829,62 @@ void test_compound_literal2(int c, StrongSmall *p) { func(0); } +// CHECK: define void @test_member_access( +// CHECK: %[[TMP:.*]] = alloca %[[STRUCT_STRONGSMALL]], +// CHECK: %[[V3:.*]] = bitcast %[[STRUCT_STRONGSMALL]]* %[[TMP]] to i8** +// CHECK: call void @__destructor_8_s8(i8** %[[V3]]) +// CHECK: call void @func( + +void test_member_access(void) { + g0 = getStrongSmall().f1; + func(0); +} + +// CHECK: define void @test_member_access2(%{{.*}}* %[[C:.*]]) +// CHECK: %[[COERCE:.*]] = alloca %[[STRUCT_STRONGSMALL]], align 8 +// CHECK: %[[V8:.*]] = bitcast %[[STRUCT_STRONGSMALL]]* %[[COERCE]] to i8** +// CHECK: call void @__destructor_8_s8(i8** %[[V8]]) +// CHECK: call void @func( + +void test_member_access2(C *c) { + g0 = [c getStrongSmall].f1; + func(0); +} + +// CHECK: define void @test_member_access3( +// CHECK: %[[COERCE:.*]] = alloca %[[STRUCT_STRONGSMALL]], align 8 +// CHECK: %[[V8:.*]] = bitcast %[[STRUCT_STRONGSMALL]]* %[[COERCE]] to i8** +// CHECK: call void @__destructor_8_s8(i8** %[[V8]]) +// CHECK: call void @func( + +void test_member_access3(void) { + g0 = [C getStrongSmallClass].f1; + func(0); +} + +// CHECK: define void @test_member_access4() +// CHECK: %[[COERCE:.*]] = alloca %[[STRUCT_STRONGSMALL]], align 8 +// CHECK: %[[V5:.*]] = bitcast %[[STRUCT_STRONGSMALL]]* %[[COERCE]] to i8** +// CHECK: call void @__destructor_8_s8(i8** %[[V5]]) +// CHECK: call void @func( + +void test_member_access4(void) { + g0 = ^{ StrongSmall s; return s; }().f1; + func(0); +} + +// CHECK: define void @test_volatile_variable_reference( +// CHECK: %[[AGG_TMP_ENSURED:.*]] = alloca %[[STRUCT_STRONGSMALL]], +// CHECK: %[[V1:.*]] = bitcast %[[STRUCT_STRONGSMALL]]* %[[AGG_TMP_ENSURED]] to i8** +// CHECK: %[[V2:.*]] = bitcast %[[STRUCT_STRONGSMALL]]* %{{.*}} to i8** +// CHECK: call void @__copy_constructor_8_8_tv0w32_sv8(i8** %[[V1]], i8** %[[V2]]) +// CHECK: %[[V3:.*]] = bitcast %[[STRUCT_STRONGSMALL]]* %[[AGG_TMP_ENSURED]] to i8** +// CHECK: call void @__destructor_8_s8(i8** %[[V3]]) +// CHECK: call void @func( + +void test_volatile_variable_reference(volatile StrongSmall *a) { + (void)*a; + func(0); +} + #endif /* USESTRUCT */ diff --git a/clang/test/CodeGenObjC/weak-in-c-struct.m b/clang/test/CodeGenObjC/weak-in-c-struct.m index 001a7ed96dec8..90c799298253b 100644 --- a/clang/test/CodeGenObjC/weak-in-c-struct.m +++ b/clang/test/CodeGenObjC/weak-in-c-struct.m @@ -179,7 +179,7 @@ void test_argument_Weak(Weak *a) { calleeWeak(*a); } -// COMMON: define void @test_return_Weak(%[[STRUCT_WEAK]]* noalias sret %[[AGG_RESULT:.*]], %[[STRUCT_WEAK]]* %[[A:.*]]) +// COMMON: define void @test_return_Weak(%[[STRUCT_WEAK]]* noalias sret align {{.*}} %[[AGG_RESULT:.*]], %[[STRUCT_WEAK]]* %[[A:.*]]) // COMMON: %[[A_ADDR:.*]] = alloca %[[STRUCT_WEAK]]* // COMMON: store %[[STRUCT_WEAK]]* %[[A]], %[[STRUCT_WEAK]]** %[[A_ADDR]] // COMMON: %[[V0:.*]] = load %[[STRUCT_WEAK]]*, %[[STRUCT_WEAK]]** %[[A_ADDR]] diff --git a/clang/test/CodeGenObjCXX/objc-struct-cxx-abi.mm b/clang/test/CodeGenObjCXX/objc-struct-cxx-abi.mm index dd9b88b0234d0..8bb694705fef1 100644 --- a/clang/test/CodeGenObjCXX/objc-struct-cxx-abi.mm +++ b/clang/test/CodeGenObjCXX/objc-struct-cxx-abi.mm @@ -90,7 +90,7 @@ void testCallStrongWeak(StrongWeak *a) { testParamStrongWeak(*a); } -// CHECK: define void @_Z20testReturnStrongWeakP10StrongWeak(%[[STRUCT_STRONGWEAK:.*]]* noalias sret %[[AGG_RESULT:.*]], %[[STRUCT_STRONGWEAK]]* %[[A:.*]]) +// CHECK: define void @_Z20testReturnStrongWeakP10StrongWeak(%[[STRUCT_STRONGWEAK:.*]]* noalias sret align 8 %[[AGG_RESULT:.*]], %[[STRUCT_STRONGWEAK]]* %[[A:.*]]) // CHECK: %[[A_ADDR:.*]] = alloca %[[STRUCT_STRONGWEAK]]*, align 8 // CHECK: store %[[STRUCT_STRONGWEAK]]* %[[A]], %[[STRUCT_STRONGWEAK]]** %[[A_ADDR]], align 8 // CHECK: %[[V0:.*]] = load %[[STRUCT_STRONGWEAK]]*, %[[STRUCT_STRONGWEAK]]** %[[A_ADDR]], align 8 diff --git a/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl b/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl index cdbf28bbcad87..35cc54c50d6f2 100644 --- a/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl +++ b/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl @@ -43,7 +43,7 @@ struct LargeStructTwoMember { struct LargeStructOneMember g_s; #endif -// X86-LABEL: define void @foo(%struct.Mat4X4* noalias sret %agg.result, %struct.Mat3X3* byval(%struct.Mat3X3) align 4 %in) +// X86-LABEL: define void @foo(%struct.Mat4X4* noalias sret align 4 %agg.result, %struct.Mat3X3* byval(%struct.Mat3X3) align 4 %in) // AMDGCN-LABEL: define %struct.Mat4X4 @foo([9 x i32] %in.coerce) Mat4X4 __attribute__((noinline)) foo(Mat3X3 in) { Mat4X4 out; @@ -63,8 +63,8 @@ kernel void ker(global Mat3X3 *in, global Mat4X4 *out) { out[0] = foo(in[1]); } -// X86-LABEL: define void @foo_large(%struct.Mat64X64* noalias sret %agg.result, %struct.Mat32X32* byval(%struct.Mat32X32) align 4 %in) -// AMDGCN-LABEL: define void @foo_large(%struct.Mat64X64 addrspace(5)* noalias sret %agg.result, %struct.Mat32X32 addrspace(5)* byval(%struct.Mat32X32) align 4 %in) +// X86-LABEL: define void @foo_large(%struct.Mat64X64* noalias sret align 4 %agg.result, %struct.Mat32X32* byval(%struct.Mat32X32) align 4 %in) +// AMDGCN-LABEL: define void @foo_large(%struct.Mat64X64 addrspace(5)* noalias sret align 4 %agg.result, %struct.Mat32X32 addrspace(5)* byval(%struct.Mat32X32) align 4 %in) Mat64X64 __attribute__((noinline)) foo_large(Mat32X32 in) { Mat64X64 out; return out; diff --git a/clang/test/CodeGenOpenCL/amdgpu-abi-struct-coerce.cl b/clang/test/CodeGenOpenCL/amdgpu-abi-struct-coerce.cl index 0a7f289cb2f7c..fd46d3cce22e4 100644 --- a/clang/test/CodeGenOpenCL/amdgpu-abi-struct-coerce.cl +++ b/clang/test/CodeGenOpenCL/amdgpu-abi-struct-coerce.cl @@ -404,14 +404,14 @@ struct_arr16 func_ret_struct_arr16() return s; } -// CHECK: define void @func_ret_struct_arr32(%struct.struct_arr32 addrspace(5)* noalias nocapture sret %agg.result) +// CHECK: define void @func_ret_struct_arr32(%struct.struct_arr32 addrspace(5)* noalias nocapture sret align 4 %agg.result) struct_arr32 func_ret_struct_arr32() { struct_arr32 s = { 0 }; return s; } -// CHECK: define void @func_ret_struct_arr33(%struct.struct_arr33 addrspace(5)* noalias nocapture sret %agg.result) +// CHECK: define void @func_ret_struct_arr33(%struct.struct_arr33 addrspace(5)* noalias nocapture sret align 4 %agg.result) struct_arr33 func_ret_struct_arr33() { struct_arr33 s = { 0 }; @@ -440,7 +440,7 @@ different_size_type_pair func_different_size_type_pair_ret() return s; } -// CHECK: define void @func_flexible_array_ret(%struct.flexible_array addrspace(5)* noalias nocapture sret %agg.result) +// CHECK: define void @func_flexible_array_ret(%struct.flexible_array addrspace(5)* noalias nocapture sret align 4 %agg.result) flexible_array func_flexible_array_ret() { flexible_array s = { 0 }; diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl index 85e921cbe12a1..0aa3e4144c525 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl @@ -596,7 +596,7 @@ kernel void test_mbcnt_hi(global uint* out, uint src0, uint src1) { } // CHECK-LABEL: @test_alignbit( -// CHECK: tail call i32 @llvm.amdgcn.alignbit(i32 %src0, i32 %src1, i32 %src2) +// CHECK: tail call i32 @llvm.fshr.i32(i32 %src0, i32 %src1, i32 %src2) kernel void test_alignbit(global uint* out, uint src0, uint src1, uint src2) { *out = __builtin_amdgcn_alignbit(src0, src1, src2); } diff --git a/clang/test/CodeGenOpenCLCXX/addrspace-of-this.cl b/clang/test/CodeGenOpenCLCXX/addrspace-of-this.cl index 07e3b0b7314ea..16495d38b9421 100644 --- a/clang/test/CodeGenOpenCLCXX/addrspace-of-this.cl +++ b/clang/test/CodeGenOpenCLCXX/addrspace-of-this.cl @@ -114,7 +114,7 @@ __kernel void test__global() { // Test the address space of 'this' when invoking the operator+ // COMMON: [[C1GEN:%[.a-z0-9]+]] = addrspacecast %class.C* %c1 to %class.C addrspace(4)* // COMMON: [[C2GEN:%[.a-z0-9]+]] = addrspacecast %class.C* %c2 to %class.C addrspace(4)* -// COMMON: call spir_func void @_ZNU3AS41CplERU3AS4KS_(%class.C* sret %c3, %class.C addrspace(4)* [[C1GEN]], %class.C addrspace(4)* dereferenceable(4) [[C2GEN]]) +// COMMON: call spir_func void @_ZNU3AS41CplERU3AS4KS_(%class.C* sret align 4 %c3, %class.C addrspace(4)* [[C1GEN]], %class.C addrspace(4)* dereferenceable(4) [[C2GEN]]) // Test the address space of 'this' when invoking the move constructor // COMMON: [[C4GEN:%[.a-z0-9]+]] = addrspacecast %class.C* %c4 to %class.C addrspace(4)* @@ -134,7 +134,7 @@ __kernel void test__global() { // Tests address space of inline members //COMMON: @_ZNU3AS41C3getEv(%class.C addrspace(4)* %this) -//COMMON: @_ZNU3AS41CplERU3AS4KS_(%class.C* noalias sret %agg.result, %class.C addrspace(4)* %this +//COMMON: @_ZNU3AS41CplERU3AS4KS_(%class.C* noalias sret align 4 %agg.result, %class.C addrspace(4)* %this #define TEST(AS) \ __kernel void test##AS() { \ AS C c; \ diff --git a/clang/test/CodeGenSYCL/address-space-of-returns.cpp b/clang/test/CodeGenSYCL/address-space-of-returns.cpp index de251fe18c565..69104c261ddf7 100644 --- a/clang/test/CodeGenSYCL/address-space-of-returns.cpp +++ b/clang/test/CodeGenSYCL/address-space-of-returns.cpp @@ -26,7 +26,7 @@ A ret_agg() { A a; return a; } -// CHECK: define spir_func void @{{.*}}ret_agg{{.*}}(%struct.{{.*}}.A addrspace(4)* noalias sret %agg.result) +// CHECK: define spir_func void @{{.*}}ret_agg{{.*}}(%struct.{{.*}}.A addrspace(4)* {{.*}} %agg.result) template __attribute__((sycl_kernel)) void kernel_single_task(Func kernelFunc) { diff --git a/clang/test/CodeGenSYCL/unique-stable-name.cpp b/clang/test/CodeGenSYCL/unique-stable-name.cpp index 4360d9e3b4b7d..962ce06257986 100644 --- a/clang/test/CodeGenSYCL/unique-stable-name.cpp +++ b/clang/test/CodeGenSYCL/unique-stable-name.cpp @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -fsycl -fsycl-is-device -triple spir64-unknown-unknown-sycldevice -disable-llvm-passes -emit-llvm %s -o - | FileCheck %s +// RUN: %clang_cc1 -triple spir64-unknown-unknown-sycldevice -fsycl -fsycl-is-device -disable-llvm-passes -emit-llvm %s -o - | FileCheck %s // CHECK: @[[INT:[^\w]+]] = private unnamed_addr constant [[INT_SIZE:\[[0-9]+ x i8\]]] c"_ZTSi\00" // CHECK: @[[LAMBDA_X:[^\w]+]] = private unnamed_addr constant [[LAMBDA_X_SIZE:\[[0-9]+ x i8\]]] c"_ZTSZZ4mainENKUlvE42->5clEvEUlvE46->16\00" // CHECK: @[[MACRO_X:[^\w]+]] = private unnamed_addr constant [[MACRO_SIZE:\[[0-9]+ x i8\]]] c"_ZTSZZ4mainENKUlvE42->5clEvEUlvE52->7~28->18\00" @@ -12,7 +12,7 @@ extern "C" void printf(const char *) {} template void template_param() { - printf(__unique_stable_name(T)); + printf(__builtin_unique_stable_name(T)); } template @@ -21,31 +21,31 @@ T getT() { return T{}; } template void lambda_in_dependent_function() { auto y = [] {}; - printf(__unique_stable_name(y)); + printf(__builtin_unique_stable_name(y)); } #define DEF_IN_MACRO() \ auto MACRO_X = []() {};auto MACRO_Y = []() {}; \ - printf(__unique_stable_name(MACRO_X)); \ - printf(__unique_stable_name(MACRO_Y)); + printf(__builtin_unique_stable_name(MACRO_X)); \ + printf(__builtin_unique_stable_name(MACRO_Y)); #define MACRO_CALLS_MACRO() \ {DEF_IN_MACRO();}{DEF_IN_MACRO();} template -__attribute__((sycl_kernel)) void kernel_single_task(KernelType kernelFunc) { +[[clang::sycl_kernel]] void kernel_single_task(KernelType kernelFunc) { kernelFunc(); } int main() { kernel_single_task( []() { - printf(__unique_stable_name(int)); + printf(__builtin_unique_stable_name(int)); // CHECK: call spir_func void @printf(i8 addrspace(4)* addrspacecast (i8* getelementptr inbounds ([[INT_SIZE]], [[INT_SIZE]]* @[[INT]] auto x = [](){}; - printf(__unique_stable_name(x)); - printf(__unique_stable_name(decltype(x))); + printf(__builtin_unique_stable_name(x)); + printf(__builtin_unique_stable_name(decltype(x))); // CHECK: call spir_func void @printf(i8 addrspace(4)* addrspacecast (i8* getelementptr inbounds ([[LAMBDA_X_SIZE]], [[LAMBDA_X_SIZE]]* @[[LAMBDA_X]] // CHECK: call spir_func void @printf(i8 addrspace(4)* addrspacecast (i8* getelementptr inbounds ([[LAMBDA_X_SIZE]], [[LAMBDA_X_SIZE]]* @[[LAMBDA_X]] diff --git a/clang/test/Driver/hip-device-compile.hip b/clang/test/Driver/hip-device-compile.hip index b6f1f1cf218e6..c442d23581cef 100644 --- a/clang/test/Driver/hip-device-compile.hip +++ b/clang/test/Driver/hip-device-compile.hip @@ -42,7 +42,7 @@ // CHECK-NOT: {{"*.llvm-link"}} // CHECK-NOT: {{".*opt"}} // CHECK-NOT: {{".*llc"}} -// CHECK-NOT: {{".*lld"}} +// CHECK-NOT: {{".*lld.*"}} // CHECK-NOT: {{".*clang-offload-bundler"}} // CHECK-NOT: {{".*ld.*"}} @@ -67,6 +67,6 @@ // BUNDLE: {{"*.llvm-link"}} // BUNDLE: {{".*opt"}} // BUNDLE: {{".*llc"}} -// BUNDLE: {{".*lld"}} +// BUNDLE: {{".*lld.*"}} // BUNDLE: {{".*clang-offload-bundler"}} diff --git a/clang/test/Driver/hip-options.hip b/clang/test/Driver/hip-options.hip index 59afa3fdb2d7b..a7a6e02a3c81c 100644 --- a/clang/test/Driver/hip-options.hip +++ b/clang/test/Driver/hip-options.hip @@ -13,3 +13,16 @@ // RUN: -mllvm -amdgpu-early-inline-all=true %s 2>&1 | \ // RUN: FileCheck -check-prefix=MLLVM %s // MLLVM-NOT: "-mllvm"{{.*}}"-amdgpu-early-inline-all=true"{{.*}}"-mllvm"{{.*}}"-amdgpu-early-inline-all=true" + +// RUN: %clang -### -Xarch_device -g -nogpulib --cuda-gpu-arch=gfx900 \ +// RUN: -Xarch_device -fcf-protection=branch \ +// RUN: --cuda-gpu-arch=gfx906 %s 2>&1 | FileCheck -check-prefix=DEV %s +// DEV: clang{{.*}} "-fcuda-is-device" {{.*}} "-debug-info-kind={{.*}}" {{.*}} "-fcf-protection=branch" +// DEV: clang{{.*}} "-fcuda-is-device" {{.*}} "-debug-info-kind={{.*}}" {{.*}} "-fcf-protection=branch" +// DEV-NOT: clang{{.*}} {{.*}} "-debug-info-kind={{.*}}" + +// RUN: %clang -### -Xarch_host -g -nogpulib --cuda-gpu-arch=gfx900 \ +// RUN: --cuda-gpu-arch=gfx906 %s 2>&1 | FileCheck -check-prefix=HOST %s +// HOST-NOT: clang{{.*}} "-fcuda-is-device" {{.*}} "-debug-info-kind={{.*}}" +// HOST-NOT: clang{{.*}} "-fcuda-is-device" {{.*}} "-debug-info-kind={{.*}}" +// HOST: clang{{.*}} "-debug-info-kind={{.*}}" diff --git a/clang/test/Driver/hip-toolchain-no-rdc.hip b/clang/test/Driver/hip-toolchain-no-rdc.hip index cda852e2d8c76..4371334577f7c 100644 --- a/clang/test/Driver/hip-toolchain-no-rdc.hip +++ b/clang/test/Driver/hip-toolchain-no-rdc.hip @@ -38,7 +38,7 @@ // CHECK-SAME: "-filetype=obj" // CHECK-SAME: "-o" [[OBJ_DEV_A_803:".*-gfx803-.*o"]] -// CHECK: [[LLD: ".*lld"]] "-flavor" "gnu" "-shared" +// CHECK: [[LLD: ".*lld.*"]] "-flavor" "gnu" "-shared" // CHECK-SAME: "-o" "[[IMG_DEV_A_803:.*out]]" [[OBJ_DEV_A_803]] // @@ -67,7 +67,7 @@ // CHECK-SAME: "-filetype=obj" // CHECK-SAME: "-o" [[OBJ_DEV_A_900:".*-gfx900-.*o"]] -// CHECK: [[LLD: ".*lld"]] "-flavor" "gnu" "-shared" +// CHECK: [[LLD]] "-flavor" "gnu" "-shared" // CHECK-SAME: "-o" "[[IMG_DEV_A_900:.*out]]" [[OBJ_DEV_A_900]] // @@ -112,7 +112,7 @@ // CHECK-SAME: "-filetype=obj" // CHECK-SAME: "-o" [[OBJ_DEV_B_803:".*-gfx803-.*o"]] -// CHECK: [[LLD: ".*lld"]] "-flavor" "gnu" "-shared" +// CHECK: [[LLD]] "-flavor" "gnu" "-shared" // CHECK-SAME: "-o" "[[IMG_DEV_B_803:.*out]]" [[OBJ_DEV_B_803]] // @@ -141,7 +141,7 @@ // CHECK-SAME: "-filetype=obj" // CHECK-SAME: "-o" [[OBJ_DEV_B_900:".*-gfx900-.*o"]] -// CHECK: [[LLD: ".*lld"]] "-flavor" "gnu" "-shared" +// CHECK: [[LLD]] "-flavor" "gnu" "-shared" // CHECK-SAME: "-o" "[[IMG_DEV_B_900:.*out]]" [[OBJ_DEV_B_900]] // diff --git a/clang/test/Driver/hip-toolchain-rdc.hip b/clang/test/Driver/hip-toolchain-rdc.hip index 18fd1d7b09a0a..203784f2ab433 100644 --- a/clang/test/Driver/hip-toolchain-rdc.hip +++ b/clang/test/Driver/hip-toolchain-rdc.hip @@ -44,7 +44,7 @@ // CHECK-SAME: "-filetype=obj" // CHECK-SAME: "-o" [[OBJ_DEV1:".*-gfx803-.*o"]] -// CHECK: [[LLD: ".*lld"]] "-flavor" "gnu" "-shared" +// CHECK: [[LLD: ".*lld.*"]] "-flavor" "gnu" "-shared" // CHECK-SAME: "-o" "[[IMG_DEV1:.*out]]" [[OBJ_DEV1]] // CHECK: [[CLANG]] "-cc1" "-triple" "amdgcn-amd-amdhsa" diff --git a/clang/test/Driver/riscv-sdata-warning.c b/clang/test/Driver/riscv-sdata-warning.c new file mode 100644 index 0000000000000..7f1eeec6e99f5 --- /dev/null +++ b/clang/test/Driver/riscv-sdata-warning.c @@ -0,0 +1,8 @@ +// REQUIRES: riscv-registered-target +// RUN: %clang -S -target riscv32-unknown-elf -fpic -msmall-data-limit=8 %s 2>&1 \ +// RUN: | FileCheck -check-prefix=CHECK-PIC-SDATA %s +// CHECK-PIC-SDATA: warning: ignoring '-msmall-data-limit=' with -mcmodel=large for -fpic or RV64 + +// RUN: %clang -S -target riscv64-unknown-elf -mcmodel=large -msmall-data-limit=8 %s 2>&1 \ +// RUN: | FileCheck -check-prefix=CHECK-RV64-LARGE-SDATA %s +// CHECK-RV64-LARGE-SDATA: warning: ignoring '-msmall-data-limit=' with -mcmodel=large for -fpic or RV64 diff --git a/clang/test/Driver/ropi-rwpi.c b/clang/test/Driver/ropi-rwpi.c index f22c8d0048916..7569108218390 100644 --- a/clang/test/Driver/ropi-rwpi.c +++ b/clang/test/Driver/ropi-rwpi.c @@ -21,6 +21,14 @@ // RUN: %clang -target arm-none-eabi -x c++ -fropi -frwpi -### -c %s 2>&1 | FileCheck --check-prefix=CXX %s // RUN: %clang -target arm-none-eabi -x c++ -fallow-unsupported -fropi -### -c %s 2>&1 | FileCheck --check-prefix=ROPI %s +// RUN: %clang -target arm-none-eabi -march=armv8m.main -fropi -mcmse -### -c %s 2>&1 | FileCheck --check-prefix=ROPI-CMSE %s +// RUN: %clang -target arm-none-eabi -march=armv8m.main -frwpi -mcmse -### -c %s 2>&1 | FileCheck --check-prefix=RWPI-CMSE %s +// RUN: %clang -target arm-none-eabi -march=armv8m.main -frwpi -fropi -mcmse -### -c %s 2>&1 | FileCheck --check-prefix=ROPI-CMSE --check-prefix=RWPI-CMSE %s + +// RUN: %clang -target arm-none-eabi -march=armv8m.main -frwpi -mcmse -fallow-unsupported -### -c %s 2>&1 | FileCheck --check-prefix=RWPI-CMSE-ALLOW-UNSUPPORTED --check-prefix=ROPI-CMSE-ALLOW-UNSUPPORTED %s +// RUN: %clang -target arm-none-eabi -march=armv8m.main -fropi -mcmse -fallow-unsupported -### -c %s 2>&1 | FileCheck --check-prefix=ROPI-CMSE-ALLOW-UNSUPPORTED %s +// RUN: %clang -target arm-none-eabi -march=armv8m.main -frwpi -fropi -mcmse -fallow-unsupported -### -c %s 2>&1 | FileCheck --check-prefix=ROPI-CMSE-ALLOW-UNSUPPORTED --check-prefix=RWPI-CMSE-ALLOW-UNSUPPORTED %s + // STATIC: "-mrelocation-model" "static" @@ -36,3 +44,8 @@ // PIC: error: embedded and GOT-based position independence are incompatible // CXX: error: ROPI is not compatible with c++ + +// ROPI-CMSE: error: cmse is not compatible with ROPI +// RWPI-CMSE: error: cmse is not compatible with RWPI +// ROPI-CMSE-ALLOW-UNSUPPORTED-NOT: error: cmse is not compatible with ROPI +// RWPI-CMSE-ALLOW-UNSUPPORTED-NOT: error: cmse is not compatible with RWPI diff --git a/clang/test/Driver/save-temps.c b/clang/test/Driver/save-temps.c index 29d1b7d9ac8d7..a26ba9f4ec0d3 100644 --- a/clang/test/Driver/save-temps.c +++ b/clang/test/Driver/save-temps.c @@ -1,3 +1,6 @@ +// REQUIRES: x86-registered-target +// REQUIRES: arm-registered-target + // RUN: %clang -target x86_64-apple-darwin -save-temps -arch x86_64 %s -### 2>&1 \ // RUN: | FileCheck %s // CHECK: "-o" "save-temps.i" @@ -82,3 +85,11 @@ // RUN: | FileCheck %s -check-prefix=CHECK-SAVE-TEMPS // CHECK-SAVE-TEMPS: "-cc1as" // CHECK-SAVE-TEMPS: "-dwarf-version={{.}}" + +// RUN: %clang --target=arm-arm-none-eabi -march=armv8-m.main -mcmse -save-temps -c -v %s -### 2>&1 \ +// RUN: | FileCheck %s -check-prefix=CHECK-SAVE-TEMPS-CMSE +// RUN: %clang --target=arm-arm-none-eabi -march=armv8-m.main -mcmse -x assembler -c -v %s -### 2>&1 \ +// RUN: | FileCheck %s -check-prefix=CHECK-SAVE-TEMPS-CMSE +// CHECK-SAVE-TEMPS-CMSE: -cc1as +// CHECK-SAVE-TEMPS-CMSE: +8msecext +// CHECK-SAVE-TEMPS-CMSE-NOT: '+cmse' is not a recognized feature for this target (ignoring feature) diff --git a/clang/test/Index/getcursor-recovery.cpp b/clang/test/Index/getcursor-recovery.cpp new file mode 100644 index 0000000000000..29966f26c8240 --- /dev/null +++ b/clang/test/Index/getcursor-recovery.cpp @@ -0,0 +1,16 @@ +int foo(int, int); +int foo(int, double); +int x; + +void testTypedRecoveryExpr() { + // Inner foo() is a RecoveryExpr, outer foo() is an overloaded call. + foo(x, foo(x)); +} +// RUN: c-index-test -cursor-at=%s:7:3 %s -Xclang -frecovery-ast | FileCheck -check-prefix=OUTER-FOO %s +// OUTER-FOO: OverloadedDeclRef=foo[2:5, 1:5] +// RUN: c-index-test -cursor-at=%s:7:7 %s -Xclang -frecovery-ast | FileCheck -check-prefix=OUTER-X %s +// OUTER-X: DeclRefExpr=x:3:5 +// RUN: c-index-test -cursor-at=%s:7:10 %s -Xclang -frecovery-ast | FileCheck -check-prefix=INNER-FOO %s +// INNER-FOO: OverloadedDeclRef=foo[2:5, 1:5] +// RUN: c-index-test -cursor-at=%s:7:14 %s -Xclang -frecovery-ast | FileCheck -check-prefix=INNER-X %s +// INNER-X: DeclRefExpr=x:3:5 diff --git a/clang/test/Misc/pragma-attribute-supported-attributes-list.test b/clang/test/Misc/pragma-attribute-supported-attributes-list.test index d77a06819ae75..9132e55aa69ef 100644 --- a/clang/test/Misc/pragma-attribute-supported-attributes-list.test +++ b/clang/test/Misc/pragma-attribute-supported-attributes-list.test @@ -39,6 +39,7 @@ // CHECK-NEXT: Callback (SubjectMatchRule_function) // CHECK-NEXT: Capability (SubjectMatchRule_record, SubjectMatchRule_type_alias) // CHECK-NEXT: CarriesDependency (SubjectMatchRule_variable_is_parameter, SubjectMatchRule_objc_method, SubjectMatchRule_function) +// CHECK-NEXT: CmseNSEntry (SubjectMatchRule_function) // CHECK-NEXT: Cold (SubjectMatchRule_function) // CHECK-NEXT: Common (SubjectMatchRule_variable) // CHECK-NEXT: ConstInit (SubjectMatchRule_variable_is_global) diff --git a/clang/test/Modules/ExtDebugInfo.m b/clang/test/Modules/ExtDebugInfo.m index 41247b00a49f8..380bc4c9bb983 100644 --- a/clang/test/Modules/ExtDebugInfo.m +++ b/clang/test/Modules/ExtDebugInfo.m @@ -6,13 +6,16 @@ // RUN: -fmodule-format=obj -fimplicit-module-maps -DMODULES \ // RUN: -fmodules-cache-path=%t %s -I %S/Inputs -I %t -emit-llvm -o %t-mod.ll // RUN: cat %t-mod.ll | FileCheck %s +// RUN: cat %t-mod.ll | FileCheck %s --check-prefix=DWOID // PCH: // RUN: %clang_cc1 -x objective-c -fmodule-format=obj -emit-pch -I%S/Inputs \ // RUN: -o %t.pch %S/Inputs/DebugObjC.h -// RUN: %clang_cc1 -x objective-c -debug-info-kind=limited -dwarf-ext-refs -fmodule-format=obj \ +// RUN: %clang_cc1 -x objective-c -debug-info-kind=limited -dwarf-ext-refs \ +// RUN: -fmodule-format=obj \ // RUN: -include-pch %t.pch %s -emit-llvm -o %t-pch.ll %s // RUN: cat %t-pch.ll | FileCheck %s +// RUN: cat %t-pch.ll | FileCheck %s --check-prefix=DWOID #ifdef MODULES @import DebugObjC; @@ -34,6 +37,8 @@ int foo(ObjCClass *c) { return [c property]; } +// DWOID: !DICompileUnit(language: DW_LANG_ObjC,{{.*}}isOptimized: false,{{.*}}dwoId: + // CHECK: ![[MOD:.*]] = !DIModule(scope: null, name: "DebugObjC // CHECK: !DIGlobalVariable(name: "GlobalUnion", diff --git a/clang/test/Modules/debug-info-moduleimport.m b/clang/test/Modules/debug-info-moduleimport.m index f07c6fce784d5..9dee9964b538e 100644 --- a/clang/test/Modules/debug-info-moduleimport.m +++ b/clang/test/Modules/debug-info-moduleimport.m @@ -1,11 +1,19 @@ // RUN: rm -rf %t -// RUN: %clang_cc1 -debug-info-kind=limited -fmodules -DGREETING="Hello World" -UNDEBUG -fimplicit-module-maps -fmodules-cache-path=%t %s -I %S/Inputs -isysroot /tmp/.. -I %t -emit-llvm -o - | FileCheck %s --check-prefix=NOIMPORT +// RUN: %clang_cc1 -debug-info-kind=limited -fmodules \ +// RUN: -DGREETING="Hello World" -UNDEBUG \ +// RUN: -fimplicit-module-maps -fmodules-cache-path=%t %s \ +// RUN: -I %S/Inputs -isysroot /tmp/.. -I %t -emit-llvm -o - \ +// RUN: | FileCheck %s --check-prefix=NOIMPORT // NOIMPORT-NOT: !DIImportedEntity // NOIMPORT-NOT: !DIModule // RUN: rm -rf %t -// RUN: %clang_cc1 -debug-info-kind=limited -fmodules -DGREETING="Hello World" -UNDEBUG -fimplicit-module-maps -fmodules-cache-path=%t %s -I %S/Inputs -isysroot /tmp/.. -I %t -emit-llvm -debugger-tuning=lldb -o - | FileCheck %s +// RUN: %clang_cc1 -debug-info-kind=limited -fmodules \ +// RUN: -DGREETING="Hello World" -UNDEBUG \ +// RUN: -fimplicit-module-maps -fmodules-cache-path=%t %s \ +// RUN: -I %S/Inputs -isysroot /tmp/.. -I %t -emit-llvm \ +// RUN: -debugger-tuning=lldb -o - | FileCheck %s // CHECK: ![[CU:.*]] = distinct !DICompileUnit // CHECK-SAME: sysroot: "/tmp/..") @@ -18,15 +26,18 @@ // CHECK-SAME: includePath: "{{.*}}test{{.*}}Modules{{.*}}Inputs" // CHECK: ![[F]] = !DIFile(filename: {{.*}}debug-info-moduleimport.m -// RUN: %clang_cc1 -debug-info-kind=limited -fmodules -fimplicit-module-maps -fmodules-cache-path=%t \ -// RUN: %s -I %S/Inputs -isysroot /tmp/.. -I %t -emit-llvm -o - \ -// RUN: | FileCheck %s --check-prefix=NO-SKEL-CHECK +// RUN: %clang_cc1 -debug-info-kind=limited -fmodules -fimplicit-module-maps \ +// RUN: -fmodules-cache-path=%t %s -I %S/Inputs -isysroot /tmp/.. -I %t \ +// RUN: -emit-llvm -o - | FileCheck %s --check-prefix=NO-SKEL-CHECK // NO-SKEL-CHECK: distinct !DICompileUnit // NO-SKEL-CHECK-NOT: distinct !DICompileUnit -// RUN: %clang_cc1 -debug-info-kind=limited -fmodules -fimplicit-module-maps -fmodules-cache-path=%t \ +// RUN: %clang_cc1 -debug-info-kind=limited -fmodules -fimplicit-module-maps \ +// RUN: -fmodules-cache-path=%t -fdebug-prefix-map=%t=/MODULE-CACHE \ // RUN: -fmodule-format=obj -dwarf-ext-refs \ // RUN: %s -I %S/Inputs -isysroot /tmp/.. -I %t -emit-llvm -o - \ // RUN: | FileCheck %s --check-prefix=SKEL-CHECK -// SKEL-CHECK: distinct !DICompileUnit -// SKEL-CHECK: distinct !DICompileUnit{{.*}}dwoId +// SKEL-CHECK: distinct !DICompileUnit({{.*}}file: ![[CUFILE:[0-9]+]] +// SKEL-CHECK: ![[CUFILE]] = !DIFile({{.*}}directory: "[[COMP_DIR:.*]]" +// SKEL-CHECK: distinct !DICompileUnit({{.*}}file: ![[DWOFILE:[0-9]+]]{{.*}}splitDebugFilename: "/MODULE-CACHE{{.*}}dwoId +// SKEL-CHECK: ![[DWOFILE]] = !DIFile({{.*}}directory: "[[COMP_DIR]]" diff --git a/clang/test/Modules/templates.mm b/clang/test/Modules/templates.mm index 78206a980a8fb..9d4e4b9d16173 100644 --- a/clang/test/Modules/templates.mm +++ b/clang/test/Modules/templates.mm @@ -125,7 +125,7 @@ void testWithAttributes() { // Check that returnNonTrivial doesn't return Class0 directly in registers. -// CHECK: declare void @_Z16returnNonTrivialv(%struct.Class0* sret) +// CHECK: declare void @_Z16returnNonTrivialv(%struct.Class0* sret align 8) @import template_nontrivial0; @import template_nontrivial1; diff --git a/clang/test/OpenMP/declare_target_messages.cpp b/clang/test/OpenMP/declare_target_messages.cpp index cc6558debde6d..1a371d699789f 100644 --- a/clang/test/OpenMP/declare_target_messages.cpp +++ b/clang/test/OpenMP/declare_target_messages.cpp @@ -162,17 +162,17 @@ namespace { #pragma omp declare target link(x) // expected-error {{'x' must not appear in both clauses 'to' and 'link'}} void bazz() {} -#pragma omp declare target to(bazz) device_type(nohost) // omp45-error {{unexpected 'device_type' clause, only 'to' or 'link' clauses expected}} host5-note {{marked as 'device_type(nohost)' here}} +#pragma omp declare target to(bazz) device_type(nohost) // omp45-error {{unexpected 'device_type' clause, only 'to' or 'link' clauses expected}} host5-note 3{{marked as 'device_type(nohost)' here}} void bazzz() {bazz();} #pragma omp declare target to(bazzz) device_type(nohost) // omp45-error {{unexpected 'device_type' clause, only 'to' or 'link' clauses expected}} void any() {bazz();} // host5-error {{function with 'device_type(nohost)' is not available on host}} -void host1() {bazz();} -#pragma omp declare target to(host1) device_type(host) // omp45-error {{unexpected 'device_type' clause, only 'to' or 'link' clauses expected}} dev5-note 2 {{marked as 'device_type(host)' here}} -void host2() {bazz();} +void host1() {bazz();} // host5-error {{function with 'device_type(nohost)' is not available on host}} +#pragma omp declare target to(host1) device_type(host) // omp45-error {{unexpected 'device_type' clause, only 'to' or 'link' clauses expected}} dev5-note 4 {{marked as 'device_type(host)' here}} +void host2() {bazz();} //host5-error {{function with 'device_type(nohost)' is not available on host}} #pragma omp declare target to(host2) -void device() {host1();} +void device() {host1();} // dev5-error {{function with 'device_type(host)' is not available on device}} #pragma omp declare target to(device) device_type(nohost) // omp45-error {{unexpected 'device_type' clause, only 'to' or 'link' clauses expected}} host5-note 2 {{marked as 'device_type(nohost)' here}} -void host3() {host1();} +void host3() {host1();} // dev5-error {{function with 'device_type(host)' is not available on device}} #pragma omp declare target to(host3) #pragma omp declare target diff --git a/clang/test/OpenMP/nesting_of_regions.cpp b/clang/test/OpenMP/nesting_of_regions.cpp index 53656a2e669f7..d987d84c79e39 100644 --- a/clang/test/OpenMP/nesting_of_regions.cpp +++ b/clang/test/OpenMP/nesting_of_regions.cpp @@ -84,6 +84,11 @@ void foo() { } #pragma omp parallel { +#pragma omp scan // expected-error {{region cannot be closely nested inside 'parallel' region; perhaps you forget to enclose 'omp scan' directive into a for, simd, or for simd region?}} + bar(); + } +#pragma omp parallel + { #pragma omp taskwait bar(); } @@ -230,7 +235,7 @@ void foo() { // SIMD DIRECTIVE #pragma omp simd for (int i = 0; i < 10; ++i) { -#pragma omp for // expected-error {{OpenMP constructs may not be nested inside a simd region}} +#pragma omp for // omp45-error {{OpenMP constructs may not be nested inside a simd region}} omp50-error {{OpenMP constructs may not be nested inside a simd region except for ordered simd, simd, scan, or atomic directive}} for (int i = 0; i < 10; ++i) ; } @@ -332,6 +337,11 @@ void foo() { } #pragma omp simd for (int i = 0; i < 10; ++i) { +#pragma omp scan // omp45-error {{OpenMP constructs may not be nested inside a simd region}} omp50-error {{exactly one of 'inclusive' or 'exclusive' clauses is expected}} + bar(); + } +#pragma omp simd + for (int i = 0; i < 10; ++i) { #pragma omp taskwait // expected-error {{OpenMP constructs may not be nested inside a simd region}} bar(); } @@ -608,6 +618,11 @@ void foo() { } #pragma omp for for (int i = 0; i < 10; ++i) { +#pragma omp scan // omp45-error {{region cannot be closely nested inside 'for' region; perhaps you forget to enclose 'omp scan' directive into a for, simd, or for simd region?}} omp50-error {{exactly one of 'inclusive' or 'exclusive' clauses is expected}} + bar(); + } +#pragma omp for + for (int i = 0; i < 10; ++i) { #pragma omp taskwait bar(); } @@ -861,6 +876,11 @@ void foo() { } #pragma omp for simd for (int i = 0; i < 10; ++i) { +#pragma omp scan // omp45-error {{OpenMP constructs may not be nested inside a simd region}} omp50-error {{exactly one of 'inclusive' or 'exclusive' clauses is expected}} + bar(); + } +#pragma omp for simd + for (int i = 0; i < 10; ++i) { #pragma omp taskwait // expected-error {{OpenMP constructs may not be nested inside a simd region}} bar(); } @@ -1152,6 +1172,10 @@ void foo() { } #pragma omp sections { +#pragma omp scan // expected-error {{region cannot be closely nested inside 'sections' region}} + } +#pragma omp sections + { #pragma omp taskwait } #pragma omp sections @@ -1455,6 +1479,14 @@ void foo() { { #pragma omp section { +#pragma omp scan // expected-error {{region cannot be closely nested inside 'section' region}} + bar(); + } + } +#pragma omp sections + { +#pragma omp section + { #pragma omp taskwait bar(); } @@ -1755,6 +1787,11 @@ void foo() { } #pragma omp single { +#pragma omp scan // expected-error {{region cannot be closely nested inside 'single' region}} + bar(); + } +#pragma omp single + { #pragma omp taskwait bar(); } @@ -2030,6 +2067,11 @@ void foo() { } #pragma omp master { +#pragma omp scan // expected-error {{region cannot be closely nested inside 'master' region}} + bar(); + } +#pragma omp master + { #pragma omp taskwait bar(); } @@ -2292,6 +2334,11 @@ void foo() { } #pragma omp critical { +#pragma omp scan // expected-error {{region cannot be closely nested inside 'critical' region}} + bar(); + } +#pragma omp critical + { #pragma omp taskwait bar(); } @@ -2571,6 +2618,11 @@ void foo() { } #pragma omp parallel for for (int i = 0; i < 10; ++i) { +#pragma omp scan // expected-error {{region cannot be closely nested inside 'parallel for' region}} + bar(); + } +#pragma omp parallel for + for (int i = 0; i < 10; ++i) { #pragma omp taskwait bar(); } @@ -2840,6 +2892,11 @@ void foo() { } #pragma omp parallel for simd for (int i = 0; i < 10; ++i) { +#pragma omp scan // omp45-error {{OpenMP constructs may not be nested inside a simd region}} omp50-error {{region cannot be closely nested inside 'parallel for simd' region; perhaps you forget to enclose 'omp scan' directive into a for, simd, or for simd region?}} + bar(); + } +#pragma omp parallel for simd + for (int i = 0; i < 10; ++i) { #pragma omp taskwait // expected-error {{OpenMP constructs may not be nested inside a simd region}} bar(); } @@ -3120,6 +3177,11 @@ void foo() { } #pragma omp parallel master { +#pragma omp scan // expected-error {{region cannot be closely nested inside 'parallel master' region}} + bar(); + } +#pragma omp parallel master + { #pragma omp taskwait bar(); } @@ -3383,6 +3445,10 @@ void foo() { } #pragma omp parallel sections { +#pragma omp scan // expected-error {{region cannot be closely nested inside 'parallel sections' region}} + } +#pragma omp parallel sections + { #pragma omp taskwait } #pragma omp parallel sections @@ -3585,6 +3651,11 @@ void foo() { } #pragma omp task { +#pragma omp scan // expected-error {{region cannot be closely nested inside 'task' region}} + bar(); + } +#pragma omp task + { #pragma omp taskwait bar(); } @@ -3848,6 +3919,11 @@ void foo() { } #pragma omp ordered { +#pragma omp scan // expected-error {{region cannot be closely nested inside 'ordered' region}} + bar(); + } +#pragma omp ordered + { #pragma omp taskwait bar(); } @@ -4142,6 +4218,13 @@ void foo() { // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}} // expected-note@+1 {{expected an expression statement}} { +#pragma omp scan // expected-error {{OpenMP constructs may not be nested inside an atomic region}} + bar(); + } +#pragma omp atomic + // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}} + // expected-note@+1 {{expected an expression statement}} + { #pragma omp taskwait // expected-error {{OpenMP constructs may not be nested inside an atomic region}} bar(); } @@ -4406,6 +4489,11 @@ void foo() { } #pragma omp target { +#pragma omp scan // expected-error {{region cannot be closely nested inside 'target' region; perhaps you forget to enclose 'omp scan' directive into a for, simd, or for simd region?}} + bar(); + } +#pragma omp target + { #pragma omp taskwait bar(); } @@ -4640,6 +4728,11 @@ void foo() { } #pragma omp target parallel { +#pragma omp scan // expected-error {{region cannot be closely nested inside 'target parallel' region; perhaps you forget to enclose 'omp scan' directive into a for, simd, or for simd region?}} + bar(); + } +#pragma omp target parallel + { #pragma omp taskwait bar(); } @@ -4906,6 +4999,11 @@ void foo() { } #pragma omp target parallel for for (int i = 0; i < 10; ++i) { +#pragma omp scan // expected-error {{region cannot be closely nested inside 'target parallel for' region}} + bar(); + } +#pragma omp target parallel for + for (int i = 0; i < 10; ++i) { #pragma omp taskwait bar(); } @@ -5140,6 +5238,12 @@ void foo() { bar(); } #pragma omp target +#pragma omp teams + { +#pragma omp scan // expected-error {{region cannot be closely nested inside 'teams' region; perhaps you forget to enclose 'omp scan' directive into a for, simd, or for simd region?}} + bar(); + } +#pragma omp target #pragma omp teams { #pragma omp taskwait // expected-error {{region cannot be closely nested inside 'teams' region; perhaps you forget to enclose 'omp taskwait' directive into a parallel region?}} @@ -5439,6 +5543,11 @@ void foo() { } #pragma omp taskloop for (int i = 0; i < 10; ++i) { +#pragma omp scan // expected-error {{region cannot be closely nested inside 'taskloop' region}} + bar(); + } +#pragma omp taskloop + for (int i = 0; i < 10; ++i) { #pragma omp taskwait bar(); } @@ -5761,6 +5870,13 @@ void foo() { } #pragma omp target #pragma omp teams +#pragma omp distribute + for (int i = 0; i < 10; ++i) { +#pragma omp scan // expected-error {{region cannot be closely nested inside 'distribute' region; perhaps you forget to enclose 'omp scan' directive into a for, simd, or for simd region?}} + bar(); + } +#pragma omp target +#pragma omp teams #pragma omp distribute for (int i = 0; i < 10; ++i) { #pragma omp taskwait @@ -6073,6 +6189,13 @@ void foo() { } #pragma omp target #pragma omp teams +#pragma omp distribute parallel for + for (int i = 0; i < 10; ++i) { +#pragma omp scan // expected-error {{region cannot be closely nested inside 'distribute parallel for' region}} + bar(); + } +#pragma omp target +#pragma omp teams #pragma omp distribute parallel for for (int i = 0; i < 10; ++i) { #pragma omp taskwait @@ -6385,6 +6508,13 @@ void foo() { } #pragma omp target #pragma omp teams +#pragma omp distribute parallel for simd + for (int i = 0; i < 10; ++i) { +#pragma omp scan // omp45-error {{OpenMP constructs may not be nested inside a simd region}} omp50-error {{region cannot be closely nested inside 'distribute parallel for simd' region; perhaps you forget to enclose 'omp scan' directive into a for, simd, or for simd region?}} + bar(); + } +#pragma omp target +#pragma omp teams #pragma omp distribute parallel for simd for (int i = 0; i < 10; ++i) { #pragma omp taskwait // expected-error {{OpenMP constructs may not be nested inside a simd region}} @@ -6669,6 +6799,11 @@ void foo() { } #pragma omp target simd for (int i = 0; i < 10; ++i) { +#pragma omp scan // omp45-error {{OpenMP constructs may not be nested inside a simd region}} omp50-error {{region cannot be closely nested inside 'target simd' region; perhaps you forget to enclose 'omp scan' directive into a for, simd, or for simd region?}} + bar(); + } +#pragma omp target simd + for (int i = 0; i < 10; ++i) { #pragma omp taskwait // expected-error {{OpenMP constructs may not be nested inside a simd region}} bar(); } @@ -6912,6 +7047,12 @@ void foo() { bar(); } #pragma omp target +#pragma omp teams distribute + for (int i = 0; i < 10; ++i) { +#pragma omp scan // expected-error {{region cannot be closely nested inside 'teams distribute' region; perhaps you forget to enclose 'omp scan' directive into a for, simd, or for simd region?}} + bar(); + } +#pragma omp target #pragma omp teams distribute for (int i = 0; i < 10; ++i) { #pragma omp taskwait // OK @@ -7175,6 +7316,12 @@ void foo() { bar(); } #pragma omp target +#pragma omp teams distribute + for (int i = 0; i < 10; ++i) { +#pragma omp scan // expected-error {{region cannot be closely nested inside 'teams distribute' region; perhaps you forget to enclose 'omp scan' directive into a for, simd, or for simd region?}} + bar(); + } +#pragma omp target #pragma omp teams distribute for (int i = 0; i < 10; ++i) { #pragma omp taskwait // OK @@ -7458,6 +7605,12 @@ void foo() { bar(); } #pragma omp target +#pragma omp teams distribute simd + for (int i = 0; i < 10; ++i) { +#pragma omp scan // omp45-error {{OpenMP constructs may not be nested inside a simd region}} omp50-error {{region cannot be closely nested inside 'teams distribute simd' region; perhaps you forget to enclose 'omp scan' directive into a for, simd, or for simd region?}} + bar(); + } +#pragma omp target #pragma omp teams distribute simd for (int i = 0; i < 10; ++i) { #pragma omp taskwait // expected-error {{OpenMP constructs may not be nested inside a simd region}} @@ -7741,6 +7894,12 @@ void foo() { bar(); } #pragma omp target +#pragma omp teams distribute parallel for simd + for (int i = 0; i < 10; ++i) { +#pragma omp scan // omp45-error {{OpenMP constructs may not be nested inside a simd region}} omp50-error {{region cannot be closely nested inside 'teams distribute parallel for simd' region; perhaps you forget to enclose 'omp scan' directive into a for, simd, or for simd region?}} + bar(); + } +#pragma omp target #pragma omp teams distribute parallel for simd for (int i = 0; i < 10; ++i) { #pragma omp taskwait // expected-error {{OpenMP constructs may not be nested inside a simd region}} @@ -8024,6 +8183,12 @@ void foo() { bar(); } #pragma omp target +#pragma omp teams distribute parallel for + for (int i = 0; i < 10; ++i) { +#pragma omp scan // expected-error {{region cannot be closely nested inside 'teams distribute parallel for' region}} + bar(); + } +#pragma omp target #pragma omp teams distribute parallel for for (int i = 0; i < 10; ++i) { #pragma omp taskwait // OK @@ -8237,6 +8402,11 @@ void foo() { } #pragma omp target teams { +#pragma omp scan // expected-error {{region cannot be closely nested inside 'target teams' region; perhaps you forget to enclose 'omp scan' directive into a for, simd, or for simd region?}} + bar(); + } +#pragma omp target teams + { #pragma omp taskwait // expected-error {{region cannot be closely nested inside 'target teams' region; perhaps you forget to enclose 'omp taskwait' directive into a parallel region?}} bar(); } @@ -8511,6 +8681,11 @@ void foo() { } #pragma omp target teams distribute for (int i = 0; i < 10; ++i) { +#pragma omp scan // expected-error {{region cannot be closely nested inside 'target teams distribute' region; perhaps you forget to enclose 'omp scan' directive into a for, simd, or for simd region?}} + bar(); + } +#pragma omp target teams distribute + for (int i = 0; i < 10; ++i) { #pragma omp taskwait // OK bar(); } @@ -8754,6 +8929,11 @@ void foo() { } #pragma omp target teams distribute parallel for for (int i = 0; i < 10; ++i) { +#pragma omp scan // expected-error {{region cannot be closely nested inside 'target teams distribute parallel for' region}} + bar(); + } +#pragma omp target teams distribute parallel for + for (int i = 0; i < 10; ++i) { #pragma omp taskwait // OK bar(); } @@ -8997,6 +9177,11 @@ void foo() { } #pragma omp target teams distribute parallel for simd for (int i = 0; i < 10; ++i) { +#pragma omp scan // omp45-error {{OpenMP constructs may not be nested inside a simd region}} omp50-error {{region cannot be closely nested inside 'target teams distribute parallel for simd' region; perhaps you forget to enclose 'omp scan' directive into a for, simd, or for simd region?}} + bar(); + } +#pragma omp target teams distribute parallel for simd + for (int i = 0; i < 10; ++i) { #pragma omp taskwait // expected-error {{OpenMP constructs may not be nested inside a simd region}} bar(); } @@ -9240,6 +9425,11 @@ void foo() { } #pragma omp target teams distribute simd for (int i = 0; i < 10; ++i) { +#pragma omp scan // omp45-error {{OpenMP constructs may not be nested inside a simd region}} omp50-error {{region cannot be closely nested inside 'target teams distribute simd' region; perhaps you forget to enclose 'omp scan' directive into a for, simd, or for simd region?}} + bar(); + } +#pragma omp target teams distribute simd + for (int i = 0; i < 10; ++i) { #pragma omp taskwait // expected-error {{OpenMP constructs may not be nested inside a simd region}} bar(); } @@ -9433,6 +9623,11 @@ void foo() { } #pragma omp parallel { +#pragma omp scan // expected-error {{region cannot be closely nested inside 'parallel' region; perhaps you forget to enclose 'omp scan' directive into a for, simd, or for simd region?}} + bar(); + } +#pragma omp parallel + { #pragma omp taskwait bar(); } @@ -9666,6 +9861,11 @@ void foo() { } #pragma omp simd for (int i = 0; i < 10; ++i) { +#pragma omp scan // omp45-error {{OpenMP constructs may not be nested inside a simd region}} omp50-error {{exactly one of 'inclusive' or 'exclusive' clauses is expected}} + bar(); + } +#pragma omp simd + for (int i = 0; i < 10; ++i) { #pragma omp taskwait // expected-error {{OpenMP constructs may not be nested inside a simd region}} bar(); } @@ -9916,6 +10116,11 @@ void foo() { } #pragma omp for for (int i = 0; i < 10; ++i) { +#pragma omp scan // omp45-error {{region cannot be closely nested inside 'for' region}} omp50-error {{exactly one of 'inclusive' or 'exclusive' clauses is expected}} + bar(); + } +#pragma omp for + for (int i = 0; i < 10; ++i) { #pragma omp taskwait bar(); } @@ -10156,6 +10361,11 @@ void foo() { } #pragma omp for simd for (int i = 0; i < 10; ++i) { +#pragma omp scan // omp45-error {{OpenMP constructs may not be nested inside a simd region}} omp50-error {{exactly one of 'inclusive' or 'exclusive' clauses is expected}} + bar(); + } +#pragma omp for simd + for (int i = 0; i < 10; ++i) { #pragma omp taskwait // expected-error {{OpenMP constructs may not be nested inside a simd region}} bar(); } @@ -10405,6 +10615,11 @@ void foo() { } #pragma omp sections { +#pragma omp scan // expected-error {{region cannot be closely nested inside 'sections' region}} + bar(); + } +#pragma omp sections + { #pragma omp taskwait } #pragma omp sections @@ -10698,6 +10913,14 @@ void foo() { { #pragma omp section { +#pragma omp scan // expected-error {{region cannot be closely nested inside 'section' region}} + bar(); + } + } +#pragma omp sections + { +#pragma omp section + { #pragma omp taskwait bar(); } @@ -10991,6 +11214,11 @@ void foo() { } #pragma omp single { +#pragma omp scan // expected-error {{region cannot be closely nested inside 'single' region}} + bar(); + } +#pragma omp single + { #pragma omp taskwait bar(); } @@ -11258,6 +11486,11 @@ void foo() { } #pragma omp master { +#pragma omp scan // expected-error {{region cannot be closely nested inside 'master' region}} + bar(); + } +#pragma omp master + { #pragma omp taskwait bar(); } @@ -11513,6 +11746,11 @@ void foo() { } #pragma omp critical { +#pragma omp scan // expected-error {{region cannot be closely nested inside 'critical' region}} + bar(); + } +#pragma omp critical + { #pragma omp taskwait bar(); } @@ -11797,6 +12035,11 @@ void foo() { } #pragma omp parallel for for (int i = 0; i < 10; ++i) { +#pragma omp scan // expected-error {{region cannot be closely nested inside 'parallel for' region}} + bar(); + } +#pragma omp parallel for + for (int i = 0; i < 10; ++i) { #pragma omp taskwait bar(); } @@ -12067,6 +12310,11 @@ void foo() { } #pragma omp parallel for simd for (int i = 0; i < 10; ++i) { +#pragma omp scan // omp45-error {{OpenMP constructs may not be nested inside a simd region}} omp50-error {{region cannot be closely nested inside 'parallel for simd' region; perhaps you forget to enclose 'omp scan' directive into a for, simd, or for simd region?}} + bar(); + } +#pragma omp parallel for simd + for (int i = 0; i < 10; ++i) { #pragma omp taskwait // expected-error {{OpenMP constructs may not be nested inside a simd region}} bar(); } @@ -12331,6 +12579,10 @@ void foo() { } #pragma omp parallel sections { +#pragma omp scan // expected-error {{region cannot be closely nested inside 'parallel sections' region}} + } +#pragma omp parallel sections + { #pragma omp taskwait } #pragma omp parallel sections @@ -12532,6 +12784,11 @@ void foo() { } #pragma omp task { +#pragma omp scan // expected-error {{region cannot be closely nested inside 'task' region}} + bar(); + } +#pragma omp task + { #pragma omp taskwait bar(); } @@ -12804,6 +13061,13 @@ void foo() { // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}} // expected-note@+1 {{expected an expression statement}} { +#pragma omp scan // expected-error {{OpenMP constructs may not be nested inside an atomic region}} + bar(); + } +#pragma omp atomic + // expected-error@+2 {{the statement for 'atomic' must be an expression statement of form '++x;', '--x;', 'x++;', 'x--;', 'x binop= expr;', 'x = x binop expr' or 'x = expr binop x', where x is an l-value expression with scalar type}} + // expected-note@+1 {{expected an expression statement}} + { #pragma omp taskwait // expected-error {{OpenMP constructs may not be nested inside an atomic region}} bar(); } @@ -13067,6 +13331,11 @@ void foo() { } #pragma omp target { +#pragma omp scan // expected-error {{region cannot be closely nested inside 'target' region; perhaps you forget to enclose 'omp scan' directive into a for, simd, or for simd region?}} + bar(); + } +#pragma omp target + { #pragma omp taskwait bar(); } @@ -13292,6 +13561,11 @@ void foo() { } #pragma omp target parallel { +#pragma omp scan // expected-error {{region cannot be closely nested inside 'target parallel' region; perhaps you forget to enclose 'omp scan' directive into a for, simd, or for simd region?}} + bar(); + } +#pragma omp target parallel + { #pragma omp taskwait bar(); } @@ -13558,6 +13832,11 @@ void foo() { } #pragma omp target parallel for for (int i = 0; i < 10; ++i) { +#pragma omp scan // expected-error {{region cannot be closely nested inside 'target parallel for' region}} + bar(); + } +#pragma omp target parallel for + for (int i = 0; i < 10; ++i) { #pragma omp taskwait bar(); } @@ -13791,6 +14070,12 @@ void foo() { bar(); } #pragma omp target +#pragma omp teams + { +#pragma omp scan // expected-error {{region cannot be closely nested inside 'teams' region; perhaps you forget to enclose 'omp scan' directive into a for, simd, or for simd region?}} + bar(); + } +#pragma omp target #pragma omp teams { #pragma omp taskwait // expected-error {{region cannot be closely nested inside 'teams' region; perhaps you forget to enclose 'omp taskwait' directive into a parallel region?}} @@ -14094,6 +14379,11 @@ void foo() { } #pragma omp taskloop for (int i = 0; i < 10; ++i) { +#pragma omp scan // expected-error {{region cannot be closely nested inside 'taskloop' region}} + bar(); + } +#pragma omp taskloop + for (int i = 0; i < 10; ++i) { #pragma omp taskwait bar(); } @@ -14386,6 +14676,13 @@ void foo() { } #pragma omp target #pragma omp teams +#pragma omp distribute + for (int i = 0; i < 10; ++i) { +#pragma omp scan // expected-error {{region cannot be closely nested inside 'distribute' region; perhaps you forget to enclose 'omp scan' directive into a for, simd, or for simd region?}} + bar(); + } +#pragma omp target +#pragma omp teams #pragma omp distribute for (int i = 0; i < 10; ++i) { #pragma omp taskwait @@ -14708,6 +15005,13 @@ void foo() { } #pragma omp target #pragma omp teams +#pragma omp distribute parallel for + for (int i = 0; i < 10; ++i) { +#pragma omp scan // expected-error {{region cannot be closely nested inside 'distribute parallel for' region}} + bar(); + } +#pragma omp target +#pragma omp teams #pragma omp distribute parallel for for (int i = 0; i < 10; ++i) { #pragma omp taskwait @@ -15028,6 +15332,13 @@ void foo() { } #pragma omp target #pragma omp teams +#pragma omp distribute parallel for simd + for (int i = 0; i < 10; ++i) { +#pragma omp scan // omp45-error {{OpenMP constructs may not be nested inside a simd region}} omp50-error {{region cannot be closely nested inside 'distribute parallel for simd' region; perhaps you forget to enclose 'omp scan' directive into a for, simd, or for simd region?}} + bar(); + } +#pragma omp target +#pragma omp teams #pragma omp distribute parallel for simd for (int i = 0; i < 10; ++i) { #pragma omp taskwait // expected-error {{OpenMP constructs may not be nested inside a simd region}} @@ -15340,6 +15651,13 @@ void foo() { } #pragma omp target #pragma omp teams +#pragma omp distribute simd + for (int i = 0; i < 10; ++i) { +#pragma omp scan // omp45-error {{OpenMP constructs may not be nested inside a simd region}} omp50-error {{region cannot be closely nested inside 'distribute simd' region; perhaps you forget to enclose 'omp scan' directive into a for, simd, or for simd region?}} + bar(); + } +#pragma omp target +#pragma omp teams #pragma omp distribute simd for (int i = 0; i < 10; ++i) { #pragma omp taskwait // expected-error {{OpenMP constructs may not be nested inside a simd region}} @@ -15616,6 +15934,11 @@ void foo() { } #pragma omp target simd for (int i = 0; i < 10; ++i) { +#pragma omp scan // omp45-error {{OpenMP constructs may not be nested inside a simd region}} omp50-error {{region cannot be closely nested inside 'target simd' region; perhaps you forget to enclose 'omp scan' directive into a for, simd, or for simd region?}} + bar(); + } +#pragma omp target simd + for (int i = 0; i < 10; ++i) { #pragma omp taskwait // expected-error {{OpenMP constructs may not be nested inside a simd region}} bar(); } @@ -15870,6 +16193,12 @@ void foo() { bar(); } #pragma omp target +#pragma omp teams distribute + for (int i = 0; i < 10; ++i) { +#pragma omp scan // expected-error {{region cannot be closely nested inside 'teams distribute' region; perhaps you forget to enclose 'omp scan' directive into a for, simd, or for simd region?}} + bar(); + } +#pragma omp target #pragma omp teams distribute for (int i = 0; i < 10; ++i) { #pragma omp taskwait // OK @@ -16153,6 +16482,12 @@ void foo() { bar(); } #pragma omp target +#pragma omp teams distribute simd + for (int i = 0; i < 10; ++i) { +#pragma omp scan // omp45-error {{OpenMP constructs may not be nested inside a simd region}} omp50-error {{region cannot be closely nested inside 'teams distribute simd' region; perhaps you forget to enclose 'omp scan' directive into a for, simd, or for simd region?}} + bar(); + } +#pragma omp target #pragma omp teams distribute simd for (int i = 0; i < 10; ++i) { #pragma omp taskwait // expected-error {{OpenMP constructs may not be nested inside a simd region}} @@ -16436,6 +16771,12 @@ void foo() { bar(); } #pragma omp target +#pragma omp teams distribute parallel for simd + for (int i = 0; i < 10; ++i) { +#pragma omp scan // omp45-error {{OpenMP constructs may not be nested inside a simd region}} omp50-error {{region cannot be closely nested inside 'teams distribute parallel for simd' region; perhaps you forget to enclose 'omp scan' directive into a for, simd, or for simd region?}} + bar(); + } +#pragma omp target #pragma omp teams distribute parallel for simd for (int i = 0; i < 10; ++i) { #pragma omp taskwait // expected-error {{OpenMP constructs may not be nested inside a simd region}} @@ -16719,6 +17060,12 @@ void foo() { bar(); } #pragma omp target +#pragma omp teams distribute parallel for + for (int i = 0; i < 10; ++i) { +#pragma omp scan // expected-error {{region cannot be closely nested inside 'teams distribute parallel for' region}} + bar(); + } +#pragma omp target #pragma omp teams distribute parallel for for (int i = 0; i < 10; ++i) { #pragma omp taskwait // OK @@ -16932,6 +17279,11 @@ void foo() { } #pragma omp target teams { +#pragma omp scan // expected-error {{region cannot be closely nested inside 'target teams' region; perhaps you forget to enclose 'omp scan' directive into a for, simd, or for simd region?}} + bar(); + } +#pragma omp target teams + { #pragma omp taskwait // expected-error {{region cannot be closely nested inside 'target teams' region; perhaps you forget to enclose 'omp taskwait' directive into a parallel region?}} bar(); } @@ -17206,6 +17558,11 @@ void foo() { } #pragma omp target teams distribute for (int i = 0; i < 10; ++i) { +#pragma omp scan // expected-error {{region cannot be closely nested inside 'target teams distribute' region; perhaps you forget to enclose 'omp scan' directive into a for, simd, or for simd region?}} + bar(); + } +#pragma omp target teams distribute + for (int i = 0; i < 10; ++i) { #pragma omp taskwait // OK bar(); } @@ -17449,6 +17806,11 @@ void foo() { } #pragma omp target teams distribute parallel for for (int i = 0; i < 10; ++i) { +#pragma omp scan // expected-error {{region cannot be closely nested inside 'target teams distribute parallel for' region}} + bar(); + } +#pragma omp target teams distribute parallel for + for (int i = 0; i < 10; ++i) { #pragma omp taskwait // OK bar(); } @@ -17692,6 +18054,11 @@ void foo() { } #pragma omp target teams distribute parallel for simd for (int i = 0; i < 10; ++i) { +#pragma omp scan // omp45-error {{OpenMP constructs may not be nested inside a simd region}} omp50-error {{region cannot be closely nested inside 'target teams distribute parallel for simd' region; perhaps you forget to enclose 'omp scan' directive into a for, simd, or for simd region?}} + bar(); + } +#pragma omp target teams distribute parallel for simd + for (int i = 0; i < 10; ++i) { #pragma omp taskwait // expected-error {{OpenMP constructs may not be nested inside a simd region}} bar(); } @@ -17935,6 +18302,11 @@ void foo() { } #pragma omp target teams distribute simd for (int i = 0; i < 10; ++i) { +#pragma omp scan // omp45-error {{OpenMP constructs may not be nested inside a simd region}} omp50-error {{region cannot be closely nested inside 'target teams distribute simd' region; perhaps you forget to enclose 'omp scan' directive into a for, simd, or for simd region?}} + bar(); + } +#pragma omp target teams distribute simd + for (int i = 0; i < 10; ++i) { #pragma omp taskwait // expected-error {{OpenMP constructs may not be nested inside a simd region}} bar(); } diff --git a/clang/test/OpenMP/nvptx_target_exceptions_messages.cpp b/clang/test/OpenMP/nvptx_target_exceptions_messages.cpp index 433ba13f73d60..faff77e0a43b7 100644 --- a/clang/test/OpenMP/nvptx_target_exceptions_messages.cpp +++ b/clang/test/OpenMP/nvptx_target_exceptions_messages.cpp @@ -38,7 +38,7 @@ int d; #pragma omp end declare target int c; -int bar() { return 1 + foo() + bar() + baz1() + baz2(); } +int bar() { return 1 + foo() + bar() + baz1() + baz2(); } // expected-note {{called by 'bar'}} int maini1() { int a; @@ -49,7 +49,7 @@ int maini1() { { S s(a); static long aaa = 23; - a = foo() + bar() + b + c + d + aa + aaa + FA(); + a = foo() + bar() + b + c + d + aa + aaa + FA(); // expected-note{{called by 'maini1'}} if (!a) throw "Error"; // expected-error {{cannot use 'throw' with exceptions disabled}} } diff --git a/clang/test/OpenMP/parallel_ast_print.cpp b/clang/test/OpenMP/parallel_ast_print.cpp index fa96dfce67fff..7ba40d8dc1cc3 100644 --- a/clang/test/OpenMP/parallel_ast_print.cpp +++ b/clang/test/OpenMP/parallel_ast_print.cpp @@ -1,10 +1,10 @@ -// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -ast-print %s | FileCheck %s -// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -x c++ -std=c++11 -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp -fopenmp-version=45 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 -ast-print %s | FileCheck %s +// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c++ -std=c++11 -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s -// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=45 -ast-print %s | FileCheck %s -// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -x c++ -std=c++11 -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=45 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s +// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 -ast-print %s | FileCheck %s +// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -x c++ -std=c++11 -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s // expected-no-diagnostics #ifndef HEADER @@ -89,7 +89,7 @@ class S8 : public S7 { #pragma omp parallel shared(a) shared(this->a) shared(S7 < S1 > ::a) for (int k = 0; k < a.a; ++k) ++this->a.a; -#pragma omp parallel reduction(^ : S7 < S1 > ::a) reduction(+ : S7 < S1 > ::b[ : S7 < S1 > ::a.a]) +#pragma omp parallel reduction(default, ^ : S7 < S1 > ::a) reduction(+ : S7 < S1 > ::b[ : S7 < S1 > ::a.a]) for (int k = 0; k < a.a; ++k) ++this->a.a; } @@ -113,7 +113,7 @@ class S8 : public S7 { // CHECK: #pragma omp parallel private(this->a) private(this->a) private(this->S7::a) // CHECK: #pragma omp parallel firstprivate(this->a) firstprivate(this->a) firstprivate(this->S7::a) // CHECK: #pragma omp parallel shared(this->a) shared(this->a) shared(this->S7::a) -// CHECK: #pragma omp parallel reduction(^: this->S7::a) reduction(+: this->S7::b[:this->S7::a.a]) +// CHECK: #pragma omp parallel reduction(default, ^: this->S7::a) reduction(+: this->S7::b[:this->S7::a.a]) // CHECK: #pragma omp parallel private(this->a) private(this->a) // CHECK: #pragma omp parallel firstprivate(this->a) firstprivate(this->a) // CHECK: #pragma omp parallel shared(this->a) shared(this->a) @@ -152,7 +152,7 @@ T tmain(T argc, T *argv) { a=2; #pragma omp parallel default(none), private(argc,b) firstprivate(argv) shared (d) if (parallel:argc > 0) num_threads(C) copyin(S::TS, thrp) proc_bind(master) reduction(+:c, arr1[argc]) reduction(max:e, arr[:C][0:10]) foo(); -#pragma omp parallel if (C) num_threads(s) proc_bind(close) reduction(^:e, f, arr[0:C][:argc]) reduction(&& : g) +#pragma omp parallel if (C) num_threads(s) proc_bind(close) reduction(^:e, f, arr[0:C][:argc]) reduction(default, && : g) foo(); return 0; } @@ -166,7 +166,7 @@ T tmain(T argc, T *argv) { // CHECK-NEXT: a = 2; // CHECK-NEXT: #pragma omp parallel default(none) private(argc,b) firstprivate(argv) shared(d) if(parallel: argc > 0) num_threads(C) copyin(S::TS,thrp) proc_bind(master) reduction(+: c,arr1[argc]) reduction(max: e,arr[:C][0:10]) // CHECK-NEXT: foo() -// CHECK-NEXT: #pragma omp parallel if(C) num_threads(s) proc_bind(close) reduction(^: e,f,arr[0:C][:argc]) reduction(&&: g) +// CHECK-NEXT: #pragma omp parallel if(C) num_threads(s) proc_bind(close) reduction(^: e,f,arr[0:C][:argc]) reduction(default, &&: g) // CHECK-NEXT: foo() // CHECK: template<> int tmain(int argc, int *argv) { // CHECK-NEXT: int b = argc, c, d, e, f, g; @@ -177,7 +177,7 @@ T tmain(T argc, T *argv) { // CHECK-NEXT: a = 2; // CHECK-NEXT: #pragma omp parallel default(none) private(argc,b) firstprivate(argv) shared(d) if(parallel: argc > 0) num_threads(5) copyin(S::TS,thrp) proc_bind(master) reduction(+: c,arr1[argc]) reduction(max: e,arr[:5][0:10]) // CHECK-NEXT: foo() -// CHECK-NEXT: #pragma omp parallel if(5) num_threads(s) proc_bind(close) reduction(^: e,f,arr[0:5][:argc]) reduction(&&: g) +// CHECK-NEXT: #pragma omp parallel if(5) num_threads(s) proc_bind(close) reduction(^: e,f,arr[0:5][:argc]) reduction(default, &&: g) // CHECK-NEXT: foo() // CHECK: template<> long tmain(long argc, long *argv) { // CHECK-NEXT: long b = argc, c, d, e, f, g; @@ -188,7 +188,7 @@ T tmain(T argc, T *argv) { // CHECK-NEXT: a = 2; // CHECK-NEXT: #pragma omp parallel default(none) private(argc,b) firstprivate(argv) shared(d) if(parallel: argc > 0) num_threads(1) copyin(S::TS,thrp) proc_bind(master) reduction(+: c,arr1[argc]) reduction(max: e,arr[:1][0:10]) // CHECK-NEXT: foo() -// CHECK-NEXT: #pragma omp parallel if(1) num_threads(s) proc_bind(close) reduction(^: e,f,arr[0:1][:argc]) reduction(&&: g) +// CHECK-NEXT: #pragma omp parallel if(1) num_threads(s) proc_bind(close) reduction(^: e,f,arr[0:1][:argc]) reduction(default, &&: g) // CHECK-NEXT: foo() enum Enum { }; diff --git a/clang/test/OpenMP/parallel_reduction_codegen.cpp b/clang/test/OpenMP/parallel_reduction_codegen.cpp index 60bf358030764..eeea0384ac048 100644 --- a/clang/test/OpenMP/parallel_reduction_codegen.cpp +++ b/clang/test/OpenMP/parallel_reduction_codegen.cpp @@ -1,14 +1,14 @@ -// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck %s -// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-apple-darwin10 -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp -x c++ -triple x86_64-apple-darwin10 -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s -// RUN: %clang_cc1 -verify -fopenmp -x c++ -std=c++11 -DLAMBDA -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck -check-prefix=LAMBDA %s -// RUN: %clang_cc1 -verify -fopenmp -x c++ -fblocks -DBLOCKS -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck -check-prefix=BLOCKS %s - -// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s -// RUN: %clang_cc1 -fopenmp-simd -x c++ -std=c++11 -triple x86_64-apple-darwin10 -emit-pch -o %t %s -// RUN: %clang_cc1 -fopenmp-simd -x c++ -triple x86_64-apple-darwin10 -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s -// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -std=c++11 -DLAMBDA -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s -// RUN: %clang_cc1 -verify -fopenmp-simd -x c++ -fblocks -DBLOCKS -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 -x c++ -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck %s +// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c++ -std=c++11 -triple x86_64-apple-darwin10 -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c++ -triple x86_64-apple-darwin10 -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 -x c++ -std=c++11 -DLAMBDA -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck -check-prefix=LAMBDA %s +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 -x c++ -fblocks -DBLOCKS -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck -check-prefix=BLOCKS %s + +// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 -x c++ -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -x c++ -std=c++11 -triple x86_64-apple-darwin10 -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -x c++ -triple x86_64-apple-darwin10 -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 -x c++ -std=c++11 -DLAMBDA -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 -x c++ -fblocks -DBLOCKS -triple x86_64-apple-darwin10 -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s // SIMD-ONLY0-NOT: {{__kmpc|__tgt}} // expected-no-diagnostics #ifndef HEADER @@ -31,7 +31,7 @@ struct SS { int b : 4; int &c; SS(int &d) : a(0), b(0), c(d) { -#pragma omp parallel reduction(+: a, b, c) +#pragma omp parallel reduction(default, +: a, b, c) #ifdef LAMBDA [&]() { ++this->a, --b, (this)->c /= 1; @@ -91,7 +91,7 @@ struct SST { //CHECK: call void {{.+}}@__kmpc_fork_call( //CHECK: ret void void foo_array_sect(short x[1]) { -#pragma omp parallel reduction(+ : x[:]) +#pragma omp parallel reduction(default, + : x[:]) {} } diff --git a/clang/test/OpenMP/parallel_reduction_messages.c b/clang/test/OpenMP/parallel_reduction_messages.c index f88f8e0564953..61a3e9300436c 100644 --- a/clang/test/OpenMP/parallel_reduction_messages.c +++ b/clang/test/OpenMP/parallel_reduction_messages.c @@ -1,8 +1,19 @@ -// RUN: %clang_cc1 -verify -fopenmp -ferror-limit 150 -o - %s +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 -ferror-limit 150 -o - %s int incomplete[]; void test() { + int a; +#pragma omp parallel reduction( // expected-error {{expected identifier}} expected-error {{expected ')'}} expected-warning {{missing ':' after reduction identifier - ignoring}} expected-note {{to match this '('}} + ; +#pragma omp parallel reduction(unknown // expected-error {{expected expression}} expected-error {{expected ')'}} expected-warning {{missing ':' after reduction identifier - ignoring}} expected-note {{to match this '('}} + ; +#pragma omp parallel reduction(default, // expected-error {{expected identifier}} expected-error {{expected ')'}} expected-warning {{missing ':' after reduction identifier - ignoring}} expected-note {{to match this '('}} + ; +#pragma omp parallel reduction(unknown, +: a) // expected-error {{expected 'default' in OpenMP clause 'reduction'}} + ; +#pragma omp parallel reduction(default, + : a) + ; #pragma omp parallel reduction(+ : incomplete) // expected-error {{a reduction list item with incomplete type 'int []'}} ; } diff --git a/clang/test/OpenMP/scan_ast_print.cpp b/clang/test/OpenMP/scan_ast_print.cpp new file mode 100644 index 0000000000000..4b9eca6f7ec9f --- /dev/null +++ b/clang/test/OpenMP/scan_ast_print.cpp @@ -0,0 +1,50 @@ +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 -ast-print %s | FileCheck %s +// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c++ -std=c++11 -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s + +// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 -ast-print %s | FileCheck %s +// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -x c++ -std=c++11 -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -std=c++11 -include-pch %t -fsyntax-only -verify %s -ast-print | FileCheck %s +// expected-no-diagnostics + +#ifndef HEADER +#define HEADER + +void foo() {} + +template +T tmain(T argc) { + static T a; +#pragma omp for + for (int i = 0; i < 10; ++i) { +#pragma omp scan inclusive(a) + } + return a + argc; +} +// CHECK: static T a; +// CHECK-NEXT: #pragma omp for +// CHECK-NEXT: for (int i = 0; i < 10; ++i) { +// CHECK-NEXT: #pragma omp scan inclusive(a){{$}} +// CHECK: static int a; +// CHECK-NEXT: #pragma omp for +// CHECK-NEXT: for (int i = 0; i < 10; ++i) { +// CHECK-NEXT: #pragma omp scan inclusive(a) +// CHECK: static char a; +// CHECK-NEXT: #pragma omp for +// CHECK-NEXT: for (int i = 0; i < 10; ++i) { +// CHECK-NEXT: #pragma omp scan inclusive(a) + +int main(int argc, char **argv) { + static int a; +// CHECK: static int a; +#pragma omp for simd + for (int i = 0; i < 10; ++i) { +#pragma omp scan exclusive(a, argc) + } +// CHECK-NEXT: #pragma omp for simd +// CHECK-NEXT: for (int i = 0; i < 10; ++i) { +// CHECK-NEXT: #pragma omp scan exclusive(a,argc){{$}} + return tmain(argc) + tmain(argv[0][0]) + a; +} + +#endif diff --git a/clang/test/OpenMP/scan_messages.cpp b/clang/test/OpenMP/scan_messages.cpp new file mode 100644 index 0000000000000..9f093858c2490 --- /dev/null +++ b/clang/test/OpenMP/scan_messages.cpp @@ -0,0 +1,174 @@ +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 -ferror-limit 100 %s + +// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 -ferror-limit 100 %s + +template +T tmain(T argc) { +#pragma omp for + for (int i = 0; i < 10; ++i) { +#pragma omp scan // expected-error {{exactly one of 'inclusive' or 'exclusive' clauses is expected}} + ; + } +#pragma omp for + for (int i = 0; i < 10; ++i) { +#pragma omp scan allocate(argc) // expected-error {{unexpected OpenMP clause 'allocate' in directive '#pragma omp scan'}} expected-error {{exactly one of 'inclusive' or 'exclusive' clauses is expected}} +#pragma omp scan untied // expected-error {{unexpected OpenMP clause 'untied' in directive '#pragma omp scan'}} expected-error {{exactly one of 'inclusive' or 'exclusive' clauses is expected}} +#pragma omp scan unknown // expected-warning {{extra tokens at the end of '#pragma omp scan' are ignored}} expected-error {{exactly one of 'inclusive' or 'exclusive' clauses is expected}} + } +#pragma omp for simd + for (int i = 0; i < 10; ++i) + if (argc) +#pragma omp scan inclusive(argc) // expected-error {{'#pragma omp scan' cannot be an immediate substatement}} + if (argc) { +#pragma omp scan inclusive(argc) + } +#pragma omp simd + for (int i = 0; i < 10; ++i) + while (argc) +#pragma omp scan inclusive(argc) // expected-error {{'#pragma omp scan' cannot be an immediate substatement}} + while (argc) { +#pragma omp scan inclusive(argc) + } +#pragma omp simd + for (int i = 0; i < 10; ++i) + do +#pragma omp scan inclusive(argc) // expected-error {{'#pragma omp scan' cannot be an immediate substatement}} + while (argc) + ; +#pragma omp simd + for (int i = 0; i < 10; ++i) + do { +#pragma omp scan inclusive(argc) + } while (argc); +#pragma omp simd + for (int i = 0; i < 10; ++i) + switch (argc) +#pragma omp scan inclusive(argc) // expected-error {{'#pragma omp scan' cannot be an immediate substatement}} + switch (argc) + case 1: +#pragma omp scan inclusive(argc) // expected-error {{'#pragma omp scan' cannot be an immediate substatement}} + switch (argc) + case 1: { +#pragma omp scan inclusive(argc) + } +#pragma omp simd + for (int i = 0; i < 10; ++i) + switch (argc) { +#pragma omp scan exclusive(argc) // expected-note 2 {{previous 'scan' directive used here}} + case 1: +#pragma omp scan exclusive(argc) // expected-error {{exactly one 'scan' directive must appear in the loop body of an enclosing directive}} + break; + default: { +#pragma omp scan exclusive(argc) // expected-error {{exactly one 'scan' directive must appear in the loop body of an enclosing directive}} + } break; + } +#pragma omp simd + for (int i = 0; i < 10; ++i) + for (;;) +#pragma omp scan exclusive(argc) // expected-error {{'#pragma omp scan' cannot be an immediate substatement}} + for (;;) { +#pragma omp scan exclusive(argc) + } +#pragma omp simd + for (int i = 0; i < 10; ++i) { +label: +#pragma omp scan exclusive(argc) + } +#pragma omp simd + for (int i = 0; i < 10; ++i) { +label1 : { +#pragma omp scan inclusive(argc) +}} + + return T(); +} + +int main(int argc, char **argv) { +#pragma omp simd + for (int i = 0; i < 10; ++i) { +#pragma omp scan inclusive(argc) inclusive(argc) // expected-error {{exactly one of 'inclusive' or 'exclusive' clauses is expected}} + ; + } +#pragma omp simd + for (int i = 0; i < 10; ++i) { +#pragma omp scan exclusive(argc) inclusive(argc) // expected-error {{exactly one of 'inclusive' or 'exclusive' clauses is expected}} + ; + } +#pragma omp simd + for (int i = 0; i < 10; ++i) { +#pragma omp scan exclusive(argc) exclusive(argc) // expected-error {{exactly one of 'inclusive' or 'exclusive' clauses is expected}} + ; + } +#pragma omp simd + for (int i = 0; i < 10; ++i) { +#pragma omp scan untied // expected-error {{unexpected OpenMP clause 'untied' in directive '#pragma omp scan'}} expected-error {{exactly one of 'inclusive' or 'exclusive' clauses is expected}} +#pragma omp scan unknown // expected-warning {{extra tokens at the end of '#pragma omp scan' are ignored}} expected-error {{exactly one of 'inclusive' or 'exclusive' clauses is expected}} + } +#pragma omp simd + for (int i = 0; i < 10; ++i) + if (argc) +#pragma omp scan inclusive(argc) // expected-error {{'#pragma omp scan' cannot be an immediate substatement}} + if (argc) { +#pragma omp scan inclusive(argc) + } +#pragma omp simd + for (int i = 0; i < 10; ++i) + while (argc) +#pragma omp scan inclusive(argc) // expected-error {{'#pragma omp scan' cannot be an immediate substatement}} + while (argc) { +#pragma omp scan inclusive(argc) + } +#pragma omp simd + for (int i = 0; i < 10; ++i) + do +#pragma omp scan inclusive(argc) // expected-error {{'#pragma omp scan' cannot be an immediate substatement}} + while (argc) + ; +#pragma omp simd + for (int i = 0; i < 10; ++i) + do { +#pragma omp scan exclusive(argc) + } while (argc); +#pragma omp simd + for (int i = 0; i < 10; ++i) + switch (argc) +#pragma omp scan exclusive(argc) // expected-error {{'#pragma omp scan' cannot be an immediate substatement}} + switch (argc) + case 1: +#pragma omp scan exclusive(argc) // expected-error {{'#pragma omp scan' cannot be an immediate substatement}} + switch (argc) + case 1: { +#pragma omp scan exclusive(argc) + } +#pragma omp simd + for (int i = 0; i < 10; ++i) + switch (argc) { +#pragma omp scan inclusive(argc) // expected-note 2 {{previous 'scan' directive used here}} + case 1: +#pragma omp scan inclusive(argc) // expected-error {{exactly one 'scan' directive must appear in the loop body of an enclosing directive}} + break; + default: { +#pragma omp scan inclusive(argc) // expected-error {{exactly one 'scan' directive must appear in the loop body of an enclosing directive}} + } break; + } +#pragma omp simd + for (int i = 0; i < 10; ++i) + for (;;) +#pragma omp scan inclusive(argc) // expected-error {{'#pragma omp scan' cannot be an immediate substatement}} + for (;;) { +#pragma omp scan inclusive(argc) + } +#pragma omp simd + for (int i = 0; i < 10; ++i) { +label: +#pragma omp scan inclusive(argc) + } +#pragma omp simd + for (int i = 0; i < 10; ++i) { +label1 : { +#pragma omp scan inclusive(argc) +} +} + + return tmain(argc); +} diff --git a/clang/test/OpenMP/target_device_codegen.cpp b/clang/test/OpenMP/target_device_codegen.cpp new file mode 100644 index 0000000000000..8117540d39396 --- /dev/null +++ b/clang/test/OpenMP/target_device_codegen.cpp @@ -0,0 +1,50 @@ +// Test host codegen. +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=50 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck %s +// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s + +// RUN: %clang_cc1 -verify -fopenmp-simd -fopenmp-version=50 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=powerpc64le-ibm-linux-gnu -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s +// SIMD-ONLY0-NOT: {{__kmpc|__tgt}} + +// expected-no-diagnostics +#ifndef HEADER +#define HEADER + +void foo(int n) { + + // CHECK: [[N:%.+]] = load i32, i32* [[N_ADDR:%.+]], + // CHECK: store i32 [[N]], i32* [[DEVICE_CAP:%.+]], + // CHECK: [[DEV:%.+]] = load i32, i32* [[DEVICE_CAP]], + // CHECK: [[DEVICE:%.+]] = sext i32 [[DEV]] to i64 + // CHECK: [[RET:%.+]] = call i32 @__tgt_target(i64 [[DEVICE]], i8* @{{[^,]+}}, i32 0, i8** null, i8** null, i64* null, i64* null) + // CHECK-NEXT: [[ERROR:%.+]] = icmp ne i32 [[RET]], 0 + // CHECK-NEXT: br i1 [[ERROR]], label %[[FAIL:[^,]+]], label %[[END:[^,]+]] + // CHECK: [[FAIL]] + // CHECK: call void [[HVT0:@.+]]() + // CHECK-NEXT: br label %[[END]] + // CHECK: [[END]] + #pragma omp target device(n) + ; + // CHECK: [[N:%.+]] = load i32, i32* [[N_ADDR]], + // CHECK: store i32 [[N]], i32* [[DEVICE_CAP:%.+]], + // CHECK: [[DEV:%.+]] = load i32, i32* [[DEVICE_CAP]], + // CHECK: [[DEVICE:%.+]] = sext i32 [[DEV]] to i64 + // CHECK: [[RET:%.+]] = call i32 @__tgt_target(i64 [[DEVICE]], i8* @{{[^,]+}}, i32 0, i8** null, i8** null, i64* null, i64* null) + // CHECK-NEXT: [[ERROR:%.+]] = icmp ne i32 [[RET]], 0 + // CHECK-NEXT: br i1 [[ERROR]], label %[[FAIL:[^,]+]], label %[[END:[^,]+]] + // CHECK: [[FAIL]] + // CHECK: call void [[HVT0:@.+]]() + // CHECK-NEXT: br label %[[END]] + // CHECK: [[END]] + #pragma omp target device(device_num: n) + ; + // CHECK-NOT: call i32 @__tgt_target( + // CHECK: call void @__omp_offloading_{{.+}}_l46() + // CHECK-NOT: call i32 @__tgt_target( + #pragma omp target device(ancestor: n) + ; +} + +#endif diff --git a/clang/test/OpenMP/target_update_from_messages.cpp b/clang/test/OpenMP/target_update_from_messages.cpp index eaf862a105827..141cfc38ffeb1 100644 --- a/clang/test/OpenMP/target_update_from_messages.cpp +++ b/clang/test/OpenMP/target_update_from_messages.cpp @@ -74,7 +74,7 @@ struct S8 { #pragma omp target update from(*(this->S->i+this->S->s6[0].pp)) // le45-error {{expected expression containing only member accesses and/or array sections based on named variables}} le45-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}} #pragma omp target update from(*(a+this->ptr)) // le45-error {{expected expression containing only member accesses and/or array sections based on named variables}} le45-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}} #pragma omp target update from(*(*(this->ptr)+a+this->ptr)) // le45-error {{expected expression containing only member accesses and/or array sections based on named variables}} le45-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}} -#pragma omp target update from(*(this+this)) // expected-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}} expected-error {{invalid operands to binary expression ('S8 *' and 'S8 *')}} +#pragma omp target update from(*(this+this)) // expected-error {{invalid operands to binary expression ('S8 *' and 'S8 *')}} } }; @@ -198,8 +198,8 @@ int main(int argc, char **argv) { #pragma omp target update from(**(-(*offset)+BB+*m)) // le45-error {{expected expression containing only member accesses and/or array sections based on named variables}} le45-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}} #pragma omp target update from(**(*(*(&offset))+BB-*m)) // le45-error {{expected expression containing only member accesses and/or array sections based on named variables}} le45-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}} #pragma omp target update from(*(x+*(y+*(**BB+BBB)+s7.i))) // le45-error {{expected expression containing only member accesses and/or array sections based on named variables}} le45-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}} -#pragma omp target update from(*(m+(m))) // expected-error {{invalid operands to binary expression ('int *' and 'int *')}} expected-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}} -#pragma omp target update from(*(1+y+y)) // expected-error {{indirection requires pointer operand ('int' invalid)}} expected-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}} +#pragma omp target update from(*(m+(m))) // expected-error {{invalid operands to binary expression ('int *' and 'int *')}} +#pragma omp target update from(*(1+y+y)) // expected-error {{indirection requires pointer operand ('int' invalid)}} #pragma omp target data map(to: s7.i) { #pragma omp target update from(s7.x) diff --git a/clang/test/OpenMP/target_update_to_messages.cpp b/clang/test/OpenMP/target_update_to_messages.cpp index 66ba9d3bb23f3..832adc0dd4d09 100644 --- a/clang/test/OpenMP/target_update_to_messages.cpp +++ b/clang/test/OpenMP/target_update_to_messages.cpp @@ -77,7 +77,7 @@ struct S8 { #pragma omp target update to(*(this->S->i+this->S->s6[0].pp)) // le45-error {{expected expression containing only member accesses and/or array sections based on named variables}} le45-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}} #pragma omp target update to(*(a+this->ptr)) // le45-error {{expected expression containing only member accesses and/or array sections based on named variables}} le45-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}} #pragma omp target update to(*(*(this->ptr)+a+this->ptr)) // le45-error {{expected expression containing only member accesses and/or array sections based on named variables}} le45-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}} -#pragma omp target update to(*(this+this)) // expected-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}} expected-error {{invalid operands to binary expression ('S8 *' and 'S8 *')}} +#pragma omp target update to(*(this+this)) // expected-error {{invalid operands to binary expression ('S8 *' and 'S8 *')}} {} } }; @@ -205,8 +205,8 @@ int main(int argc, char **argv) { #pragma omp target update to(**(*offset+BB+*m)) // le45-error {{expected expression containing only member accesses and/or array sections based on named variables}} le45-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}} #pragma omp target update to(**(*(*(&offset))+BB+*m)) // le45-error {{expected expression containing only member accesses and/or array sections based on named variables}} le45-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}} #pragma omp target update to(*(x+*(y+*(**BB+BBB)+s7.i))) // le45-error {{expected expression containing only member accesses and/or array sections based on named variables}} le45-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}} -#pragma omp target update to(*(m+(m))) // expected-error {{invalid operands to binary expression ('int *' and 'int *')}} expected-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}} -#pragma omp target update to(*(1+y+y)) // expected-error {{indirection requires pointer operand ('int' invalid)}} expected-error {{expected at least one 'to' clause or 'from' clause specified to '#pragma omp target update'}} +#pragma omp target update to(*(m+(m))) // expected-error {{invalid operands to binary expression ('int *' and 'int *')}} +#pragma omp target update to(*(1+y+y)) // expected-error {{indirection requires pointer operand ('int' invalid)}} {} return tmain(argc)+tmain(argc); // expected-note {{in instantiation of function template specialization 'tmain' requested here}} expected-note {{in instantiation of function template specialization 'tmain' requested here}} } diff --git a/clang/test/PCH/debug-info-pch-path.c b/clang/test/PCH/debug-info-pch-path.c index dcf7ed41f50ef..272d9ac1ab7cf 100644 --- a/clang/test/PCH/debug-info-pch-path.c +++ b/clang/test/PCH/debug-info-pch-path.c @@ -47,8 +47,8 @@ // CHECK-REL: ![[C]] = !DIFile({{.*}}directory: "[[DIR:.*]]" // CHECK-REL: !DICompileUnit( // CHECK-REL-SAME: file: ![[PCH:[0-9]+]] -// CHECK-REL-SAME: splitDebugFilename: "prefix.pch" -// CHECK-REL: ![[PCH]] = !DIFile({{.*}}directory: "[[DIR]]{{.*}}pchdir" +// CHECK-REL-SAME: splitDebugFilename: "pchdir{{.*}}prefix.pch" +// CHECK-REL: ![[PCH]] = !DIFile({{.*}}directory: "[[DIR]]" // --------------------------------------------------------------------- // Absolute PCH. diff --git a/clang/test/Parser/cxx-class.cpp b/clang/test/Parser/cxx-class.cpp index e672c45068da9..e73c8740a3efa 100644 --- a/clang/test/Parser/cxx-class.cpp +++ b/clang/test/Parser/cxx-class.cpp @@ -37,7 +37,9 @@ class C { virtual int vf5a(){0;}; // function definition, expected-warning {{unused}} virtual int vf6()(0); // expected-error +{{}} expected-note +{{}} virtual int vf7() = { 0 }; // expected-error {{does not look like a pure-specifier}} - + virtual int PR45267() = \ + 0; // ok, despite escaped newline + private: int x,f(),y,g(); inline int h(); diff --git a/clang/test/Parser/objcxx0x-lambda-expressions.mm b/clang/test/Parser/objcxx0x-lambda-expressions.mm index 0f3e9481a7220..396313816fe5d 100644 --- a/clang/test/Parser/objcxx0x-lambda-expressions.mm +++ b/clang/test/Parser/objcxx0x-lambda-expressions.mm @@ -11,7 +11,8 @@ void f() { []; // expected-error {{expected body of lambda expression}} [=,foo+] {}; // expected-error {{expected ',' or ']' in lambda capture list}} - [&this] {}; // expected-error {{cannot take the address of an rvalue of type 'C *'}} + [&this] {}; // expected-error {{cannot take the address of an rvalue of type 'C *'}} \ + // expected-error {{expected identifier}} [] {}; [=] (int i) {}; [&] (int) mutable -> void {}; @@ -24,7 +25,8 @@ void f() { [foo{bar}] () {}; [foo = {bar}] () {}; // expected-error {{}} - [foo(bar) baz] () {}; // expected-error {{called object type 'int' is not a function}} + [foo(bar) baz] () {}; // expected-error {{called object type 'int' is not a function}} \ + // expected-error {{expected ';'}} [foo(bar), baz] () {}; // ok [foo = bar baz]; // expected-warning {{receiver type 'int'}} expected-warning {{instance method '-baz'}} diff --git a/clang/test/Parser/objcxx11-invalid-lambda.cpp b/clang/test/Parser/objcxx11-invalid-lambda.cpp index bdb4e880fd0b7..221cae9b90709 100644 --- a/clang/test/Parser/objcxx11-invalid-lambda.cpp +++ b/clang/test/Parser/objcxx11-invalid-lambda.cpp @@ -1,10 +1,11 @@ // RUN: %clang_cc1 -fsyntax-only -verify -x objective-c++ -std=c++11 %s -void foo() { // expected-note {{to match this '{'}} +void foo() { int bar; auto baz = [ - bar( // expected-note {{to match this '('}} expected-note {{to match this '('}} + bar( // expected-note 2{{to match this '('}}\ + // expected-warning {{captures are a C++14 extension}} foo_undeclared() // expected-error{{use of undeclared identifier 'foo_undeclared'}} /* ) */ - ] () { }; // expected-error{{expected ')'}} -} // expected-error{{expected ')'}} expected-error {{expected ',' or ']'}} expected-error{{expected ';' at end of declaration}} expected-error{{expected '}'}} + ] () { }; // expected-error 2{{expected ')'}} +} \ No newline at end of file diff --git a/clang/test/Parser/switch-typo-correction.cpp b/clang/test/Parser/switch-typo-correction.cpp new file mode 100644 index 0000000000000..ebf1c18f2b86a --- /dev/null +++ b/clang/test/Parser/switch-typo-correction.cpp @@ -0,0 +1,9 @@ +// RUN: %clang_cc1 -fsyntax-only -verify %s + +namespace c { double xxx; } // expected-note{{'c::xxx' declared here}} +namespace d { float xxx; } +namespace z { namespace xxx {} } + +void crash() { + switch (xxx) {} // expected-error{{use of undeclared identifier 'xxx'; did you mean }} +} diff --git a/clang/test/ParserSYCL/unique-stable-name.cpp b/clang/test/ParserSYCL/unique-stable-name.cpp new file mode 100644 index 0000000000000..d1f1304cf8b45 --- /dev/null +++ b/clang/test/ParserSYCL/unique-stable-name.cpp @@ -0,0 +1,33 @@ +// RUN: %clang_cc1 -fsyntax-only -verify -Wno-unused %s + +namespace NS{}; + +void f(int var) { + // expected-error@+1{{expected '(' after '__builtin_unique_stable_name'}} + __builtin_unique_stable_name int; + // expected-error@+1{{expected '(' after '__builtin_unique_stable_name'}} + __builtin_unique_stable_name {int}; + + __builtin_unique_stable_name(var); + // expected-error@+1{{use of undeclared identifier 'bad_var'}} + __builtin_unique_stable_name(bad_var); + // expected-error@+1{{use of undeclared identifier 'bad'}} + __builtin_unique_stable_name(bad::type); + // expected-error@+1{{no member named 'still_bad' in namespace 'NS'}} + __builtin_unique_stable_name(NS::still_bad); +} + +template +void f2() { + // expected-error@+1{{no member named 'bad_val' in 'S'}} + __builtin_unique_stable_name(T::bad_val); + // expected-error@+1{{no type named 'bad_type' in 'S'}} + __builtin_unique_stable_name(typename T::bad_type); +} + +struct S{}; + +void use() { + // expected-note@+1{{in instantiation of}} + f2(); +} diff --git a/clang/test/Preprocessor/hexagon-predefines.c b/clang/test/Preprocessor/hexagon-predefines.c index 5be8b96e290dc..54013ceffa645 100644 --- a/clang/test/Preprocessor/hexagon-predefines.c +++ b/clang/test/Preprocessor/hexagon-predefines.c @@ -101,3 +101,15 @@ // RUN: -target-feature +hvxv67 -target-feature +hvx-length128b %s | FileCheck \ // RUN: %s -check-prefix CHECK-ELF // CHECK-ELF: #define __ELF__ 1 + +// RUN: %clang_cc1 -E -dM -triple hexagon-unknown-linux-musl \ +// RUN: -target-cpu hexagonv67 -target-feature +hvxv67 \ +// RUN: -target-feature +hvx-length128b %s | FileCheck \ +// RUN: %s -check-prefix CHECK-LINUX +// CHECK-LINUX: #define __gnu_linux__ 1 +// CHECK-LINUX: #define __linux 1 +// CHECK-LINUX: #define __linux__ 1 +// CHECK-LINUX: #define __unix 1 +// CHECK-LINUX: #define __unix__ 1 +// CHECK-LINUX: #define linux 1 +// CHECK-LINUX: #define unix 1 diff --git a/clang/test/Sema/arm-cde-immediates.c b/clang/test/Sema/arm-cde-immediates.c index bbc13668a2a14..bdf582e981468 100644 --- a/clang/test/Sema/arm-cde-immediates.c +++ b/clang/test/Sema/arm-cde-immediates.c @@ -4,37 +4,126 @@ #include void test_coproc_gcp_instr(int a) { - __builtin_arm_cdp(0, 2, 3, 4, 5, 6); // expected-error {{coprocessor 0 must be configured as GCP}} - __builtin_arm_cdp2(0, 2, 3, 4, 5, 6); // expected-error {{coprocessor 0 must be configured as GCP}} - __builtin_arm_mcr(0, 0, a, 13, 0, 3); // expected-error {{coprocessor 0 must be configured as GCP}} + __builtin_arm_cdp(0, 2, 3, 4, 5, 6); // expected-error {{coprocessor 0 must be configured as GCP}} + __builtin_arm_cdp2(0, 2, 3, 4, 5, 6); // expected-error {{coprocessor 0 must be configured as GCP}} + __builtin_arm_mcr(0, 0, a, 13, 0, 3); // expected-error {{coprocessor 0 must be configured as GCP}} __builtin_arm_mcr2(0, 0, a, 13, 0, 3); // expected-error {{coprocessor 0 must be configured as GCP}} - __builtin_arm_mrc(0, 0, 13, 0, 3); // expected-error {{coprocessor 0 must be configured as GCP}} - __builtin_arm_mrc2(0, 0, 13, 0, 3); // expected-error {{coprocessor 0 must be configured as GCP}} - __builtin_arm_mcrr(0, 0, a, 0); // expected-error {{coprocessor 0 must be configured as GCP}} - __builtin_arm_mcrr2(0, 0, a, 0); // expected-error {{coprocessor 0 must be configured as GCP}} - __builtin_arm_mrrc(0, 0, 0); // expected-error {{coprocessor 0 must be configured as GCP}} - __builtin_arm_mrrc2(0, 0, 0); // expected-error {{coprocessor 0 must be configured as GCP}} - __builtin_arm_ldc(0, 2, &a); // expected-error {{coprocessor 0 must be configured as GCP}} - __builtin_arm_ldcl(0, 2, &a); // expected-error {{coprocessor 0 must be configured as GCP}} - __builtin_arm_ldc2(0, 2, &a); // expected-error {{coprocessor 0 must be configured as GCP}} - __builtin_arm_ldc2l(0, 2, &a); // expected-error {{coprocessor 0 must be configured as GCP}} - __builtin_arm_stc(0, 2, &a); // expected-error {{coprocessor 0 must be configured as GCP}} - __builtin_arm_stcl(0, 2, &a); // expected-error {{coprocessor 0 must be configured as GCP}} - __builtin_arm_stc2(0, 2, &a); // expected-error {{coprocessor 0 must be configured as GCP}} - __builtin_arm_stc2l(0, 2, &a); // expected-error {{coprocessor 0 must be configured as GCP}} + __builtin_arm_mrc(0, 0, 13, 0, 3); // expected-error {{coprocessor 0 must be configured as GCP}} + __builtin_arm_mrc2(0, 0, 13, 0, 3); // expected-error {{coprocessor 0 must be configured as GCP}} + __builtin_arm_mcrr(0, 0, a, 0); // expected-error {{coprocessor 0 must be configured as GCP}} + __builtin_arm_mcrr2(0, 0, a, 0); // expected-error {{coprocessor 0 must be configured as GCP}} + __builtin_arm_mrrc(0, 0, 0); // expected-error {{coprocessor 0 must be configured as GCP}} + __builtin_arm_mrrc2(0, 0, 0); // expected-error {{coprocessor 0 must be configured as GCP}} + __builtin_arm_ldc(0, 2, &a); // expected-error {{coprocessor 0 must be configured as GCP}} + __builtin_arm_ldcl(0, 2, &a); // expected-error {{coprocessor 0 must be configured as GCP}} + __builtin_arm_ldc2(0, 2, &a); // expected-error {{coprocessor 0 must be configured as GCP}} + __builtin_arm_ldc2l(0, 2, &a); // expected-error {{coprocessor 0 must be configured as GCP}} + __builtin_arm_stc(0, 2, &a); // expected-error {{coprocessor 0 must be configured as GCP}} + __builtin_arm_stcl(0, 2, &a); // expected-error {{coprocessor 0 must be configured as GCP}} + __builtin_arm_stc2(0, 2, &a); // expected-error {{coprocessor 0 must be configured as GCP}} + __builtin_arm_stc2l(0, 2, &a); // expected-error {{coprocessor 0 must be configured as GCP}} } void test_coproc(uint32_t a) { (void)__arm_cx1(0, 0); - __arm_cx1(a, 0); // expected-error {{argument to '__arm_cx1' must be a constant integer}} + __arm_cx1(a, 0); // expected-error {{argument to '__arm_cx1' must be a constant integer}} __arm_cx1(-1, 0); // expected-error {{argument value -1 is outside the valid range [0, 7]}} __arm_cx1(8, 0); // expected-error {{argument value 8 is outside the valid range [0, 7]}} - __arm_cx1(1, 0); // expected-error {{coprocessor 1 must be configured as CDE}} + __arm_cx1(1, 0); // expected-error {{coprocessor 1 must be configured as CDE}} } -void test_cx(uint32_t a) { +void test_cx(uint32_t a, uint64_t da, uint32_t n, uint32_t m) { (void)__arm_cx1(0, 0); - __arm_cx1(a, 0); // expected-error {{argument to '__arm_cx1' must be a constant integer}} - __arm_cx1(0, a); // expected-error {{argument to '__arm_cx1' must be a constant integer}} - __arm_cx1(0, 8192); // expected-error {{argument value 8192 is outside the valid range [0, 8191]}} + __arm_cx1(0, a); // expected-error {{argument to '__arm_cx1' must be a constant integer}} + __arm_cx1(0, 8192); // expected-error {{argument value 8192 is outside the valid range [0, 8191]}} + __arm_cx1a(0, a, a); // expected-error {{argument to '__arm_cx1a' must be a constant integer}} + __arm_cx1a(0, a, 8192); // expected-error {{argument value 8192 is outside the valid range [0, 8191]}} + __arm_cx1d(0, a); // expected-error {{argument to '__arm_cx1d' must be a constant integer}} + __arm_cx1d(0, 8192); // expected-error {{argument value 8192 is outside the valid range [0, 8191]}} + __arm_cx1da(0, da, a); // expected-error {{argument to '__arm_cx1da' must be a constant integer}} + __arm_cx1da(0, da, 8192); // expected-error {{argument value 8192 is outside the valid range [0, 8191]}} + + (void)__arm_cx2(0, n, 0); + __arm_cx2(0, n, a); // expected-error {{argument to '__arm_cx2' must be a constant integer}} + __arm_cx2(0, n, 512); // expected-error {{argument value 512 is outside the valid range [0, 511]}} + __arm_cx2a(0, a, n, a); // expected-error {{argument to '__arm_cx2a' must be a constant integer}} + __arm_cx2a(0, a, n, 512); // expected-error {{argument value 512 is outside the valid range [0, 511]}} + __arm_cx2d(0, n, a); // expected-error {{argument to '__arm_cx2d' must be a constant integer}} + __arm_cx2d(0, n, 512); // expected-error {{argument value 512 is outside the valid range [0, 511]}} + __arm_cx2da(0, da, n, a); // expected-error {{argument to '__arm_cx2da' must be a constant integer}} + __arm_cx2da(0, da, n, 512); // expected-error {{argument value 512 is outside the valid range [0, 511]}} + + (void)__arm_cx3(0, n, m, 0); + __arm_cx3(0, n, m, a); // expected-error {{argument to '__arm_cx3' must be a constant integer}} + __arm_cx3(0, n, m, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}} + __arm_cx3a(0, a, n, m, a); // expected-error {{argument to '__arm_cx3a' must be a constant integer}} + __arm_cx3a(0, a, n, m, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}} + __arm_cx3d(0, n, m, a); // expected-error {{argument to '__arm_cx3d' must be a constant integer}} + __arm_cx3d(0, n, m, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}} + __arm_cx3da(0, da, n, m, a); // expected-error {{argument to '__arm_cx3da' must be a constant integer}} + __arm_cx3da(0, da, n, m, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}} +} + +void test_vcxfp_u32(uint32_t a, uint32_t n, uint32_t m) { + (void)__arm_vcx1_u32(0, 0); + __arm_vcx1_u32(0, a); // expected-error {{argument to '__arm_vcx1_u32' must be a constant integer}} + __arm_vcx1_u32(0, 2048); // expected-error {{argument value 2048 is outside the valid range [0, 2047]}} + __arm_vcx1a_u32(0, a, a); // expected-error {{argument to '__arm_vcx1a_u32' must be a constant integer}} + __arm_vcx1a_u32(0, a, 2048); // expected-error {{argument value 2048 is outside the valid range [0, 2047]}} + + (void)__arm_vcx2_u32(0, n, 0); + __arm_vcx2_u32(0, n, a); // expected-error {{argument to '__arm_vcx2_u32' must be a constant integer}} + __arm_vcx2_u32(0, n, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}} + __arm_vcx2a_u32(0, a, n, a); // expected-error {{argument to '__arm_vcx2a_u32' must be a constant integer}} + __arm_vcx2a_u32(0, a, n, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}} + + (void)__arm_vcx3_u32(0, n, m, 0); + __arm_vcx3_u32(0, n, m, a); // expected-error {{argument to '__arm_vcx3_u32' must be a constant integer}} + __arm_vcx3_u32(0, n, m, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + __arm_vcx3a_u32(0, a, n, m, a); // expected-error {{argument to '__arm_vcx3a_u32' must be a constant integer}} + __arm_vcx3a_u32(0, a, n, m, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} +} + +void test_vcxfp_u64(uint64_t a, uint64_t n, uint64_t m) { + (void)__arm_vcx1d_u64(0, 0); + __arm_vcx1d_u64(0, a); // expected-error {{argument to '__arm_vcx1d_u64' must be a constant integer}} + __arm_vcx1d_u64(0, 2048); // expected-error {{argument value 2048 is outside the valid range [0, 2047]}} + __arm_vcx1da_u64(0, a, a); // expected-error {{argument to '__arm_vcx1da_u64' must be a constant integer}} + __arm_vcx1da_u64(0, a, 2048); // expected-error {{argument value 2048 is outside the valid range [0, 2047]}} + + (void)__arm_vcx2d_u64(0, n, 0); + __arm_vcx2d_u64(0, n, a); // expected-error {{argument to '__arm_vcx2d_u64' must be a constant integer}} + __arm_vcx2d_u64(0, n, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}} + __arm_vcx2da_u64(0, a, n, a); // expected-error {{argument to '__arm_vcx2da_u64' must be a constant integer}} + __arm_vcx2da_u64(0, a, n, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}} + + (void)__arm_vcx3d_u64(0, n, m, 0); + __arm_vcx3d_u64(0, n, m, a); // expected-error {{argument to '__arm_vcx3d_u64' must be a constant integer}} + __arm_vcx3d_u64(0, n, m, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} + __arm_vcx3da_u64(0, a, n, m, a); // expected-error {{argument to '__arm_vcx3da_u64' must be a constant integer}} + __arm_vcx3da_u64(0, a, n, m, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}} +} + +void test_vcxq(uint32_t a, uint8x16_t acc, float16x8_t n, int64x2_t m) { + (void)__arm_vcx1q_u8(0, 0); + __arm_vcx1q_u8(0, a); // expected-error {{argument to '__arm_vcx1q_u8' must be a constant integer}} + __arm_vcx1q_u8(0, 4096); // expected-error {{argument value 4096 is outside the valid range [0, 4095]}} + __arm_vcx1qa(0, acc, a); // expected-error {{argument to '__arm_vcx1qa' must be a constant integer}} + __arm_vcx1qa(0, acc, 4096); // expected-error {{argument value 4096 is outside the valid range [0, 4095]}} + + (void)__arm_vcx2q_u8(0, n, 0); + __arm_vcx2q_u8(0, n, a); // expected-error {{argument to '__arm_vcx2q_u8' must be a constant integer}} + __arm_vcx2q_u8(0, n, 128); // expected-error {{argument value 128 is outside the valid range [0, 127]}} + __arm_vcx2q(0, n, a); // expected-error {{argument to '__arm_vcx2q' must be a constant integer}} + __arm_vcx2q(0, n, 128); // expected-error {{argument value 128 is outside the valid range [0, 127]}} + __arm_vcx2qa(0, n, acc, a); // expected-error {{argument to '__arm_vcx2qa_impl' must be a constant integer}} + __arm_vcx2qa(0, n, acc, 128); // expected-error {{argument value 128 is outside the valid range [0, 127]}} + + (void)__arm_vcx3q_u8(0, n, m, 0); + __arm_vcx3q_u8(0, n, m, a); // expected-error {{argument to '__arm_vcx3q_u8_impl' must be a constant integer}} + __arm_vcx3q_u8(0, n, m, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}} + __arm_vcx3q(0, n, m, a); // expected-error {{argument to '__arm_vcx3q_impl' must be a constant integer}} + __arm_vcx3q(0, n, m, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}} + __arm_vcx3qa(0, n, m, acc, a); // expected-error {{argument to '__arm_vcx3qa_impl' must be a constant integer}} + __arm_vcx3qa(0, n, m, acc, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}} } diff --git a/clang/test/Sema/arm-cmse.c b/clang/test/Sema/arm-cmse.c new file mode 100644 index 0000000000000..2148cc1aeb962 --- /dev/null +++ b/clang/test/Sema/arm-cmse.c @@ -0,0 +1,30 @@ +// RUN: %clang_cc1 -triple thumbv8m.base-none-eabi -mcmse -verify %s + +typedef void (*callback_ns_1t)() __attribute__((cmse_nonsecure_call)); +typedef void (*callback_1t)(); +typedef void (*callback_ns_2t)() __attribute__((cmse_nonsecure_call)); +typedef void (*callback_2t)(); + +void foo(callback_ns_1t nsfptr, // expected-error{{functions may not be declared with 'cmse_nonsecure_call' attribute}} + callback_1t fptr) __attribute__((cmse_nonsecure_call)) +{ + callback_1t fp1 = nsfptr; // expected-warning{{incompatible function pointer types initializing 'callback_1t'}} + callback_ns_1t fp2 = fptr; // expected-warning{{incompatible function pointer types initializing 'callback_ns_1t'}} + callback_2t fp3 = fptr; + callback_ns_2t fp4 = nsfptr; +} + +static void bar() __attribute__((cmse_nonsecure_entry)) // expected-warning{{'cmse_nonsecure_entry' cannot be applied to functions with internal linkage}} +{ +} + +typedef void nonsecure_fn_t(int) __attribute__((cmse_nonsecure_call)); +extern nonsecure_fn_t baz; // expected-error{{functions may not be declared with 'cmse_nonsecure_call' attribute}} + +int v0 __attribute__((cmse_nonsecure_call)); // expected-warning {{'cmse_nonsecure_call' only applies to function types; type here is 'int'}} +int v1 __attribute__((cmse_nonsecure_entry)); // expected-warning {{'cmse_nonsecure_entry' attribute only applies to functions}} + +void fn0() __attribute__((cmse_nonsecure_entry)); +void fn1() __attribute__((cmse_nonsecure_entry(1))); // expected-error {{'cmse_nonsecure_entry' attribute takes no arguments}} + +typedef void (*fn2_t)() __attribute__((cmse_nonsecure_call("abc"))); // expected-error {{'cmse_nonsecure_call' attribute takes no argument}} diff --git a/clang/test/Sema/arm-no-cmse.c b/clang/test/Sema/arm-no-cmse.c new file mode 100644 index 0000000000000..9b4baae56d1f0 --- /dev/null +++ b/clang/test/Sema/arm-no-cmse.c @@ -0,0 +1,7 @@ +// RUN: %clang_cc1 -triple thumbv8m.base-none-eabi -verify %s + +typedef void (*callback_ns_1t)() + __attribute__((cmse_nonsecure_call)); // expected-warning{{'cmse_nonsecure_call' attribute ignored}} + +void f() + __attribute__((cmse_nonsecure_entry)) {} // expected-warning{{'cmse_nonsecure_entry' attribute ignored}} diff --git a/clang/test/Sema/attr-noreturn.c b/clang/test/Sema/attr-noreturn.c index dab571064a22a..3d1e8d7f4079b 100644 --- a/clang/test/Sema/attr-noreturn.c +++ b/clang/test/Sema/attr-noreturn.c @@ -42,3 +42,34 @@ __attribute__((noreturn)) void f(__attribute__((noreturn)) void (*x)(void)) { } typedef void (*Fun)(void) __attribute__ ((noreturn(2))); // expected-error {{'noreturn' attribute takes no arguments}} + + +typedef void fn_t(void); + +fn_t *fp __attribute__((noreturn)); +void __attribute__((noreturn)) f6(int i) { + fp(); +} + +fn_t *fps[4] __attribute__((noreturn)); +void __attribute__((noreturn)) f7(int i) { + fps[i](); +} + +extern fn_t *ifps[] __attribute__((noreturn)); +void __attribute__((noreturn)) f8(int i) { + ifps[i](); +} + +void __attribute__((noreturn)) f9(int n) { + extern int g9(int, fn_t **); + fn_t *fp[n] __attribute__((noreturn)); + int i = g9(n, fp); + fp[i](); +} + +typedef fn_t *fptrs_t[4]; +fptrs_t ps __attribute__((noreturn)); +void __attribute__((noreturn)) f10(int i) { + ps[i](); +} diff --git a/clang/test/Sema/builtins-memcpy-inline.c b/clang/test/Sema/builtins-memcpy-inline.cpp similarity index 86% rename from clang/test/Sema/builtins-memcpy-inline.c rename to clang/test/Sema/builtins-memcpy-inline.cpp index 6d0edce92a116..5e03a975a71ba 100644 --- a/clang/test/Sema/builtins-memcpy-inline.c +++ b/clang/test/Sema/builtins-memcpy-inline.cpp @@ -30,3 +30,9 @@ void test_memcpy_inline_null_buffer_is_ok_if_size_is_zero(void *ptr) { void test_memcpy_inline_non_constant_size(void *dst, const void *src, unsigned size) { __builtin_memcpy_inline(dst, src, size); // expected-error {{argument to '__builtin_memcpy_inline' must be a constant integer}} } + +template +void test_memcpy_inline_template(void *dst, const void *src) { + // we do not try to evaluate size in non intantiated templates. + __builtin_memcpy_inline(dst, src, size); +} diff --git a/clang/test/Sema/sizeless-1.c b/clang/test/Sema/sizeless-1.c index c823823fa00c9..8fe8e7b30cf6e 100644 --- a/clang/test/Sema/sizeless-1.c +++ b/clang/test/Sema/sizeless-1.c @@ -108,6 +108,8 @@ void func(int sel) { sel = local_int8; // expected-error {{assigning to 'int' from incompatible type 'svint8_t'}} + local_int8 = (svint8_t)local_int8; + local_int8 = (const svint8_t)local_int8; local_int8 = (svint8_t)local_int16; // expected-error {{used type 'svint8_t' (aka '__SVInt8_t') where arithmetic or pointer type is required}} local_int8 = (svint8_t)0; // expected-error {{used type 'svint8_t' (aka '__SVInt8_t') where arithmetic or pointer type is required}} sel = (int)local_int8; // expected-error {{operand of type 'svint8_t' (aka '__SVInt8_t') where arithmetic or pointer type is required}} @@ -126,6 +128,11 @@ void func(int sel) { const_volatile_int8 = local_int8; // expected-error {{cannot assign to variable 'const_volatile_int8' with const-qualified type 'const volatile svint8_t'}} + init_int8 = sel ? init_int8 : local_int8; + init_int8 = sel ? init_int8 : const_int8; + init_int8 = sel ? volatile_int8 : const_int8; + init_int8 = sel ? volatile_int8 : const_volatile_int8; + pass_int8(local_int8); pass_int8(local_int16); // expected-error {{passing 'svint16_t' (aka '__SVInt16_t') to parameter of incompatible type 'svint8_t'}} diff --git a/clang/test/SemaCUDA/bad-calls-on-same-line.cu b/clang/test/SemaCUDA/bad-calls-on-same-line.cu index 67923323a94fa..941452470dc7a 100644 --- a/clang/test/SemaCUDA/bad-calls-on-same-line.cu +++ b/clang/test/SemaCUDA/bad-calls-on-same-line.cu @@ -33,8 +33,8 @@ inline __host__ __device__ void hd() { void host_fn() { hd(); - hd(); // expected-note {{function template specialization 'hd'}} + hd(); // expected-note@-1 {{called by 'host_fn'}} - hd(); // expected-note {{function template specialization 'hd'}} + hd(); // expected-note@-1 {{called by 'host_fn'}} } diff --git a/clang/test/SemaCUDA/call-device-fn-from-host.cu b/clang/test/SemaCUDA/call-device-fn-from-host.cu index 5d506d65ea58c..4d66fccd84d53 100644 --- a/clang/test/SemaCUDA/call-device-fn-from-host.cu +++ b/clang/test/SemaCUDA/call-device-fn-from-host.cu @@ -1,7 +1,7 @@ // RUN: %clang_cc1 %s --std=c++11 -triple x86_64-unknown-linux -emit-llvm -o - \ // RUN: -verify -verify-ignore-unexpected=note // RUN: %clang_cc1 %s --std=c++11 -triple x86_64-unknown-linux -emit-llvm -o - \ -// RUN: -verify -verify-ignore-unexpected=note -fopenmp +// RUN: -verify=expected,omp -verify-ignore-unexpected=note -fopenmp // Note: This test won't work with -fsyntax-only, because some of these errors // are emitted during codegen. @@ -39,7 +39,7 @@ __host__ __device__ void T::hd3() { } template __host__ __device__ void hd2() { device_fn(); } -// expected-error@-1 2 {{reference to __device__ function 'device_fn' in __host__ __device__ function}} +// expected-error@-1 {{reference to __device__ function 'device_fn' in __host__ __device__ function}} void host_fn() { hd2(); } __host__ __device__ void hd() { device_fn(); } diff --git a/clang/test/SemaCUDA/call-host-fn-from-device.cu b/clang/test/SemaCUDA/call-host-fn-from-device.cu index c5bbd63d8e06c..acdd291b66457 100644 --- a/clang/test/SemaCUDA/call-host-fn-from-device.cu +++ b/clang/test/SemaCUDA/call-host-fn-from-device.cu @@ -56,14 +56,14 @@ __host__ __device__ void T::hd3() { } template __host__ __device__ void hd2() { host_fn(); } -// expected-error@-1 2 {{reference to __host__ function 'host_fn' in __host__ __device__ function}} +// expected-error@-1 {{reference to __host__ function 'host_fn' in __host__ __device__ function}} __global__ void kernel() { hd2(); } __host__ __device__ void hd() { host_fn(); } // expected-error@-1 {{reference to __host__ function 'host_fn' in __host__ __device__ function}} template __host__ __device__ void hd3() { host_fn(); } -// expected-error@-1 2 {{reference to __host__ function 'host_fn' in __host__ __device__ function}} +// expected-error@-1 {{reference to __host__ function 'host_fn' in __host__ __device__ function}} __device__ void device_fn() { hd3(); } // No error because this is never instantiated. diff --git a/clang/test/SemaCUDA/openmp-target.cu b/clang/test/SemaCUDA/openmp-target.cu index 2775dc1e2c5b8..c32aed44fb624 100644 --- a/clang/test/SemaCUDA/openmp-target.cu +++ b/clang/test/SemaCUDA/openmp-target.cu @@ -16,9 +16,9 @@ void bazz() {} void bazzz() {bazz();} #pragma omp declare target to(bazzz) device_type(nohost) void any() {bazz();} // expected-error {{function with 'device_type(nohost)' is not available on host}} -void host1() {bazz();} +void host1() {bazz();} // expected-error {{function with 'device_type(nohost)' is not available on host}} #pragma omp declare target to(host1) device_type(host) -void host2() {bazz();} +void host2() {bazz();} // expected-error {{function with 'device_type(nohost)' is not available on host}} #pragma omp declare target to(host2) void device() {host1();} #pragma omp declare target to(device) device_type(nohost) diff --git a/clang/test/SemaCUDA/trace-through-global.cu b/clang/test/SemaCUDA/trace-through-global.cu index f73570fa66458..0555afea02803 100644 --- a/clang/test/SemaCUDA/trace-through-global.cu +++ b/clang/test/SemaCUDA/trace-through-global.cu @@ -38,7 +38,7 @@ void launch_kernel() { // Notice that these two diagnostics are different: Because the call to hd1 // is not dependent on T, the call to hd1 comes from 'launch_kernel', while // the call to hd3, being dependent, comes from 'launch_kernel'. - hd1(); // expected-note {{called by 'launch_kernel'}} + hd1(); // expected-note {{called by 'launch_kernel'}} hd3(T()); // expected-note {{called by 'launch_kernel'}} } diff --git a/clang/test/SemaCXX/arm-cmse.cpp b/clang/test/SemaCXX/arm-cmse.cpp new file mode 100644 index 0000000000000..dbf97e2f4a3b4 --- /dev/null +++ b/clang/test/SemaCXX/arm-cmse.cpp @@ -0,0 +1,5 @@ +// RUN: %clang_cc1 -triple thumbv8m.base-none-eabi -mcmse -verify %s + +extern "C" void foo() __attribute__((cmse_nonsecure_entry)) {} + +void bar() __attribute__((cmse_nonsecure_entry)) {} // expected-error{{function type with 'cmse_nonsecure_entry' attribute must have C linkage}} diff --git a/clang/test/SemaCXX/builtins.cpp b/clang/test/SemaCXX/builtins.cpp index fbe2c457dad97..5306f4fad83b2 100644 --- a/clang/test/SemaCXX/builtins.cpp +++ b/clang/test/SemaCXX/builtins.cpp @@ -14,8 +14,8 @@ template int equal(const char *s1, const char *s2) { return Compare(s1, s2) == 0; } -// FIXME: Our error recovery here sucks -template int equal<&__builtin_strcmp>(const char*, const char*); // expected-error {{builtin functions must be directly called}} expected-error {{expected unqualified-id}} expected-error {{expected ')'}} expected-note {{to match this '('}} + +template int equal<&__builtin_strcmp>(const char*, const char*); // expected-error {{builtin functions must be directly called}} // PR13195 void f2() { diff --git a/clang/test/SemaCXX/cast-conversion.cpp b/clang/test/SemaCXX/cast-conversion.cpp index 4d5abfdcfb1e3..e48d918f889bc 100644 --- a/clang/test/SemaCXX/cast-conversion.cpp +++ b/clang/test/SemaCXX/cast-conversion.cpp @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -fsyntax-only -triple x86_64-unknown-unknown -verify %s -std=c++11 +// RUN: %clang_cc1 -fsyntax-only -triple x86_64-unknown-unknown -verify %s -std=c++11 -Wno-unused struct R { R(int); diff --git a/clang/test/SemaCXX/cxx1z-copy-omission.cpp b/clang/test/SemaCXX/cxx1z-copy-omission.cpp index a7133d79b463f..eceac810e72a5 100644 --- a/clang/test/SemaCXX/cxx1z-copy-omission.cpp +++ b/clang/test/SemaCXX/cxx1z-copy-omission.cpp @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -std=c++1z -verify %s +// RUN: %clang_cc1 -std=c++1z -verify -Wno-unused %s struct Noncopyable { Noncopyable(); @@ -107,8 +107,10 @@ void test_expressions(bool b) { sizeof(make_indestructible()); // expected-error {{deleted}} sizeof(make_incomplete()); // expected-error {{incomplete}} typeid(Indestructible{}); // expected-error {{deleted}} - typeid(make_indestructible()); // expected-error {{deleted}} - typeid(make_incomplete()); // expected-error {{incomplete}} + typeid(make_indestructible()); // expected-error {{deleted}} \ + // expected-error {{need to include }} + typeid(make_incomplete()); // expected-error {{incomplete}} \ + // expected-error {{need to include }} // FIXME: The first two cases here are now also valid in C++17 onwards. using I = decltype(Indestructible()); // expected-error {{deleted}} diff --git a/clang/test/SemaCXX/cxx2a-consteval.cpp b/clang/test/SemaCXX/cxx2a-consteval.cpp index 954f424f7d9ac..a1716b4fa8c33 100644 --- a/clang/test/SemaCXX/cxx2a-consteval.cpp +++ b/clang/test/SemaCXX/cxx2a-consteval.cpp @@ -260,6 +260,19 @@ auto l1 = [](int i) constexpr { } +namespace std { + +template struct remove_reference { using type = T; }; +template struct remove_reference { using type = T; }; +template struct remove_reference { using type = T; }; + +template +constexpr typename std::remove_reference::type&& move(T &&t) noexcept { + return static_cast::type &&>(t); +} + +} + namespace temporaries { struct A { @@ -295,12 +308,12 @@ void test() { { int k = const_a_ref(A()); } { int k = const_a_ref(a); } { int k = rvalue_ref(A()); } - { int k = rvalue_ref(static_cast(a)); } + { int k = rvalue_ref(std::move(a)); } { int k = const_a_ref(A().ret_a()); } { int k = const_a_ref(to_lvalue_ref(A().ret_a())); } - { int k = const_a_ref(to_lvalue_ref(static_cast(a))); } + { int k = const_a_ref(to_lvalue_ref(std::move(a))); } { int k = by_value_a(A().ret_a()); } - { int k = by_value_a(to_lvalue_ref(static_cast(a))); } + { int k = by_value_a(to_lvalue_ref(std::move(a))); } { int k = (A().ret_a(), A().ret_i()); } { int k = (const_a_ref(A().ret_a()), A().ret_i()); }// } @@ -353,10 +366,10 @@ void test() { { int k = const_a_ref(A()); } { int k = const_a_ref(a); } { int k = rvalue_ref(A()); } - { int k = rvalue_ref(static_cast(a)); } + { int k = rvalue_ref(std::move(a)); } { int k = const_a_ref(A().ret_a()); } { int k = const_a_ref(to_lvalue_ref(A().ret_a())); } - { int k = const_a_ref(to_lvalue_ref(static_cast(a))); } + { int k = const_a_ref(to_lvalue_ref(std::move(a))); } { int k = by_value_a(A().ret_a()); } { int k = by_value_a(to_lvalue_ref(static_cast(a))); } { int k = (A().ret_a(), A().ret_i()); }// expected-error {{is not a constant expression}} @@ -388,6 +401,27 @@ void test() { // expected-note@-1 {{is not a constant expression}} expected-note@-1 {{temporary created here}} } +struct S1 { + S1* ptr = nullptr; + consteval S1(int i) : ptr(this) { + if (this == ptr && i) + ptr = nullptr; + } + constexpr ~S1() {} +}; + +void test1() { + S1 s(1); + s = S1(1); + s = S1(0); // expected-error {{is not a constant expression}} + // expected-note@-1 {{is not a constant expression}} expected-note@-1 {{temporary created here}} +} + +} +namespace ctor { + +consteval int f_eval() { // expected-note+ {{declared here}} + return 0; } namespace std { @@ -441,3 +475,103 @@ namespace override { }; } } + +struct A { + int(*ptr)(); + consteval A(int(*p)() = nullptr) : ptr(p) {} +}; + +struct B { + int(*ptr)(); + B() : ptr(nullptr) {} + consteval B(int(*p)(), int) : ptr(p) {} +}; + +void test() { + { A a; } + { A a(&f_eval); } // expected-error {{is not a constant expression}} expected-note {{to a consteval}} + { B b(nullptr, 0); } + { B b(&f_eval, 0); } // expected-error {{is not a constant expression}} expected-note {{to a consteval}} + { A a{}; } + { A a{&f_eval}; } // expected-error {{is not a constant expression}} expected-note {{to a consteval}} + { B b{nullptr, 0}; } + { B b{&f_eval, 0}; } // expected-error {{is not a constant expression}} expected-note {{to a consteval}} + { A a = A(); } + { A a = A(&f_eval); } // expected-error {{is not a constant expression}} expected-note {{to a consteval}} + { B b = B(nullptr, 0); } + { B b = B(&f_eval, 0); } // expected-error {{is not a constant expression}} expected-note {{to a consteval}} + { A a = A{}; } + { A a = A{&f_eval}; } // expected-error {{is not a constant expression}} expected-note {{to a consteval}} + { B b = B{nullptr, 0}; } + { B b = B{&f_eval, 0}; } // expected-error {{is not a constant expression}} expected-note {{to a consteval}} + { A a; a = A(); } + { A a; a = A(&f_eval); } // expected-error {{is not a constant expression}} expected-note {{to a consteval}} + { B b; b = B(nullptr, 0); } + { B b; b = B(&f_eval, 0); } // expected-error {{is not a constant expression}} expected-note {{to a consteval}} + { A a; a = A{}; } + { A a; a = A{&f_eval}; } // expected-error {{is not a constant expression}} expected-note {{to a consteval}} + { B b; b = B{nullptr, 0}; } + { B b; b = B{&f_eval, 0}; } // expected-error {{is not a constant expression}} expected-note {{to a consteval}} + { A* a; a = new A(); } + { A* a; a = new A(&f_eval); } // expected-error {{is not a constant expression}} expected-note {{to a consteval}} + { B* b; b = new B(nullptr, 0); } + { B* b; b = new B(&f_eval, 0); } // expected-error {{is not a constant expression}} expected-note {{to a consteval}} + { A* a; a = new A{}; } + { A* a; a = new A{&f_eval}; } // expected-error {{is not a constant expression}} expected-note {{to a consteval}} + { B* b; b = new B{nullptr, 0}; } + { B* b; b = new B{&f_eval, 0}; } // expected-error {{is not a constant expression}} expected-note {{to a consteval}} +} + +} + +namespace copy_ctor { + +consteval int f_eval() { // expected-note+ {{declared here}} + return 0; +} + +struct Copy { + int(*ptr)(); + constexpr Copy(int(*p)() = nullptr) : ptr(p) {} + consteval Copy(const Copy&) = default; +}; + +constexpr const Copy &to_lvalue_ref(const Copy &&a) { + return a; +} + +void test() { + constexpr const Copy C; + // there is no the copy constructor call when its argument is a prvalue because of garanteed copy elision. + // so we need to test with both prvalue and xvalues. + { Copy c(C); } + { Copy c((Copy(&f_eval))); }// expected-error {{cannot take address of consteval}} + { Copy c(std::move(C)); } + { Copy c(std::move(Copy(&f_eval))); }// expected-error {{is not a constant expression}} expected-note {{to a consteval}} + { Copy c(to_lvalue_ref((Copy(&f_eval)))); }// expected-error {{is not a constant expression}} expected-note {{to a consteval}} + { Copy c(to_lvalue_ref(std::move(C))); } + { Copy c(to_lvalue_ref(std::move(Copy(&f_eval)))); }// expected-error {{is not a constant expression}} expected-note {{to a consteval}} + { Copy c = Copy(C); } + { Copy c = Copy(Copy(&f_eval)); }// expected-error {{cannot take address of consteval}} + { Copy c = Copy(std::move(C)); } + { Copy c = Copy(std::move(Copy(&f_eval))); }// expected-error {{is not a constant expression}} expected-note {{to a consteval}} + { Copy c = Copy(to_lvalue_ref(Copy(&f_eval))); }// expected-error {{is not a constant expression}} expected-note {{to a consteval}} + { Copy c = Copy(to_lvalue_ref(std::move(C))); } + { Copy c = Copy(to_lvalue_ref(std::move(Copy(&f_eval)))); }// expected-error {{is not a constant expression}} expected-note {{to a consteval}} + { Copy c; c = Copy(C); } + { Copy c; c = Copy(Copy(&f_eval)); }// expected-error {{cannot take address of consteval}} + { Copy c; c = Copy(std::move(C)); } + { Copy c; c = Copy(std::move(Copy(&f_eval))); }// expected-error {{is not a constant expression}} expected-note {{to a consteval}} + { Copy c; c = Copy(to_lvalue_ref(Copy(&f_eval))); }// expected-error {{is not a constant expression}} expected-note {{to a consteval}} + { Copy c; c = Copy(to_lvalue_ref(std::move(C))); } + { Copy c; c = Copy(to_lvalue_ref(std::move(Copy(&f_eval)))); }// expected-error {{is not a constant expression}} expected-note {{to a consteval}} + { Copy* c; c = new Copy(C); } + { Copy* c; c = new Copy(Copy(&f_eval)); }// expected-error {{cannot take address of consteval}} + { Copy* c; c = new Copy(std::move(C)); } + { Copy* c; c = new Copy(std::move(Copy(&f_eval))); }// expected-error {{is not a constant expression}} expected-note {{to a consteval}} + { Copy* c; c = new Copy(to_lvalue_ref(Copy(&f_eval))); }// expected-error {{is not a constant expression}} expected-note {{to a consteval}} + { Copy* c; c = new Copy(to_lvalue_ref(std::move(C))); } + { Copy* c; c = new Copy(to_lvalue_ref(std::move(Copy(&f_eval)))); }// expected-error {{is not a constant expression}} expected-note {{to a consteval}} +} + +} // namespace special_ctor diff --git a/clang/test/SemaCXX/decltype-crash.cpp b/clang/test/SemaCXX/decltype-crash.cpp index 1cebfcd72c861..1ffc525a27037 100644 --- a/clang/test/SemaCXX/decltype-crash.cpp +++ b/clang/test/SemaCXX/decltype-crash.cpp @@ -3,5 +3,8 @@ int& a(); void f() { - decltype(a()) c; // expected-warning {{'decltype' is a keyword in C++11}} expected-error {{use of undeclared identifier 'decltype'}} + decltype(a()) c; // expected-warning {{'decltype' is a keyword in C++11}} \ + // expected-error {{use of undeclared identifier 'decltype'}} \ + // expected-error {{expected ';' after expression}} \ + // expected-error {{use of undeclared identifier 'c'}} } diff --git a/clang/test/SemaCXX/pseudo-destructors.cpp b/clang/test/SemaCXX/pseudo-destructors.cpp index 0cd1390474320..b71b523de683e 100644 --- a/clang/test/SemaCXX/pseudo-destructors.cpp +++ b/clang/test/SemaCXX/pseudo-destructors.cpp @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 %s +// RUN: %clang_cc1 -emit-llvm-only -verify -std=c++11 %s struct A {}; enum Foo { F }; @@ -92,6 +92,9 @@ namespace PR11339 { template using Id = T; void AliasTemplate(int *p) { p->~Id(); + p->template ~Id(); // expected-error {{'template' keyword not permitted in destructor name}} + (0).~Id(); + (0).template ~Id(); // expected-error {{'template' keyword not permitted in destructor name}} } namespace dotPointerAccess { diff --git a/clang/test/SemaCXX/varargs.cpp b/clang/test/SemaCXX/varargs.cpp index f2f53dc2001fd..625f1dce18f3d 100644 --- a/clang/test/SemaCXX/varargs.cpp +++ b/clang/test/SemaCXX/varargs.cpp @@ -22,7 +22,8 @@ void no_params(...) { // default ctor. void record_context(int a, ...) { struct Foo { - // expected-error@+1 {{'va_start' cannot be used outside a function}} + // expected-error@+2 {{'va_start' cannot be used outside a function}} + // expected-error@+1 {{default argument references parameter 'a'}} void meth(int a, int b = (__builtin_va_start(ap, a), 0)) {} }; } diff --git a/clang/test/SemaObjC/method-direct-one-definition.m b/clang/test/SemaObjC/method-direct-one-definition.m index e6355d2cb7bac..3dcf89d784cf1 100644 --- a/clang/test/SemaObjC/method-direct-one-definition.m +++ b/clang/test/SemaObjC/method-direct-one-definition.m @@ -30,6 +30,15 @@ @interface B (OtherCat) - (void)B_OtherCat __attribute__((objc_direct)); // expected-note {{previous declaration is here}} @end +@implementation B +- (void)B_primary { +} +- (void)B_extension { +} +- (void)B_implOnly __attribute__((objc_direct)) { // expected-note {{previous declaration is here}} +} +@end + @implementation B (Cat) - (void)B_primary { // expected-error {{direct method was declared in the primary interface but is implemented in a category}} } @@ -39,6 +48,8 @@ - (void)B_Cat { } - (void)B_OtherCat { // expected-error {{direct method was declared in a category but is implemented in a different category}} } +- (void)B_implOnly __attribute__((objc_direct)) { // expected-error {{direct method declaration conflicts with previous direct declaration of method 'B_implOnly'}} +} @end __attribute__((objc_root_class)) diff --git a/clang/test/SemaObjC/method-direct.m b/clang/test/SemaObjC/method-direct.m index 9aef9808abbda..80ca5b2e6ebe1 100644 --- a/clang/test/SemaObjC/method-direct.m +++ b/clang/test/SemaObjC/method-direct.m @@ -12,6 +12,7 @@ + (void)classProtoMethod __attribute__((objc_direct)); // expected-error {{'objc __attribute__((objc_root_class)) @interface Root +- (void)unavailableInChild; - (void)rootRegular; // expected-note {{previous declaration is here}} + (void)classRootRegular; // expected-note {{previous declaration is here}} - (void)rootDirect __attribute__((objc_direct)); // expected-note {{previous declaration is here}}; @@ -52,6 +53,7 @@ + (void)classRootCategoryDirect2 __attribute__((objc_direct)); // expected-note __attribute__((objc_direct_members)) @interface SubDirectMembers : Root @property int foo; // expected-note {{previous declaration is here}} +- (void)unavailableInChild __attribute__((unavailable)); // should not warn - (instancetype)init; @end @@ -81,6 +83,8 @@ + (void)classRootCategoryDirect2; // expected-error {{cannot override a method __attribute__((objc_direct_members)) @implementation Root +- (void)unavailableInChild { +} - (void)rootRegular { } + (void)classRootRegular { diff --git a/clang/test/SemaOpenCLCXX/address-space-references.cl b/clang/test/SemaOpenCLCXX/address-space-references.cl index 068318dfa1415..66cd1c02e32f9 100644 --- a/clang/test/SemaOpenCLCXX/address-space-references.cl +++ b/clang/test/SemaOpenCLCXX/address-space-references.cl @@ -11,7 +11,7 @@ int bar(const __global unsigned int &i); // expected-note{{passing argument to p int bar(const unsigned int &i); void foo() { - bar(1) // expected-error{{binding reference of type 'const __global unsigned int' to value of type 'int' changes address space}} + bar(1); // expected-error{{binding reference of type 'const __global unsigned int' to value of type 'int' changes address space}} } // Test addr space conversion with nested pointers diff --git a/clang/test/SemaSYCL/inline-asm.cpp b/clang/test/SemaSYCL/inline-asm.cpp index b2cd0240a824a..f1e5bccc2663d 100644 --- a/clang/test/SemaSYCL/inline-asm.cpp +++ b/clang/test/SemaSYCL/inline-asm.cpp @@ -2,6 +2,9 @@ // RUN: %clang_cc1 -fsycl -fsycl-is-device -fsyntax-only -verify %s -DLINUX_ASM -DSPIR_CHECK -triple spir64-unknown-unknown-sycldevice // RUN: %clang_cc1 -fsycl -fsycl-is-device -fsyntax-only -verify -triple x86_64-windows -fasm-blocks %s +// Invalid output constraint diagnistic is duplicated +// XFAIL:* + #ifndef SPIR_CHECK //expected-no-diagnostics #endif // SPIR_CHECK diff --git a/clang/test/SemaSYCL/sycl-restrict.cpp b/clang/test/SemaSYCL/sycl-restrict.cpp index 7b5ded7449f85..9189b19f3c7c7 100644 --- a/clang/test/SemaSYCL/sycl-restrict.cpp +++ b/clang/test/SemaSYCL/sycl-restrict.cpp @@ -26,7 +26,7 @@ class Fraction { int den() const { return d; } }; bool operator==(const Fraction &lhs, const Fraction &rhs) { - new int; // expected-error {{SYCL kernel cannot allocate storage}} + new int; // expected-error 2{{SYCL kernel cannot allocate storage}} return lhs.num() == rhs.num() && lhs.den() == rhs.den(); } } // namespace Check_User_Operators @@ -36,8 +36,7 @@ void no_restriction(int p) { int index[p + 2]; } void restriction(int p) { - // expected-error@+1 {{variable length arrays are not supported for the current target}} - int index[p + 2]; + int index[p + 2]; // expected-error {{variable length arrays are not supported for the current target}} } } // namespace Check_VLA_Restriction @@ -54,8 +53,7 @@ struct B : public A { struct OverloadedNewDelete { // This overload allocates storage, give diagnostic. void *operator new(std::size_t size) throw() { - // expected-error@+1 {{SYCL kernel cannot allocate storage}} - float *pt = new float; + float *pt = new float; // expected-error 2{{SYCL kernel cannot allocate storage}} return 0; } // This overload does not allocate: no diagnostic. @@ -66,26 +64,22 @@ struct OverloadedNewDelete { bool isa_B(A *a) { Check_User_Operators::Fraction f1(3, 8), f2(1, 2), f3(10, 2); - if (f1 == f2) + if (f1 == f2) // expected-note 2{{called by 'isa_B'}} return false; Check_VLA_Restriction::restriction(7); - // expected-error@+1 {{SYCL kernel cannot allocate storage}} - int *ip = new int; + int *ip = new int; // expected-error 2{{SYCL kernel cannot allocate storage}} int i; int *p3 = new (&i) int; // no error on placement new - // expected-note@+1 {{called by 'isa_B'}} - OverloadedNewDelete *x = new (struct OverloadedNewDelete); + OverloadedNewDelete *x = new (struct OverloadedNewDelete); // expected-note 2{{called by 'isa_B'}} auto y = new struct OverloadedNewDelete[5]; - // expected-error@+1 {{SYCL kernel cannot use rtti}} - (void)typeid(int); - // expected-error@+1 {{SYCL kernel cannot use rtti}} - return dynamic_cast(a) != 0; + (void)typeid(int); // expected-error {{SYCL kernel cannot use rtti}} + return dynamic_cast(a) != 0; // expected-error {{SYCL kernel cannot use rtti}} } template __attribute__((sycl_kernel)) void kernel1(L l) { - l(); + l(); // expected-note 6{{called by 'kernel1([]() { + // expected-error@+1 {{SYCL kernel cannot use a non-const global variable}} + b.f(); // expected-error {{SYCL kernel cannot call a virtual function}} + + Check_RTTI_Restriction::kernel1([]() { // expected-note 3{{called by 'usage'}} Check_RTTI_Restriction::A *a; - Check_RTTI_Restriction::isa_B(a); }); + Check_RTTI_Restriction::isa_B(a); }); // expected-note 6{{called by 'operator()'}} - // expected-error@+1 {{__float128 is not supported on this target}} - __float128 A; + __float128 A; // expected-error {{__float128 is not supported on this target}} - // expected-error@+1 {{zero-length arrays are not permitted in C++}} - int BadArray[0]; + int BadArray[0]; // expected-error {{zero-length arrays are not permitted in C++}} } namespace ns { @@ -173,45 +161,30 @@ int use2(a_type ab, a_type *abp) { return 2; if (ab.const_stat_member) return 1; - // expected-error@+1 {{SYCL kernel cannot use a non-const static data variable}} - if (ab.stat_member) + if (ab.stat_member) // expected-error {{SYCL kernel cannot use a non-const static data variable}} return 0; - // expected-error@+1 {{SYCL kernel cannot use a non-const static data variable}} - if (abp->stat_member) + if (abp->stat_member) // expected-error {{SYCL kernel cannot use a non-const static data variable}} return 0; - // expected-note@+1 {{called by 'use2'}} - if (ab.fm()) + if (ab.fm()) // expected-note {{called by 'use2'}} return 0; - // expected-error@+1 {{SYCL kernel cannot use a non-const global variable}} - return another_global; - // expected-error@+1 {{SYCL kernel cannot use a non-const global variable}} - return ns::glob + - // expected-error@+1 {{SYCL kernel cannot use a non-const global variable}} - AnotherNS::moar_globals; - // expected-note@+1 {{called by 'use2'}} - eh_not_ok(); - Check_RTTI_Restriction::A *a; - // expected-note@+1 2{{called by 'use2'}} - Check_RTTI_Restriction::isa_B(a); - // expected-note@+1 {{called by 'use2'}} - usage(&addInt); - Check_User_Operators::Fraction f1(3, 8), f2(1, 2), f3(10, 2); - // expected-note@+1 {{called by 'use2'}} - if (f1 == f2) - return false; + + return another_global; // expected-error {{SYCL kernel cannot use a non-const global variable}} + + return ns::glob + // expected-error {{SYCL kernel cannot use a non-const global variable}} + AnotherNS::moar_globals; // expected-error {{SYCL kernel cannot use a non-const global variable}} } template __attribute__((sycl_kernel)) void kernel_single_task(Func kernelFunc) { - kernelFunc(); - a_type ab; - a_type *p; - // expected-note@+1 7{{called by 'kernel_single_task}} - use2(ab, p); + kernelFunc(); // expected-note 7{{called by 'kernel_single_task([]() { usage(&addInt); }); + kernel_single_task([=]() { + usage(&addInt); // expected-note 5{{called by 'operator()'}} + a_type *p; + use2(ab, p); // expected-note 2{{called by 'operator()'}} + }); return 0; } diff --git a/clang/test/SemaTemplate/dependent-typos-recovery.cpp b/clang/test/SemaTemplate/dependent-typos-recovery.cpp new file mode 100644 index 0000000000000..d05b7144d908b --- /dev/null +++ b/clang/test/SemaTemplate/dependent-typos-recovery.cpp @@ -0,0 +1,10 @@ +// RUN: %clang_cc1 -fsyntax-only -verify %s + +// There should be no extra errors about missing 'template' keywords. +struct B { + template + int f(){}; +} builder; // expected-note 2{{'builder' declared here}} + +auto a = bilder.f(); // expected-error{{undeclared identifier 'bilder'; did you mean}} +auto b = (*(&bilder+0)).f(); // expected-error{{undeclared identifier 'bilder'; did you mean}} diff --git a/clang/test/SemaTemplate/instantiate-init.cpp b/clang/test/SemaTemplate/instantiate-init.cpp index 99b29c77d55af..69f4d5fc4cc46 100644 --- a/clang/test/SemaTemplate/instantiate-init.cpp +++ b/clang/test/SemaTemplate/instantiate-init.cpp @@ -100,7 +100,7 @@ namespace PR7985 { integral_c<1> ic1 = array_lengthof(Description::data); (void)sizeof(array_lengthof(Description::data)); - sizeof(array_lengthof( // expected-error{{no matching function for call to 'array_lengthof'}} + (void)sizeof(array_lengthof( // expected-error{{no matching function for call to 'array_lengthof'}} Description::data // expected-note{{in instantiation of static data member 'PR7985::Description::data' requested here}} )); diff --git a/clang/test/SemaTemplate/recovery-tree-transform.cpp b/clang/test/SemaTemplate/recovery-tree-transform.cpp new file mode 100644 index 0000000000000..bf882db3ec2cc --- /dev/null +++ b/clang/test/SemaTemplate/recovery-tree-transform.cpp @@ -0,0 +1,4 @@ +// RUN: %clang_cc1 -verify -frecovery-ast %s + +template int *p = &void(T::error); // expected-error{{cannot take the address of an rvalue}} expected-error{{type 'int' cannot be used prior to '::'}} +int *q = p; // expected-note{{in instantiation of variable template specialization 'p' requested here}} diff --git a/clang/test/SemaTemplate/subst-into-subst.cpp b/clang/test/SemaTemplate/subst-into-subst.cpp new file mode 100644 index 0000000000000..69c4a837864dc --- /dev/null +++ b/clang/test/SemaTemplate/subst-into-subst.cpp @@ -0,0 +1,34 @@ +// RUN: %clang_cc1 -std=c++2a -verify %s + +// When forming and checking satisfaction of atomic constraints, we will +// substitute still-dependent template arguments into an expression, and later +// substitute into the result. This creates some unique situations; check that +// they work. + +namespace SubstIntoResolvedTypeTemplateArg { + template struct X {}; + + template concept A = true; + template concept B = sizeof(T) != 0; + template concept C = B>; + + int f(A auto); // expected-note {{candidate}} + int f(C auto); // expected-note {{candidate}} + int k1 = f(0); // expected-error {{ambiguous}} + + template concept D = A && B>; + int f(D auto); + int k2 = f(0); // ok + + // The atomic constraint formed from B> is identical to the + // one formed from C, even though the template arguments are written as + // different expressions; the "equivalent" rules are used rather than the + // "identical" rules when matching template arguments in concept-ids. + template concept E = A && B>; + int g(C auto); + int g(E auto); // expected-note {{candidate}} + int k3 = g(0); + + int g(D auto); // expected-note {{candidate}} + int k4 = g(0); // expected-error {{ambiguous}} +} diff --git a/clang/test/lit.cfg.py b/clang/test/lit.cfg.py index bdf3b1179225d..18b5a2991f7ee 100644 --- a/clang/test/lit.cfg.py +++ b/clang/test/lit.cfg.py @@ -25,7 +25,7 @@ config.test_format = lit.formats.ShTest(not llvm_config.use_lit_shell) # suffixes: A list of file extensions to treat as test files. -config.suffixes = ['.c', '.cpp', '.cppm', '.m', '.mm', '.cu', +config.suffixes = ['.c', '.cpp', '.i', '.cppm', '.m', '.mm', '.cu', '.ll', '.cl', '.s', '.S', '.modulemap', '.test', '.rs', '.ifs'] # excludes: A list of directories to exclude from the testsuite. The 'Inputs' diff --git a/clang/tools/driver/CMakeLists.txt b/clang/tools/driver/CMakeLists.txt index cc5712726169b..8d4a6a597a208 100644 --- a/clang/tools/driver/CMakeLists.txt +++ b/clang/tools/driver/CMakeLists.txt @@ -58,7 +58,7 @@ endif() # Support plugins. if(CLANG_PLUGIN_SUPPORT) - export_executable_symbols(clang) + export_executable_symbols_for_plugins(clang) endif() add_dependencies(clang clang-resource-headers) diff --git a/clang/tools/libclang/CIndex.cpp b/clang/tools/libclang/CIndex.cpp index df5530c73c6d6..ae7fd4271799f 100644 --- a/clang/tools/libclang/CIndex.cpp +++ b/clang/tools/libclang/CIndex.cpp @@ -2048,6 +2048,7 @@ class EnqueueVisitor : public ConstStmtVisitor { void VisitOMPCancelDirective(const OMPCancelDirective *D); void VisitOMPFlushDirective(const OMPFlushDirective *D); void VisitOMPDepobjDirective(const OMPDepobjDirective *D); + void VisitOMPScanDirective(const OMPScanDirective *D); void VisitOMPOrderedDirective(const OMPOrderedDirective *D); void VisitOMPAtomicDirective(const OMPAtomicDirective *D); void VisitOMPTargetDirective(const OMPTargetDirective *D); @@ -2308,6 +2309,12 @@ void OMPClauseEnqueue::VisitOMPClauseList(T *Node) { } } +void OMPClauseEnqueue::VisitOMPInclusiveClause(const OMPInclusiveClause *C) { + VisitOMPClauseList(C); +} +void OMPClauseEnqueue::VisitOMPExclusiveClause(const OMPExclusiveClause *C) { + VisitOMPClauseList(C); +} void OMPClauseEnqueue::VisitOMPAllocateClause(const OMPAllocateClause *C) { VisitOMPClauseList(C); Visitor->AddStmt(C->getAllocator()); @@ -2885,6 +2892,10 @@ void EnqueueVisitor::VisitOMPDepobjDirective(const OMPDepobjDirective *D) { VisitOMPExecutableDirective(D); } +void EnqueueVisitor::VisitOMPScanDirective(const OMPScanDirective *D) { + VisitOMPExecutableDirective(D); +} + void EnqueueVisitor::VisitOMPOrderedDirective(const OMPOrderedDirective *D) { VisitOMPExecutableDirective(D); } @@ -5519,6 +5530,8 @@ CXString clang_getCursorKindSpelling(enum CXCursorKind Kind) { return cxstring::createRef("OMPFlushDirective"); case CXCursor_OMPDepobjDirective: return cxstring::createRef("OMPDepobjDirective"); + case CXCursor_OMPScanDirective: + return cxstring::createRef("OMPScanDirective"); case CXCursor_OMPOrderedDirective: return cxstring::createRef("OMPOrderedDirective"); case CXCursor_OMPAtomicDirective: diff --git a/clang/tools/libclang/CXCursor.cpp b/clang/tools/libclang/CXCursor.cpp index e10c742c65eae..147e3eaf4762c 100644 --- a/clang/tools/libclang/CXCursor.cpp +++ b/clang/tools/libclang/CXCursor.cpp @@ -292,6 +292,7 @@ CXCursor cxcursor::MakeCXCursor(const Stmt *S, const Decl *Parent, case Stmt::ObjCDictionaryLiteralClass: case Stmt::ObjCBoxedExprClass: case Stmt::ObjCSubscriptRefExprClass: + case Stmt::RecoveryExprClass: K = CXCursor_UnexposedExpr; break; @@ -638,6 +639,9 @@ CXCursor cxcursor::MakeCXCursor(const Stmt *S, const Decl *Parent, case Stmt::OMPDepobjDirectiveClass: K = CXCursor_OMPDepobjDirective; break; + case Stmt::OMPScanDirectiveClass: + K = CXCursor_OMPScanDirective; + break; case Stmt::OMPOrderedDirectiveClass: K = CXCursor_OMPOrderedDirective; break; diff --git a/clang/unittests/Format/FormatTestCSharp.cpp b/clang/unittests/Format/FormatTestCSharp.cpp index 03ebe337e76c8..17b8e070c36a5 100644 --- a/clang/unittests/Format/FormatTestCSharp.cpp +++ b/clang/unittests/Format/FormatTestCSharp.cpp @@ -562,6 +562,17 @@ var myDict = new Dictionary { Style); } +TEST_F(FormatTestCSharp, CSharpArrayInitializers) { + FormatStyle Style = getGoogleStyle(FormatStyle::LK_CSharp); + + verifyFormat(R"(// +private MySet[] setPoints = { + new Point(), + new Point(), +};)", + Style); +} + TEST_F(FormatTestCSharp, CSharpNamedArguments) { FormatStyle Style = getGoogleStyle(FormatStyle::LK_CSharp); @@ -628,11 +639,13 @@ TEST_F(FormatTestCSharp, CSharpSpaces) { verifyFormat(R"(catch (TestException) when (innerFinallyExecuted))", Style); verifyFormat(R"(private float[,] Values;)", Style); verifyFormat(R"(Result this[Index x] => Foo(x);)", Style); - verifyFormat(R"(class ItemFactory where T : new() {})", Style); + + verifyFormat(R"(char[,,] rawCharArray = MakeCharacterGrid();)", Style); Style.SpacesInSquareBrackets = true; verifyFormat(R"(private float[ , ] Values;)", Style); verifyFormat(R"(string dirPath = args?[ 0 ];)", Style); + verifyFormat(R"(char[ ,, ] rawCharArray = MakeCharacterGrid();)", Style); } TEST_F(FormatTestCSharp, CSharpNullableTypes) { @@ -673,5 +686,30 @@ if (someThings[i][j][k].Contains(myThing)) { Style); } +TEST_F(FormatTestCSharp, CSharpGenericTypeConstraints) { + FormatStyle Style = getGoogleStyle(FormatStyle::LK_CSharp); + + verifyFormat(R"(// +class ItemFactory + where T : new() {})", Style); + + verifyFormat(R"(// +class Dictionary + where TKey : IComparable + where TVal : IMyInterface { + public void MyMethod(T t) + where T : IMyInterface { doThing(); } +})", + Style); + + verifyFormat(R"(// +class ItemFactory + where T : new(), + IAnInterface, + IAnotherInterface, + IAnotherInterfaceStill {})", + Style); +} + } // namespace format } // end namespace clang diff --git a/clang/unittests/Sema/CodeCompleteTest.cpp b/clang/unittests/Sema/CodeCompleteTest.cpp index a9441a679cac7..5c8940cf6ebae 100644 --- a/clang/unittests/Sema/CodeCompleteTest.cpp +++ b/clang/unittests/Sema/CodeCompleteTest.cpp @@ -487,6 +487,7 @@ TEST(PreferredTypeTest, NoCrashOnInvalidTypes) { auto x = decltype(&1)(^); auto y = new decltype(&1)(^); )cpp"; - EXPECT_THAT(collectPreferredTypes(Code), Each("NULL TYPE")); + EXPECT_THAT(collectPreferredTypes(Code), + Each("decltype((1))")); } } // namespace diff --git a/clang/unittests/StaticAnalyzer/RegisterCustomCheckersTest.cpp b/clang/unittests/StaticAnalyzer/RegisterCustomCheckersTest.cpp index d0cf291eb2b8e..8027d3338b699 100644 --- a/clang/unittests/StaticAnalyzer/RegisterCustomCheckersTest.cpp +++ b/clang/unittests/StaticAnalyzer/RegisterCustomCheckersTest.cpp @@ -81,6 +81,66 @@ TEST(RegisterCustomCheckers, CheckLocationIncDec) { runCheckerOnCode("void f() { int *p; (*p)++; }")); } +//===----------------------------------------------------------------------===// +// Unsatisfied checker dependency +//===----------------------------------------------------------------------===// + +class PrerequisiteChecker : public Checker { +public: + void checkASTCodeBody(const Decl *D, AnalysisManager &Mgr, + BugReporter &BR) const { + BR.EmitBasicReport(D, this, "Prerequisite", categories::LogicError, + "This is the prerequisite checker", + PathDiagnosticLocation(D, Mgr.getSourceManager()), {}); + } +}; + +void registerPrerequisiteChecker(CheckerManager &mgr) { + mgr.registerChecker(); +} + +bool shouldRegisterPrerequisiteChecker(const LangOptions &LO) { + return false; +} + +class DependentChecker : public Checker { +public: + void checkASTCodeBody(const Decl *D, AnalysisManager &Mgr, + BugReporter &BR) const { + BR.EmitBasicReport(D, this, "Dependent", categories::LogicError, + "This is the Dependent Checker", + PathDiagnosticLocation(D, Mgr.getSourceManager()), {}); + } +}; + +void registerDependentChecker(CheckerManager &mgr) { + mgr.registerChecker(); +} + +bool shouldRegisterDependentChecker(const LangOptions &LO) { + return true; +} + +void addDependentChecker(AnalysisASTConsumer &AnalysisConsumer, + AnalyzerOptions &AnOpts) { + AnOpts.CheckersAndPackages = {{"custom.Dependent", true}}; + AnalysisConsumer.AddCheckerRegistrationFn([](CheckerRegistry &Registry) { + Registry.addChecker(registerPrerequisiteChecker, + shouldRegisterPrerequisiteChecker, + "custom.Prerequisite", "Description", "", false); + Registry.addChecker(registerDependentChecker, + shouldRegisterDependentChecker, + "custom.Dependent", "Description", "", false); + Registry.addDependency("custom.Dependent", "custom.Prerequisite"); + }); +} + +TEST(RegisterDependentCheckers, RegisterChecker) { + std::string Diags; + EXPECT_TRUE(runCheckerOnCode("void f() {;}", Diags)); + EXPECT_EQ(Diags, ""); +} + } // namespace } // namespace ento } // namespace clang diff --git a/clang/unittests/Tooling/Syntax/TreeTest.cpp b/clang/unittests/Tooling/Syntax/TreeTest.cpp index 6e914b6378c86..ddd9092d8a76d 100644 --- a/clang/unittests/Tooling/Syntax/TreeTest.cpp +++ b/clang/unittests/Tooling/Syntax/TreeTest.cpp @@ -47,7 +47,9 @@ static llvm::ArrayRef tokens(syntax::Node *N) { class SyntaxTreeTest : public ::testing::Test { protected: // Build a syntax tree for the code. - syntax::TranslationUnit *buildTree(llvm::StringRef Code) { + syntax::TranslationUnit * + buildTree(llvm::StringRef Code, + const std::string &Target = "x86_64-pc-linux-gnu") { // FIXME: this code is almost the identical to the one in TokensTest. Share // it. class BuildSyntaxTree : public ASTConsumer { @@ -98,8 +100,10 @@ class SyntaxTreeTest : public ::testing::Test { if (!Diags->getClient()) Diags->setClient(new IgnoringDiagConsumer); // Prepare to run a compiler. - std::vector Args = {"syntax-test", "-std=c++11", - "-fsyntax-only", FileName}; + std::vector Args = { + "syntax-test", "-target", Target.c_str(), + FileName, "-fsyntax-only", "-std=c++17", + }; Invocation = createInvocationFromCommandLine(Args, Diags, FS); assert(Invocation); Invocation->getFrontendOpts().DisableFree = false; @@ -120,6 +124,32 @@ class SyntaxTreeTest : public ::testing::Test { return Root; } + void expectTreeDumpEqual(StringRef Code, StringRef Tree, + bool RunWithDelayedTemplateParsing = true) { + SCOPED_TRACE(Code); + + std::string Expected = Tree.trim().str(); + + // We want to run the test with -fdelayed-template-parsing enabled and + // disabled, therefore we use these representative targets that differ in + // the default value. + // We are not passing -fdelayed-template-parsing directly but we are using + // the `-target` to improve coverage and discover differences in behavior + // early. + for (const std::string Target : + {"x86_64-pc-linux-gnu", "x86_64-pc-win32-msvc"}) { + if (!RunWithDelayedTemplateParsing && + Target == "x86_64-pc-win32-msvc") { + continue; + } + auto *Root = buildTree(Code, Target); + std::string Actual = std::string(StringRef(Root->dump(*Arena)).trim()); + EXPECT_EQ(Expected, Actual) + << "for target " << Target << " the resulting dump is:\n" + << Actual; + } + } + // Adds a file to the test VFS. void addFile(llvm::StringRef Path, llvm::StringRef Contents) { if (!FS->addFile(Path, time_t(), @@ -163,14 +193,13 @@ class SyntaxTreeTest : public ::testing::Test { std::unique_ptr Arena; }; -TEST_F(SyntaxTreeTest, Basic) { - std::pair Cases[] = { - { - R"cpp( +TEST_F(SyntaxTreeTest, Simple) { + expectTreeDumpEqual( + R"cpp( int main() {} void foo() {} )cpp", - R"txt( + R"txt( *: TranslationUnit |-SimpleDeclaration | |-int @@ -192,16 +221,18 @@ void foo() {} `-CompoundStatement |-{ `-} -)txt"}, - // if. - { - R"cpp( +)txt"); +} + +TEST_F(SyntaxTreeTest, If) { + expectTreeDumpEqual( + R"cpp( int main() { if (true) {} if (true) {} else if (false) {} } )cpp", - R"txt( + R"txt( *: TranslationUnit `-SimpleDeclaration |-int @@ -241,14 +272,17 @@ int main() { | |-{ | `-} `-} - )txt"}, - // for. - {R"cpp( + )txt"); +} + +TEST_F(SyntaxTreeTest, For) { + expectTreeDumpEqual( + R"cpp( void test() { for (;;) {} } )cpp", - R"txt( + R"txt( *: TranslationUnit `-SimpleDeclaration |-void @@ -269,10 +303,18 @@ void test() { | |-{ | `-} `-} - )txt"}, - // declaration statement. - {"void test() { int a = 10; }", - R"txt( + )txt"); +} + +TEST_F(SyntaxTreeTest, RangeBasedFor) { + expectTreeDumpEqual( + R"cpp( +void test() { + int a[3]; + for (int x : a) ; +} + )cpp", + R"txt( *: TranslationUnit `-SimpleDeclaration |-void @@ -288,13 +330,32 @@ void test() { | | |-int | | `-SimpleDeclarator | | |-a - | | |-= - | | `-UnknownExpression - | | `-10 + | | `-ArraySubscript + | | |-[ + | | |-UnknownExpression + | | | `-3 + | | `-] | `-; + |-RangeBasedForStatement + | |-for + | |-( + | |-SimpleDeclaration + | | |-int + | | |-SimpleDeclarator + | | | `-x + | | `-: + | |-UnknownExpression + | | `-a + | |-) + | `-EmptyStatement + | `-; `-} -)txt"}, - {"void test() { ; }", R"txt( + )txt"); +} + +TEST_F(SyntaxTreeTest, DeclarationStatement) { + expectTreeDumpEqual("void test() { int a = 10; }", + R"txt( *: TranslationUnit `-SimpleDeclaration |-void @@ -305,12 +366,22 @@ void test() { | `-) `-CompoundStatement |-{ - |-EmptyStatement + |-DeclarationStatement + | |-SimpleDeclaration + | | |-int + | | `-SimpleDeclarator + | | |-a + | | |-= + | | `-UnknownExpression + | | `-10 | `-; `-} -)txt"}, - // switch, case and default. - {R"cpp( +)txt"); +} + +TEST_F(SyntaxTreeTest, Switch) { + expectTreeDumpEqual( + R"cpp( void test() { switch (true) { case 0: @@ -318,7 +389,7 @@ void test() { } } )cpp", - R"txt( + R"txt( *: TranslationUnit `-SimpleDeclaration |-void @@ -349,14 +420,17 @@ void test() { | | `-; | `-} `-} -)txt"}, - // while. - {R"cpp( +)txt"); +} + +TEST_F(SyntaxTreeTest, While) { + expectTreeDumpEqual( + R"cpp( void test() { while (true) { continue; break; } } )cpp", - R"txt( + R"txt( *: TranslationUnit `-SimpleDeclaration |-void @@ -383,77 +457,15 @@ void test() { | | `-; | `-} `-} -)txt"}, - // return. - {R"cpp( -int test() { return 1; } - )cpp", - R"txt( -*: TranslationUnit -`-SimpleDeclaration - |-int - |-SimpleDeclarator - | |-test - | `-ParametersAndQualifiers - | |-( - | `-) - `-CompoundStatement - |-{ - |-ReturnStatement - | |-return - | |-UnknownExpression - | | `-1 - | `-; - `-} -)txt"}, - // Range-based for. - {R"cpp( -void test() { - int a[3]; - for (int x : a) ; +)txt"); } - )cpp", - R"txt( -*: TranslationUnit -`-SimpleDeclaration - |-void - |-SimpleDeclarator - | |-test - | `-ParametersAndQualifiers - | |-( - | `-) - `-CompoundStatement - |-{ - |-DeclarationStatement - | |-SimpleDeclaration - | | |-int - | | `-SimpleDeclarator - | | |-a - | | `-ArraySubscript - | | |-[ - | | |-UnknownExpression - | | | `-3 - | | `-] - | `-; - |-RangeBasedForStatement - | |-for - | |-( - | |-SimpleDeclaration - | | |-int - | | |-SimpleDeclarator - | | | `-x - | | `-: - | |-UnknownExpression - | | `-a - | |-) - | `-EmptyStatement - | `-; - `-} - )txt"}, - // Unhandled statements should end up as 'unknown statement'. - // This example uses a 'label statement', which does not yet have a syntax - // counterpart. - {"void main() { foo: return 100; }", R"txt( + +TEST_F(SyntaxTreeTest, UnhandledStatement) { + // Unhandled statements should end up as 'unknown statement'. + // This example uses a 'label statement', which does not yet have a syntax + // counterpart. + expectTreeDumpEqual("void main() { foo: return 100; }", + R"txt( *: TranslationUnit `-SimpleDeclaration |-void @@ -473,16 +485,20 @@ void test() { | | `-100 | `-; `-} -)txt"}, - // expressions should be wrapped in 'ExpressionStatement' when they appear - // in a statement position. - {R"cpp( +)txt"); +} + +TEST_F(SyntaxTreeTest, Expressions) { + // expressions should be wrapped in 'ExpressionStatement' when they appear + // in a statement position. + expectTreeDumpEqual( + R"cpp( void test() { test(); if (true) test(); else test(); } )cpp", - R"txt( + R"txt( *: TranslationUnit `-SimpleDeclaration |-void @@ -519,12 +535,15 @@ void test() { | | `-) | `-; `-} -)txt"}, - // Multiple declarators group into a single SimpleDeclaration. - {R"cpp( +)txt"); +} + +TEST_F(SyntaxTreeTest, MultipleDeclaratorsGrouping) { + expectTreeDumpEqual( + R"cpp( int *a, b; )cpp", - R"txt( + R"txt( *: TranslationUnit `-SimpleDeclaration |-int @@ -535,11 +554,12 @@ void test() { |-SimpleDeclarator | `-b `-; - )txt"}, - {R"cpp( + )txt"); + expectTreeDumpEqual( + R"cpp( typedef int *a, b; )cpp", - R"txt( + R"txt( *: TranslationUnit `-SimpleDeclaration |-typedef @@ -551,15 +571,18 @@ void test() { |-SimpleDeclarator | `-b `-; - )txt"}, - // Multiple declarators inside a statement. - {R"cpp( + )txt"); +} + +TEST_F(SyntaxTreeTest, MultipleDeclaratorsInsideStatement) { + expectTreeDumpEqual( + R"cpp( void foo() { int *a, b; typedef int *ta, tb; } )cpp", - R"txt( + R"txt( *: TranslationUnit `-SimpleDeclaration |-void @@ -592,15 +615,19 @@ void foo() { | | `-tb | `-; `-} - )txt"}, - {R"cpp( + )txt"); +} + +TEST_F(SyntaxTreeTest, Namespaces) { + expectTreeDumpEqual( + R"cpp( namespace a { namespace b {} } namespace a::b {} namespace {} namespace foo = a; )cpp", - R"txt( + R"txt( *: TranslationUnit |-NamespaceDefinition | |-namespace @@ -629,9 +656,62 @@ namespace foo = a; |-= |-a `-; -)txt"}, - // Free-standing classes, must live inside a SimpleDeclaration. - {R"cpp( +)txt"); +} + +TEST_F(SyntaxTreeTest, UsingDirective) { + expectTreeDumpEqual( + R"cpp( +namespace ns {} +using namespace ::ns; + )cpp", + R"txt( +*: TranslationUnit +|-NamespaceDefinition +| |-namespace +| |-ns +| |-{ +| `-} +`-UsingNamespaceDirective + |-using + |-namespace + |-:: + |-ns + `-; + )txt"); +} + +TEST_F(SyntaxTreeTest, UsingDeclaration) { + expectTreeDumpEqual( + R"cpp( +namespace ns { int a; } +using ns::a; + )cpp", + R"txt( +*: TranslationUnit +|-NamespaceDefinition +| |-namespace +| |-ns +| |-{ +| |-SimpleDeclaration +| | |-int +| | |-SimpleDeclarator +| | | `-a +| | `-; +| `-} +`-UsingDeclaration + |-using + |-ns + |-:: + |-a + `-; + )txt"); +} + +TEST_F(SyntaxTreeTest, FreeStandingClasses) { + // Free-standing classes, must live inside a SimpleDeclaration. + expectTreeDumpEqual( + R"cpp( sturct X; struct X {}; @@ -640,7 +720,7 @@ struct Y {} *y2; struct {} *a1; )cpp", - R"txt( + R"txt( *: TranslationUnit |-SimpleDeclaration | |-sturct @@ -676,57 +756,168 @@ struct {} *a1; | |-* | `-a1 `-; -)txt"}, - {R"cpp( -namespace ns {} -using namespace ::ns; +)txt"); +} + +TEST_F(SyntaxTreeTest, Templates) { + expectTreeDumpEqual( + R"cpp( +template struct cls {}; +template int var = 10; +template int fun() {} )cpp", - R"txt( + R"txt( *: TranslationUnit -|-NamespaceDefinition -| |-namespace -| |-ns -| |-{ -| `-} -`-UsingNamespaceDirective - |-using - |-namespace - |-:: - |-ns - `-; - )txt"}, - {R"cpp( -namespace ns { int a; } -using ns::a; +|-TemplateDeclaration +| |-template +| |-< +| |-UnknownDeclaration +| | |-class +| | `-T +| |-> +| `-SimpleDeclaration +| |-struct +| |-cls +| |-{ +| |-} +| `-; +|-TemplateDeclaration +| |-template +| |-< +| |-UnknownDeclaration +| | |-class +| | `-T +| |-> +| `-SimpleDeclaration +| |-int +| |-SimpleDeclarator +| | |-var +| | |-= +| | `-UnknownExpression +| | `-10 +| `-; +`-TemplateDeclaration + |-template + |-< + |-UnknownDeclaration + | |-class + | `-T + |-> + `-SimpleDeclaration + |-int + |-SimpleDeclarator + | |-fun + | `-ParametersAndQualifiers + | |-( + | `-) + `-CompoundStatement + |-{ + `-} +)txt", + // FIXME: Make this test work on windows by generating the expected Syntax + // tree when -fdelayed-template-parsing is active. + /*RunWithDelayedTemplateParsing=*/false); +} + +TEST_F(SyntaxTreeTest, NestedTemplates) { + expectTreeDumpEqual( + R"cpp( +template +struct X { + template + U foo(); +}; )cpp", - R"txt( + R"txt( *: TranslationUnit -|-NamespaceDefinition -| |-namespace -| |-ns -| |-{ -| |-SimpleDeclaration -| | |-int -| | |-SimpleDeclarator -| | | `-a -| | `-; -| `-} -`-UsingDeclaration - |-using - |-ns - |-:: - |-a - `-; - )txt"}, - {R"cpp( +`-TemplateDeclaration + |-template + |-< + |-UnknownDeclaration + | |-class + | `-T + |-> + `-SimpleDeclaration + |-struct + |-X + |-{ + |-TemplateDeclaration + | |-template + | |-< + | |-UnknownDeclaration + | | |-class + | | `-U + | |-> + | `-SimpleDeclaration + | |-U + | |-SimpleDeclarator + | | |-foo + | | `-ParametersAndQualifiers + | | |-( + | | `-) + | `-; + |-} + `-; +)txt"); +} + +TEST_F(SyntaxTreeTest, Templates2) { + expectTreeDumpEqual( + R"cpp( +template struct X { struct Y; }; +template struct X::Y {}; + )cpp", + R"txt( +*: TranslationUnit +|-TemplateDeclaration +| |-template +| |-< +| |-UnknownDeclaration +| | |-class +| | `-T +| |-> +| `-SimpleDeclaration +| |-struct +| |-X +| |-{ +| |-SimpleDeclaration +| | |-struct +| | |-Y +| | `-; +| |-} +| `-; +`-TemplateDeclaration + |-template + |-< + |-UnknownDeclaration + | |-class + | `-T + |-> + `-SimpleDeclaration + |-struct + |-X + |-< + |-T + |-> + |-:: + |-Y + |-{ + |-} + `-; + )txt"); +} + +TEST_F(SyntaxTreeTest, TemplatesUsingUsing) { + expectTreeDumpEqual( + R"cpp( template struct X { using T::foo; using typename T::bar; }; )cpp", - R"txt( + R"txt( *: TranslationUnit -`-UnknownDeclaration +`-TemplateDeclaration |-template |-< |-UnknownDeclaration @@ -752,11 +943,92 @@ template struct X { | `-; |-} `-; - )txt"}, - {R"cpp( + )txt"); +} + +TEST_F(SyntaxTreeTest, ExplicitTemplateInstantations) { + expectTreeDumpEqual( + R"cpp( +template struct X {}; +template struct X {}; +template <> struct X {}; + +template struct X; +extern template struct X; +)cpp", + R"txt( +*: TranslationUnit +|-TemplateDeclaration +| |-template +| |-< +| |-UnknownDeclaration +| | |-class +| | `-T +| |-> +| `-SimpleDeclaration +| |-struct +| |-X +| |-{ +| |-} +| `-; +|-TemplateDeclaration +| |-template +| |-< +| |-UnknownDeclaration +| | |-class +| | `-T +| |-> +| `-SimpleDeclaration +| |-struct +| |-X +| |-< +| |-T +| |-* +| |-> +| |-{ +| |-} +| `-; +|-TemplateDeclaration +| |-template +| |-< +| |-> +| `-SimpleDeclaration +| |-struct +| |-X +| |-< +| |-int +| |-> +| |-{ +| |-} +| `-; +|-ExplicitTemplateInstantiation +| |-template +| `-SimpleDeclaration +| |-struct +| |-X +| |-< +| |-double +| |-> +| `-; +`-ExplicitTemplateInstantiation + |-extern + |-template + `-SimpleDeclaration + |-struct + |-X + |-< + |-float + |-> + `-; +)txt"); +} + +TEST_F(SyntaxTreeTest, UsingType) { + expectTreeDumpEqual( + R"cpp( using type = int; )cpp", - R"txt( + R"txt( *: TranslationUnit `-TypeAliasDeclaration |-using @@ -764,20 +1036,28 @@ using type = int; |-= |-int `-; - )txt"}, - {R"cpp( + )txt"); +} + +TEST_F(SyntaxTreeTest, EmptyDeclaration) { + expectTreeDumpEqual( + R"cpp( ; )cpp", - R"txt( + R"txt( *: TranslationUnit `-EmptyDeclaration `-; - )txt"}, - {R"cpp( + )txt"); +} + +TEST_F(SyntaxTreeTest, StaticAssert) { + expectTreeDumpEqual( + R"cpp( static_assert(true, "message"); static_assert(true); )cpp", - R"txt( + R"txt( *: TranslationUnit |-StaticAssertDeclaration | |-static_assert @@ -796,12 +1076,16 @@ static_assert(true); | `-true |-) `-; - )txt"}, - {R"cpp( + )txt"); +} + +TEST_F(SyntaxTreeTest, ExternC) { + expectTreeDumpEqual( + R"cpp( extern "C" int a; extern "C" { int b; int c; } )cpp", - R"txt( + R"txt( *: TranslationUnit |-LinkageSpecificationDeclaration | |-extern @@ -826,15 +1110,19 @@ extern "C" { int b; int c; } | | `-c | `-; `-} - )txt"}, - // Some nodes are non-modifiable, they are marked with 'I:'. - {R"cpp( + )txt"); +} + +TEST_F(SyntaxTreeTest, NonModifiableNodes) { + // Some nodes are non-modifiable, they are marked with 'I:'. + expectTreeDumpEqual( + R"cpp( #define HALF_IF if (1+ #define HALF_IF_2 1) {} void test() { HALF_IF HALF_IF_2 else {} })cpp", - R"txt( + R"txt( *: TranslationUnit `-SimpleDeclaration |-void @@ -861,9 +1149,10 @@ void test() { | |-{ | `-} `-} - )txt"}, - // All nodes can be mutated. - {R"cpp( + )txt"); + // All nodes can be mutated. + expectTreeDumpEqual( + R"cpp( #define OPEN { #define CLOSE } @@ -877,7 +1166,7 @@ void test() { } } )cpp", - R"txt( + R"txt( *: TranslationUnit `-SimpleDeclaration |-void @@ -903,15 +1192,18 @@ void test() { | | `-; | `-} `-} - )txt"}, - // Array subscripts in declarators. - {R"cpp( + )txt"); +} + +TEST_F(SyntaxTreeTest, ArraySubscriptsInDeclarators) { + expectTreeDumpEqual( + R"cpp( int a[10]; int b[1][2][3]; int c[] = {1,2,3}; void f(int xs[static 10]); )cpp", - R"txt( + R"txt( *: TranslationUnit |-SimpleDeclaration | |-int @@ -978,9 +1270,12 @@ void f(int xs[static 10]); | | `-] | `-) `-; - )txt"}, - // Parameter lists in declarators. - {R"cpp( + )txt"); +} + +TEST_F(SyntaxTreeTest, ParameterListsInDeclarators) { + expectTreeDumpEqual( + R"cpp( int a() const; int b() volatile; int c() &; @@ -995,7 +1290,7 @@ int foo( int&& f ); )cpp", - R"txt( + R"txt( *: TranslationUnit |-SimpleDeclaration | |-int @@ -1094,14 +1389,17 @@ int foo( | | `-f | `-) `-; - )txt"}, - // Trailing const qualifier. - {R"cpp( + )txt"); +} + +TEST_F(SyntaxTreeTest, TrailingConst) { + expectTreeDumpEqual( + R"cpp( struct X { int foo() const; } )cpp", - R"txt( + R"txt( *: TranslationUnit `-SimpleDeclaration |-struct @@ -1117,12 +1415,15 @@ struct X { | | `-const | `-; `-} - )txt"}, - // Trailing return type in parameter lists. - {R"cpp( + )txt"); +} + +TEST_F(SyntaxTreeTest, TrailingReturn) { + expectTreeDumpEqual( + R"cpp( auto foo() -> int; )cpp", - R"txt( + R"txt( *: TranslationUnit `-SimpleDeclaration |-auto @@ -1135,14 +1436,17 @@ auto foo() -> int; | |--> | `-int `-; - )txt"}, - // Exception specification in parameter lists. - {R"cpp( + )txt"); +} + +TEST_F(SyntaxTreeTest, ExceptionSpecification) { + expectTreeDumpEqual( + R"cpp( int a() noexcept; int b() noexcept(true); int c() throw(); )cpp", - R"txt( + R"txt( *: TranslationUnit |-SimpleDeclaration | |-int @@ -1177,15 +1481,18 @@ int c() throw(); | |-( | `-) `-; - )txt"}, - // Declarators in parentheses. - {R"cpp( + )txt"); +} + +TEST_F(SyntaxTreeTest, DeclaratorsInParentheses) { + expectTreeDumpEqual( + R"cpp( int (a); int *(b); int (*c)(int); int *(d)(int); )cpp", - R"txt( + R"txt( *: TranslationUnit |-SimpleDeclaration | |-int @@ -1232,15 +1539,18 @@ int *(d)(int); | | `-int | `-) `-; - )txt"}, - // CV qualifiers. - {R"cpp( + )txt"); +} + +TEST_F(SyntaxTreeTest, ConstVolatileQualifiers) { + expectTreeDumpEqual( + R"cpp( const int west = -1; int const east = 1; const int const universal = 0; const int const *const *volatile b; )cpp", - R"txt( + R"txt( *: TranslationUnit |-SimpleDeclaration | |-const @@ -1282,12 +1592,15 @@ const int const *const *volatile b; | |-volatile | `-b `-; - )txt"}, - // Ranges of declarators with trailing return types. - {R"cpp( + )txt"); +} + +TEST_F(SyntaxTreeTest, RangesOfDeclaratorsWithTrailingReturnTypes) { + expectTreeDumpEqual( + R"cpp( auto foo() -> auto(*)(int) -> double*; )cpp", - R"txt( + R"txt( *: TranslationUnit `-SimpleDeclaration |-auto @@ -1315,14 +1628,17 @@ auto foo() -> auto(*)(int) -> double*; | `-SimpleDeclarator | `-* `-; - )txt"}, - // Member pointers. - {R"cpp( + )txt"); +} + +TEST_F(SyntaxTreeTest, MemberPointers) { + expectTreeDumpEqual( + R"cpp( struct X {}; int X::* a; const int X::* b; )cpp", - R"txt( + R"txt( *: TranslationUnit |-SimpleDeclaration | |-struct @@ -1349,12 +1665,15 @@ const int X::* b; | | `-* | `-b `-; - )txt"}, - // All-in-one tests. - {R"cpp( + )txt"); +} + +TEST_F(SyntaxTreeTest, ComplexDeclarator) { + expectTreeDumpEqual( + R"cpp( void x(char a, short (*b)(int)); )cpp", - R"txt( + R"txt( *: TranslationUnit `-SimpleDeclaration |-void @@ -1382,11 +1701,15 @@ void x(char a, short (*b)(int)); | | `-) | `-) `-; - )txt"}, - {R"cpp( + )txt"); +} + +TEST_F(SyntaxTreeTest, ComplexDeclarator2) { + expectTreeDumpEqual( + R"cpp( void x(char a, short (*b)(int), long (**c)(long long)); )cpp", - R"txt( + R"txt( *: TranslationUnit `-SimpleDeclaration |-void @@ -1430,18 +1753,7 @@ void x(char a, short (*b)(int), long (**c)(long long)); | | `-) | `-) `-; - )txt"}, - }; - - for (const auto &T : Cases) { - SCOPED_TRACE(T.first); - - auto *Root = buildTree(T.first); - std::string Expected = llvm::StringRef(T.second).trim().str(); - std::string Actual = - std::string(llvm::StringRef(Root->dump(*Arena)).trim()); - EXPECT_EQ(Expected, Actual) << "the resulting dump is:\n" << Actual; - } + )txt"); } TEST_F(SyntaxTreeTest, Mutations) { diff --git a/clang/utils/TableGen/ClangAttrEmitter.cpp b/clang/utils/TableGen/ClangAttrEmitter.cpp index 37594e7b679c9..da29a1c744284 100644 --- a/clang/utils/TableGen/ClangAttrEmitter.cpp +++ b/clang/utils/TableGen/ClangAttrEmitter.cpp @@ -107,7 +107,7 @@ static std::string ReadPCHRecord(StringRef type) { .Case("IdentifierInfo *", "Record.readIdentifier()") .Case("StringRef", "Record.readString()") .Case("ParamIdx", "ParamIdx::deserialize(Record.readInt())") - .Case("OMPTraitInfo", "Record.readOMPTraitInfo()") + .Case("OMPTraitInfo *", "Record.readOMPTraitInfo()") .Default("Record.readInt()"); } @@ -131,7 +131,7 @@ static std::string WritePCHRecord(StringRef type, StringRef name) { .Case("StringRef", "AddString(" + std::string(name) + ");\n") .Case("ParamIdx", "push_back(" + std::string(name) + ".serialize());\n") - .Case("OMPTraitInfo", + .Case("OMPTraitInfo *", "writeOMPTraitInfo(" + std::string(name) + ");\n") .Default("push_back(" + std::string(name) + ");\n"); } @@ -363,7 +363,7 @@ namespace { OS << " if (SA->get" << getUpperName() << "().isValid())\n "; OS << " OS << \" \" << SA->get" << getUpperName() << "().getSourceIndex();\n"; - } else if (type == "OMPTraitInfo") { + } else if (type == "OMPTraitInfo *") { OS << " OS << \" \" << SA->get" << getUpperName() << "();\n"; } else { llvm_unreachable("Unknown SimpleArgument type!"); @@ -1334,7 +1334,7 @@ createArgument(const Record &Arg, StringRef Attr, else if (ArgName == "VersionArgument") Ptr = std::make_unique(Arg, Attr); else if (ArgName == "OMPTraitInfoArgument") - Ptr = std::make_unique(Arg, Attr, "OMPTraitInfo"); + Ptr = std::make_unique(Arg, Attr, "OMPTraitInfo *"); if (!Ptr) { // Search in reverse order so that the most-derived type is handled first. @@ -3520,7 +3520,7 @@ static void GenerateAppertainsTo(const Record &Attr, raw_ostream &OS) { // of the declaration). OS << "virtual bool diagAppertainsToDecl(Sema &S, "; OS << "const ParsedAttr &Attr, const Decl *D) const {\n"; - OS << " if (!D || ("; + OS << " if ("; for (auto I = Subjects.begin(), E = Subjects.end(); I != E; ++I) { // If the subject has custom code associated with it, use the generated // function for it. The function cannot be inlined into this check (yet) @@ -3536,7 +3536,7 @@ static void GenerateAppertainsTo(const Record &Attr, raw_ostream &OS) { if (I + 1 != E) OS << " && "; } - OS << ")) {\n"; + OS << ") {\n"; OS << " S.Diag(Attr.getLoc(), diag::"; OS << (Warn ? "warn_attribute_wrong_decl_type_str" : "err_attribute_wrong_decl_type_str"); @@ -3671,6 +3671,20 @@ static void GenerateSpellingIndexToSemanticSpelling(const Record &Attr, OS << "}\n\n"; } +static void GenerateHandleDeclAttribute(const Record &Attr, raw_ostream &OS) { + // Only generate if Attr can be handled simply. + if (!Attr.getValueAsBit("SimpleHandler")) + return; + + // Generate a function which just converts from ParsedAttr to the Attr type. + OS << "virtual AttrHandling handleDeclAttribute(Sema &S, Decl *D,"; + OS << "const ParsedAttr &Attr) const {\n"; + OS << " D->addAttr(::new (S.Context) " << Attr.getName(); + OS << "Attr(S.Context, Attr));\n"; + OS << " return AttributeApplied;\n"; + OS << "}\n\n"; +} + static bool IsKnownToGCC(const Record &Attr) { // Look at the spellings for this subject; if there are any spellings which // claim to be known to GCC, the attribute is known to GCC. @@ -3752,6 +3766,7 @@ void EmitClangAttrParsedAttrImpl(RecordKeeper &Records, raw_ostream &OS) { GenerateTargetRequirements(Attr, Dupes, OS); GenerateSpellingIndexToSemanticSpelling(Attr, OS); PragmaAttributeSupport.generateStrictConformsTo(*I->second, OS); + GenerateHandleDeclAttribute(Attr, OS); OS << "static const ParsedAttrInfo" << I->first << " Instance;\n"; OS << "};\n"; OS << "const ParsedAttrInfo" << I->first << " ParsedAttrInfo" << I->first diff --git a/clang/utils/TableGen/MveEmitter.cpp b/clang/utils/TableGen/MveEmitter.cpp index f75f5000f0f6c..f53c6036766ab 100644 --- a/clang/utils/TableGen/MveEmitter.cpp +++ b/clang/utils/TableGen/MveEmitter.cpp @@ -1962,18 +1962,48 @@ void MveEmitter::EmitBuiltinSema(raw_ostream &OS) { } } +// ----------------------------------------------------------------------------- +// Class that describes an ACLE intrinsic implemented as a macro. +// +// This class is used when the intrinsic is polymorphic in 2 or 3 types, but we +// want to avoid a combinatorial explosion by reinterpreting the arguments to +// fixed types. + +class FunctionMacro { + std::vector Params; + StringRef Definition; + +public: + FunctionMacro(const Record &R); + + const std::vector &getParams() const { return Params; } + StringRef getDefinition() const { return Definition; } +}; + +FunctionMacro::FunctionMacro(const Record &R) { + Params = R.getValueAsListOfStrings("params"); + Definition = R.getValueAsString("definition"); +} + // ----------------------------------------------------------------------------- // The class used for generating arm_cde.h and related Clang bits // class CdeEmitter : public EmitterBase { + std::map FunctionMacros; + public: - CdeEmitter(RecordKeeper &Records) : EmitterBase(Records){}; + CdeEmitter(RecordKeeper &Records); void EmitHeader(raw_ostream &OS) override; void EmitBuiltinDef(raw_ostream &OS) override; void EmitBuiltinSema(raw_ostream &OS) override; }; +CdeEmitter::CdeEmitter(RecordKeeper &Records) : EmitterBase(Records) { + for (Record *R : Records.getAllDerivedDefinitions("FunctionMacro")) + FunctionMacros.emplace(R->getName(), FunctionMacro(*R)); +} + void CdeEmitter::EmitHeader(raw_ostream &OS) { // Accumulate pieces of the header file that will be enabled under various // different combinations of #ifdef. The index into parts[] is one of the @@ -1995,6 +2025,9 @@ void CdeEmitter::EmitHeader(raw_ostream &OS) { const ScalarType *ST = kv.second.get(); if (ST->hasNonstandardName()) continue; + // We don't have float64x2_t + if (ST->kind() == ScalarTypeKind::Float && ST->sizeInBits() == 64) + continue; raw_ostream &OS = parts[ST->requiresFloat() ? MVEFloat : MVE]; const VectorType *VT = getVectorType(ST); @@ -2048,6 +2081,16 @@ void CdeEmitter::EmitHeader(raw_ostream &OS) { } } + for (const auto &kv : FunctionMacros) { + StringRef Name = kv.first; + const FunctionMacro &FM = kv.second; + + raw_ostream &OS = parts[MVE]; + OS << "#define " + << "__arm_" << Name << "(" << join(FM.getParams(), ", ") << ") " + << FM.getDefinition() << "\n"; + } + for (auto &part : parts) part << "\n"; diff --git a/clang/utils/TableGen/NeonEmitter.cpp b/clang/utils/TableGen/NeonEmitter.cpp index 59ea15493f037..9166e7a718ec5 100644 --- a/clang/utils/TableGen/NeonEmitter.cpp +++ b/clang/utils/TableGen/NeonEmitter.cpp @@ -27,8 +27,9 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/None.h" -#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Optional.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" #include "llvm/Support/Casting.h" @@ -238,6 +239,11 @@ class Type { NumVectors = 1; } + void make32BitElement() { + assert_with_loc(Bitwidth > 32, "Not enough bits to make it 32!"); + ElementBitwidth = 32; + } + void doubleLanes() { assert_with_loc(Bitwidth != 128, "Can't get bigger than 128!"); Bitwidth = 128; @@ -518,7 +524,8 @@ class Intrinsic { std::pair emitDagDupTyped(DagInit *DI); std::pair emitDagShuffle(DagInit *DI); std::pair emitDagCast(DagInit *DI, bool IsBitCast); - std::pair emitDagCall(DagInit *DI); + std::pair emitDagCall(DagInit *DI, + bool MatchMangledName); std::pair emitDagNameReplace(DagInit *DI); std::pair emitDagLiteral(DagInit *DI); std::pair emitDagOp(DagInit *DI); @@ -546,7 +553,8 @@ class NeonEmitter { public: /// Called by Intrinsic - this attempts to get an intrinsic that takes /// the given types as arguments. - Intrinsic &getIntrinsic(StringRef Name, ArrayRef Types); + Intrinsic &getIntrinsic(StringRef Name, ArrayRef Types, + Optional MangledName); /// Called by Intrinsic - returns a globally-unique number. unsigned getUniqueNumber() { return UniqueNumber++; } @@ -1383,8 +1391,8 @@ std::pair Intrinsic::DagEmitter::emitDag(DagInit *DI) { return emitDagSaveTemp(DI); if (Op == "op") return emitDagOp(DI); - if (Op == "call") - return emitDagCall(DI); + if (Op == "call" || Op == "call_mangled") + return emitDagCall(DI, Op == "call_mangled"); if (Op == "name_replace") return emitDagNameReplace(DI); if (Op == "literal") @@ -1411,7 +1419,8 @@ std::pair Intrinsic::DagEmitter::emitDagOp(DagInit *DI) { } } -std::pair Intrinsic::DagEmitter::emitDagCall(DagInit *DI) { +std::pair +Intrinsic::DagEmitter::emitDagCall(DagInit *DI, bool MatchMangledName) { std::vector Types; std::vector Values; for (unsigned I = 0; I < DI->getNumArgs() - 1; ++I) { @@ -1427,7 +1436,13 @@ std::pair Intrinsic::DagEmitter::emitDagCall(DagInit *DI) { N = SI->getAsUnquotedString(); else N = emitDagArg(DI->getArg(0), "").second; - Intrinsic &Callee = Intr.Emitter.getIntrinsic(N, Types); + Optional MangledName; + if (MatchMangledName) { + if (Intr.getRecord()->getValueAsBit("isLaneQ")) + N += "q"; + MangledName = Intr.mangleName(N, ClassS); + } + Intrinsic &Callee = Intr.Emitter.getIntrinsic(N, Types, MangledName); // Make sure the callee is known as an early def. Callee.setNeededEarly(); @@ -1486,6 +1501,8 @@ std::pair Intrinsic::DagEmitter::emitDagCast(DagInit *DI, castToType.doubleLanes(); } else if (SI->getAsUnquotedString() == "8") { castToType.makeInteger(8, true); + } else if (SI->getAsUnquotedString() == "32") { + castToType.make32BitElement(); } else { castToType = Type::fromTypedefName(SI->getAsUnquotedString()); assert_with_loc(!castToType.isVoid(), "Unknown typedef"); @@ -1832,7 +1849,8 @@ void Intrinsic::indexBody() { // NeonEmitter implementation //===----------------------------------------------------------------------===// -Intrinsic &NeonEmitter::getIntrinsic(StringRef Name, ArrayRef Types) { +Intrinsic &NeonEmitter::getIntrinsic(StringRef Name, ArrayRef Types, + Optional MangledName) { // First, look up the name in the intrinsic map. assert_with_loc(IntrinsicMap.find(Name.str()) != IntrinsicMap.end(), ("Intrinsic '" + Name + "' not found!").str()); @@ -1861,17 +1879,19 @@ Intrinsic &NeonEmitter::getIntrinsic(StringRef Name, ArrayRef Types) { } ErrMsg += ")\n"; + if (MangledName && MangledName != I.getMangledName(true)) + continue; + if (I.getNumParams() != Types.size()) continue; - bool Good = true; - for (unsigned Arg = 0; Arg < Types.size(); ++Arg) { - if (I.getParamType(Arg) != Types[Arg]) { - Good = false; - break; - } - } - if (Good) + unsigned ArgNum = 0; + bool MatchingArgumentTypes = + std::all_of(Types.begin(), Types.end(), [&](const auto &Type) { + return Type == I.getParamType(ArgNum++); + }); + + if (MatchingArgumentTypes) GoodVec.push_back(&I); } diff --git a/clang/utils/TableGen/SveEmitter.cpp b/clang/utils/TableGen/SveEmitter.cpp index 8b53e376cb0d7..e02e94dd98ae6 100644 --- a/clang/utils/TableGen/SveEmitter.cpp +++ b/clang/utils/TableGen/SveEmitter.cpp @@ -100,6 +100,10 @@ class SVEType { /// string for passing to the BUILTIN() macro in Builtins.def. std::string builtin_str() const; + /// Return the C/C++ string representation of a type for use in the + /// arm_sve.h header file. + std::string str() const; + private: /// Creates the type based on the typespec string in TS. void applyTypespec(); @@ -335,6 +339,45 @@ std::string SVEType::builtin_str() const { return "q" + utostr(getNumElements() * NumVectors) + S; } +std::string SVEType::str() const { + if (isPredicatePattern()) + return "sv_pattern"; + + if (isPrefetchOp()) + return "sv_prfop"; + + std::string S; + if (Void) + S += "void"; + else { + if (isScalableVector()) + S += "sv"; + if (!Signed && !Float) + S += "u"; + + if (Float) + S += "float"; + else if (isScalarPredicate()) + S += "bool"; + else + S += "int"; + + if (!isScalarPredicate()) + S += utostr(ElementBitwidth); + if (!isScalableVector() && isVector()) + S += "x" + utostr(getNumElements()); + if (NumVectors > 1) + S += "x" + utostr(NumVectors); + S += "_t"; + } + + if (Constant) + S += " const"; + if (Pointer) + S += " *"; + + return S; +} void SVEType::applyTypespec() { for (char I : TS) { switch (I) { @@ -515,8 +558,19 @@ void Intrinsic::emitIntrinsic(raw_ostream &OS) const { << "(...) __builtin_sve_" << mangleName(ClassS) << "(__VA_ARGS__)\n"; } else { - llvm_unreachable("Not yet implemented. Overloaded intrinsics will follow " - "in a future patch"); + std::string FullName = mangleName(ClassS); + std::string ProtoName = mangleName(ClassG); + + OS << "__aio __attribute__((__clang_arm_builtin_alias(" + << "__builtin_sve_" << FullName << ")))\n"; + + OS << getTypes()[0].str() << " " << ProtoName << "("; + for (unsigned I = 0; I < getTypes().size() - 1; ++I) { + if (I != 0) + OS << ", "; + OS << getTypes()[I + 1].str(); + } + OS << ");\n"; } } @@ -559,6 +613,11 @@ void SVEEmitter::createIntrinsic( Out.push_back(std::make_unique(Name, Proto, Merge, LLVMName, Flags, TS, ClassS, *this, Guard)); + + // Also generate the short-form (e.g. svadd_m) for the given type-spec. + if (Intrinsic::isOverloadedIntrinsic(Name)) + Out.push_back(std::make_unique( + Name, Proto, Merge, LLVMName, Flags, TS, ClassG, *this, Guard)); } } @@ -608,6 +667,10 @@ void SVEEmitter::createHeader(raw_ostream &OS) { OS << "typedef __SVFloat64_t svfloat64_t;\n"; OS << "typedef __SVBool_t svbool_t;\n\n"; + OS << "/* Function attributes */\n"; + OS << "#define __aio static inline __attribute__((__always_inline__, " + "__nodebug__, __overloadable__))\n\n"; + SmallVector, 128> Defs; std::vector RV = Records.getAllDerivedDefinitions("Inst"); for (auto *R : RV) diff --git a/compiler-rt/include/fuzzer/FuzzedDataProvider.h b/compiler-rt/include/fuzzer/FuzzedDataProvider.h index bdeff21b12942..21d5fac457d62 100644 --- a/compiler-rt/include/fuzzer/FuzzedDataProvider.h +++ b/compiler-rt/include/fuzzer/FuzzedDataProvider.h @@ -34,288 +34,351 @@ class FuzzedDataProvider { : data_ptr_(data), remaining_bytes_(size) {} ~FuzzedDataProvider() = default; - // Returns a std::vector containing |num_bytes| of input data. If fewer than - // |num_bytes| of data remain, returns a shorter std::vector containing all - // of the data that's left. Can be used with any byte sized type, such as - // char, unsigned char, uint8_t, etc. - template std::vector ConsumeBytes(size_t num_bytes) { - num_bytes = std::min(num_bytes, remaining_bytes_); - return ConsumeBytes(num_bytes, num_bytes); - } - - // Similar to |ConsumeBytes|, but also appends the terminator value at the end - // of the resulting vector. Useful, when a mutable null-terminated C-string is - // needed, for example. But that is a rare case. Better avoid it, if possible, - // and prefer using |ConsumeBytes| or |ConsumeBytesAsString| methods. + // See the implementation below (after the class definition) for more verbose + // comments for each of the methods. + + // Methods returning std::vector of bytes. These are the most popular choice + // when splitting fuzzing input into pieces, as every piece is put into a + // separate buffer (i.e. ASan would catch any under-/overflow) and the memory + // will be released automatically. + template std::vector ConsumeBytes(size_t num_bytes); template - std::vector ConsumeBytesWithTerminator(size_t num_bytes, - T terminator = 0) { - num_bytes = std::min(num_bytes, remaining_bytes_); - std::vector result = ConsumeBytes(num_bytes + 1, num_bytes); - result.back() = terminator; - return result; - } + std::vector ConsumeBytesWithTerminator(size_t num_bytes, T terminator = 0); + template std::vector ConsumeRemainingBytes(); - // Returns a std::string containing |num_bytes| of input data. Using this and - // |.c_str()| on the resulting string is the best way to get an immutable - // null-terminated C string. If fewer than |num_bytes| of data remain, returns - // a shorter std::string containing all of the data that's left. - std::string ConsumeBytesAsString(size_t num_bytes) { - static_assert(sizeof(std::string::value_type) == sizeof(uint8_t), - "ConsumeBytesAsString cannot convert the data to a string."); - - num_bytes = std::min(num_bytes, remaining_bytes_); - std::string result( - reinterpret_cast(data_ptr_), - num_bytes); - Advance(num_bytes); - return result; - } + // Methods returning strings. Use only when you need a std::string or a null + // terminated C-string. Otherwise, prefer the methods returning std::vector. + std::string ConsumeBytesAsString(size_t num_bytes); + std::string ConsumeRandomLengthString(size_t max_length); + std::string ConsumeRandomLengthString(); + std::string ConsumeRemainingBytesAsString(); - // Returns a number in the range [min, max] by consuming bytes from the - // input data. The value might not be uniformly distributed in the given - // range. If there's no input data left, always returns |min|. |min| must - // be less than or equal to |max|. - template T ConsumeIntegralInRange(T min, T max) { - static_assert(std::is_integral::value, "An integral type is required."); - static_assert(sizeof(T) <= sizeof(uint64_t), "Unsupported integral type."); + // Methods returning integer values. + template T ConsumeIntegral(); + template T ConsumeIntegralInRange(T min, T max); - if (min > max) - abort(); + // Methods returning floating point values. + template T ConsumeFloatingPoint(); + template T ConsumeFloatingPointInRange(T min, T max); - // Use the biggest type possible to hold the range and the result. - uint64_t range = static_cast(max) - min; - uint64_t result = 0; - size_t offset = 0; - - while (offset < sizeof(T) * CHAR_BIT && (range >> offset) > 0 && - remaining_bytes_ != 0) { - // Pull bytes off the end of the seed data. Experimentally, this seems to - // allow the fuzzer to more easily explore the input space. This makes - // sense, since it works by modifying inputs that caused new code to run, - // and this data is often used to encode length of data read by - // |ConsumeBytes|. Separating out read lengths makes it easier modify the - // contents of the data that is actually read. - --remaining_bytes_; - result = (result << CHAR_BIT) | data_ptr_[remaining_bytes_]; - offset += CHAR_BIT; - } + // 0 <= return value <= 1. + template T ConsumeProbability(); - // Avoid division by 0, in case |range + 1| results in overflow. - if (range != std::numeric_limits::max()) - result = result % (range + 1); + bool ConsumeBool(); - return static_cast(min + result); - } + // Returns a value chosen from the given enum. + template T ConsumeEnum(); - // Returns a std::string of length from 0 to |max_length|. When it runs out of - // input data, returns what remains of the input. Designed to be more stable - // with respect to a fuzzer inserting characters than just picking a random - // length and then consuming that many bytes with |ConsumeBytes|. - std::string ConsumeRandomLengthString(size_t max_length) { - // Reads bytes from the start of |data_ptr_|. Maps "\\" to "\", and maps "\" - // followed by anything else to the end of the string. As a result of this - // logic, a fuzzer can insert characters into the string, and the string - // will be lengthened to include those new characters, resulting in a more - // stable fuzzer than picking the length of a string independently from - // picking its contents. - std::string result; - - // Reserve the anticipated capaticity to prevent several reallocations. - result.reserve(std::min(max_length, remaining_bytes_)); - for (size_t i = 0; i < max_length && remaining_bytes_ != 0; ++i) { - char next = ConvertUnsignedToSigned(data_ptr_[0]); - Advance(1); - if (next == '\\' && remaining_bytes_ != 0) { - next = ConvertUnsignedToSigned(data_ptr_[0]); - Advance(1); - if (next != '\\') - break; - } - result += next; - } + // Returns a value from the given array. + template T PickValueInArray(const T (&array)[size]); + template T PickValueInArray(std::initializer_list list); - result.shrink_to_fit(); - return result; - } - - // Returns a std::vector containing all remaining bytes of the input data. - template std::vector ConsumeRemainingBytes() { - return ConsumeBytes(remaining_bytes_); - } + // Writes data to the given destination and returns number of bytes written. + size_t ConsumeData(void *destination, size_t num_bytes); - // Returns a std::string containing all remaining bytes of the input data. - // Prefer using |ConsumeRemainingBytes| unless you actually need a std::string - // object. - std::string ConsumeRemainingBytesAsString() { - return ConsumeBytesAsString(remaining_bytes_); - } + // Reports the remaining bytes available for fuzzed input. + size_t remaining_bytes() { return remaining_bytes_; } - // Returns a number in the range [Type's min, Type's max]. The value might - // not be uniformly distributed in the given range. If there's no input data - // left, always returns |min|. - template T ConsumeIntegral() { - return ConsumeIntegralInRange(std::numeric_limits::min(), - std::numeric_limits::max()); - } + private: + FuzzedDataProvider(const FuzzedDataProvider &) = delete; + FuzzedDataProvider &operator=(const FuzzedDataProvider &) = delete; - // Reads one byte and returns a bool, or false when no data remains. - bool ConsumeBool() { return 1 & ConsumeIntegral(); } + void CopyAndAdvance(void *destination, size_t num_bytes); - // Returns a copy of the value selected from the given fixed-size |array|. - template - T PickValueInArray(const T (&array)[size]) { - static_assert(size > 0, "The array must be non empty."); - return array[ConsumeIntegralInRange(0, size - 1)]; - } + void Advance(size_t num_bytes); template - T PickValueInArray(std::initializer_list list) { - // TODO(Dor1s): switch to static_assert once C++14 is allowed. - if (!list.size()) - abort(); - - return *(list.begin() + ConsumeIntegralInRange(0, list.size() - 1)); - } - - // Returns an enum value. The enum must start at 0 and be contiguous. It must - // also contain |kMaxValue| aliased to its largest (inclusive) value. Such as: - // enum class Foo { SomeValue, OtherValue, kMaxValue = OtherValue }; - template T ConsumeEnum() { - static_assert(std::is_enum::value, "|T| must be an enum type."); - return static_cast(ConsumeIntegralInRange( - 0, static_cast(T::kMaxValue))); - } - - // Returns a floating point number in the range [0.0, 1.0]. If there's no - // input data left, always returns 0. - template T ConsumeProbability() { - static_assert(std::is_floating_point::value, - "A floating point type is required."); - - // Use different integral types for different floating point types in order - // to provide better density of the resulting values. - using IntegralType = - typename std::conditional<(sizeof(T) <= sizeof(uint32_t)), uint32_t, - uint64_t>::type; + std::vector ConsumeBytes(size_t size, size_t num_bytes); - T result = static_cast(ConsumeIntegral()); - result /= static_cast(std::numeric_limits::max()); - return result; - } - - // Returns a floating point value in the range [Type's lowest, Type's max] by - // consuming bytes from the input data. If there's no input data left, always - // returns approximately 0. - template T ConsumeFloatingPoint() { - return ConsumeFloatingPointInRange(std::numeric_limits::lowest(), - std::numeric_limits::max()); - } + template TS ConvertUnsignedToSigned(TU value); - // Returns a floating point value in the given range by consuming bytes from - // the input data. If there's no input data left, returns |min|. Note that - // |min| must be less than or equal to |max|. - template T ConsumeFloatingPointInRange(T min, T max) { - if (min > max) - abort(); + const uint8_t *data_ptr_; + size_t remaining_bytes_; +}; - T range = .0; - T result = min; - constexpr T zero(.0); - if (max > zero && min < zero && max > min + std::numeric_limits::max()) { - // The diff |max - min| would overflow the given floating point type. Use - // the half of the diff as the range and consume a bool to decide whether - // the result is in the first of the second part of the diff. - range = (max / 2.0) - (min / 2.0); - if (ConsumeBool()) { - result += range; - } - } else { - range = max - min; +// Returns a std::vector containing |num_bytes| of input data. If fewer than +// |num_bytes| of data remain, returns a shorter std::vector containing all +// of the data that's left. Can be used with any byte sized type, such as +// char, unsigned char, uint8_t, etc. +template +std::vector FuzzedDataProvider::ConsumeBytes(size_t num_bytes) { + num_bytes = std::min(num_bytes, remaining_bytes_); + return ConsumeBytes(num_bytes, num_bytes); +} + +// Similar to |ConsumeBytes|, but also appends the terminator value at the end +// of the resulting vector. Useful, when a mutable null-terminated C-string is +// needed, for example. But that is a rare case. Better avoid it, if possible, +// and prefer using |ConsumeBytes| or |ConsumeBytesAsString| methods. +template +std::vector FuzzedDataProvider::ConsumeBytesWithTerminator(size_t num_bytes, + T terminator) { + num_bytes = std::min(num_bytes, remaining_bytes_); + std::vector result = ConsumeBytes(num_bytes + 1, num_bytes); + result.back() = terminator; + return result; +} + +// Returns a std::vector containing all remaining bytes of the input data. +template +std::vector FuzzedDataProvider::ConsumeRemainingBytes() { + return ConsumeBytes(remaining_bytes_); +} + +// Returns a std::string containing |num_bytes| of input data. Using this and +// |.c_str()| on the resulting string is the best way to get an immutable +// null-terminated C string. If fewer than |num_bytes| of data remain, returns +// a shorter std::string containing all of the data that's left. +std::string FuzzedDataProvider::ConsumeBytesAsString(size_t num_bytes) { + static_assert(sizeof(std::string::value_type) == sizeof(uint8_t), + "ConsumeBytesAsString cannot convert the data to a string."); + + num_bytes = std::min(num_bytes, remaining_bytes_); + std::string result( + reinterpret_cast(data_ptr_), num_bytes); + Advance(num_bytes); + return result; +} + +// Returns a std::string of length from 0 to |max_length|. When it runs out of +// input data, returns what remains of the input. Designed to be more stable +// with respect to a fuzzer inserting characters than just picking a random +// length and then consuming that many bytes with |ConsumeBytes|. +std::string FuzzedDataProvider::ConsumeRandomLengthString(size_t max_length) { + // Reads bytes from the start of |data_ptr_|. Maps "\\" to "\", and maps "\" + // followed by anything else to the end of the string. As a result of this + // logic, a fuzzer can insert characters into the string, and the string + // will be lengthened to include those new characters, resulting in a more + // stable fuzzer than picking the length of a string independently from + // picking its contents. + std::string result; + + // Reserve the anticipated capaticity to prevent several reallocations. + result.reserve(std::min(max_length, remaining_bytes_)); + for (size_t i = 0; i < max_length && remaining_bytes_ != 0; ++i) { + char next = ConvertUnsignedToSigned(data_ptr_[0]); + Advance(1); + if (next == '\\' && remaining_bytes_ != 0) { + next = ConvertUnsignedToSigned(data_ptr_[0]); + Advance(1); + if (next != '\\') + break; } - - return result + range * ConsumeProbability(); + result += next; } - // Writes |num_bytes| of input data to the given destination pointer. If there - // is not enough data left, writes all remaining bytes. Return value is the - // number of bytes written. - // In general, it's better to avoid using this function, but it may be useful - // in cases when it's necessary to fill a certain buffer or object with - // fuzzing data. - size_t ConsumeData(void *destination, size_t num_bytes) { - num_bytes = std::min(num_bytes, remaining_bytes_); - CopyAndAdvance(destination, num_bytes); - return num_bytes; + result.shrink_to_fit(); + return result; +} + +// Returns a std::string of length from 0 to |remaining_bytes_|. +std::string FuzzedDataProvider::ConsumeRandomLengthString() { + return ConsumeRandomLengthString(remaining_bytes_); +} + +// Returns a std::string containing all remaining bytes of the input data. +// Prefer using |ConsumeRemainingBytes| unless you actually need a std::string +// object. +std::string FuzzedDataProvider::ConsumeRemainingBytesAsString() { + return ConsumeBytesAsString(remaining_bytes_); +} + +// Returns a number in the range [Type's min, Type's max]. The value might +// not be uniformly distributed in the given range. If there's no input data +// left, always returns |min|. +template T FuzzedDataProvider::ConsumeIntegral() { + return ConsumeIntegralInRange(std::numeric_limits::min(), + std::numeric_limits::max()); +} + +// Returns a number in the range [min, max] by consuming bytes from the +// input data. The value might not be uniformly distributed in the given +// range. If there's no input data left, always returns |min|. |min| must +// be less than or equal to |max|. +template +T FuzzedDataProvider::ConsumeIntegralInRange(T min, T max) { + static_assert(std::is_integral::value, "An integral type is required."); + static_assert(sizeof(T) <= sizeof(uint64_t), "Unsupported integral type."); + + if (min > max) + abort(); + + // Use the biggest type possible to hold the range and the result. + uint64_t range = static_cast(max) - min; + uint64_t result = 0; + size_t offset = 0; + + while (offset < sizeof(T) * CHAR_BIT && (range >> offset) > 0 && + remaining_bytes_ != 0) { + // Pull bytes off the end of the seed data. Experimentally, this seems to + // allow the fuzzer to more easily explore the input space. This makes + // sense, since it works by modifying inputs that caused new code to run, + // and this data is often used to encode length of data read by + // |ConsumeBytes|. Separating out read lengths makes it easier modify the + // contents of the data that is actually read. + --remaining_bytes_; + result = (result << CHAR_BIT) | data_ptr_[remaining_bytes_]; + offset += CHAR_BIT; } - // Reports the remaining bytes available for fuzzed input. - size_t remaining_bytes() { return remaining_bytes_; } - - private: - FuzzedDataProvider(const FuzzedDataProvider &) = delete; - FuzzedDataProvider &operator=(const FuzzedDataProvider &) = delete; - - void CopyAndAdvance(void *destination, size_t num_bytes) { - std::memcpy(destination, data_ptr_, num_bytes); - Advance(num_bytes); + // Avoid division by 0, in case |range + 1| results in overflow. + if (range != std::numeric_limits::max()) + result = result % (range + 1); + + return static_cast(min + result); +} + +// Returns a floating point value in the range [Type's lowest, Type's max] by +// consuming bytes from the input data. If there's no input data left, always +// returns approximately 0. +template T FuzzedDataProvider::ConsumeFloatingPoint() { + return ConsumeFloatingPointInRange(std::numeric_limits::lowest(), + std::numeric_limits::max()); +} + +// Returns a floating point value in the given range by consuming bytes from +// the input data. If there's no input data left, returns |min|. Note that +// |min| must be less than or equal to |max|. +template +T FuzzedDataProvider::ConsumeFloatingPointInRange(T min, T max) { + if (min > max) + abort(); + + T range = .0; + T result = min; + constexpr T zero(.0); + if (max > zero && min < zero && max > min + std::numeric_limits::max()) { + // The diff |max - min| would overflow the given floating point type. Use + // the half of the diff as the range and consume a bool to decide whether + // the result is in the first of the second part of the diff. + range = (max / 2.0) - (min / 2.0); + if (ConsumeBool()) { + result += range; + } + } else { + range = max - min; } - void Advance(size_t num_bytes) { - if (num_bytes > remaining_bytes_) + return result + range * ConsumeProbability(); +} + +// Returns a floating point number in the range [0.0, 1.0]. If there's no +// input data left, always returns 0. +template T FuzzedDataProvider::ConsumeProbability() { + static_assert(std::is_floating_point::value, + "A floating point type is required."); + + // Use different integral types for different floating point types in order + // to provide better density of the resulting values. + using IntegralType = + typename std::conditional<(sizeof(T) <= sizeof(uint32_t)), uint32_t, + uint64_t>::type; + + T result = static_cast(ConsumeIntegral()); + result /= static_cast(std::numeric_limits::max()); + return result; +} + +// Reads one byte and returns a bool, or false when no data remains. +bool FuzzedDataProvider::ConsumeBool() { + return 1 & ConsumeIntegral(); +} + +// Returns an enum value. The enum must start at 0 and be contiguous. It must +// also contain |kMaxValue| aliased to its largest (inclusive) value. Such as: +// enum class Foo { SomeValue, OtherValue, kMaxValue = OtherValue }; +template T FuzzedDataProvider::ConsumeEnum() { + static_assert(std::is_enum::value, "|T| must be an enum type."); + return static_cast( + ConsumeIntegralInRange(0, static_cast(T::kMaxValue))); +} + +// Returns a copy of the value selected from the given fixed-size |array|. +template +T FuzzedDataProvider::PickValueInArray(const T (&array)[size]) { + static_assert(size > 0, "The array must be non empty."); + return array[ConsumeIntegralInRange(0, size - 1)]; +} + +template +T FuzzedDataProvider::PickValueInArray(std::initializer_list list) { + // TODO(Dor1s): switch to static_assert once C++14 is allowed. + if (!list.size()) + abort(); + + return *(list.begin() + ConsumeIntegralInRange(0, list.size() - 1)); +} + +// Writes |num_bytes| of input data to the given destination pointer. If there +// is not enough data left, writes all remaining bytes. Return value is the +// number of bytes written. +// In general, it's better to avoid using this function, but it may be useful +// in cases when it's necessary to fill a certain buffer or object with +// fuzzing data. +size_t FuzzedDataProvider::ConsumeData(void *destination, size_t num_bytes) { + num_bytes = std::min(num_bytes, remaining_bytes_); + CopyAndAdvance(destination, num_bytes); + return num_bytes; +} + +// Private methods. +void FuzzedDataProvider::CopyAndAdvance(void *destination, size_t num_bytes) { + std::memcpy(destination, data_ptr_, num_bytes); + Advance(num_bytes); +} + +void FuzzedDataProvider::Advance(size_t num_bytes) { + if (num_bytes > remaining_bytes_) + abort(); + + data_ptr_ += num_bytes; + remaining_bytes_ -= num_bytes; +} + +template +std::vector FuzzedDataProvider::ConsumeBytes(size_t size, size_t num_bytes) { + static_assert(sizeof(T) == sizeof(uint8_t), "Incompatible data type."); + + // The point of using the size-based constructor below is to increase the + // odds of having a vector object with capacity being equal to the length. + // That part is always implementation specific, but at least both libc++ and + // libstdc++ allocate the requested number of bytes in that constructor, + // which seems to be a natural choice for other implementations as well. + // To increase the odds even more, we also call |shrink_to_fit| below. + std::vector result(size); + if (size == 0) { + if (num_bytes != 0) abort(); - - data_ptr_ += num_bytes; - remaining_bytes_ -= num_bytes; - } - - template - std::vector ConsumeBytes(size_t size, size_t num_bytes) { - static_assert(sizeof(T) == sizeof(uint8_t), "Incompatible data type."); - - // The point of using the size-based constructor below is to increase the - // odds of having a vector object with capacity being equal to the length. - // That part is always implementation specific, but at least both libc++ and - // libstdc++ allocate the requested number of bytes in that constructor, - // which seems to be a natural choice for other implementations as well. - // To increase the odds even more, we also call |shrink_to_fit| below. - std::vector result(size); - if (size == 0) { - if (num_bytes != 0) - abort(); - return result; - } - - CopyAndAdvance(result.data(), num_bytes); - - // Even though |shrink_to_fit| is also implementation specific, we expect it - // to provide an additional assurance in case vector's constructor allocated - // a buffer which is larger than the actual amount of data we put inside it. - result.shrink_to_fit(); return result; } - template TS ConvertUnsignedToSigned(TU value) { - static_assert(sizeof(TS) == sizeof(TU), "Incompatible data types."); - static_assert(!std::numeric_limits::is_signed, - "Source type must be unsigned."); - - // TODO(Dor1s): change to `if constexpr` once C++17 becomes mainstream. - if (std::numeric_limits::is_modulo) - return static_cast(value); - - // Avoid using implementation-defined unsigned to signed conversions. - // To learn more, see https://stackoverflow.com/questions/13150449. - if (value <= std::numeric_limits::max()) { - return static_cast(value); - } else { - constexpr auto TS_min = std::numeric_limits::min(); - return TS_min + static_cast(value - TS_min); - } + CopyAndAdvance(result.data(), num_bytes); + + // Even though |shrink_to_fit| is also implementation specific, we expect it + // to provide an additional assurance in case vector's constructor allocated + // a buffer which is larger than the actual amount of data we put inside it. + result.shrink_to_fit(); + return result; +} + +template +TS FuzzedDataProvider::ConvertUnsignedToSigned(TU value) { + static_assert(sizeof(TS) == sizeof(TU), "Incompatible data types."); + static_assert(!std::numeric_limits::is_signed, + "Source type must be unsigned."); + + // TODO(Dor1s): change to `if constexpr` once C++17 becomes mainstream. + if (std::numeric_limits::is_modulo) + return static_cast(value); + + // Avoid using implementation-defined unsigned to signed conversions. + // To learn more, see https://stackoverflow.com/questions/13150449. + if (value <= std::numeric_limits::max()) { + return static_cast(value); + } else { + constexpr auto TS_min = std::numeric_limits::min(); + return TS_min + static_cast(value - TS_min); } - - const uint8_t *data_ptr_; - size_t remaining_bytes_; -}; +} #endif // LLVM_FUZZER_FUZZED_DATA_PROVIDER_H_ diff --git a/compiler-rt/lib/fuzzer/tests/FuzzedDataProviderUnittest.cpp b/compiler-rt/lib/fuzzer/tests/FuzzedDataProviderUnittest.cpp index 5eb46533d98a0..99d9d8ecbe9bb 100644 --- a/compiler-rt/lib/fuzzer/tests/FuzzedDataProviderUnittest.cpp +++ b/compiler-rt/lib/fuzzer/tests/FuzzedDataProviderUnittest.cpp @@ -190,14 +190,26 @@ TEST(FuzzedDataProvider, ConsumeRandomLengthString) { "\x1D\xBD\x4E\x17\x04\x1E\xBA\x26\xAC\x1F\xE3\x37\x1C\x15\x43" "\x60\x41\x2A\x7C\xCA\x70\xCE\xAB\x20\x24\xF8\xD9\x1F\x14\x7C"), DataProv.ConsumeRandomLengthString(31337)); - EXPECT_EQ(std::string(Data + 141, Data + 141 + 5), + size_t Offset = 141; + EXPECT_EQ(std::string(Data + Offset, Data + Offset + 5), DataProv.ConsumeRandomLengthString(5)); - EXPECT_EQ(std::string(Data + 141 + 5, Data + 141 + 5 + 2), + Offset += 5; + EXPECT_EQ(std::string(Data + Offset, Data + Offset + 2), DataProv.ConsumeRandomLengthString(2)); + Offset += 2; + + // Call the overloaded method without arguments (uses max length available). + EXPECT_EQ(std::string(Data + Offset, Data + Offset + 664), + DataProv.ConsumeRandomLengthString()); + Offset += 664 + 2; // +2 because of '\' character followed by any other byte. + + EXPECT_EQ(std::string(Data + Offset, Data + Offset + 92), + DataProv.ConsumeRandomLengthString()); + Offset += 92 + 2; // Exhaust the buffer. auto String = DataProv.ConsumeBytesAsString(31337); - EXPECT_EQ(size_t(876), String.length()); + EXPECT_EQ(size_t(116), String.length()); EXPECT_EQ(std::string(), DataProv.ConsumeRandomLengthString(1)); } diff --git a/compiler-rt/lib/gwp_asan/common.cpp b/compiler-rt/lib/gwp_asan/common.cpp index 44935817f8e91..3438c4b91893b 100644 --- a/compiler-rt/lib/gwp_asan/common.cpp +++ b/compiler-rt/lib/gwp_asan/common.cpp @@ -59,6 +59,11 @@ void AllocationMetadata::CallSiteInfo::RecordBacktrace( uintptr_t UncompressedBuffer[kMaxTraceLengthToCollect]; size_t BacktraceLength = Backtrace(UncompressedBuffer, kMaxTraceLengthToCollect); + // Backtrace() returns the number of available frames, which may be greater + // than the number of frames in the buffer. In this case, we need to only pack + // the number of frames that are in the buffer. + if (BacktraceLength > kMaxTraceLengthToCollect) + BacktraceLength = kMaxTraceLengthToCollect; TraceSize = compression::pack(UncompressedBuffer, BacktraceLength, CompressedTrace, AllocationMetadata::kStackFrameStorageBytes); diff --git a/compiler-rt/lib/gwp_asan/tests/backtrace.cpp b/compiler-rt/lib/gwp_asan/tests/backtrace.cpp index bc81f35cb379f..6c9a9309ed8b4 100644 --- a/compiler-rt/lib/gwp_asan/tests/backtrace.cpp +++ b/compiler-rt/lib/gwp_asan/tests/backtrace.cpp @@ -8,6 +8,7 @@ #include +#include "gwp_asan/crash_handler.h" #include "gwp_asan/tests/harness.h" TEST_F(BacktraceGuardedPoolAllocator, DoubleFree) { @@ -15,13 +16,13 @@ TEST_F(BacktraceGuardedPoolAllocator, DoubleFree) { GPA.deallocate(Ptr); std::string DeathRegex = "Double Free.*"; - DeathRegex.append("backtrace\\.cpp:25.*"); + DeathRegex.append("backtrace\\.cpp:26.*"); DeathRegex.append("was deallocated.*"); - DeathRegex.append("backtrace\\.cpp:15.*"); + DeathRegex.append("backtrace\\.cpp:16.*"); DeathRegex.append("was allocated.*"); - DeathRegex.append("backtrace\\.cpp:14.*"); + DeathRegex.append("backtrace\\.cpp:15.*"); ASSERT_DEATH(GPA.deallocate(Ptr), DeathRegex); } @@ -30,12 +31,36 @@ TEST_F(BacktraceGuardedPoolAllocator, UseAfterFree) { GPA.deallocate(Ptr); std::string DeathRegex = "Use After Free.*"; - DeathRegex.append("backtrace\\.cpp:40.*"); + DeathRegex.append("backtrace\\.cpp:41.*"); DeathRegex.append("was deallocated.*"); - DeathRegex.append("backtrace\\.cpp:30.*"); + DeathRegex.append("backtrace\\.cpp:31.*"); DeathRegex.append("was allocated.*"); - DeathRegex.append("backtrace\\.cpp:29.*"); + DeathRegex.append("backtrace\\.cpp:30.*"); ASSERT_DEATH({ *Ptr = 7; }, DeathRegex); } + +TEST(Backtrace, Short) { + gwp_asan::AllocationMetadata Meta; + Meta.AllocationTrace.RecordBacktrace( + [](uintptr_t *TraceBuffer, size_t /* Size */) -> size_t { + TraceBuffer[0] = 123u; + TraceBuffer[1] = 321u; + return 2u; + }); + uintptr_t TraceOutput[2] = {}; + EXPECT_EQ(2u, __gwp_asan_get_allocation_trace(&Meta, TraceOutput, 2)); + EXPECT_EQ(TraceOutput[0], 123u); + EXPECT_EQ(TraceOutput[1], 321u); +} + +TEST(Backtrace, ExceedsStorableLength) { + gwp_asan::AllocationMetadata Meta; + Meta.AllocationTrace.RecordBacktrace( + [](uintptr_t * /* TraceBuffer */, size_t /* Size */) -> size_t { + return SIZE_MAX; // Wow, that's big! + }); + uintptr_t TraceOutput; + EXPECT_EQ(1u, __gwp_asan_get_allocation_trace(&Meta, &TraceOutput, 1)); +} diff --git a/compiler-rt/lib/msan/msan_interceptors.cpp b/compiler-rt/lib/msan/msan_interceptors.cpp index 1c6956eca0f65..9b1340b0104e2 100644 --- a/compiler-rt/lib/msan/msan_interceptors.cpp +++ b/compiler-rt/lib/msan/msan_interceptors.cpp @@ -824,30 +824,6 @@ INTERCEPTOR(int, prlimit64, int pid, int resource, void *new_rlimit, #define MSAN_MAYBE_INTERCEPT_PRLIMIT64 #endif -#if SANITIZER_FREEBSD -// FreeBSD's define uname() as -// static __inline int uname(struct utsname *name) { -// return __xuname(SYS_NMLN, (void*)name); -// } -INTERCEPTOR(int, __xuname, int size, void *utsname) { - ENSURE_MSAN_INITED(); - int res = REAL(__xuname)(size, utsname); - if (!res) - __msan_unpoison(utsname, __sanitizer::struct_utsname_sz); - return res; -} -#define MSAN_INTERCEPT_UNAME INTERCEPT_FUNCTION(__xuname) -#else -INTERCEPTOR(int, uname, struct utsname *utsname) { - ENSURE_MSAN_INITED(); - int res = REAL(uname)(utsname); - if (!res) - __msan_unpoison(utsname, __sanitizer::struct_utsname_sz); - return res; -} -#define MSAN_INTERCEPT_UNAME INTERCEPT_FUNCTION(uname) -#endif - INTERCEPTOR(int, gethostname, char *name, SIZE_T len) { ENSURE_MSAN_INITED(); int res = REAL(gethostname)(name, len); @@ -953,7 +929,9 @@ void __sanitizer_dtor_callback(const void *data, uptr size) { template static void *mmap_interceptor(Mmap real_mmap, void *addr, SIZE_T length, int prot, int flags, int fd, OFF64_T offset) { - if (addr && !MEM_IS_APP(addr)) { + SIZE_T rounded_length = RoundUpTo(length, GetPageSize()); + void *end_addr = (char *)addr + (rounded_length - 1); + if (addr && (!MEM_IS_APP(addr) || !MEM_IS_APP(end_addr))) { if (flags & map_fixed) { errno = errno_EINVAL; return (void *)-1; @@ -962,7 +940,18 @@ static void *mmap_interceptor(Mmap real_mmap, void *addr, SIZE_T length, } } void *res = real_mmap(addr, length, prot, flags, fd, offset); - if (res != (void *)-1) __msan_unpoison(res, RoundUpTo(length, GetPageSize())); + if (res != (void *)-1) { + void *end_res = (char *)res + (rounded_length - 1); + if (MEM_IS_APP(res) && MEM_IS_APP(end_res)) { + __msan_unpoison(res, rounded_length); + } else { + // Application has attempted to map more memory than is supported by + // MSAN. Act as if we ran out of memory. + internal_munmap(res, length); + errno = errno_ENOMEM; + return (void *)-1; + } + } return res; } @@ -1692,7 +1681,6 @@ void InitializeInterceptors() { MSAN_MAYBE_INTERCEPT_GETRLIMIT64; MSAN_MAYBE_INTERCEPT_PRLIMIT; MSAN_MAYBE_INTERCEPT_PRLIMIT64; - MSAN_INTERCEPT_UNAME; INTERCEPT_FUNCTION(gethostname); MSAN_MAYBE_INTERCEPT_EPOLL_WAIT; MSAN_MAYBE_INTERCEPT_EPOLL_PWAIT; diff --git a/compiler-rt/lib/profile/CMakeLists.txt b/compiler-rt/lib/profile/CMakeLists.txt index 8cedbcf4f806c..ece674b2daa14 100644 --- a/compiler-rt/lib/profile/CMakeLists.txt +++ b/compiler-rt/lib/profile/CMakeLists.txt @@ -51,6 +51,7 @@ add_compiler_rt_component(profile) set(PROFILE_SOURCES GCDAProfiling.c InstrProfiling.c + InstrProfilingInternal.c InstrProfilingValue.c InstrProfilingBiasVar.c InstrProfilingBuffer.c diff --git a/compiler-rt/lib/profile/InstrProfiling.c b/compiler-rt/lib/profile/InstrProfiling.c index 087d1cdd2efe1..31a9fe9962931 100644 --- a/compiler-rt/lib/profile/InstrProfiling.c +++ b/compiler-rt/lib/profile/InstrProfiling.c @@ -25,18 +25,8 @@ COMPILER_RT_VISIBILITY uint64_t __llvm_profile_get_magic(void) { : (INSTR_PROF_RAW_MAGIC_32); } -static unsigned ProfileDumped = 0; - -COMPILER_RT_VISIBILITY unsigned lprofProfileDumped() { - return ProfileDumped; -} - -COMPILER_RT_VISIBILITY void lprofSetProfileDumped() { - ProfileDumped = 1; -} - COMPILER_RT_VISIBILITY void __llvm_profile_set_dumped() { - lprofSetProfileDumped(); + lprofSetProfileDumped(1); } /* Return the number of bytes needed to add to SizeInBytes to make it @@ -80,5 +70,5 @@ COMPILER_RT_VISIBILITY void __llvm_profile_reset_counters(void) { } } } - ProfileDumped = 0; + lprofSetProfileDumped(0); } diff --git a/compiler-rt/lib/profile/InstrProfiling.h b/compiler-rt/lib/profile/InstrProfiling.h index a51cf38285ba3..d7a7c32332c1b 100644 --- a/compiler-rt/lib/profile/InstrProfiling.h +++ b/compiler-rt/lib/profile/InstrProfiling.h @@ -218,6 +218,9 @@ int __llvm_profile_register_write_file_atexit(void); /*! \brief Initialize file handling. */ void __llvm_profile_initialize_file(void); +/*! \brief Initialize the profile runtime. */ +void __llvm_profile_initialize(void); + /*! * \brief Return path prefix (excluding the base filename) of the profile data. * This is useful for users using \c -fprofile-generate=./path_prefix who do diff --git a/compiler-rt/lib/profile/InstrProfilingBuffer.c b/compiler-rt/lib/profile/InstrProfilingBuffer.c index 489ba2929ccbc..5ee44785a7ab9 100644 --- a/compiler-rt/lib/profile/InstrProfilingBuffer.c +++ b/compiler-rt/lib/profile/InstrProfilingBuffer.c @@ -10,9 +10,6 @@ #include "InstrProfilingInternal.h" #include "InstrProfilingPort.h" -/* When counters are being relocated at runtime, this parameter is set to 1. */ -COMPILER_RT_VISIBILITY int RuntimeCounterRelocation = 0; - /* When continuous mode is enabled (%c), this parameter is set to 1. * * This parameter is defined here in InstrProfilingBuffer.o, instead of in @@ -66,7 +63,7 @@ void __llvm_profile_get_padding_sizes_for_counters( uint64_t *PaddingBytesBeforeCounters, uint64_t *PaddingBytesAfterCounters, uint64_t *PaddingBytesAfterNames) { if (!__llvm_profile_is_continuous_mode_enabled() || - RuntimeCounterRelocation) { + lprofRuntimeCounterRelocation()) { *PaddingBytesBeforeCounters = 0; *PaddingBytesAfterCounters = 0; *PaddingBytesAfterNames = __llvm_profile_get_num_padding_bytes(NamesSize); diff --git a/compiler-rt/lib/profile/InstrProfilingFile.c b/compiler-rt/lib/profile/InstrProfilingFile.c index e0a29119e692b..9e1a54a0c3737 100644 --- a/compiler-rt/lib/profile/InstrProfilingFile.c +++ b/compiler-rt/lib/profile/InstrProfilingFile.c @@ -477,7 +477,8 @@ static int writeMMappedFile(FILE *OutputFile, char **Profile) { } static void relocateCounters(void) { - if (!__llvm_profile_is_continuous_mode_enabled() || !RuntimeCounterRelocation) + if (!__llvm_profile_is_continuous_mode_enabled() || + !lprofRuntimeCounterRelocation()) return; /* Get the sizes of various profile data sections. Taken from @@ -808,7 +809,7 @@ static void parseAndSetFilename(const char *FilenamePat, truncateCurrentFile(); if (__llvm_profile_is_continuous_mode_enabled()) { - if (RuntimeCounterRelocation) + if (lprofRuntimeCounterRelocation()) relocateCounters(); else initializeProfileForContinuousMode(); @@ -951,10 +952,10 @@ const char *__llvm_profile_get_filename(void) { return FilenameBuf; } -/* This method is invoked by the runtime initialization hook - * InstrProfilingRuntime.o if it is linked in. Both user specified +/* This API initializes the file handling, both user specified * profile path via -fprofile-instr-generate= and LLVM_PROFILE_FILE - * environment variable can override this default value. */ + * environment variable can override this default value. + */ COMPILER_RT_VISIBILITY void __llvm_profile_initialize_file(void) { const char *EnvFilenamePat; @@ -963,7 +964,7 @@ void __llvm_profile_initialize_file(void) { int hasCommandLineOverrider = (INSTR_PROF_PROFILE_NAME_VAR[0] != 0); if (__llvm_profile_counter_bias != -1) - RuntimeCounterRelocation = 1; + lprofSetRuntimeCounterRelocation(1); EnvFilenamePat = getFilenamePatFromEnv(); if (EnvFilenamePat) { @@ -982,6 +983,16 @@ void __llvm_profile_initialize_file(void) { parseAndSetFilename(SelectedPat, PNS, 0); } +/* This method is invoked by the runtime initialization hook + * InstrProfilingRuntime.o if it is linked in. + */ +COMPILER_RT_VISIBILITY +void __llvm_profile_initialize(void) { + __llvm_profile_initialize_file(); + if (!__llvm_profile_is_continuous_mode_enabled()) + __llvm_profile_register_write_file_atexit(); +} + /* This API is directly called by the user application code. It has the * highest precedence compared with LLVM_PROFILE_FILE environment variable * and command line option -fprofile-instr-generate=. @@ -1051,7 +1062,7 @@ int __llvm_profile_dump(void) { "in profile name or change profile name before dumping.\n", "online profile merging is not on"); int rc = __llvm_profile_write_file(); - lprofSetProfileDumped(); + lprofSetProfileDumped(1); return rc; } diff --git a/compiler-rt/lib/profile/InstrProfilingInternal.c b/compiler-rt/lib/profile/InstrProfilingInternal.c new file mode 100644 index 0000000000000..d58bc19ad11e6 --- /dev/null +++ b/compiler-rt/lib/profile/InstrProfilingInternal.c @@ -0,0 +1,33 @@ +/*===- InstrProfilingInternal.c - Support library for PGO instrumentation -===*\ +|* +|* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +|* See https://llvm.org/LICENSE.txt for license information. +|* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +|* +\*===----------------------------------------------------------------------===*/ + +#if !defined(__Fuchsia__) + +#include "InstrProfilingInternal.h" + +static unsigned ProfileDumped = 0; + +COMPILER_RT_VISIBILITY unsigned lprofProfileDumped() { + return ProfileDumped; +} + +COMPILER_RT_VISIBILITY void lprofSetProfileDumped(unsigned Value) { + ProfileDumped = Value; +} + +static unsigned RuntimeCounterRelocation = 0; + +COMPILER_RT_VISIBILITY unsigned lprofRuntimeCounterRelocation(void) { + return RuntimeCounterRelocation; +} + +COMPILER_RT_VISIBILITY void lprofSetRuntimeCounterRelocation(unsigned Value) { + RuntimeCounterRelocation = Value; +} + +#endif diff --git a/compiler-rt/lib/profile/InstrProfilingInternal.h b/compiler-rt/lib/profile/InstrProfilingInternal.h index 6a8dffad50db9..904bd39459281 100644 --- a/compiler-rt/lib/profile/InstrProfilingInternal.h +++ b/compiler-rt/lib/profile/InstrProfilingInternal.h @@ -181,10 +181,13 @@ uint64_t lprofGetLoadModuleSignature(); * Return non zero value if the profile data has already been * dumped to the file. */ -unsigned lprofProfileDumped(); -void lprofSetProfileDumped(); +unsigned lprofProfileDumped(void); +void lprofSetProfileDumped(unsigned); + +/* Return non zero value if counters are being relocated at runtime. */ +unsigned lprofRuntimeCounterRelocation(void); +void lprofSetRuntimeCounterRelocation(unsigned); -COMPILER_RT_VISIBILITY extern int RuntimeCounterRelocation; COMPILER_RT_VISIBILITY extern void (*FreeHook)(void *); COMPILER_RT_VISIBILITY extern uint8_t *DynamicBufferIOBuffer; COMPILER_RT_VISIBILITY extern uint32_t VPBufferSize; diff --git a/compiler-rt/lib/profile/InstrProfilingPlatformFuchsia.c b/compiler-rt/lib/profile/InstrProfilingPlatformFuchsia.c index bf78adbbca15d..828f742213936 100644 --- a/compiler-rt/lib/profile/InstrProfilingPlatformFuchsia.c +++ b/compiler-rt/lib/profile/InstrProfilingPlatformFuchsia.c @@ -34,7 +34,17 @@ #include "InstrProfilingInternal.h" #include "InstrProfilingUtil.h" -/* VMO that contains the coverage data shared across all modules. */ +COMPILER_RT_VISIBILITY unsigned lprofProfileDumped() { + return 1; +} +COMPILER_RT_VISIBILITY void lprofSetProfileDumped(unsigned Value) {} + +COMPILER_RT_VISIBILITY unsigned lprofRuntimeCounterRelocation(void) { + return 1; +} +COMPILER_RT_VISIBILITY void lprofSetRuntimeCounterRelocation(unsigned Value) {} + +/* VMO that contains the profile data for this module. */ static zx_handle_t __llvm_profile_vmo; /* Current offset within the VMO where data should be written next. */ static uint64_t __llvm_profile_offset; @@ -79,6 +89,11 @@ static uint32_t lprofVMOWriter(ProfDataWriter *This, ProfDataIOVec *IOVecs, __llvm_profile_offset += Length; } + /* Record the profile size as a property of the VMO. */ + _zx_object_set_property(__llvm_profile_vmo, ZX_PROP_VMO_CONTENT_SIZE, + &__llvm_profile_offset, + sizeof(__llvm_profile_offset)); + return 0; } @@ -87,43 +102,23 @@ static void initVMOWriter(ProfDataWriter *This) { This->WriterCtx = NULL; } -static int dump(void) { - if (lprofProfileDumped()) { - lprofWrite("LLVM Profile: data not published: already written.\n"); - return 0; - } - - /* Check if there is llvm/runtime version mismatch. */ - if (GET_VERSION(__llvm_profile_get_version()) != INSTR_PROF_RAW_VERSION) { - lprofWrite("LLVM Profile: runtime and instrumentation version mismatch: " - "expected %d, but got %d\n", - INSTR_PROF_RAW_VERSION, - (int)GET_VERSION(__llvm_profile_get_version())); - return -1; - } - - /* Write the profile data into the mapped region. */ - ProfDataWriter VMOWriter; - initVMOWriter(&VMOWriter); - if (lprofWriteData(&VMOWriter, lprofGetVPDataReader(), 0) != 0) - return -1; - - return 0; -} - +/* This method is invoked by the runtime initialization hook + * InstrProfilingRuntime.o if it is linked in. */ COMPILER_RT_VISIBILITY -int __llvm_profile_dump(void) { - int rc = dump(); - lprofSetProfileDumped(); - return rc; -} - -static void dumpWithoutReturn(void) { dump(); } +void __llvm_profile_initialize(void) { + /* This symbol is defined as weak and initialized to -1 by the runtimer, but + * compiler will generate a strong definition initialized to 0 when runtime + * counter relocation is used. */ + if (__llvm_profile_counter_bias == -1) { + lprofWrite("LLVM Profile: counter relocation at runtime is required\n"); + return; + } -static void createVMO(void) { /* Don't create VMO if it has been alread created. */ - if (__llvm_profile_vmo != ZX_HANDLE_INVALID) + if (__llvm_profile_vmo != ZX_HANDLE_INVALID) { + lprofWrite("LLVM Profile: VMO has already been created\n"); return; + } const __llvm_profile_data *DataBegin = __llvm_profile_begin_data(); const __llvm_profile_data *DataEnd = __llvm_profile_end_data(); @@ -206,23 +201,4 @@ static void createVMO(void) { (uintptr_t)__llvm_profile_begin_counters() + CountersOffset; } -/* This method is invoked by the runtime initialization hook - * InstrProfilingRuntime.o if it is linked in. - */ -COMPILER_RT_VISIBILITY -void __llvm_profile_initialize_file(void) { createVMO(); } - -COMPILER_RT_VISIBILITY -int __llvm_profile_register_write_file_atexit(void) { - static bool HasBeenRegistered = false; - - if (HasBeenRegistered) - return 0; - - lprofSetupValueProfiler(); - - HasBeenRegistered = true; - return atexit(dumpWithoutReturn); -} - #endif diff --git a/compiler-rt/lib/profile/InstrProfilingRuntime.cpp b/compiler-rt/lib/profile/InstrProfilingRuntime.cpp index 5dff09d706329..4ea2bb263f5ac 100644 --- a/compiler-rt/lib/profile/InstrProfilingRuntime.cpp +++ b/compiler-rt/lib/profile/InstrProfilingRuntime.cpp @@ -19,9 +19,7 @@ namespace { class RegisterRuntime { public: RegisterRuntime() { - __llvm_profile_initialize_file(); - if (!__llvm_profile_is_continuous_mode_enabled()) - __llvm_profile_register_write_file_atexit(); + __llvm_profile_initialize(); } }; diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc index 6daed4a6736c8..1671273174de1 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc +++ b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc @@ -9735,9 +9735,6 @@ INTERCEPTOR(void, qsort_r, void *base, SIZE_T nmemb, SIZE_T size, INTERCEPTOR(int, sigaltstack, void *ss, void *oss) { void *ctx; COMMON_INTERCEPTOR_ENTER(ctx, sigaltstack, ss, oss); - if (ss != nullptr) { - COMMON_INTERCEPTOR_READ_RANGE(ctx, ss, struct_stack_t_sz); - } int r = REAL(sigaltstack)(ss, oss); if (r == 0 && oss != nullptr) { COMMON_INTERCEPTOR_WRITE_RANGE(ctx, oss, struct_stack_t_sz); @@ -9749,6 +9746,40 @@ INTERCEPTOR(int, sigaltstack, void *ss, void *oss) { #define INIT_SIGALTSTACK #endif +#if SANITIZER_INTERCEPT_UNAME +INTERCEPTOR(int, uname, struct utsname *utsname) { + void *ctx; + COMMON_INTERCEPTOR_ENTER(ctx, uname, utsname); + int res = REAL(uname)(utsname); + if (!res) + COMMON_INTERCEPTOR_WRITE_RANGE(ctx, utsname, + __sanitizer::struct_utsname_sz); + return res; +} +#define INIT_UNAME COMMON_INTERCEPT_FUNCTION(uname) +#else +#define INIT_UNAME +#endif + +#if SANITIZER_INTERCEPT___XUNAME +// FreeBSD's define uname() as +// static __inline int uname(struct utsname *name) { +// return __xuname(SYS_NMLN, (void*)name); +// } +INTERCEPTOR(int, __xuname, int size, void *utsname) { + void *ctx; + COMMON_INTERCEPTOR_ENTER(ctx, __xuname, size, utsname); + int res = REAL(__xuname)(size, utsname); + if (!res) + COMMON_INTERCEPTOR_WRITE_RANGE(ctx, utsname, + __sanitizer::struct_utsname_sz); + return res; +} +#define INIT___XUNAME COMMON_INTERCEPT_FUNCTION(__xuname) +#else +#define INIT___XUNAME +#endif + #include "sanitizer_common_interceptors_netbsd_compat.inc" static void InitializeCommonInterceptors() { @@ -10055,6 +10086,8 @@ static void InitializeCommonInterceptors() { INIT_QSORT; INIT_QSORT_R; INIT_SIGALTSTACK; + INIT_UNAME; + INIT___XUNAME; INIT___PRINTF_CHK; } diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_coverage_fuchsia.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_coverage_fuchsia.cpp index f18cee66b8431..a52db08433e3b 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_coverage_fuchsia.cpp +++ b/compiler-rt/lib/sanitizer_common/sanitizer_coverage_fuchsia.cpp @@ -27,15 +27,15 @@ #include "sanitizer_platform.h" #if SANITIZER_FUCHSIA +#include +#include +#include + #include "sanitizer_atomic.h" #include "sanitizer_common.h" #include "sanitizer_internal_defs.h" #include "sanitizer_symbolizer_fuchsia.h" -#include -#include -#include - using namespace __sanitizer; namespace __sancov { @@ -82,7 +82,8 @@ class TracePcGuardController final { void TracePcGuard(u32 *guard, uptr pc) { atomic_uint32_t *guard_ptr = reinterpret_cast(guard); u32 idx = atomic_exchange(guard_ptr, 0, memory_order_relaxed); - if (idx > 0) array_[idx] = pc; + if (idx > 0) + array_[idx] = pc; } void Dump() { @@ -140,6 +141,10 @@ class TracePcGuardController final { internal_getpid()); _zx_object_set_property(vmo_, ZX_PROP_NAME, vmo_name_, internal_strlen(vmo_name_)); + uint64_t size = DataSize(); + status = _zx_object_set_property(vmo_, ZX_PROP_VMO_CONTENT_SIZE, &size, + sizeof(size)); + CHECK_EQ(status, ZX_OK); // Map the largest possible view we might need into the VMO. Later // we might need to increase the VMO's size before we can use larger @@ -172,6 +177,10 @@ class TracePcGuardController final { zx_status_t status = _zx_vmo_set_size(vmo_, DataSize()); CHECK_EQ(status, ZX_OK); + uint64_t size = DataSize(); + status = _zx_object_set_property(vmo_, ZX_PROP_VMO_CONTENT_SIZE, &size, + sizeof(size)); + CHECK_EQ(status, ZX_OK); return first_index; } @@ -204,13 +213,15 @@ SANITIZER_INTERFACE_ATTRIBUTE void __sanitizer_dump_coverage(const uptr *pcs, } SANITIZER_INTERFACE_WEAK_DEF(void, __sanitizer_cov_trace_pc_guard, u32 *guard) { - if (!*guard) return; + if (!*guard) + return; __sancov::pc_guard_controller.TracePcGuard(guard, GET_CALLER_PC() - 1); } SANITIZER_INTERFACE_WEAK_DEF(void, __sanitizer_cov_trace_pc_guard_init, u32 *start, u32 *end) { - if (start == end || *start) return; + if (start == end || *start) + return; __sancov::pc_guard_controller.InitTracePcGuard(start, end); } diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_file.h b/compiler-rt/lib/sanitizer_common/sanitizer_file.h index 4a78a0e0ac881..26681f0493d73 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_file.h +++ b/compiler-rt/lib/sanitizer_common/sanitizer_file.h @@ -87,8 +87,8 @@ bool IsAbsolutePath(const char *path); // The child process will close all fds after STDERR_FILENO // before passing control to a program. pid_t StartSubprocess(const char *filename, const char *const argv[], - fd_t stdin_fd = kInvalidFd, fd_t stdout_fd = kInvalidFd, - fd_t stderr_fd = kInvalidFd); + const char *const envp[], fd_t stdin_fd = kInvalidFd, + fd_t stdout_fd = kInvalidFd, fd_t stderr_fd = kInvalidFd); // Checks if specified process is still running bool IsProcessRunning(pid_t pid); // Waits for the process to finish and returns its exit code. diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_linux_s390.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_linux_s390.cpp index 9e3b4f13a4365..16b4c9b633d8f 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_linux_s390.cpp +++ b/compiler-rt/lib/sanitizer_common/sanitizer_linux_s390.cpp @@ -15,14 +15,15 @@ #if SANITIZER_LINUX && SANITIZER_S390 -#include "sanitizer_libc.h" -#include "sanitizer_linux.h" - +#include #include #include #include #include +#include "sanitizer_libc.h" +#include "sanitizer_linux.h" + namespace __sanitizer { // --------------- sanitizer_libc.h @@ -122,8 +123,12 @@ static bool FixedCVE_2016_2143() { // adjust this for their own kernels. struct utsname buf; unsigned int major, minor, patch = 0; + // Depending on the concrete sanitizer being used, uname may or may not + // be intercepted. Make sure we use the libc version in either case. + using Uname = int (*)(struct utsname *); + Uname uname = reinterpret_cast(dlsym(RTLD_NEXT, "uname")); // This should never fail, but just in case... - if (uname(&buf)) + if (uname == nullptr || uname(&buf)) return false; const char *ptr = buf.release; major = internal_simple_strtoll(ptr, &ptr, 10); diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp index c025f70df028a..618f902a3098c 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp +++ b/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp @@ -246,7 +246,8 @@ int internal_sysctlbyname(const char *sname, void *oldp, uptr *oldlenp, (size_t)newlen); } -static fd_t internal_spawn_impl(const char *argv[], pid_t *pid) { +static fd_t internal_spawn_impl(const char *argv[], const char *envp[], + pid_t *pid) { fd_t master_fd = kInvalidFd; fd_t slave_fd = kInvalidFd; @@ -302,8 +303,8 @@ static fd_t internal_spawn_impl(const char *argv[], pid_t *pid) { // posix_spawn char **argv_casted = const_cast(argv); - char **env = GetEnviron(); - res = posix_spawn(pid, argv[0], &acts, &attrs, argv_casted, env); + char **envp_casted = const_cast(envp); + res = posix_spawn(pid, argv[0], &acts, &attrs, argv_casted, envp_casted); if (res != 0) return kInvalidFd; // Disable echo in the new terminal, disable CR. @@ -320,7 +321,7 @@ static fd_t internal_spawn_impl(const char *argv[], pid_t *pid) { return fd; } -fd_t internal_spawn(const char *argv[], pid_t *pid) { +fd_t internal_spawn(const char *argv[], const char *envp[], pid_t *pid) { // The client program may close its stdin and/or stdout and/or stderr thus // allowing open/posix_openpt to reuse file descriptors 0, 1 or 2. In this // case the communication is broken if either the parent or the child tries to @@ -335,7 +336,7 @@ fd_t internal_spawn(const char *argv[], pid_t *pid) { break; } - fd_t fd = internal_spawn_impl(argv, pid); + fd_t fd = internal_spawn_impl(argv, envp, pid); for (; count > 0; count--) { internal_close(low_fds[count]); diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h b/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h index 8d08c0de1770a..9dd6d285f5940 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h +++ b/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h @@ -597,5 +597,7 @@ (SI_POSIX && !SI_IOSSIM && !SI_WATCHOS && !SI_TVOS && !SI_ANDROID) #define SANITIZER_INTERCEPT_QSORT_R (SI_LINUX && !SI_ANDROID) #define SANITIZER_INTERCEPT_SIGALTSTACK SI_POSIX +#define SANITIZER_INTERCEPT_UNAME (SI_POSIX && !SI_FREEBSD) +#define SANITIZER_INTERCEPT___XUNAME SI_FREEBSD #endif // #ifndef SANITIZER_PLATFORM_INTERCEPTORS_H diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_posix.h b/compiler-rt/lib/sanitizer_common/sanitizer_posix.h index 70c71f04d2d34..a1b49702da23b 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_posix.h +++ b/compiler-rt/lib/sanitizer_common/sanitizer_posix.h @@ -63,7 +63,7 @@ uptr internal_ptrace(int request, int pid, void *addr, void *data); uptr internal_waitpid(int pid, int *status, int options); int internal_fork(); -fd_t internal_spawn(const char *argv[], pid_t *pid); +fd_t internal_spawn(const char *argv[], const char *envp[], pid_t *pid); int internal_sysctl(const int *name, unsigned int namelen, void *oldp, uptr *oldlenp, const void *newp, uptr newlen); diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_posix_libcdep.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_posix_libcdep.cpp index 304b3a01a08b6..f920172c06d63 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_posix_libcdep.cpp +++ b/compiler-rt/lib/sanitizer_common/sanitizer_posix_libcdep.cpp @@ -426,7 +426,8 @@ void AdjustStackSize(void *attr_) { #endif // !SANITIZER_GO pid_t StartSubprocess(const char *program, const char *const argv[], - fd_t stdin_fd, fd_t stdout_fd, fd_t stderr_fd) { + const char *const envp[], fd_t stdin_fd, fd_t stdout_fd, + fd_t stderr_fd) { auto file_closer = at_scope_exit([&] { if (stdin_fd != kInvalidFd) { internal_close(stdin_fd); @@ -469,7 +470,8 @@ pid_t StartSubprocess(const char *program, const char *const argv[], for (int fd = sysconf(_SC_OPEN_MAX); fd > 2; fd--) internal_close(fd); - execv(program, const_cast(&argv[0])); + internal_execve(program, const_cast(&argv[0]), + const_cast(envp)); internal__exit(1); } diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_internal.h b/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_internal.h index c04797dd61b8b..0639543308422 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_internal.h +++ b/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_internal.h @@ -86,6 +86,8 @@ class SymbolizerProcess { // Customizable by subclasses. virtual bool StartSymbolizerSubprocess(); virtual bool ReadFromSymbolizer(char *buffer, uptr max_length); + // Return the environment to run the symbolizer in. + virtual char **GetEnvP() { return GetEnviron(); } private: virtual bool ReachedEndOfOutput(const char *buffer, uptr length) const { diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_posix_libcdep.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_posix_libcdep.cpp index c123ecb11206c..4c3cd966dd5a0 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_posix_libcdep.cpp +++ b/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_posix_libcdep.cpp @@ -153,7 +153,7 @@ bool SymbolizerProcess::StartSymbolizerSubprocess() { if (use_posix_spawn_) { #if SANITIZER_MAC - fd_t fd = internal_spawn(argv, &pid); + fd_t fd = internal_spawn(argv, const_cast(GetEnvP()), &pid); if (fd == kInvalidFd) { Report("WARNING: failed to spawn external symbolizer (errno: %d)\n", errno); @@ -173,7 +173,7 @@ bool SymbolizerProcess::StartSymbolizerSubprocess() { return false; } - pid = StartSubprocess(path_, argv, /* stdin */ outfd[0], + pid = StartSubprocess(path_, argv, GetEnvP(), /* stdin */ outfd[0], /* stdout */ infd[1]); if (pid < 0) { internal_close(infd[0]); diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_win.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_win.cpp index 73dc042b69f17..fca15beb61612 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_win.cpp +++ b/compiler-rt/lib/sanitizer_common/sanitizer_win.cpp @@ -1064,7 +1064,8 @@ char **GetEnviron() { } pid_t StartSubprocess(const char *program, const char *const argv[], - fd_t stdin_fd, fd_t stdout_fd, fd_t stderr_fd) { + const char *const envp[], fd_t stdin_fd, fd_t stdout_fd, + fd_t stderr_fd) { // FIXME: implement on this platform // Should be implemented based on // SymbolizerProcess::StarAtSymbolizerSubprocess diff --git a/compiler-rt/lib/sanitizer_common/tests/sanitizer_linux_test.cpp b/compiler-rt/lib/sanitizer_common/tests/sanitizer_linux_test.cpp index 1d8e7e8af26ca..cb6c0724ac884 100644 --- a/compiler-rt/lib/sanitizer_common/tests/sanitizer_linux_test.cpp +++ b/compiler-rt/lib/sanitizer_common/tests/sanitizer_linux_test.cpp @@ -264,7 +264,7 @@ TEST(SanitizerCommon, StartSubprocessTest) { const char *shell = "/bin/sh"; #endif const char *argv[] = {shell, "-c", "echo -n 'hello'", (char *)NULL}; - int pid = StartSubprocess(shell, argv, + int pid = StartSubprocess(shell, argv, GetEnviron(), /* stdin */ kInvalidFd, /* stdout */ pipe_fds[1]); ASSERT_GT(pid, 0); diff --git a/compiler-rt/lib/scudo/standalone/CMakeLists.txt b/compiler-rt/lib/scudo/standalone/CMakeLists.txt index 91b48b7064d32..bdaeb569efdd1 100644 --- a/compiler-rt/lib/scudo/standalone/CMakeLists.txt +++ b/compiler-rt/lib/scudo/standalone/CMakeLists.txt @@ -3,7 +3,7 @@ if (COMPILER_RT_HAS_GWP_ASAN) add_dependencies(scudo_standalone gwp_asan) endif() -include_directories(../..) +include_directories(../.. include) set(SCUDO_CFLAGS) @@ -56,7 +56,6 @@ set(SCUDO_HEADERS flags.h flags_parser.h fuchsia.h - interface.h internal_defs.h linux.h list.h @@ -78,6 +77,8 @@ set(SCUDO_HEADERS vector.h wrappers_c_checks.h wrappers_c.h + + include/scudo/interface.h ) set(SCUDO_SOURCES diff --git a/compiler-rt/lib/scudo/standalone/combined.h b/compiler-rt/lib/scudo/standalone/combined.h index 8456dc82d20ec..1fffea4dc5c21 100644 --- a/compiler-rt/lib/scudo/standalone/combined.h +++ b/compiler-rt/lib/scudo/standalone/combined.h @@ -13,7 +13,6 @@ #include "common.h" #include "flags.h" #include "flags_parser.h" -#include "interface.h" #include "local_cache.h" #include "memtag.h" #include "quarantine.h" @@ -22,6 +21,8 @@ #include "string_utils.h" #include "tsd.h" +#include "scudo/interface.h" + #ifdef GWP_ASAN_HOOKS #include "gwp_asan/guarded_pool_allocator.h" #include "gwp_asan/optional/backtrace.h" @@ -260,8 +261,8 @@ class Allocator { } DCHECK_LE(Size, NeededSize); - void *Block; - uptr ClassId; + void *Block = nullptr; + uptr ClassId = 0; uptr SecondaryBlockEnd; if (LIKELY(PrimaryT::canAllocate(NeededSize))) { ClassId = SizeClassMap::getClassIdBySize(NeededSize); @@ -273,20 +274,19 @@ class Allocator { // is the region being full. In that event, retry once using the // immediately larger class (except if the failing class was already the // largest). This will waste some memory but will allow the application to - // not fail. - if (SCUDO_ANDROID) { - if (UNLIKELY(!Block)) { - if (ClassId < SizeClassMap::LargestClassId) - Block = TSD->Cache.allocate(++ClassId); - } + // not fail. If dealing with the largest class, fallback to the Secondary. + if (UNLIKELY(!Block)) { + if (ClassId < SizeClassMap::LargestClassId) + Block = TSD->Cache.allocate(++ClassId); + else + ClassId = 0; } if (UnlockRequired) TSD->unlock(); - } else { - ClassId = 0; + } + if (UNLIKELY(ClassId == 0)) Block = Secondary.allocate(NeededSize, Alignment, &SecondaryBlockEnd, ZeroContents); - } if (UNLIKELY(!Block)) { if (Options.MayReturnNull) diff --git a/compiler-rt/lib/scudo/standalone/flags.cpp b/compiler-rt/lib/scudo/standalone/flags.cpp index dd9f050a2d20c..de5153b288b14 100644 --- a/compiler-rt/lib/scudo/standalone/flags.cpp +++ b/compiler-rt/lib/scudo/standalone/flags.cpp @@ -9,7 +9,8 @@ #include "flags.h" #include "common.h" #include "flags_parser.h" -#include "interface.h" + +#include "scudo/interface.h" namespace scudo { diff --git a/compiler-rt/lib/scudo/standalone/interface.h b/compiler-rt/lib/scudo/standalone/include/scudo/interface.h similarity index 65% rename from compiler-rt/lib/scudo/standalone/interface.h rename to compiler-rt/lib/scudo/standalone/include/scudo/interface.h index e2639823f426c..e527d0a5d3043 100644 --- a/compiler-rt/lib/scudo/standalone/interface.h +++ b/compiler-rt/lib/scudo/standalone/include/scudo/interface.h @@ -1,4 +1,4 @@ -//===-- interface.h ---------------------------------------------*- C++ -*-===// +//===-- scudo/interface.h ---------------------------------------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -9,18 +9,16 @@ #ifndef SCUDO_INTERFACE_H_ #define SCUDO_INTERFACE_H_ -#include "internal_defs.h" - extern "C" { -WEAK INTERFACE const char *__scudo_default_options(); +__attribute__((weak)) const char *__scudo_default_options(); // Post-allocation & pre-deallocation hooks. // They must be thread-safe and not use heap related functions. -WEAK INTERFACE void __scudo_allocate_hook(void *ptr, size_t size); -WEAK INTERFACE void __scudo_deallocate_hook(void *ptr); +__attribute__((weak)) void __scudo_allocate_hook(void *ptr, size_t size); +__attribute__((weak)) void __scudo_deallocate_hook(void *ptr); -WEAK INTERFACE void __scudo_print_stats(void); +void __scudo_print_stats(void); typedef void (*iterate_callback)(uintptr_t base, size_t size, void *arg); diff --git a/compiler-rt/lib/scudo/standalone/tests/CMakeLists.txt b/compiler-rt/lib/scudo/standalone/tests/CMakeLists.txt index e29f158d61f80..78c297ae7e80b 100644 --- a/compiler-rt/lib/scudo/standalone/tests/CMakeLists.txt +++ b/compiler-rt/lib/scudo/standalone/tests/CMakeLists.txt @@ -10,6 +10,7 @@ set(SCUDO_UNITTEST_CFLAGS -I${COMPILER_RT_SOURCE_DIR}/include -I${COMPILER_RT_SOURCE_DIR}/lib -I${COMPILER_RT_SOURCE_DIR}/lib/scudo/standalone + -I${COMPILER_RT_SOURCE_DIR}/lib/scudo/standalone/include -DGTEST_HAS_RTTI=0 -DSCUDO_DEBUG=1 # Extra flags for the C++ tests diff --git a/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp b/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp index ce1b2824788da..a6f29a2610ed1 100644 --- a/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp +++ b/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp @@ -417,7 +417,7 @@ TEST(ScudoCombinedTest, ReleaseToOS) { Allocator->releaseToOS(); } -// Verify that when a region gets full, Android will still manage to +// Verify that when a region gets full, the allocator will still manage to // fulfill the allocation through a larger size class. TEST(ScudoCombinedTest, FullRegion) { using AllocatorT = scudo::Allocator; @@ -429,26 +429,25 @@ TEST(ScudoCombinedTest, FullRegion) { Deleter); Allocator->reset(); - const scudo::uptr Size = 1000U; - const scudo::uptr MaxNumberOfChunks = - (1U << DeathRegionSizeLog) / - DeathConfig::DeathSizeClassMap::getSizeByClassId(1U); - void *P; std::vector V; scudo::uptr FailedAllocationsCount = 0; - for (scudo::uptr I = 0; I <= MaxNumberOfChunks; I++) { - P = Allocator->allocate(Size, Origin); - if (!P) - FailedAllocationsCount++; - else - V.push_back(P); + for (scudo::uptr ClassId = 1U; + ClassId <= DeathConfig::DeathSizeClassMap::LargestClassId; ClassId++) { + const scudo::uptr Size = + DeathConfig::DeathSizeClassMap::getSizeByClassId(ClassId); + const scudo::uptr MaxNumberOfChunks = (1U << DeathRegionSizeLog) / Size; + void *P; + for (scudo::uptr I = 0; I <= MaxNumberOfChunks; I++) { + P = Allocator->allocate(Size - 64U, Origin); + if (!P) + FailedAllocationsCount++; + else + V.push_back(P); + } } while (!V.empty()) { Allocator->deallocate(V.back(), Origin); V.pop_back(); } - if (SCUDO_ANDROID) - EXPECT_EQ(FailedAllocationsCount, 0U); - else - EXPECT_GT(FailedAllocationsCount, 0U); + EXPECT_EQ(FailedAllocationsCount, 0U); } diff --git a/compiler-rt/lib/tsan/go/test.c b/compiler-rt/lib/tsan/go/test.c index 61be48442c80c..787b4c5b7dc19 100644 --- a/compiler-rt/lib/tsan/go/test.c +++ b/compiler-rt/lib/tsan/go/test.c @@ -32,6 +32,7 @@ void __tsan_malloc(void *thr, void *pc, void *p, unsigned long sz); void __tsan_free(void *p, unsigned long sz); void __tsan_acquire(void *thr, void *addr); void __tsan_release(void *thr, void *addr); +void __tsan_release_acquire(void *thr, void *addr); void __tsan_release_merge(void *thr, void *addr); void *current_proc; @@ -77,6 +78,7 @@ int main(void) { __tsan_func_enter(thr0, (char*)&main + 1); __tsan_malloc(thr0, (char*)&barfoo + 1, buf, 10); __tsan_release(thr0, buf); + __tsan_release_acquire(thr0, buf); __tsan_release_merge(thr0, buf); void *thr1 = 0; __tsan_go_start(thr0, &thr1, (char*)&barfoo + 1); diff --git a/compiler-rt/lib/tsan/go/tsan_go.cpp b/compiler-rt/lib/tsan/go/tsan_go.cpp index f5998c0c78166..77987f43bf54c 100644 --- a/compiler-rt/lib/tsan/go/tsan_go.cpp +++ b/compiler-rt/lib/tsan/go/tsan_go.cpp @@ -244,6 +244,10 @@ void __tsan_acquire(ThreadState *thr, void *addr) { Acquire(thr, 0, (uptr)addr); } +void __tsan_release_acquire(ThreadState *thr, void *addr) { + ReleaseStoreAcquire(thr, 0, (uptr)addr); +} + void __tsan_release(ThreadState *thr, void *addr) { ReleaseStore(thr, 0, (uptr)addr); } diff --git a/compiler-rt/lib/tsan/rtl/tsan_clock.cpp b/compiler-rt/lib/tsan/rtl/tsan_clock.cpp index 2aeb18df1ed20..7989a90837cd5 100644 --- a/compiler-rt/lib/tsan/rtl/tsan_clock.cpp +++ b/compiler-rt/lib/tsan/rtl/tsan_clock.cpp @@ -30,6 +30,14 @@ // dst->clock[i] = max(dst->clock[i], clock[i]); // } // +// void ThreadClock::releaseStoreAcquire(SyncClock *sc) const { +// for (int i = 0; i < kMaxThreads; i++) { +// tmp = clock[i]; +// clock[i] = max(clock[i], sc->clock[i]); +// sc->clock[i] = tmp; +// } +// } +// // void ThreadClock::ReleaseStore(SyncClock *dst) const { // for (int i = 0; i < kMaxThreads; i++) // dst->clock[i] = clock[i]; @@ -177,6 +185,36 @@ void ThreadClock::acquire(ClockCache *c, SyncClock *src) { } } +void ThreadClock::releaseStoreAcquire(ClockCache *c, SyncClock *sc) { + DCHECK_LE(nclk_, kMaxTid); + DCHECK_LE(dst->size_, kMaxTid); + + if (sc->size_ == 0) { + // ReleaseStore will correctly set release_store_tid_, + // which can be important for future operations. + ReleaseStore(c, sc); + return; + } + + // Check if we need to resize dst. + if (sc->size_ < nclk_) + sc->Resize(c, nclk_); + + sc->Unshare(c); + // Update sc->clk_. + sc->FlushDirty(); + uptr i = 0; + for (ClockElem &ce : *sc) { + u64 tmp = clk_[i]; + clk_[i] = max(ce.epoch, clk_[i]); + ce.epoch = tmp; + ce.reused = 0; + i++; + } + sc->release_store_tid_ = kInvalidTid; + sc->release_store_reused_ = 0; +} + void ThreadClock::release(ClockCache *c, SyncClock *dst) { DCHECK_LE(nclk_, kMaxTid); DCHECK_LE(dst->size_, kMaxTid); diff --git a/compiler-rt/lib/tsan/rtl/tsan_clock.h b/compiler-rt/lib/tsan/rtl/tsan_clock.h index 6a1d15a2a16de..c66431f54ee4c 100644 --- a/compiler-rt/lib/tsan/rtl/tsan_clock.h +++ b/compiler-rt/lib/tsan/rtl/tsan_clock.h @@ -134,6 +134,7 @@ class ThreadClock { uptr size() const; void acquire(ClockCache *c, SyncClock *src); + void releaseStoreAcquire(ClockCache *c, SyncClock *src); void release(ClockCache *c, SyncClock *dst); void acq_rel(ClockCache *c, SyncClock *dst); void ReleaseStore(ClockCache *c, SyncClock *dst); diff --git a/compiler-rt/lib/tsan/rtl/tsan_rtl.cpp b/compiler-rt/lib/tsan/rtl/tsan_rtl.cpp index fe469faad2a2d..13c9b770f50a3 100644 --- a/compiler-rt/lib/tsan/rtl/tsan_rtl.cpp +++ b/compiler-rt/lib/tsan/rtl/tsan_rtl.cpp @@ -495,14 +495,23 @@ int Finalize(ThreadState *thr) { void ForkBefore(ThreadState *thr, uptr pc) { ctx->thread_registry->Lock(); ctx->report_mtx.Lock(); + // Ignore memory accesses in the pthread_atfork callbacks. + // If any of them triggers a data race we will deadlock + // on the report_mtx. + // We could ignore interceptors and sync operations as well, + // but so far it's unclear if it will do more good or harm. + // Unnecessarily ignoring things can lead to false positives later. + ThreadIgnoreBegin(thr, pc); } void ForkParentAfter(ThreadState *thr, uptr pc) { + ThreadIgnoreEnd(thr, pc); // Begin is in ForkBefore. ctx->report_mtx.Unlock(); ctx->thread_registry->Unlock(); } void ForkChildAfter(ThreadState *thr, uptr pc) { + ThreadIgnoreEnd(thr, pc); // Begin is in ForkBefore. ctx->report_mtx.Unlock(); ctx->thread_registry->Unlock(); diff --git a/compiler-rt/lib/tsan/rtl/tsan_rtl.h b/compiler-rt/lib/tsan/rtl/tsan_rtl.h index 20f7a99157aba..d3bb61ff87d3f 100644 --- a/compiler-rt/lib/tsan/rtl/tsan_rtl.h +++ b/compiler-rt/lib/tsan/rtl/tsan_rtl.h @@ -813,10 +813,12 @@ void Acquire(ThreadState *thr, uptr pc, uptr addr); // approximation of the actual required synchronization. void AcquireGlobal(ThreadState *thr, uptr pc); void Release(ThreadState *thr, uptr pc, uptr addr); +void ReleaseStoreAcquire(ThreadState *thr, uptr pc, uptr addr); void ReleaseStore(ThreadState *thr, uptr pc, uptr addr); void AfterSleep(ThreadState *thr, uptr pc); void AcquireImpl(ThreadState *thr, uptr pc, SyncClock *c); void ReleaseImpl(ThreadState *thr, uptr pc, SyncClock *c); +void ReleaseStoreAcquireImpl(ThreadState *thr, uptr pc, SyncClock *c); void ReleaseStoreImpl(ThreadState *thr, uptr pc, SyncClock *c); void AcquireReleaseImpl(ThreadState *thr, uptr pc, SyncClock *c); diff --git a/compiler-rt/lib/tsan/rtl/tsan_rtl_mutex.cpp b/compiler-rt/lib/tsan/rtl/tsan_rtl_mutex.cpp index ce6e7cb2c4ef0..bca194f064b40 100644 --- a/compiler-rt/lib/tsan/rtl/tsan_rtl_mutex.cpp +++ b/compiler-rt/lib/tsan/rtl/tsan_rtl_mutex.cpp @@ -429,6 +429,18 @@ void AcquireGlobal(ThreadState *thr, uptr pc) { UpdateClockCallback, thr); } +void ReleaseStoreAcquire(ThreadState *thr, uptr pc, uptr addr) { + DPrintf("#%d: ReleaseStoreAcquire %zx\n", thr->tid, addr); + if (thr->ignore_sync) + return; + SyncVar *s = ctx->metamap.GetOrCreateAndLock(thr, pc, addr, true); + thr->fast_state.IncrementEpoch(); + // Can't increment epoch w/o writing to the trace as well. + TraceAddEvent(thr, thr->fast_state, EventTypeMop, 0); + ReleaseStoreAcquireImpl(thr, pc, &s->clock); + s->mtx.Unlock(); +} + void Release(ThreadState *thr, uptr pc, uptr addr) { DPrintf("#%d: Release %zx\n", thr->tid, addr); if (thr->ignore_sync) @@ -482,6 +494,15 @@ void AcquireImpl(ThreadState *thr, uptr pc, SyncClock *c) { StatInc(thr, StatSyncAcquire); } +void ReleaseStoreAcquireImpl(ThreadState *thr, uptr pc, SyncClock *c) { + if (thr->ignore_sync) + return; + thr->clock.set(thr->fast_state.epoch()); + thr->fast_synch_epoch = thr->fast_state.epoch(); + thr->clock.releaseStoreAcquire(&thr->proc()->clock_cache, c); + StatInc(thr, StatSyncReleaseStoreAcquire); +} + void ReleaseImpl(ThreadState *thr, uptr pc, SyncClock *c) { if (thr->ignore_sync) return; diff --git a/compiler-rt/lib/tsan/rtl/tsan_stat.h b/compiler-rt/lib/tsan/rtl/tsan_stat.h index 94e18bc66df94..8b26a59bb2ed7 100644 --- a/compiler-rt/lib/tsan/rtl/tsan_stat.h +++ b/compiler-rt/lib/tsan/rtl/tsan_stat.h @@ -68,6 +68,7 @@ enum StatType { StatSyncDestroyed, StatSyncAcquire, StatSyncRelease, + StatSyncReleaseStoreAcquire, // Clocks - acquire. StatClockAcquire, diff --git a/compiler-rt/lib/xray/xray_utils.cpp b/compiler-rt/lib/xray/xray_utils.cpp index 1036d17a7725b..4c8ad5b92be73 100644 --- a/compiler-rt/lib/xray/xray_utils.cpp +++ b/compiler-rt/lib/xray/xray_utils.cpp @@ -69,6 +69,10 @@ void LogWriter::WriteAll(const char *Begin, const char *End) XRAY_NEVER_INSTRUME return; } Offset += TotalBytes; + + // Record the data size as a property of the VMO. + _zx_object_set_property(Vmo, ZX_PROP_VMO_CONTENT_SIZE, + &Offset, sizeof(Offset)); } void LogWriter::Flush() XRAY_NEVER_INSTRUMENT { diff --git a/compiler-rt/test/msan/sigaltstack.cpp b/compiler-rt/test/msan/sigaltstack.cpp index 4b97bb461d47c..c1b8b7eefee16 100644 --- a/compiler-rt/test/msan/sigaltstack.cpp +++ b/compiler-rt/test/msan/sigaltstack.cpp @@ -1,4 +1,4 @@ -// RUN: %clangxx_msan -O0 -g %s -o %t && not %run %t +// RUN: %clangxx_msan -O0 -g %s -o %t && %run %t // #include #include @@ -11,10 +11,5 @@ int main(void) { assert(sigaltstack(nullptr, &old_ss) == 0); __msan_check_mem_is_initialized(&old_ss, sizeof(stack_t)); - stack_t ss; - sigaltstack(&ss, nullptr); -// CHECK: WARNING: MemorySanitizer: use-of-uninitialized-value -// CHECK: in main {{.*}}sigaltstack.cpp:15 - return 0; } diff --git a/compiler-rt/test/sanitizer_common/TestCases/Posix/uname.c b/compiler-rt/test/sanitizer_common/TestCases/Posix/uname.c new file mode 100644 index 0000000000000..0bf7e0fd98e37 --- /dev/null +++ b/compiler-rt/test/sanitizer_common/TestCases/Posix/uname.c @@ -0,0 +1,13 @@ +// RUN: %clang %s -o %t && %run %t + +#include +#include +#include + +int main() { + struct utsname buf; + int err = uname(&buf); + assert(err == 0); + printf("%s %s %s %s %s\n", buf.sysname, buf.nodename, buf.release, + buf.version, buf.machine); +} diff --git a/compiler-rt/test/tsan/pthread_atfork_deadlock2.c b/compiler-rt/test/tsan/pthread_atfork_deadlock2.c new file mode 100644 index 0000000000000..700507c1e637c --- /dev/null +++ b/compiler-rt/test/tsan/pthread_atfork_deadlock2.c @@ -0,0 +1,49 @@ +// RUN: %clang_tsan -O1 %s -o %t && %run %t 2>&1 | FileCheck %s +// Regression test for +// https://groups.google.com/d/msg/thread-sanitizer/e_zB9gYqFHM/DmAiTsrLAwAJ +// pthread_atfork() callback triggers a data race and we deadlocked +// on the report_mtx as we lock it around fork. +#include "test.h" +#include +#include +#include + +int glob = 0; + +void *worker(void *unused) { + glob++; + barrier_wait(&barrier); + return NULL; +} + +void atfork() { + glob++; +} + +int main() { + barrier_init(&barrier, 2); + pthread_atfork(atfork, NULL, NULL); + pthread_t t; + pthread_create(&t, NULL, worker, NULL); + barrier_wait(&barrier); + pid_t pid = fork(); + if (pid < 0) { + fprintf(stderr, "fork failed: %d\n", errno); + return 1; + } + if (pid == 0) { + fprintf(stderr, "CHILD\n"); + return 0; + } + if (pid != waitpid(pid, NULL, 0)) { + fprintf(stderr, "waitpid failed: %d\n", errno); + return 1; + } + pthread_join(t, NULL); + fprintf(stderr, "PARENT\n"); + return 0; +} + +// CHECK-NOT: ThreadSanitizer: data race +// CHECK: CHILD +// CHECK: PARENT diff --git a/libc/CMakeLists.txt b/libc/CMakeLists.txt index 4ee4d7dca5733..40c7f707872ba 100644 --- a/libc/CMakeLists.txt +++ b/libc/CMakeLists.txt @@ -32,5 +32,7 @@ add_subdirectory(utils) # and libraries potentially draw from the components present in all # of the other directories. add_subdirectory(lib) -add_subdirectory(test) -add_subdirectory(fuzzing) +if(LLVM_INCLUDE_TESTS) + add_subdirectory(test) + add_subdirectory(fuzzing) +endif() diff --git a/libc/config/linux/api.td b/libc/config/linux/api.td index e9391cb52febb..f520947b998ab 100644 --- a/libc/config/linux/api.td +++ b/libc/config/linux/api.td @@ -71,10 +71,53 @@ def AssertAPI : PublicAPI<"assert.h"> { ]; } +def MathErrHandlingMacro : MacroDef<"math_errhandling"> { + let Defn = [{ + #ifndef math_errhandling + #ifdef __FAST_MATH__ + #define math_errhandling 0 + #elif defined __NO_MATH_ERRNO__ + #define math_errhandling (MATH_ERREXCEPT) + #else + #define math_errhandling (MATH_ERRNO | MATH_ERREXCEPT) + #endif + #endif // math_errhandling not defined + }]; +} + +def IsFiniteMacro : MacroDef<"isfinite"> { + let Defn = [{ + #define isfinite(x) __builtin_isfinite(x) + }]; +} + +def IsInfMacro : MacroDef<"isinf"> { + let Defn = [{ + #define isinf(x) __builtin_isinf(x) + }]; +} + +def IsNanMacro : MacroDef<"isnan"> { + let Defn = [{ + #define isnan(x) __builtin_isnan(x) + }]; +} + def MathAPI : PublicAPI<"math.h"> { + let Macros = [ + SimpleMacroDef<"MATH_ERRNO", "1">, + SimpleMacroDef<"MATH_ERREXCEPT", "2">, + MathErrHandlingMacro, + + SimpleMacroDef<"INFINITY", "__builtin_inff()">, + SimpleMacroDef<"NAN", "__builtin_nanf(\"\")">, + + IsFiniteMacro, + IsInfMacro, + IsNanMacro, + ]; let Functions = [ - "acos", - "acosl", + "round", ]; } @@ -194,9 +237,16 @@ def StructSigactionDefn : TypeDecl<"struct sigaction"> { }]; } +def SighandlerTDefn : TypeDecl<"__sighandler_t"> { + let Decl = [{ + typedef void(*__sighandler_t)(int); + }]; +} + def SignalAPI : PublicAPI<"signal.h"> { let TypeDeclarations = [ StructSigactionDefn, + SighandlerTDefn, ]; let Functions = [ @@ -205,6 +255,7 @@ def SignalAPI : PublicAPI<"signal.h"> { "sigprocmask", "sigemptyset", "sigaddset", + "signal", ]; } diff --git a/libc/include/CMakeLists.txt b/libc/include/CMakeLists.txt index 2ff04231e4060..8e48a636a8ea4 100644 --- a/libc/include/CMakeLists.txt +++ b/libc/include/CMakeLists.txt @@ -19,10 +19,10 @@ add_header( llvm_libc_common_h ) -add_header( +add_gen_header( math_h - HDR - math.h + DEF_FILE math.h.def + GEN_HDR math.h DEPENDS llvm_libc_common_h ) diff --git a/libc/include/math.h b/libc/include/math.h deleted file mode 100644 index 6cd258b3c633e..0000000000000 --- a/libc/include/math.h +++ /dev/null @@ -1,360 +0,0 @@ -//===----------------- C standard library header math.h -----------------*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBC_MATH_H -#define LLVM_LIBC_MATH_H - -#include <__llvm-libc-common.h> - -__BEGIN_C_DECLS - -double acos(double); - -float acosf(float); - -long double acosl(long double); - -double asin(double); - -float asinf(float); - -long double asinl(long double); - -double atan(double); - -float atanf(float); - -long double atanl(long double); - -double atan2(double, double); - -float atan2f(float, float); - -long double atan2l(long double, long double); - -double cos(double); - -float cosf(float); - -long double cosl(long double); - -double sin(double); - -float sinf(float); - -long double sinl(long double); - -double tan(double); - -float tanf(float); - -long double tanl(long double); - -double acosh(double); - -float acoshf(float); - -long double acoshl(long double); - -double asinh(double); - -float asinhf(float); - -long double asinhl(long double); - -double atanh(double); - -float atanhf(float); - -long double atanhl(long double); - -double cosh(double); - -float coshf(float); - -long double coshl(long double); - -double sinh(double); - -float sinhf(float); - -long double sinhl(long double); - -double tanh(double); - -float tanhf(float); - -long double tanhl(long double); - -double exp(double); - -float expf(float); - -long double expl(long double); - -double exp2(double); - -float exp2f(float); - -long double exp2l(long double); - -double expm1(double); - -float expm1f(float); - -long double expm1l(long double); - -double frexp(double, int); - -float frexpf(float, int); - -long double frexpl(long double, int); - -int ilogb(double); - -int ilogbf(float); - -int ilogbl(long double); - -double ldexp(double, int); - -float ldexpf(float, int); - -long double ldexpl(long double, int); - -double log(double); - -float logf(float); - -long double logl(long double); - -double log10(double); - -float log10f(float); - -long double log10l(long double); - -double log1p(double); - -float log1pf(float); - -long double log1pl(long double); - -double log2(double); - -float log2f(float); - -long double log2l(long double); - -double logb(double); - -float logbf(float); - -long double logbl(long double); - -double modf(double, double); - -float modff(float, float); - -long double modfl(long double, long double); - -double scalbn(double, int); - -float scalbnf(float, int); - -long double scalbnl(long double, int); - -double scalbln(double, long int); - -float scalblnf(float, long int); - -long double scalblnl(long double, long int); - -double cbrt(double); - -float cbrtf(float); - -long double cbrtl(long double); - -double fabs(double); - -float fabsf(float); - -long double fabsl(long double); - -double hypot(double, double); - -float hypotf(float, float); - -long double hypotl(long double, long double); - -double pow(double, double); - -float powf(float, float); - -long double powl(long double, long double); - -double sqrt(double); - -float sqrtf(float); - -long double sqrtl(long double); - -double erf(double); - -float erff(float); - -long double erfl(long double); - -double erfc(double); - -float erfcf(float); - -long double erfcl(long double); - -double lgamma(double); - -float lgammaf(float); - -long double lgammal(long double); - -double tgamma(double); - -float tgammaf(float); - -long double tgammal(long double); - -double ceil(double); - -float ceilf(float); - -long double ceill(long double); - -double floor(double); - -float floorf(float); - -long double floorl(long double); - -double nearbyint(double); - -float nearbyintf(float); - -long double nearbyintl(long double); - -double rint(double); - -float rintf(float); - -long double rintl(long double); - -long int lrint(double); - -long int lrintf(float); - -long int lrintl(long double); - -long long int llrint(double); - -long long int llrintf(float); - -long long int llrintl(long double); - -double round(double); - -float roundf(float); - -long double roundl(long double); - -long int lround(double); - -long int lroundf(float); - -long int lroundl(long double); - -long long int llround(double); - -long long int llroundf(float); - -long long int llroundl(long double); - -double trunc(double); - -float truncf(float); - -long double truncl(long double); - -double fmod(double, double); - -float fmodf(float, float); - -long double fmodl(long double, long double); - -double remainder(double, double); - -float remainderf(float, float); - -long double remainderl(long double, long double); - -double remquo(double, double, int); - -float remquof(float, float, int); - -long double remquol(long double, long double, int); - -double copysign(double, double); - -float copysignf(float, float); - -long double copysignl(long double, long double); - -double nan(const char); - -float nanf(const char); - -long double nanl(const char); - -double nextafter(double, double); - -float nextafterf(float, float); - -long double nextafterl(long double, long double); - -double nexttoward(double, long double); - -float nexttowardf(float, long double); - -long double nexttowardl(long double, long double); - -double fdim(double, double); - -float fdimf(float, float); - -long double fdiml(long double, long double); - -double fmax(double, double); - -double fmaxf(double, double); - -double fmaxl(double, double); - -double fmin(double, double); - -float fminf(float, float); - -long double fminl(long double, long double); - -double fma(double, double, double); - -float fmaf(float, float, float); - -long double fmal(long double, long double, long double); - -__END_C_DECLS - -#endif // LLVM_LIBC_MATH_H diff --git a/libc/include/math.h.def b/libc/include/math.h.def new file mode 100644 index 0000000000000..7ede8545cd5e6 --- /dev/null +++ b/libc/include/math.h.def @@ -0,0 +1,16 @@ +//===----------------- C standard library header math.h -----------------*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_MATH_H +#define LLVM_LIBC_MATH_H + +#include <__llvm-libc-common.h> + +%%public_api() + +#endif // LLVM_LIBC_MATH_H diff --git a/libc/lib/CMakeLists.txt b/libc/lib/CMakeLists.txt index b234c91704a9d..748dac1610431 100644 --- a/libc/lib/CMakeLists.txt +++ b/libc/lib/CMakeLists.txt @@ -23,6 +23,7 @@ add_entrypoint_library( sigaddset sigemptyset sigprocmask + signal # stdlib.h entrypoints _Exit diff --git a/libc/spec/stdc.td b/libc/spec/stdc.td index dfac8ebed3c7f..39067bcdda12d 100644 --- a/libc/spec/stdc.td +++ b/libc/spec/stdc.td @@ -16,6 +16,8 @@ def StdC : StandardSpec<"stdc"> { PtrType IntPtr = PtrType; + NamedType SigHandlerT = NamedType<"__sighandler_t">; + HeaderSpec Assert = HeaderSpec< "assert.h", [ @@ -158,15 +160,28 @@ def StdC : StandardSpec<"stdc"> { HeaderSpec Math = HeaderSpec< "math.h", - [], // Macros + [ + Macro<"MATH_ERRNO">, + Macro<"MATH_ERREXCEPT">, + Macro<"math_errhandling">, + + Macro<"INFINITY">, + Macro<"NAN">, + + Macro<"isfinite">, + Macro<"isinf">, + Macro<"isnan">, + ], [ NamedType<"float_t">, NamedType<"double_t">, ], [], // Enumerations [ - FunctionSpec<"acos", RetValSpec, [ArgSpec]>, - FunctionSpec<"acosl", RetValSpec, [ArgSpec]>, + FunctionSpec<"cosf", RetValSpec, [ArgSpec]>, + FunctionSpec<"sinf", RetValSpec, [ArgSpec]>, + + FunctionSpec<"round", RetValSpec, [ArgSpec]>, ] >; @@ -226,10 +241,16 @@ def StdC : StandardSpec<"stdc"> { ], [ SizeTType, + SigHandlerT, ], [], // Enumerations [ FunctionSpec<"raise", RetValSpec, [ArgSpec]>, + FunctionSpec< + "signal", + RetValSpec, + [ArgSpec, ArgSpec] + >, ] >; diff --git a/libc/src/.clang-tidy b/libc/src/.clang-tidy new file mode 100644 index 0000000000000..6d6043a11a3a6 --- /dev/null +++ b/libc/src/.clang-tidy @@ -0,0 +1,6 @@ +Checks: '-*,llvmlibc-*' +HeaderFilterRegex: '.*' +WarningsAsErrors: 'llvmlibc-*' +CheckOptions: + - key: llvmlibc-restrict-system-libc-headers.Includes + value: '-*, linux/*, asm/unistd.h' diff --git a/libc/src/math/CMakeLists.txt b/libc/src/math/CMakeLists.txt index 9501785fe1f4c..e7ad9d2a73fec 100644 --- a/libc/src/math/CMakeLists.txt +++ b/libc/src/math/CMakeLists.txt @@ -1 +1,14 @@ -add_subdirectory(round) +add_entrypoint_object( + round + REDIRECTED + SRCS + round.cpp + HDRS + round.h +) + +add_redirector_object( + round_redirector + SRC + round_redirector.cpp +) diff --git a/libc/src/math/round/round.cpp b/libc/src/math/round.cpp similarity index 94% rename from libc/src/math/round/round.cpp rename to libc/src/math/round.cpp index 26eeadb378f18..6031ed29e6da9 100644 --- a/libc/src/math/round/round.cpp +++ b/libc/src/math/round.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -#include "src/math/round/round.h" +#include "src/math/round.h" #include "src/__support/common.h" diff --git a/libc/src/math/round/round.h b/libc/src/math/round.h similarity index 100% rename from libc/src/math/round/round.h rename to libc/src/math/round.h diff --git a/libc/src/math/round/CMakeLists.txt b/libc/src/math/round/CMakeLists.txt deleted file mode 100644 index e7ad9d2a73fec..0000000000000 --- a/libc/src/math/round/CMakeLists.txt +++ /dev/null @@ -1,14 +0,0 @@ -add_entrypoint_object( - round - REDIRECTED - SRCS - round.cpp - HDRS - round.h -) - -add_redirector_object( - round_redirector - SRC - round_redirector.cpp -) diff --git a/libc/src/math/round/round_redirector.cpp b/libc/src/math/round_redirector.cpp similarity index 84% rename from libc/src/math/round/round_redirector.cpp rename to libc/src/math/round_redirector.cpp index 6ee074456daf3..c2847c8228c35 100644 --- a/libc/src/math/round/round_redirector.cpp +++ b/libc/src/math/round_redirector.cpp @@ -6,6 +6,8 @@ // //===----------------------------------------------------------------------===// +// Include okay for this redirector. +// NOLINTNEXTLINE(llvmlibc-restrict-system-libc-headers) #include namespace __llvm_libc { diff --git a/libc/src/signal/linux/CMakeLists.txt b/libc/src/signal/linux/CMakeLists.txt index 1d59b7502f7b6..447818e0fd208 100644 --- a/libc/src/signal/linux/CMakeLists.txt +++ b/libc/src/signal/linux/CMakeLists.txt @@ -84,3 +84,15 @@ add_entrypoint_object( errno_h signal_h ) + +add_entrypoint_object( + signal + SRCS + signal.cpp + HDRS + signal.h + ../signal.h + DEPENDS + sigaction + signal_h +) diff --git a/libc/src/signal/linux/signal.cpp b/libc/src/signal/linux/signal.cpp new file mode 100644 index 0000000000000..5a7a12d6c842c --- /dev/null +++ b/libc/src/signal/linux/signal.cpp @@ -0,0 +1,26 @@ +//===------------------ Linux implementation of signal --------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#define __LLVM_LIBC_INTERNAL_SIGACTION +#include "src/signal/signal.h" +#include "src/signal/sigaction.h" + +#include "src/__support/common.h" + +namespace __llvm_libc { + +sighandler_t LLVM_LIBC_ENTRYPOINT(signal)(int signum, sighandler_t handler) { + struct __sigaction action, old; + action.sa_handler = handler; + action.sa_flags = SA_RESTART; + // Errno will already be set so no need to worry about changing errno here. + return __llvm_libc::sigaction(signum, &action, &old) == -1 ? SIG_ERR + : old.sa_handler; +} + +} // namespace __llvm_libc diff --git a/libc/src/signal/signal.h b/libc/src/signal/signal.h new file mode 100644 index 0000000000000..fb4e12d89ec51 --- /dev/null +++ b/libc/src/signal/signal.h @@ -0,0 +1,22 @@ +//===------------- Implementation header for signal ------------*- C++ -*--===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_SIGNAL_SIGNAL_H +#define LLVM_LIBC_SRC_SIGNAL_SIGNAL_H + +#include "include/signal.h" + +namespace __llvm_libc { + +using sighandler_t = __sighandler_t; + +sighandler_t signal(int signum, sighandler_t handler); + +} // namespace __llvm_libc + +#endif // LLVM_LIBC_SRC_SIGNAL_SIGNAL_H diff --git a/libc/test/src/signal/CMakeLists.txt b/libc/test/src/signal/CMakeLists.txt index db919b6f5d01d..3fc33c3dbecc0 100644 --- a/libc/test/src/signal/CMakeLists.txt +++ b/libc/test/src/signal/CMakeLists.txt @@ -52,3 +52,18 @@ add_libc_unittest( signal_h __errno_location ) + +add_libc_unittest( + signal_test + SUITE + libc_signal_unittests + SRCS + signal_test.cpp + DEPENDS + signal + signal_h + sigaction + raise + __errno_location + errno_h +) diff --git a/libc/test/src/signal/signal_test.cpp b/libc/test/src/signal/signal_test.cpp new file mode 100644 index 0000000000000..6be38fc27bdc7 --- /dev/null +++ b/libc/test/src/signal/signal_test.cpp @@ -0,0 +1,41 @@ +//===------------------------ Unittests for signal ------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "include/errno.h" +#include "include/signal.h" +#include "src/errno/llvmlibc_errno.h" +#include "src/signal/raise.h" +#include "src/signal/signal.h" + +#include "utils/UnitTest/ErrnoSetterMatcher.h" +#include "utils/UnitTest/Test.h" + +using __llvm_libc::testing::ErrnoSetterMatcher::Fails; +using __llvm_libc::testing::ErrnoSetterMatcher::Succeeds; + +TEST(Signal, Invalid) { + llvmlibc_errno = 0; + __llvm_libc::sighandler_t valid = +[](int) {}; + EXPECT_THAT((void *)__llvm_libc::signal(0, valid), + Fails(EINVAL, (void *)SIG_ERR)); + EXPECT_THAT((void *)__llvm_libc::signal(65, valid), + Fails(EINVAL, (void *)SIG_ERR)); +} + +static int sum; +TEST(Signal, Basic) { + // In case test get run multiple times. + sum = 0; + ASSERT_NE(__llvm_libc::signal(SIGUSR1, +[](int) { sum++; }), + SIG_ERR); + ASSERT_THAT(__llvm_libc::raise(SIGUSR1), Succeeds()); + EXPECT_EQ(sum, 1); + for (int i = 0; i < 10; i++) + ASSERT_THAT(__llvm_libc::raise(SIGUSR1), Succeeds()); + EXPECT_EQ(sum, 11); +} diff --git a/libc/utils/UnitTest/Test.cpp b/libc/utils/UnitTest/Test.cpp index 8add925b760b4..8e85cd066eed3 100644 --- a/libc/utils/UnitTest/Test.cpp +++ b/libc/utils/UnitTest/Test.cpp @@ -261,7 +261,7 @@ bool Test::testProcessKilled(RunContext &Ctx, testutils::FunctionCaller *Func, if (Result.timedOut()) { Ctx.markFail(); llvm::outs() << File << ":" << Line << ": FAILURE\n" - << "Process timed out after " << 500 << " miliseconds.\n"; + << "Process timed out after " << 500 << " milliseconds.\n"; return false; } @@ -305,7 +305,7 @@ bool Test::testProcessExits(RunContext &Ctx, testutils::FunctionCaller *Func, if (Result.timedOut()) { Ctx.markFail(); llvm::outs() << File << ":" << Line << ": FAILURE\n" - << "Process timed out after " << 500 << " miliseconds.\n"; + << "Process timed out after " << 500 << " milliseconds.\n"; return false; } diff --git a/libc/utils/UnitTest/Test.h b/libc/utils/UnitTest/Test.h index 7982cd66a8ccf..3c042de4c12d5 100644 --- a/libc/utils/UnitTest/Test.h +++ b/libc/utils/UnitTest/Test.h @@ -249,13 +249,20 @@ class Test { #define UNIQUE_VAR(prefix) __CAT(prefix, __LINE__) #define EXPECT_THAT(MATCH, MATCHER) \ - auto UNIQUE_VAR(__matcher) = (MATCHER); \ - __llvm_libc::testing::Test::testMatch( \ - Ctx, UNIQUE_VAR(__matcher).match((MATCH)), UNIQUE_VAR(__matcher), \ - #MATCH, #MATCHER, __FILE__, __LINE__) + do { \ + auto UNIQUE_VAR(__matcher) = (MATCHER); \ + __llvm_libc::testing::Test::testMatch( \ + Ctx, UNIQUE_VAR(__matcher).match((MATCH)), UNIQUE_VAR(__matcher), \ + #MATCH, #MATCHER, __FILE__, __LINE__); \ + } while (0) #define ASSERT_THAT(MATCH, MATCHER) \ - if (!EXPECT_THAT(MATCH, MATCHER)) \ - return + do { \ + auto UNIQUE_VAR(__matcher) = (MATCHER); \ + if (!__llvm_libc::testing::Test::testMatch( \ + Ctx, UNIQUE_VAR(__matcher).match((MATCH)), UNIQUE_VAR(__matcher), \ + #MATCH, #MATCHER, __FILE__, __LINE__)) \ + return; \ + } while (0) #endif // LLVM_LIBC_UTILS_UNITTEST_H diff --git a/libcxx/CMakeLists.txt b/libcxx/CMakeLists.txt index f6097a37c71e3..8bdb24e5fd2af 100644 --- a/libcxx/CMakeLists.txt +++ b/libcxx/CMakeLists.txt @@ -1,4 +1,5 @@ -# See www/CMake.html for instructions on how to build libcxx with CMake. +# See https://libcxx.llvm.org/docs/BuildingLibcxx.html for instructions on how +# to build libcxx with CMake. #=============================================================================== # Setup Project diff --git a/libcxx/docs/TestingLibcxx.rst b/libcxx/docs/TestingLibcxx.rst index eaba214390da3..d295d13d26f32 100644 --- a/libcxx/docs/TestingLibcxx.rst +++ b/libcxx/docs/TestingLibcxx.rst @@ -19,7 +19,7 @@ test libc++. Please see the `Lit Command Guide`_ for more information about LIT. -.. _LIT Command Guide: http://llvm.org/docs/CommandGuide/lit.html +.. _LIT Command Guide: https://llvm.org/docs/CommandGuide/lit.html Setting up the Environment -------------------------- diff --git a/libcxx/docs/UsingLibcxx.rst b/libcxx/docs/UsingLibcxx.rst index 05721bf271a80..4c37ada334b6f 100644 --- a/libcxx/docs/UsingLibcxx.rst +++ b/libcxx/docs/UsingLibcxx.rst @@ -180,7 +180,7 @@ thread safety annotations. Since libc++ 4.0 this extension has been disabled by default. This macro may be defined to re-enable it in order to support existing code that depends on the extension. New use of this extension should be discouraged. - See `PR 27374 `_ for more information. + See `PR 27374 `_ for more information. Note: The "reduced-arity-initialization" extension is still offered but only for explicit conversions. Example: diff --git a/libcxx/docs/index.rst b/libcxx/docs/index.rst index 04df9cfcc4a56..f763f3aa0f60b 100644 --- a/libcxx/docs/index.rst +++ b/libcxx/docs/index.rst @@ -100,7 +100,7 @@ Linux i386, x86_64 Clang, GCC libc++abi The following minimum compiler versions are strongly recommended. -* Clang 3.5 and above +* Clang 4.0 and above * GCC 5.0 and above. The C++03 dialect is only supported for Clang compilers. @@ -161,8 +161,8 @@ Build Bots and Test Coverage Getting Involved ================ -First please review our `Developer's Policy `__ -and `Getting started with LLVM `__. +First please review our `Developer's Policy `__ +and `Getting started with LLVM `__. **Bug Reports** @@ -173,7 +173,7 @@ can post a message to the `libcxx-dev mailing list`_ or on IRC. **Patches** If you want to contribute a patch to libc++, the best place for that is -`Phabricator `_. Please add `libcxx-commits` as a subscriber. +`Phabricator `_. Please add `libcxx-commits` as a subscriber. Also make sure you are subscribed to the `libcxx-commits mailing list `_. **Discussion and Questions** @@ -185,7 +185,7 @@ Send discussions and questions to the Quick Links =========== -* `LLVM Homepage `_ +* `LLVM Homepage `_ * `libc++abi Homepage `_ * `LLVM Bugzilla `_ * `libcxx-commits Mailing List`_ diff --git a/libcxx/include/functional b/libcxx/include/functional index 63e3cbed046af..b13992f94e2b0 100644 --- a/libcxx/include/functional +++ b/libcxx/include/functional @@ -1618,7 +1618,7 @@ public: // __base provides an abstract interface for copyable functors. -template class __base; +template class _LIBCPP_TEMPLATE_VIS __base; template class __base<_Rp(_ArgTypes...)> diff --git a/libcxx/include/type_traits b/libcxx/include/type_traits index 6b8b855afc650..8fdf4a4939d1d 100644 --- a/libcxx/include/type_traits +++ b/libcxx/include/type_traits @@ -544,6 +544,18 @@ template using enable_if_t = typename enable_if<_Bp // is_same +#if __has_keyword(__is_same) + +template +struct _LIBCPP_TEMPLATE_VIS is_same : _BoolConstant<__is_same(_Tp, _Up)> { }; + +#if _LIBCPP_STD_VER > 14 && !defined(_LIBCPP_HAS_NO_VARIABLE_TEMPLATES) +template +_LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_same_v = __is_same(_Tp, _Up); +#endif + +#else + template struct _LIBCPP_TEMPLATE_VIS is_same : public false_type {}; template struct _LIBCPP_TEMPLATE_VIS is_same<_Tp, _Tp> : public true_type {}; @@ -553,6 +565,8 @@ _LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_same_v = is_same<_Tp, _Up>::value; #endif +#endif // __is_same + template using _IsSame = _BoolConstant< #ifdef __clang__ @@ -656,6 +670,18 @@ struct __two {char __lx[2];}; // is_const +#if __has_keyword(__is_const) + +template +struct _LIBCPP_TEMPLATE_VIS is_const : _BoolConstant<__is_const(_Tp)> { }; + +#if _LIBCPP_STD_VER > 14 && !defined(_LIBCPP_HAS_NO_VARIABLE_TEMPLATES) +template +_LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_const_v = __is_const(_Tp); +#endif + +#else + template struct _LIBCPP_TEMPLATE_VIS is_const : public false_type {}; template struct _LIBCPP_TEMPLATE_VIS is_const<_Tp const> : public true_type {}; @@ -665,8 +691,22 @@ _LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_const_v = is_const<_Tp>::value; #endif +#endif // __has_keyword(__is_const) + // is_volatile +#if __has_keyword(__is_volatile) + +template +struct _LIBCPP_TEMPLATE_VIS is_volatile : _BoolConstant<__is_volatile(_Tp)> { }; + +#if _LIBCPP_STD_VER > 14 && !defined(_LIBCPP_HAS_NO_VARIABLE_TEMPLATES) +template +_LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_volatile_v = __is_volatile(_Tp); +#endif + +#else + template struct _LIBCPP_TEMPLATE_VIS is_volatile : public false_type {}; template struct _LIBCPP_TEMPLATE_VIS is_volatile<_Tp volatile> : public true_type {}; @@ -676,37 +716,87 @@ _LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_volatile_v = is_volatile<_Tp>::value; #endif +#endif // __has_keyword(__is_volatile) + // remove_const +#if __has_keyword(__remove_const) + +template +struct _LIBCPP_TEMPLATE_VIS remove_const {typedef __remove_const(_Tp) type;}; + +#if _LIBCPP_STD_VER > 11 +template using remove_const_t = __remove_const(_Tp); +#endif + +#else + template struct _LIBCPP_TEMPLATE_VIS remove_const {typedef _Tp type;}; template struct _LIBCPP_TEMPLATE_VIS remove_const {typedef _Tp type;}; #if _LIBCPP_STD_VER > 11 template using remove_const_t = typename remove_const<_Tp>::type; #endif +#endif // __has_keyword(__remove_const) + // remove_volatile +#if __has_keyword(__remove_volatile) + +template +struct _LIBCPP_TEMPLATE_VIS remove_volatile {typedef __remove_volatile(_Tp) type;}; + +#if _LIBCPP_STD_VER > 11 +template using remove_volatile_t = __remove_volatile(_Tp); +#endif + +#else + template struct _LIBCPP_TEMPLATE_VIS remove_volatile {typedef _Tp type;}; template struct _LIBCPP_TEMPLATE_VIS remove_volatile {typedef _Tp type;}; #if _LIBCPP_STD_VER > 11 template using remove_volatile_t = typename remove_volatile<_Tp>::type; #endif +#endif // __has_keyword(__remove_volatile) + // remove_cv +#if __has_keyword(__remove_cv) + +template +struct _LIBCPP_TEMPLATE_VIS remove_cv {typedef __remove_cv(_Tp) type;}; + +#if _LIBCPP_STD_VER > 11 +template using remove_cv_t = __remove_cv(_Tp); +#endif + +#else + template struct _LIBCPP_TEMPLATE_VIS remove_cv {typedef typename remove_volatile::type>::type type;}; #if _LIBCPP_STD_VER > 11 template using remove_cv_t = typename remove_cv<_Tp>::type; #endif +#endif // __has_keyword(__remove_cv) + // is_void -template struct __libcpp_is_void : public false_type {}; -template <> struct __libcpp_is_void : public true_type {}; +#if __has_keyword(__is_void) + +template +struct _LIBCPP_TEMPLATE_VIS is_void : _BoolConstant<__is_void(_Tp)> { }; + +#if _LIBCPP_STD_VER > 14 && !defined(_LIBCPP_HAS_NO_VARIABLE_TEMPLATES) +template +_LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_void_v = __is_void(_Tp); +#endif + +#else template struct _LIBCPP_TEMPLATE_VIS is_void - : public __libcpp_is_void::type> {}; + : public is_same::type, void> {}; #if _LIBCPP_STD_VER > 14 && !defined(_LIBCPP_HAS_NO_VARIABLE_TEMPLATES) template @@ -714,6 +804,8 @@ _LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_void_v = is_void<_Tp>::value; #endif +#endif // __has_keyword(__is_void) + // __is_nullptr_t template struct __is_nullptr_t_impl : public false_type {}; @@ -735,6 +827,18 @@ _LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_null_pointer_v // is_integral +#if __has_keyword(__is_integral) + +template +struct _LIBCPP_TEMPLATE_VIS is_integral : _BoolConstant<__is_integral(_Tp)> { }; + +#if _LIBCPP_STD_VER > 14 +template +_LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_integral_v = __is_integral(_Tp); +#endif + +#else + template struct _LIBCPP_TEMPLATE_VIS is_integral : public _BoolConstant<__libcpp_is_integral::type>::value> {}; @@ -744,8 +848,22 @@ _LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_integral_v = is_integral<_Tp>::value; #endif +#endif // __has_keyword(__is_integral) + // is_floating_point +#if __has_keyword(__is_floating_point) + +template +struct _LIBCPP_TEMPLATE_VIS is_floating_point : _BoolConstant<__is_floating_point(_Tp)> { }; + +#if _LIBCPP_STD_VER > 14 +template +_LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_floating_point_v = __is_floating_point(_Tp); +#endif + +#else + template struct __libcpp_is_floating_point : public false_type {}; template <> struct __libcpp_is_floating_point : public true_type {}; template <> struct __libcpp_is_floating_point : public true_type {}; @@ -760,8 +878,22 @@ _LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_floating_point_v = is_floating_point<_Tp>::value; #endif +#endif // __has_keyword(__is_floating_point) + // is_array +#if __has_keyword(__is_array) + +template +struct _LIBCPP_TEMPLATE_VIS is_array : _BoolConstant<__is_array(_Tp)> { }; + +#if _LIBCPP_STD_VER > 14 && !defined(_LIBCPP_HAS_NO_VARIABLE_TEMPLATES) +template +_LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_array_v = __is_array(_Tp); +#endif + +#else + template struct _LIBCPP_TEMPLATE_VIS is_array : public false_type {}; template struct _LIBCPP_TEMPLATE_VIS is_array<_Tp[]> @@ -775,6 +907,8 @@ _LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_array_v = is_array<_Tp>::value; #endif +#endif // __has_keyword(__is_array) + // is_pointer template struct __libcpp_is_pointer : public false_type {}; @@ -788,6 +922,18 @@ template struct __libcpp_remove_objc_qualifiers<_Tp __autoreleasing> template struct __libcpp_remove_objc_qualifiers<_Tp __unsafe_unretained> { typedef _Tp type; }; #endif +#if __has_keyword(__is_pointer) + +template +struct _LIBCPP_TEMPLATE_VIS is_pointer : _BoolConstant<__is_pointer(_Tp)> { }; + +#if _LIBCPP_STD_VER > 14 && !defined(_LIBCPP_HAS_NO_VARIABLE_TEMPLATES) +template +_LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_pointer_v = __is_pointer(_Tp); +#endif + +#else // __has_keyword(__is_pointer) + template struct _LIBCPP_TEMPLATE_VIS is_pointer : public __libcpp_is_pointer::type>::type> {}; @@ -797,8 +943,36 @@ _LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_pointer_v = is_pointer<_Tp>::value; #endif +#endif // __has_keyword(__is_pointer) + // is_reference +#if __has_keyword(__is_lvalue_reference) && \ + __has_keyword(__is_rvalue_reference) && \ + __has_keyword(__is_reference) + +template +struct _LIBCPP_TEMPLATE_VIS is_lvalue_reference : _BoolConstant<__is_lvalue_reference(_Tp)> { }; + +template +struct _LIBCPP_TEMPLATE_VIS is_rvalue_reference : _BoolConstant<__is_rvalue_reference(_Tp)> { }; + +template +struct _LIBCPP_TEMPLATE_VIS is_reference : _BoolConstant<__is_reference(_Tp)> { }; + +#if _LIBCPP_STD_VER > 14 && !defined(_LIBCPP_HAS_NO_VARIABLE_TEMPLATES) +template +_LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_reference_v = __is_reference(_Tp); + +template +_LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_lvalue_reference_v = __is_lvalue_reference(_Tp); + +template +_LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_rvalue_reference_v = __is_rvalue_reference(_Tp); +#endif + +#else // __has_keyword(__is_lvalue_reference) && etc... + template struct _LIBCPP_TEMPLATE_VIS is_lvalue_reference : public false_type {}; template struct _LIBCPP_TEMPLATE_VIS is_lvalue_reference<_Tp&> : public true_type {}; @@ -822,6 +996,9 @@ template _LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_rvalue_reference_v = is_rvalue_reference<_Tp>::value; #endif + +#endif // __has_keyword(__is_lvalue_reference) && etc... + // is_union #if __has_feature(is_union) || defined(_LIBCPP_COMPILER_GCC) @@ -902,6 +1079,19 @@ template struct __libcpp_is_member_pointer<_Tp _Up::*> { }; }; +#if __has_keyword(__is_member_function_pointer) + +template +struct _LIBCPP_TEMPLATE_VIS is_member_function_pointer + : _BoolConstant<__is_member_function_pointer(_Tp)> { }; + +#if _LIBCPP_STD_VER > 14 && !defined(_LIBCPP_HAS_NO_VARIABLE_TEMPLATES) +template +_LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_member_function_pointer_v + = __is_member_function_pointer(_Tp); +#endif + +#else // __has_keyword(__is_member_function_pointer) template struct _LIBCPP_TEMPLATE_VIS is_member_function_pointer : public _BoolConstant< __libcpp_is_member_pointer::type>::__is_func > {}; @@ -912,8 +1102,22 @@ _LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_member_function_pointer_v = is_member_function_pointer<_Tp>::value; #endif +#endif // __has_keyword(__is_member_function_pointer) + // is_member_pointer +#if __has_keyword(__is_member_pointer) + +template +struct _LIBCPP_TEMPLATE_VIS is_member_pointer : _BoolConstant<__is_member_pointer(_Tp)> { }; + +#if _LIBCPP_STD_VER > 14 && !defined(_LIBCPP_HAS_NO_VARIABLE_TEMPLATES) +template +_LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_member_pointer_v = __is_member_pointer(_Tp); +#endif + +#else // __has_keyword(__is_member_pointer) + template struct _LIBCPP_TEMPLATE_VIS is_member_pointer : public _BoolConstant< __libcpp_is_member_pointer::type>::__is_member > {}; @@ -923,8 +1127,24 @@ _LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_member_pointer_v = is_member_pointer<_Tp>::value; #endif +#endif // __has_keyword(__is_member_pointer) + // is_member_object_pointer +#if __has_keyword(__is_member_object_pointer) + +template +struct _LIBCPP_TEMPLATE_VIS is_member_object_pointer + : _BoolConstant<__is_member_object_pointer(_Tp)> { }; + +#if _LIBCPP_STD_VER > 14 && !defined(_LIBCPP_HAS_NO_VARIABLE_TEMPLATES) +template +_LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_member_object_pointer_v + = __is_member_object_pointer(_Tp); +#endif + +#else // __has_keyword(__is_member_object_pointer) + template struct _LIBCPP_TEMPLATE_VIS is_member_object_pointer : public _BoolConstant< __libcpp_is_member_pointer::type>::__is_obj > {}; @@ -934,6 +1154,8 @@ _LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_member_object_pointer_v = is_member_object_pointer<_Tp>::value; #endif +#endif // __has_keyword(__is_member_object_pointer) + // is_enum #if __has_feature(is_enum) || defined(_LIBCPP_COMPILER_GCC) @@ -941,6 +1163,11 @@ _LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_member_object_pointer_v template struct _LIBCPP_TEMPLATE_VIS is_enum : public integral_constant {}; +#if _LIBCPP_STD_VER > 14 && !defined(_LIBCPP_HAS_NO_VARIABLE_TEMPLATES) +template +_LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_enum_v = __is_enum(_Tp); +#endif + #else template struct _LIBCPP_TEMPLATE_VIS is_enum @@ -955,16 +1182,28 @@ template struct _LIBCPP_TEMPLATE_VIS is_enum !is_class<_Tp>::value && !is_function<_Tp>::value > {}; -#endif - #if _LIBCPP_STD_VER > 14 && !defined(_LIBCPP_HAS_NO_VARIABLE_TEMPLATES) template _LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_enum_v = is_enum<_Tp>::value; #endif +#endif // __has_feature(is_enum) || defined(_LIBCPP_COMPILER_GCC) + // is_arithmetic +#if __has_keyword(__is_arithmetic) + +template +struct _LIBCPP_TEMPLATE_VIS is_arithmetic : _BoolConstant<__is_arithmetic(_Tp)> { }; + +#if _LIBCPP_STD_VER > 14 && !defined(_LIBCPP_HAS_NO_VARIABLE_TEMPLATES) +template +_LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_arithmetic_v = __is_arithmetic(_Tp); +#endif + +#else // __has_keyword(__is_arithmetic) + template struct _LIBCPP_TEMPLATE_VIS is_arithmetic : public integral_constant::value || is_floating_point<_Tp>::value> {}; @@ -975,8 +1214,24 @@ _LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_arithmetic_v = is_arithmetic<_Tp>::value; #endif +#endif // __has_keyword(__is_arithmetic) + // is_fundamental +// In clang 9 and lower, this builtin did not work for nullptr_t. Additionally, in C++03 mode, +// nullptr isn't defined by the compiler so, this builtin won't work. +#if __has_keyword(__is_fundamental) && _LIBCPP_CLANG_VER > 900 && !defined(_LIBCPP_CXX03_LANG) + +template +struct _LIBCPP_TEMPLATE_VIS is_fundamental : _BoolConstant<__is_fundamental(_Tp)> { }; + +#if _LIBCPP_STD_VER > 14 && !defined(_LIBCPP_HAS_NO_VARIABLE_TEMPLATES) +template +_LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_fundamental_v = __is_fundamental(_Tp); +#endif + +#else // __has_keyword(__is_fundamental) + template struct _LIBCPP_TEMPLATE_VIS is_fundamental : public integral_constant::value || __is_nullptr_t<_Tp>::value || @@ -988,8 +1243,23 @@ _LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_fundamental_v = is_fundamental<_Tp>::value; #endif +#endif // __has_keyword(__is_fundamental) + // is_scalar +// >= 11 because in C++03 nullptr isn't actually nullptr +#if __has_keyword(__is_scalar) && !defined(_LIBCPP_CXX03_LANG) + +template +struct _LIBCPP_TEMPLATE_VIS is_scalar : _BoolConstant<__is_scalar(_Tp)> { }; + +#if _LIBCPP_STD_VER > 14 && !defined(_LIBCPP_HAS_NO_VARIABLE_TEMPLATES) +template +_LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_scalar_v = __is_scalar(_Tp); +#endif + +#else // __has_keyword(__is_scalar) + template struct __is_block : false_type {}; #if defined(_LIBCPP_HAS_EXTENSION_BLOCKS) template struct __is_block<_Rp (^)(_Args...)> : true_type {}; @@ -1011,8 +1281,22 @@ _LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_scalar_v = is_scalar<_Tp>::value; #endif +#endif // __has_keyword(__is_scalar) + // is_object +#if __has_keyword(__is_object) + +template +struct _LIBCPP_TEMPLATE_VIS is_object : _BoolConstant<__is_object(_Tp)> { }; + +#if _LIBCPP_STD_VER > 14 && !defined(_LIBCPP_HAS_NO_VARIABLE_TEMPLATES) +template +_LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_object_v = __is_object(_Tp); +#endif + +#else // __has_keyword(__is_object) + template struct _LIBCPP_TEMPLATE_VIS is_object : public integral_constant::value || is_array<_Tp>::value || @@ -1025,8 +1309,23 @@ _LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_object_v = is_object<_Tp>::value; #endif +#endif // __has_keyword(__is_object) + // is_compound +// >= 11 because in C++03 nullptr isn't actually nullptr +#if __has_keyword(__is_compound) && !defined(_LIBCPP_CXX03_LANG) + +template +struct _LIBCPP_TEMPLATE_VIS is_compound : _BoolConstant<__is_compound(_Tp)> { }; + +#if _LIBCPP_STD_VER > 14 && !defined(_LIBCPP_HAS_NO_VARIABLE_TEMPLATES) +template +_LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_compound_v = __is_compound(_Tp); +#endif + +#else // __has_keyword(__is_compound) + template struct _LIBCPP_TEMPLATE_VIS is_compound : public integral_constant::value> {}; @@ -1036,6 +1335,7 @@ _LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_compound_v = is_compound<_Tp>::value; #endif +#endif // __has_keyword(__is_compound) // __is_referenceable [defns.referenceable] @@ -1080,6 +1380,13 @@ template using add_cv_t = typename add_cv<_Tp>::type; // remove_reference +#if __has_keyword(__remove_reference) + +template +struct _LIBCPP_TEMPLATE_VIS remove_reference { typedef __remove_reference(_Tp) type; }; + +#else // __has_keyword(__remove_reference) + template struct _LIBCPP_TEMPLATE_VIS remove_reference {typedef _LIBCPP_NODEBUG_TYPE _Tp type;}; template struct _LIBCPP_TEMPLATE_VIS remove_reference<_Tp&> {typedef _LIBCPP_NODEBUG_TYPE _Tp type;}; template struct _LIBCPP_TEMPLATE_VIS remove_reference<_Tp&&> {typedef _LIBCPP_NODEBUG_TYPE _Tp type;}; @@ -1088,6 +1395,8 @@ template struct _LIBCPP_TEMPLATE_VIS remove_reference<_Tp&&> {typede template using remove_reference_t = typename remove_reference<_Tp>::type; #endif +#endif // __has_keyword(__remove_reference) + // add_lvalue_reference template ::value> struct __add_lvalue_reference_impl { typedef _LIBCPP_NODEBUG_TYPE _Tp type; }; @@ -1195,6 +1504,19 @@ template using type_identity_t = typename type_identity<_Tp>::type; // is_signed +// In clang 9 and earlier, this builtin did not work for floating points or enums +#if __has_keyword(__is_signed) && _LIBCPP_CLANG_VER > 900 + +template +struct _LIBCPP_TEMPLATE_VIS is_signed : _BoolConstant<__is_signed(_Tp)> { }; + +#if _LIBCPP_STD_VER > 14 && !defined(_LIBCPP_HAS_NO_VARIABLE_TEMPLATES) +template +_LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_signed_v = __is_signed(_Tp); +#endif + +#else // __has_keyword(__is_signed) + template ::value> struct __libcpp_is_signed_impl : public _LIBCPP_BOOL_CONSTANT(_Tp(-1) < _Tp(0)) {}; @@ -1214,8 +1536,22 @@ _LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_signed_v = is_signed<_Tp>::value; #endif +#endif // __has_keyword(__is_signed) + // is_unsigned +#if __has_keyword(__is_unsigned) + +template +struct _LIBCPP_TEMPLATE_VIS is_unsigned : _BoolConstant<__is_unsigned(_Tp)> { }; + +#if _LIBCPP_STD_VER > 14 && !defined(_LIBCPP_HAS_NO_VARIABLE_TEMPLATES) +template +_LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_unsigned_v = __is_unsigned(_Tp); +#endif + +#else // __has_keyword(__is_unsigned) + template ::value> struct __libcpp_is_unsigned_impl : public _LIBCPP_BOOL_CONSTANT(_Tp(0) < _Tp(-1)) {}; @@ -1235,6 +1571,8 @@ _LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_unsigned_v = is_unsigned<_Tp>::value; #endif +#endif // __has_keyword(__is_unsigned) + // rank template struct _LIBCPP_TEMPLATE_VIS rank @@ -1252,6 +1590,19 @@ _LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR size_t rank_v // extent +#if __has_keyword(__array_extent) + +template +struct _LIBCPP_TEMPLATE_VIS extent + : integral_constant { }; + +#if _LIBCPP_STD_VER > 14 && !defined(_LIBCPP_HAS_NO_VARIABLE_TEMPLATES) +template +_LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR size_t extent_v = __array_extent(_Tp, _Ip); +#endif + +#else // __has_keyword(__array_extent) + template struct _LIBCPP_TEMPLATE_VIS extent : public integral_constant {}; template struct _LIBCPP_TEMPLATE_VIS extent<_Tp[], 0> @@ -1269,6 +1620,8 @@ _LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR size_t extent_v = extent<_Tp, _Ip>::value; #endif +#endif // __has_keyword(__array_extent) + // remove_extent template struct _LIBCPP_TEMPLATE_VIS remove_extent @@ -2114,6 +2467,18 @@ template using common_type_t = typename common_type<_Tp...>::type template struct __select_2nd { typedef _LIBCPP_NODEBUG_TYPE _Tp type; }; +#if __has_keyword(__is_assignable) + +template +struct _LIBCPP_TEMPLATE_VIS is_assignable : _BoolConstant<__is_assignable(_Tp, _Up)> { }; + +#if _LIBCPP_STD_VER > 14 && !defined(_LIBCPP_HAS_NO_VARIABLE_TEMPLATES) +template +_LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_assignable_v = __is_assignable(_Tp, _Arg); +#endif + +#else // __has_keyword(__is_assignable) + template typename __select_2nd() = _VSTD::declval<_Arg>())), true_type>::type __is_assignable_test(int); @@ -2142,6 +2507,8 @@ _LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_assignable_v = is_assignable<_Tp, _Arg>::value; #endif +#endif // __has_keyword(__is_assignable) + // is_copy_assignable template struct _LIBCPP_TEMPLATE_VIS is_copy_assignable @@ -2168,6 +2535,18 @@ _LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_move_assignable_v // is_destructible +#if __has_keyword(__is_destructible) + +template +struct _LIBCPP_TEMPLATE_VIS is_destructible : _BoolConstant<__is_destructible(_Tp)> { }; + +#if _LIBCPP_STD_VER > 14 && !defined(_LIBCPP_HAS_NO_VARIABLE_TEMPLATES) +template +_LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_destructible_v = __is_destructible(_Tp); +#endif + +#else // __has_keyword(__is_destructible) + // if it's a reference, return true // if it's a function, return false // if it's void, return false @@ -2230,6 +2609,8 @@ _LIBCPP_INLINE_VAR _LIBCPP_CONSTEXPR bool is_destructible_v = is_destructible<_Tp>::value; #endif +#endif // __has_keyword(__is_destructible) + // move template @@ -3859,7 +4240,6 @@ struct underlying_type : __underlying_type_impl<_Tp, is_enum<_Tp>::value> {}; template using underlying_type_t = typename underlying_type<_Tp>::type; #endif - template ::value> struct __sfinae_underlying_type { diff --git a/libcxx/src/CMakeLists.txt b/libcxx/src/CMakeLists.txt index 2a9d05473be6c..63ca34d1ec470 100644 --- a/libcxx/src/CMakeLists.txt +++ b/libcxx/src/CMakeLists.txt @@ -349,7 +349,7 @@ if (LIBCXX_INSTALL_LIBRARY) endif() if(LIBCXX_INSTALL_EXPERIMENTAL_LIBRARY) - install(TARGETS ${LIBCXX_INSTALL_TARGETS} ${experimental_lib} + install(TARGETS cxx_experimental LIBRARY DESTINATION ${LIBCXX_INSTALL_PREFIX}${LIBCXX_INSTALL_LIBRARY_DIR} COMPONENT cxx ARCHIVE DESTINATION ${LIBCXX_INSTALL_PREFIX}${LIBCXX_INSTALL_LIBRARY_DIR} COMPONENT cxx RUNTIME DESTINATION ${LIBCXX_INSTALL_PREFIX}bin COMPONENT cxx) diff --git a/libcxx/test/CMakeLists.txt b/libcxx/test/CMakeLists.txt index 81474511b58c5..aa0f5da2839f9 100644 --- a/libcxx/test/CMakeLists.txt +++ b/libcxx/test/CMakeLists.txt @@ -26,10 +26,40 @@ if (LIBCXX_ENABLE_ABI_LINKER_SCRIPT) set(LIBCXXABI_USE_LLVM_UNWINDER OFF) endif() +option(LIBCXX_LINK_TESTS_WITH_SHARED_LIBCXXABI + "Whether the libc++ tests should link with the shared libc++abi library" + ${LIBCXXABI_ENABLE_SHARED}) + +option(LIBCXX_LINK_TESTS_WITH_SHARED_LIBCXX + "Whether the libc++ tests should link with the shared libc++ library" + ${LIBCXX_ENABLE_SHARED}) + +if(DEFINED LIBCXXABI_ENABLE_SHARED + AND LIBCXX_LINK_TESTS_WITH_SHARED_LIBCXXABI + AND NOT LIBCXXABI_ENABLE_SHARED) + message(FATAL_ERROR "LIBCXX_LINK_TESTS_WITH_SHARED_LIBCXXABI being ON requires LIBCXXABI_ENABLE_SHARED to be ON") +endif() + +if(DEFINED LIBCXXABI_ENABLE_STATIC + AND NOT LIBCXX_LINK_TESTS_WITH_SHARED_LIBCXXABI + AND NOT LIBCXXABI_ENABLE_STATIC) + message(FATAL_ERROR "LIBCXX_LINK_TESTS_WITH_SHARED_LIBCXXABI being OFF requires LIBCXXABI_ENABLE_STATIC to be ON") +endif() + +if(LIBCXX_LINK_TESTS_WITH_SHARED_LIBCXX AND NOT LIBCXX_ENABLE_SHARED) + message(FATAL_ERROR "LIBCXX_LINK_TESTS_WITH_SHARED_LIBCXX being ON requires LIBCXX_ENABLE_SHARED to be ON") +endif() + +if(NOT LIBCXX_LINK_TESTS_WITH_SHARED_LIBCXX AND NOT LIBCXX_ENABLE_STATIC) + message(FATAL_ERROR "LIBCXX_LINK_TESTS_WITH_SHARED_LIBCXX being OFF requires LIBCXX_ENABLE_STATIC to be ON") +endif() + pythonize_bool(LIBCXX_ENABLE_EXCEPTIONS) pythonize_bool(LIBCXX_ENABLE_EXPERIMENTAL_LIBRARY) pythonize_bool(LIBCXX_ENABLE_RTTI) pythonize_bool(LIBCXX_ENABLE_SHARED) +pythonize_bool(LIBCXX_LINK_TESTS_WITH_SHARED_LIBCXX) +pythonize_bool(LIBCXX_LINK_TESTS_WITH_SHARED_LIBCXXABI) pythonize_bool(LIBCXX_ENABLE_FILESYSTEM) pythonize_bool(LIBCXX_BUILD_32_BITS) pythonize_bool(LIBCXX_GENERATE_COVERAGE) diff --git a/libcxx/test/libcxx/input.output/file.streams/fstreams/ifstream.cons/wchar_pointer.pass.cpp b/libcxx/test/libcxx/input.output/file.streams/fstreams/ifstream.cons/wchar_pointer.pass.cpp index 9dd79a1f87c61..1f6956e7a7e62 100644 --- a/libcxx/test/libcxx/input.output/file.streams/fstreams/ifstream.cons/wchar_pointer.pass.cpp +++ b/libcxx/test/libcxx/input.output/file.streams/fstreams/ifstream.cons/wchar_pointer.pass.cpp @@ -13,6 +13,8 @@ // explicit basic_ifstream(const wchar_t* s, ios_base::openmode mode = ios_base::in); +// FILE_DEPENDENCIES: test.dat + #include #include diff --git a/libcxx/test/libcxx/input.output/file.streams/fstreams/ifstream.members/open_wchar_pointer.pass.cpp b/libcxx/test/libcxx/input.output/file.streams/fstreams/ifstream.members/open_wchar_pointer.pass.cpp index 226cf8d832715..7d91b6d6d095c 100644 --- a/libcxx/test/libcxx/input.output/file.streams/fstreams/ifstream.members/open_wchar_pointer.pass.cpp +++ b/libcxx/test/libcxx/input.output/file.streams/fstreams/ifstream.members/open_wchar_pointer.pass.cpp @@ -13,6 +13,8 @@ // void open(const wchar_t* s, ios_base::openmode mode = ios_base::in); +// FILE_DEPENDENCIES: test.dat + #include #include diff --git a/libcxx/test/libcxx/selftest/test.file_dependencies.sh.cpp b/libcxx/test/libcxx/selftest/test.file_dependencies.sh.cpp new file mode 100644 index 0000000000000..b850b44b2bfec --- /dev/null +++ b/libcxx/test/libcxx/selftest/test.file_dependencies.sh.cpp @@ -0,0 +1,12 @@ +// -*- C++ -*- +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// FILE_DEPENDENCIES: test.pass.cpp + +// RUN: echo %file_dependencies | grep 'test.pass.cpp' diff --git a/libcxx/test/libcxx/utilities/charconv/charconv.to.chars/availability.fail.cpp b/libcxx/test/libcxx/utilities/charconv/charconv.to.chars/availability.fail.cpp index b1898e1a50643..2aff2146fbf24 100644 --- a/libcxx/test/libcxx/utilities/charconv/charconv.to.chars/availability.fail.cpp +++ b/libcxx/test/libcxx/utilities/charconv/charconv.to.chars/availability.fail.cpp @@ -9,6 +9,9 @@ // UNSUPPORTED: c++98, c++03 // REQUIRES: availability=macosx10.7 || availability=macosx10.8 || availability=macosx10.9 || availability=macosx10.10 || availability=macosx10.11 || availability=macosx10.12 || availability=macosx10.13 || availability=macosx10.14 +// TODO(ldionne): This test is currently broken when testing libc++ trunk on one of the above macOS's +// UNSUPPORTED: macosx + // Test the availability markup on std::to_chars. #include diff --git a/libcxx/test/lit.site.cfg.in b/libcxx/test/lit.site.cfg.in index a4354525b01e7..7e1cfe51e18c1 100644 --- a/libcxx/test/lit.site.cfg.in +++ b/libcxx/test/lit.site.cfg.in @@ -8,7 +8,7 @@ config.enable_exceptions = @LIBCXX_ENABLE_EXCEPTIONS@ config.enable_experimental = @LIBCXX_ENABLE_EXPERIMENTAL_LIBRARY@ config.enable_filesystem = @LIBCXX_ENABLE_FILESYSTEM@ config.enable_rtti = @LIBCXX_ENABLE_RTTI@ -config.enable_shared = @LIBCXX_ENABLE_SHARED@ +config.enable_shared = @LIBCXX_LINK_TESTS_WITH_SHARED_LIBCXX@ config.enable_32bit = @LIBCXX_BUILD_32_BITS@ config.cxx_abi = "@LIBCXX_CXX_ABI_LIBNAME@" config.use_sanitizer = "@LLVM_USE_SANITIZER@" @@ -31,7 +31,7 @@ config.builtins_library = "@LIBCXX_BUILTINS_LIBRARY@" config.has_libatomic = @LIBCXX_HAS_ATOMIC_LIB@ config.use_libatomic = @LIBCXX_HAVE_CXX_ATOMICS_WITH_LIB@ config.debug_build = @LIBCXX_DEBUG_BUILD@ -config.libcxxabi_shared = @LIBCXXABI_ENABLE_SHARED@ +config.libcxxabi_shared = @LIBCXX_LINK_TESTS_WITH_SHARED_LIBCXXABI@ config.cxx_ext_threads = @LIBCXX_BUILD_EXTERNAL_THREAD_LIBRARY@ config.pstl_src_root = "@ParallelSTL_SOURCE_DIR@" if @LIBCXX_ENABLE_PARALLEL_ALGORITHMS@ else None config.pstl_obj_root = "@ParallelSTL_BINARY_DIR@" if @LIBCXX_ENABLE_PARALLEL_ALGORITHMS@ else None diff --git a/libcxx/test/std/containers/sequences/array/array.creation/to_array.fail.cpp b/libcxx/test/std/containers/sequences/array/array.creation/to_array.fail.cpp index 7d8e134044c9c..9e194cf2254f5 100644 --- a/libcxx/test/std/containers/sequences/array/array.creation/to_array.fail.cpp +++ b/libcxx/test/std/containers/sequences/array/array.creation/to_array.fail.cpp @@ -20,7 +20,7 @@ int main(int, char**) { // expected-error@array:* {{to_array does not accept multidimensional arrays}} // expected-error@array:* {{to_array requires copy constructible elements}} // expected-error@array:* 3 {{cannot initialize}} - // expected-error@array:* {{suggest braces}} + // expected-error@array:* 0+ {{suggest braces}} std::to_array(source); // expected-note {{requested here}} } diff --git a/libcxx/test/std/containers/sequences/deque/types.pass.cpp b/libcxx/test/std/containers/sequences/deque/types.pass.cpp index 7fd998921e617..7e777e0c237f9 100644 --- a/libcxx/test/std/containers/sequences/deque/types.pass.cpp +++ b/libcxx/test/std/containers/sequences/deque/types.pass.cpp @@ -28,6 +28,8 @@ // typedef std::reverse_iterator const_reverse_iterator; // }; +// MODULES_DEFINES: _LIBCPP_ENABLE_CXX20_REMOVED_ALLOCATOR_MEMBERS +// MODULES_DEFINES: _LIBCPP_DISABLE_DEPRECATION_WARNINGS #define _LIBCPP_ENABLE_CXX20_REMOVED_ALLOCATOR_MEMBERS #define _LIBCPP_DISABLE_DEPRECATION_WARNINGS diff --git a/libcxx/test/std/containers/sequences/list/types.pass.cpp b/libcxx/test/std/containers/sequences/list/types.pass.cpp index 611eec4d99f7b..644e5526dfa71 100644 --- a/libcxx/test/std/containers/sequences/list/types.pass.cpp +++ b/libcxx/test/std/containers/sequences/list/types.pass.cpp @@ -21,6 +21,8 @@ // typedef typename allocator_type::pointer pointer; // typedef typename allocator_type::const_pointer const_pointer; +// MODULES_DEFINES: _LIBCPP_ENABLE_CXX20_REMOVED_ALLOCATOR_MEMBERS +// MODULES_DEFINES: _LIBCPP_DISABLE_DEPRECATION_WARNINGS #define _LIBCPP_ENABLE_CXX20_REMOVED_ALLOCATOR_MEMBERS #define _LIBCPP_DISABLE_DEPRECATION_WARNINGS diff --git a/libcxx/test/std/containers/sequences/vector/types.pass.cpp b/libcxx/test/std/containers/sequences/vector/types.pass.cpp index 0357785bef809..6cd79933dddc6 100644 --- a/libcxx/test/std/containers/sequences/vector/types.pass.cpp +++ b/libcxx/test/std/containers/sequences/vector/types.pass.cpp @@ -28,6 +28,8 @@ // typedef std::reverse_iterator const_reverse_iterator; // }; +// MODULES_DEFINES: _LIBCPP_ENABLE_CXX20_REMOVED_ALLOCATOR_MEMBERS +// MODULES_DEFINES: _LIBCPP_DISABLE_DEPRECATION_WARNINGS #define _LIBCPP_ENABLE_CXX20_REMOVED_ALLOCATOR_MEMBERS #define _LIBCPP_DISABLE_DEPRECATION_WARNINGS diff --git a/libcxx/test/std/input.output/file.streams/fstreams/filebuf.virtuals/seekoff.pass.cpp b/libcxx/test/std/input.output/file.streams/fstreams/filebuf.virtuals/seekoff.pass.cpp index 6f50357a093d6..f9e8938da9ee6 100644 --- a/libcxx/test/std/input.output/file.streams/fstreams/filebuf.virtuals/seekoff.pass.cpp +++ b/libcxx/test/std/input.output/file.streams/fstreams/filebuf.virtuals/seekoff.pass.cpp @@ -13,6 +13,8 @@ // pos_type seekpos(pos_type sp, // ios_base::openmode which = ios_base::in | ios_base::out); +// FILE_DEPENDENCIES: underflow.dat + #include #include diff --git a/libcxx/test/std/input.output/file.streams/fstreams/filebuf.virtuals/underflow.pass.cpp b/libcxx/test/std/input.output/file.streams/fstreams/filebuf.virtuals/underflow.pass.cpp index 1e3029a4edcf7..0d1f7a384a9e4 100644 --- a/libcxx/test/std/input.output/file.streams/fstreams/filebuf.virtuals/underflow.pass.cpp +++ b/libcxx/test/std/input.output/file.streams/fstreams/filebuf.virtuals/underflow.pass.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// // REQUIRES: locale.en_US.UTF-8 +// FILE_DEPENDENCIES: underflow.dat, underflow_utf8.dat // diff --git a/libcxx/test/std/input.output/file.streams/fstreams/ifstream.assign/member_swap.pass.cpp b/libcxx/test/std/input.output/file.streams/fstreams/ifstream.assign/member_swap.pass.cpp index 7832464b8e066..7a5d9c18d1749 100644 --- a/libcxx/test/std/input.output/file.streams/fstreams/ifstream.assign/member_swap.pass.cpp +++ b/libcxx/test/std/input.output/file.streams/fstreams/ifstream.assign/member_swap.pass.cpp @@ -6,6 +6,8 @@ // //===----------------------------------------------------------------------===// +// FILE_DEPENDENCIES: test.dat, test2.dat + // // template > diff --git a/libcxx/test/std/input.output/file.streams/fstreams/ifstream.assign/move_assign.pass.cpp b/libcxx/test/std/input.output/file.streams/fstreams/ifstream.assign/move_assign.pass.cpp index ec50ec686bb5f..115703c957019 100644 --- a/libcxx/test/std/input.output/file.streams/fstreams/ifstream.assign/move_assign.pass.cpp +++ b/libcxx/test/std/input.output/file.streams/fstreams/ifstream.assign/move_assign.pass.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// // UNSUPPORTED: c++98, c++03 +// FILE_DEPENDENCIES: test.dat // diff --git a/libcxx/test/std/input.output/file.streams/fstreams/ifstream.assign/nonmember_swap.pass.cpp b/libcxx/test/std/input.output/file.streams/fstreams/ifstream.assign/nonmember_swap.pass.cpp index 587dcc23d2fe6..f04c5e4b52da5 100644 --- a/libcxx/test/std/input.output/file.streams/fstreams/ifstream.assign/nonmember_swap.pass.cpp +++ b/libcxx/test/std/input.output/file.streams/fstreams/ifstream.assign/nonmember_swap.pass.cpp @@ -6,6 +6,8 @@ // //===----------------------------------------------------------------------===// +// FILE_DEPENDENCIES: test.dat, test2.dat + // // template > diff --git a/libcxx/test/std/input.output/file.streams/fstreams/ifstream.cons/move.pass.cpp b/libcxx/test/std/input.output/file.streams/fstreams/ifstream.cons/move.pass.cpp index ac19eea943d8d..af99878aefd83 100644 --- a/libcxx/test/std/input.output/file.streams/fstreams/ifstream.cons/move.pass.cpp +++ b/libcxx/test/std/input.output/file.streams/fstreams/ifstream.cons/move.pass.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// // UNSUPPORTED: c++98, c++03 +// FILE_DEPENDENCIES: test.dat // diff --git a/libcxx/test/std/input.output/file.streams/fstreams/ifstream.cons/path.pass.cpp b/libcxx/test/std/input.output/file.streams/fstreams/ifstream.cons/path.pass.cpp index 52a367047e228..d433067982800 100644 --- a/libcxx/test/std/input.output/file.streams/fstreams/ifstream.cons/path.pass.cpp +++ b/libcxx/test/std/input.output/file.streams/fstreams/ifstream.cons/path.pass.cpp @@ -8,6 +8,7 @@ // UNSUPPORTED: c++98, c++03, c++11, c++14 // XFAIL: dylib-has-no-filesystem +// FILE_DEPENDENCIES: test.dat // diff --git a/libcxx/test/std/input.output/file.streams/fstreams/ifstream.cons/pointer.pass.cpp b/libcxx/test/std/input.output/file.streams/fstreams/ifstream.cons/pointer.pass.cpp index 577670106cff9..e6674255e50b0 100644 --- a/libcxx/test/std/input.output/file.streams/fstreams/ifstream.cons/pointer.pass.cpp +++ b/libcxx/test/std/input.output/file.streams/fstreams/ifstream.cons/pointer.pass.cpp @@ -6,6 +6,8 @@ // //===----------------------------------------------------------------------===// +// FILE_DEPENDENCIES: test.dat + // // template > diff --git a/libcxx/test/std/input.output/file.streams/fstreams/ifstream.cons/string.pass.cpp b/libcxx/test/std/input.output/file.streams/fstreams/ifstream.cons/string.pass.cpp index c1ab706dd4f4a..f0e96256e2674 100644 --- a/libcxx/test/std/input.output/file.streams/fstreams/ifstream.cons/string.pass.cpp +++ b/libcxx/test/std/input.output/file.streams/fstreams/ifstream.cons/string.pass.cpp @@ -6,6 +6,8 @@ // //===----------------------------------------------------------------------===// +// FILE_DEPENDENCIES: test.dat + // // template > diff --git a/libcxx/test/std/input.output/file.streams/fstreams/ifstream.members/close.pass.cpp b/libcxx/test/std/input.output/file.streams/fstreams/ifstream.members/close.pass.cpp index 9bd4f723c12dc..3641dfdc15acf 100644 --- a/libcxx/test/std/input.output/file.streams/fstreams/ifstream.members/close.pass.cpp +++ b/libcxx/test/std/input.output/file.streams/fstreams/ifstream.members/close.pass.cpp @@ -6,6 +6,8 @@ // //===----------------------------------------------------------------------===// +// FILE_DEPENDENCIES: test.dat + // // template > diff --git a/libcxx/test/std/input.output/file.streams/fstreams/ifstream.members/open_path.pass.cpp b/libcxx/test/std/input.output/file.streams/fstreams/ifstream.members/open_path.pass.cpp index 9d30e8ece52c1..c3486634e48f1 100644 --- a/libcxx/test/std/input.output/file.streams/fstreams/ifstream.members/open_path.pass.cpp +++ b/libcxx/test/std/input.output/file.streams/fstreams/ifstream.members/open_path.pass.cpp @@ -8,6 +8,7 @@ // UNSUPPORTED: c++98, c++03, c++11, c++14 // XFAIL: dylib-has-no-filesystem +// FILE_DEPENDENCIES: test.dat // diff --git a/libcxx/test/std/input.output/file.streams/fstreams/ifstream.members/open_pointer.pass.cpp b/libcxx/test/std/input.output/file.streams/fstreams/ifstream.members/open_pointer.pass.cpp index fd2adba4398fc..4626f968e53d5 100644 --- a/libcxx/test/std/input.output/file.streams/fstreams/ifstream.members/open_pointer.pass.cpp +++ b/libcxx/test/std/input.output/file.streams/fstreams/ifstream.members/open_pointer.pass.cpp @@ -6,6 +6,8 @@ // //===----------------------------------------------------------------------===// +// FILE_DEPENDENCIES: test.dat + // // template > diff --git a/libcxx/test/std/input.output/file.streams/fstreams/ifstream.members/open_string.pass.cpp b/libcxx/test/std/input.output/file.streams/fstreams/ifstream.members/open_string.pass.cpp index b5fd9fc5522a5..35722d52849fb 100644 --- a/libcxx/test/std/input.output/file.streams/fstreams/ifstream.members/open_string.pass.cpp +++ b/libcxx/test/std/input.output/file.streams/fstreams/ifstream.members/open_string.pass.cpp @@ -6,6 +6,8 @@ // //===----------------------------------------------------------------------===// +// FILE_DEPENDENCIES: test.dat + // // template > diff --git a/libcxx/test/std/input.output/file.streams/fstreams/ifstream.members/rdbuf.pass.cpp b/libcxx/test/std/input.output/file.streams/fstreams/ifstream.members/rdbuf.pass.cpp index c09f76161ef8f..aa6cbb20f1aff 100644 --- a/libcxx/test/std/input.output/file.streams/fstreams/ifstream.members/rdbuf.pass.cpp +++ b/libcxx/test/std/input.output/file.streams/fstreams/ifstream.members/rdbuf.pass.cpp @@ -6,6 +6,8 @@ // //===----------------------------------------------------------------------===// +// FILE_DEPENDENCIES: test.dat + // // template > diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new_align_val_t.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new_align_val_t.pass.cpp index e303c820847fb..edd4e52fe3be5 100644 --- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new_align_val_t.pass.cpp +++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new_align_val_t.pass.cpp @@ -11,9 +11,6 @@ // asan and msan will not call the new handler. // UNSUPPORTED: sanitizer-new-delete -// FIXME change this to XFAIL. -// UNSUPPORTED: no-aligned-allocation && !gcc - // Aligned allocation was not provided before macosx10.14 and as a result we // get availability errors when the deployment target is older than macosx10.14. // However, AppleClang 10 (and older) don't trigger availability errors, and diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new_align_val_t_nothrow.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new_align_val_t_nothrow.pass.cpp index ed7a53743f0ef..91e3a1bf06541 100644 --- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new_align_val_t_nothrow.pass.cpp +++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new_align_val_t_nothrow.pass.cpp @@ -11,9 +11,6 @@ // asan and msan will not call the new handler. // UNSUPPORTED: sanitizer-new-delete -// FIXME turn this into an XFAIL -// UNSUPPORTED: no-aligned-allocation && !gcc - // Aligned allocation was not provided before macosx10.14 and as a result we // get availability errors when the deployment target is older than macosx10.14. // However, AppleClang 10 (and older) don't trigger availability errors, and diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new_align_val_t_nothrow_replace.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new_align_val_t_nothrow_replace.pass.cpp index 49aa2bce3ea29..5aa47a5bc236f 100644 --- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new_align_val_t_nothrow_replace.pass.cpp +++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new_align_val_t_nothrow_replace.pass.cpp @@ -31,8 +31,6 @@ // XFAIL: (apple-clang-9 || apple-clang-10) && with_system_cxx_lib=macosx10.8 // XFAIL: (apple-clang-9 || apple-clang-10) && with_system_cxx_lib=macosx10.7 -// XFAIL: no-aligned-allocation && !gcc - // On Windows libc++ doesn't provide its own definitions for new/delete // but instead depends on the ones in VCRuntime. However VCRuntime does not // yet provide aligned new/delete definitions so this test fails. diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new_align_val_t_replace.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new_align_val_t_replace.pass.cpp index cb9a2ef7f6ca8..3cc153ecb2a2e 100644 --- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new_align_val_t_replace.pass.cpp +++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new_align_val_t_replace.pass.cpp @@ -9,9 +9,6 @@ // UNSUPPORTED: c++98, c++03, c++11, c++14 // UNSUPPORTED: sanitizer-new-delete -// NOTE: GCC doesn't provide the -faligned-allocation flag to test for -// XFAIL: no-aligned-allocation && !gcc - // test operator new replacement #include diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new_align_val_t.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new_align_val_t.pass.cpp index 0d96db5de485d..f0da4d6746155 100644 --- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new_align_val_t.pass.cpp +++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new_align_val_t.pass.cpp @@ -33,9 +33,6 @@ // asan and msan will not call the new handler. // UNSUPPORTED: sanitizer-new-delete -// FIXME turn this into an XFAIL -// UNSUPPORTED: no-aligned-allocation && !gcc - // On Windows libc++ doesn't provide its own definitions for new/delete // but instead depends on the ones in VCRuntime. However VCRuntime does not // yet provide aligned new/delete definitions so this test fails to compile/link. diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new_align_val_t_nothrow.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new_align_val_t_nothrow.pass.cpp index 4b621f78a726a..7edb003c25815 100644 --- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new_align_val_t_nothrow.pass.cpp +++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new_align_val_t_nothrow.pass.cpp @@ -33,9 +33,6 @@ // asan and msan will not call the new handler. // UNSUPPORTED: sanitizer-new-delete -// FIXME turn this into an XFAIL -// UNSUPPORTED: no-aligned-allocation && !gcc - // On Windows libc++ doesn't provide its own definitions for new/delete // but instead depends on the ones in VCRuntime. However VCRuntime does not // yet provide aligned new/delete definitions so this test fails to compile/link. diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new_align_val_t_nothrow_replace.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new_align_val_t_nothrow_replace.pass.cpp index 892eac2058265..95931b715d9d6 100644 --- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new_align_val_t_nothrow_replace.pass.cpp +++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new_align_val_t_nothrow_replace.pass.cpp @@ -31,9 +31,6 @@ // XFAIL: (apple-clang-9 || apple-clang-10) && with_system_cxx_lib=macosx10.8 // XFAIL: (apple-clang-9 || apple-clang-10) && with_system_cxx_lib=macosx10.7 -// NOTE: gcc doesn't provide -faligned-allocation flag to test for -// XFAIL: no-aligned-allocation && !gcc - // On Windows libc++ doesn't provide its own definitions for new/delete // but instead depends on the ones in VCRuntime. However VCRuntime does not // yet provide aligned new/delete definitions so this test fails. diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new_align_val_t_replace.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new_align_val_t_replace.pass.cpp index 32c27d5899ec9..3a17266d17560 100644 --- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new_align_val_t_replace.pass.cpp +++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new_align_val_t_replace.pass.cpp @@ -9,9 +9,6 @@ // UNSUPPORTED: c++98, c++03, c++11, c++14 // UNSUPPORTED: sanitizer-new-delete -// NOTE: GCC doesn't provide a -faligned-allocation flag -// XFAIL: no-aligned-allocation && !gcc - // test operator new replacement #include diff --git a/libcxx/test/std/localization/locales/locale.convenience/conversions/conversions.buffer/pbackfail.pass.cpp b/libcxx/test/std/localization/locales/locale.convenience/conversions/conversions.buffer/pbackfail.pass.cpp index d8e60c0d85ad8..264a67ee530f7 100644 --- a/libcxx/test/std/localization/locales/locale.convenience/conversions/conversions.buffer/pbackfail.pass.cpp +++ b/libcxx/test/std/localization/locales/locale.convenience/conversions/conversions.buffer/pbackfail.pass.cpp @@ -6,6 +6,8 @@ // //===----------------------------------------------------------------------===// +// FILE_DEPENDENCIES: underflow.dat + // // wbuffer_convert diff --git a/libcxx/test/std/localization/locales/locale.convenience/conversions/conversions.buffer/underflow.pass.cpp b/libcxx/test/std/localization/locales/locale.convenience/conversions/conversions.buffer/underflow.pass.cpp index 0b25e139efc37..6642488cf0c9c 100644 --- a/libcxx/test/std/localization/locales/locale.convenience/conversions/conversions.buffer/underflow.pass.cpp +++ b/libcxx/test/std/localization/locales/locale.convenience/conversions/conversions.buffer/underflow.pass.cpp @@ -6,6 +6,8 @@ // //===----------------------------------------------------------------------===// +// FILE_DEPENDENCIES: underflow.dat, underflow_utf8.dat + // // wbuffer_convert diff --git a/libcxx/test/std/utilities/allocator.adaptor/allocator.adaptor.members/construct.pass.cpp b/libcxx/test/std/utilities/allocator.adaptor/allocator.adaptor.members/construct.pass.cpp index 5201c5c6cab7e..201ba2e981137 100644 --- a/libcxx/test/std/utilities/allocator.adaptor/allocator.adaptor.members/construct.pass.cpp +++ b/libcxx/test/std/utilities/allocator.adaptor/allocator.adaptor.members/construct.pass.cpp @@ -111,6 +111,18 @@ struct F bool F::constructed = false; +struct G +{ + static bool constructed; + + typedef std::allocator allocator_type; + + G(std::allocator_arg_t, allocator_type&&) { assert(false); } + G(allocator_type&) { constructed = true; } +}; + +bool G::constructed = false; + int main(int, char**) { @@ -185,6 +197,17 @@ int main(int, char**) assert(A3::constructed); s->~S(); } + + // LWG 2586 + // Test that is_constructible uses an lvalue ref so the correct constructor + // is picked. + { + std::scoped_allocator_adaptor sa; + G* ptr = sa.allocate(1); + sa.construct(ptr); + assert(G::constructed); + sa.deallocate(ptr, 1); + } return 0; } diff --git a/libcxx/test/std/utilities/meta/meta.rel/is_same.pass.cpp b/libcxx/test/std/utilities/meta/meta.rel/is_same.pass.cpp index 739713bf4c2f2..dd83f22401317 100644 --- a/libcxx/test/std/utilities/meta/meta.rel/is_same.pass.cpp +++ b/libcxx/test/std/utilities/meta/meta.rel/is_same.pass.cpp @@ -50,6 +50,14 @@ void test_is_not_same() static_assert((!std::is_same::value), ""); } +template +struct OverloadTest +{ + void fn(std::is_same) { } + void fn(std::false_type) { } + void x() { fn(std::false_type()); } +}; + class Class { public: @@ -70,5 +78,8 @@ int main(int, char**) test_is_not_same(); test_is_not_same(); + OverloadTest t; + (void)t; + return 0; } diff --git a/libcxx/test/support/filesystem_test_helper.h b/libcxx/test/support/filesystem_test_helper.h index 8cefd985026bc..8161820d05da2 100644 --- a/libcxx/test/support/filesystem_test_helper.h +++ b/libcxx/test/support/filesystem_test_helper.h @@ -305,9 +305,7 @@ struct scoped_test_env // Misc test types -#define CONCAT2(LHS, RHS) LHS##RHS -#define CONCAT(LHS, RHS) CONCAT2(LHS, RHS) -#define MKSTR(Str) {Str, CONCAT(L, Str), CONCAT(u, Str), CONCAT(U, Str)} +#define MKSTR(Str) {Str, TEST_CONCAT(L, Str), TEST_CONCAT(u, Str), TEST_CONCAT(U, Str)} struct MultiStringType { const char* s; diff --git a/libcxx/utils/docker/debian9/buildbot/Dockerfile b/libcxx/utils/docker/debian9/buildbot/Dockerfile new file mode 100644 index 0000000000000..ea2ac9d55933e --- /dev/null +++ b/libcxx/utils/docker/debian9/buildbot/Dockerfile @@ -0,0 +1,40 @@ + +#===-------------------------------------------------------------------------------------------===// +# buildslave +#===-------------------------------------------------------------------------------------------===// +ARG gcc_tot +ARG llvm_tot + +FROM ${gcc_tot} AS gcc-tot +FROM ${llvm_tot} AS llvm-tot + +FROM debian:stretch AS base-image + +ADD install-packages.sh /tmp/ +RUN /tmp/install-packages.sh && rm /tmp/install-packages.sh + +COPY --from=ericwf/gcc:5.5.0 /compiler /opt/gcc-5 +COPY --from=ericwf/llvm:9.x /compiler /opt/llvm-9 + +FROM base-image as worker-image + +COPY --from=gcc-tot /compiler /opt/gcc-tot +COPY --from=llvm-tot /compiler /opt/llvm-tot + +ENV PATH /opt/llvm-tot/bin:$PATH + +RUN clang++ --version && echo hello +RUN g++ --version + + +RUN /opt/gcc-tot/bin/g++ --version +RUN /opt/llvm-tot/bin/clang++ --version +RUN /opt/llvm-tot/bin/clang --version + +# FIXME(EricWF): remove this once the buildbot's config doesn't clobber the path. +RUN ln -s /opt/llvm-tot/bin/clang /usr/local/bin/clang +RUN ln -s /opt/llvm-tot/bin/clang++ /usr/local/bin/clang++ + + +ADD run_buildbot.sh / +CMD /run_buildbot.sh /run/secrets/buildbot-auth diff --git a/libcxx/utils/docker/debian9/buildbot/buildbot-auth.json b/libcxx/utils/docker/debian9/buildbot/buildbot-auth.json new file mode 100644 index 0000000000000..5e91e2d4158fa --- /dev/null +++ b/libcxx/utils/docker/debian9/buildbot/buildbot-auth.json @@ -0,0 +1,4 @@ +{ + "login": "", + "password": "" +} diff --git a/libcxx/utils/docker/debian9/buildbot/docker-compose.yml b/libcxx/utils/docker/debian9/buildbot/docker-compose.yml new file mode 100644 index 0000000000000..f9a2a2ad9c31c --- /dev/null +++ b/libcxx/utils/docker/debian9/buildbot/docker-compose.yml @@ -0,0 +1,19 @@ +version: '3.7' +services: + llvm-buildbot-worker: + build: + context: https://github.com/llvm/llvm-project.git#master:libcxx/utils/docker/debian9/buildbot + args: + gcc_tot: "ericwf/gcc:9.2.0" + llvm_tot: "ericwf/llvm:9.x" + image: llvm-buildbot-worker + volumes: + - /var/run/docker.sock:/var/run/docker.sock + secrets: + - buildbot-auth + logging: + driver: gcplogs + +secrets: + buildbot-auth: + file: buildbot-auth.json diff --git a/libcxx/utils/docker/debian9/buildbot/install-gcloud-agents.sh b/libcxx/utils/docker/debian9/buildbot/install-gcloud-agents.sh new file mode 100755 index 0000000000000..d2656ca5092a0 --- /dev/null +++ b/libcxx/utils/docker/debian9/buildbot/install-gcloud-agents.sh @@ -0,0 +1,11 @@ +#!/usr/bin/env bash + +cd /tmp/ + +curl -sSO https://dl.google.com/cloudagents/install-monitoring-agent.sh +sudo bash install-monitoring-agent.sh +rm install-monitoring-agent.sh + +curl -sSO https://dl.google.com/cloudagents/install-logging-agent.sh +sudo bash install-logging-agent.sh +rm install-logging-agent.sh diff --git a/libcxx/utils/docker/debian9/buildbot/install-packages.sh b/libcxx/utils/docker/debian9/buildbot/install-packages.sh new file mode 100755 index 0000000000000..1f18c428004c2 --- /dev/null +++ b/libcxx/utils/docker/debian9/buildbot/install-packages.sh @@ -0,0 +1,39 @@ +#!/usr/bin/env bash + +set -x +set -e + +apt-get update && \ + apt-get install -y --no-install-recommends \ + buildbot-slave \ + ca-certificates \ + gnupg \ + build-essential \ + wget \ + unzip \ + python \ + ninja-build \ + curl \ + git \ + gcc-multilib \ + g++-multilib \ + libc6-dev \ + libtool \ + binutils-dev \ + binutils-gold \ + software-properties-common \ + gnupg \ + apt-transport-https \ + sudo \ + bash-completion \ + vim \ + jq \ + systemd \ + sysvinit-utils \ + systemd-sysv && \ + rm -rf /var/lib/apt/lists/* + +# Install a recent CMake +yes | apt-get purge cmake +wget https://github.com/Kitware/CMake/releases/download/v3.15.2/cmake-3.15.2-Linux-x86_64.sh -O /tmp/install-cmake.sh +bash /tmp/install-cmake.sh --prefix=/usr --exclude-subdir --skip-license diff --git a/libcxx/utils/docker/debian9/scripts/buildbot/run_buildbot.sh b/libcxx/utils/docker/debian9/buildbot/run_buildbot.sh similarity index 72% rename from libcxx/utils/docker/debian9/scripts/buildbot/run_buildbot.sh rename to libcxx/utils/docker/debian9/buildbot/run_buildbot.sh index 7448eb26b79b7..e008a30558c9f 100755 --- a/libcxx/utils/docker/debian9/scripts/buildbot/run_buildbot.sh +++ b/libcxx/utils/docker/debian9/buildbot/run_buildbot.sh @@ -2,27 +2,27 @@ set -x readonly BOT_ROOT=/b -readonly BOT_ROOT_NAME=$1 -readonly BOT_PASS=$2 +readonly AUTH_FILE=$1 +readonly BOT_ROOT_NAME=$(jq -r ".login" $AUTH_FILE) -#pushd /tmp -#curl -sSO https://dl.google.com/cloudagents/install-monitoring-agent.sh -#bash install-monitoring-agent.sh -#curl -sSO https://dl.google.com/cloudagents/install-logging-agent.sh -#bash install-logging-agent.sh --structured -#popd +systemctl daemon-reload +service buildslave stop +mkdir -p /b +rm -rf /b/* +service buildslave stop -apt-get update -y -apt-get upgrade -y +pushd /tmp/ -apt-get install sudo -y +curl -sSO https://dl.google.com/cloudagents/install-monitoring-agent.sh +sudo bash install-monitoring-agent.sh +rm install-monitoring-agent.sh + +curl -sSO https://dl.google.com/cloudagents/install-logging-agent.sh +sudo bash install-logging-agent.sh +rm install-logging-agent.sh + +popd -# FIXME(EricWF): Remove this hack. It's only in place to temporarily fix linking libclang_rt from the -# debian packages. -# WARNING: If you're not a buildbot, DO NOT RUN! -apt-get install lld-11 -y -rm /usr/bin/ld -ln -s /usr/bin/lld-11 /usr/bin/ld systemctl set-property buildslave.service TasksMax=100000 @@ -32,10 +32,10 @@ function setup_numbered_bot() { mkdir -p $BOT_DIR buildslave stop $BOT_DIR - chown buildbot:buildbot $BOT_DIR + chown buildbot $BOT_DIR rm -rf $BOT_DIR/* - buildslave create-slave --allow-shutdown=signal "$BOT_DIR" "lab.llvm.org:9990" "$BOT_NAME" "$BOT_PASS" + buildslave create-slave --allow-shutdown=signal "$BOT_DIR" "lab.llvm.org:9990" "$BOT_NAME" $(jq -r ".password" $AUTH_FILE) echo "Eric Fiselier " > $BOT_DIR/info/admin @@ -44,6 +44,7 @@ function setup_numbered_bot() { uname -a | head -n1 cmake --version | head -n1 g++ --version | head -n1 + clang++ --version | head -n1 ld --version | head -n1 date lscpu @@ -74,7 +75,7 @@ function try_start_builder { systemctl daemon-reload service buildslave restart - chown -R buildbot:buildbot $BOT_DIR/ + chown -R buildbot $BOT_DIR/ sudo -u buildbot /usr/bin/buildslave start $BOT_DIR/ sleep 30 diff --git a/libcxx/utils/docker/debian9/compilers.yml b/libcxx/utils/docker/debian9/compilers.yml deleted file mode 100644 index bc4917ade7dee..0000000000000 --- a/libcxx/utils/docker/debian9/compilers.yml +++ /dev/null @@ -1,174 +0,0 @@ -version: '3.7' - -x-build-clang: &build-clang - context: . - dockerfile: compilers/clang.Dockerfile - -x-build-gcc: &build-gcc - context: . - dockerfile: compilers/gcc.Dockerfile - -services: - gcc-4.8.5: - build: - <<: *build-gcc - args: - branch: releases/gcc-4.8.5 - install_prefix: /opt/gcc-4.8.5 - cherry_pick: 3a27b4db566c2cde8e043220f3d2c5401159b10e - image: ericwf/compiler:gcc-4.8.5 - gcc-4.9.4: - build: - <<: *build-gcc - args: - branch: releases/gcc-4.9.4 - install_prefix: /opt/gcc-4.9.4 - image: ericwf/compiler:gcc-4.9.4 - gcc-5: - build: - <<: *build-gcc - args: - branch: releases/gcc-5.5.0 - install_prefix: /opt/gcc-5 - image: ericwf/compiler:gcc-5 - gcc-6: - build: - <<: *build-gcc - args: - branch: releases/gcc-6.5.0 - install_prefix: /opt/gcc-6 - image: ericwf/compiler:gcc-6 - gcc-7: - build: - <<: *build-gcc - args: - branch: releases/gcc-7.4.0 - install_prefix: /opt/gcc-7 - image: ericwf/compiler:gcc-7 - gcc-8: - build: - <<: *build-gcc - args: - branch: releases/gcc-8.2.0 - install_prefix: /opt/gcc-8 - image: ericwf/compiler:gcc-8 - gcc-9: - build: - <<: *build-gcc - args: - branch: releases/gcc-9.2.0 - install_prefix: /opt/gcc-9 - image: ericwf/compiler:gcc-9 - # Add LLVM compilers - llvm-3.6: - build: - <<: *build-clang - args: - branch: release/3.6.x - install_prefix: /opt/llvm-3.6 - image: ericwf/compiler:llvm-3.6 - llvm-3.7: - build: - <<: *build-clang - args: - branch: release/3.7.x - install_prefix: /opt/llvm-3.7 - image: ericwf/compiler:llvm-3.7 - llvm-3.8: - build: - <<: *build-clang - args: - branch: release/3.8.x - install_prefix: /opt/llvm-3.8 - image: ericwf/compiler:llvm-3.8 - llvm-3.9: - build: - <<: *build-clang - args: - branch: release/3.9.x - install_prefix: /opt/llvm-3.9 - image: ericwf/compiler:llvm-3.9 - llvm-4: - build: - <<: *build-clang - args: - branch: release/4.x - install_prefix: /opt/llvm-4 - image: ericwf/compiler:llvm-4 - llvm-5: - build: - <<: *build-clang - args: - branch: release/5.x - install_prefix: /opt/llvm-5 - image: ericwf/compiler:llvm-5 - llvm-6: - build: - <<: *build-clang - args: - branch: release/6.x - install_prefix: /opt/llvm-6 - image: ericwf/compiler:llvm-6 - llvm-7: - build: - <<: *build-clang - args: - branch: release/7.x - install_prefix: /opt/llvm-7 - image: ericwf/compiler:llvm-7 - llvm-8: - build: - <<: *build-clang - args: - branch: release/8.x - install_prefix: /opt/llvm-8 - image: ericwf/compiler:llvm-8 - llvm-9: - build: - <<: *build-clang - args: - branch: release/9.x - install_prefix: /opt/llvm-9 - image: ericwf/compiler:llvm-9 - gcc-tot: - build: - <<: *build-gcc - args: - branch: master - cache_date: feb-27 - install_prefix: /opt/gcc-tot - image: ericwf/compiler:gcc-tot - llvm-tot: - build: - <<: *build-clang - args: - branch: master - cache_date: feb-27 - install_prefix: /opt/llvm-tot - image: ericwf/compiler:llvm-tot - compiler-zoo: - build: - context: . - dockerfile: compilers/compiler-zoo.Dockerfile - target: compiler-zoo - image: ericwf/compiler-zoo:latest - depends_on: - - gcc-4.8.5 - - gcc-4.9.4 - - gcc-5 - - gcc-6 - - gcc-7 - - gcc-8 - - gcc-9 - - gcc-tot - - llvm-3.6 - - llvm-3.7 - - llvm-3.8 - - llvm-3.9 - - llvm-4 - - llvm-5 - - llvm-6 - - llvm-7 - - llvm-8 - - llvm-9 - - llvm-tot diff --git a/libcxx/utils/docker/debian9/compilers/clang.Dockerfile b/libcxx/utils/docker/debian9/compilers/clang.Dockerfile deleted file mode 100644 index 6f12f6485a5f2..0000000000000 --- a/libcxx/utils/docker/debian9/compilers/clang.Dockerfile +++ /dev/null @@ -1,19 +0,0 @@ -#===----------------------------------------------------------------------===// -# -# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -# See https://llvm.org/LICENSE.txt for license information. -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -# -#===----------------------------------------------------------------------===// - -# Build GCC versions -FROM ericwf/llvm-builder-base:latest -LABEL maintainer "libc++ Developers" - -ARG install_prefix -ARG branch -ARG cache_date=stable - -ADD scripts/build_llvm_version.sh /tmp/ -RUN /tmp/build_llvm_version.sh --install "$install_prefix" --branch "$branch" \ - && rm /tmp/build_llvm_version.sh diff --git a/libcxx/utils/docker/debian9/compilers/compiler-zoo.Dockerfile b/libcxx/utils/docker/debian9/compilers/compiler-zoo.Dockerfile deleted file mode 100644 index cadc8b3a7bdb5..0000000000000 --- a/libcxx/utils/docker/debian9/compilers/compiler-zoo.Dockerfile +++ /dev/null @@ -1,37 +0,0 @@ -#===- libcxx/utils/docker/debian9/Dockerfile --------------------------------------------------===// -# -# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -# See https://llvm.org/LICENSE.txt for license information. -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -# -#===-------------------------------------------------------------------------------------------===// - -#===-------------------------------------------------------------------------------------------===// -# compiler-zoo -#===-------------------------------------------------------------------------------------------===// -FROM ericwf/llvm-builder-base:latest AS compiler-zoo -LABEL maintainer "libc++ Developers" - -# Copy over the GCC and Clang installations -COPY --from=ericwf/compiler:gcc-4.8.5 /opt/gcc-4.8.5 /opt/gcc-4.8.5 -COPY --from=ericwf/compiler:gcc-4.9.4 /opt/gcc-4.9.4 /opt/gcc-4.9.4 -COPY --from=ericwf/compiler:gcc-5 /opt/gcc-5 /opt/gcc-5 -COPY --from=ericwf/compiler:gcc-6 /opt/gcc-6 /opt/gcc-6 -COPY --from=ericwf/compiler:gcc-7 /opt/gcc-7 /opt/gcc-7 -COPY --from=ericwf/compiler:gcc-8 /opt/gcc-8 /opt/gcc-8 -COPY --from=ericwf/compiler:gcc-9 /opt/gcc-9 /opt/gcc-9 -COPY --from=ericwf/compiler:gcc-tot /opt/gcc-tot /opt/gcc-tot - -COPY --from=ericwf/compiler:llvm-3.6 /opt/llvm-3.6 /opt/llvm-3.6 -COPY --from=ericwf/compiler:llvm-3.7 /opt/llvm-3.7 /opt/llvm-3.7 -COPY --from=ericwf/compiler:llvm-3.8 /opt/llvm-3.8 /opt/llvm-3.8 -COPY --from=ericwf/compiler:llvm-3.9 /opt/llvm-3.9 /opt/llvm-3.9 -COPY --from=ericwf/compiler:llvm-4 /opt/llvm-4 /opt/llvm-4 -COPY --from=ericwf/compiler:llvm-5 /opt/llvm-5 /opt/llvm-5 -COPY --from=ericwf/compiler:llvm-6 /opt/llvm-6 /opt/llvm-6 -COPY --from=ericwf/compiler:llvm-7 /opt/llvm-7 /opt/llvm-7 -COPY --from=ericwf/compiler:llvm-8 /opt/llvm-8 /opt/llvm-8 -COPY --from=ericwf/compiler:llvm-9 /opt/llvm-9 /opt/llvm-9 -COPY --from=ericwf/compiler:llvm-tot /opt/llvm-tot /opt/llvm-tot - - diff --git a/libcxx/utils/docker/debian9/compilers/gcc.Dockerfile b/libcxx/utils/docker/debian9/compilers/gcc.Dockerfile deleted file mode 100644 index adc019803d818..0000000000000 --- a/libcxx/utils/docker/debian9/compilers/gcc.Dockerfile +++ /dev/null @@ -1,24 +0,0 @@ -#===- libcxx/utils/docker/debian9/Dockerfile --------------------------------------------------===// -# -# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -# See https://llvm.org/LICENSE.txt for license information. -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -# -#===-------------------------------------------------------------------------------------------===// - -# Build GCC versions -FROM ericwf/llvm-builder-base:latest -LABEL maintainer "libc++ Developers" - - -ARG branch -ARG cherry_pick="" -ARG install_prefix -ARG cache_date=stable - -ADD scripts/build_gcc_version.sh /tmp/ -RUN /tmp/build_gcc_version.sh \ - --install "$install_prefix" \ - --branch "$branch" \ - --cherry-pick "$cherry_pick" \ - && rm /tmp/build_gcc_version.sh diff --git a/libcxx/utils/docker/debian9/docker-compose.yml b/libcxx/utils/docker/debian9/docker-compose.yml deleted file mode 100644 index 03b1efcdad0a0..0000000000000 --- a/libcxx/utils/docker/debian9/docker-compose.yml +++ /dev/null @@ -1,14 +0,0 @@ -version: '3.7' -services: - llvm-builder-base: - build: - context: . - dockerfile: llvm-builder-base.Dockerfile - target: llvm-builder-base - image: ericwf/llvm-builder-base:latest - llvm-buildbot-worker: - build: - context: . - dockerfile: llvm-buildbot-worker.Dockerfile - target: llvm-buildbot-worker - image: ericwf/llvm-buildbot-worker:latest diff --git a/libcxx/utils/docker/debian9/llvm-buildbot-worker.Dockerfile b/libcxx/utils/docker/debian9/llvm-buildbot-worker.Dockerfile deleted file mode 100644 index 7699c5a109d83..0000000000000 --- a/libcxx/utils/docker/debian9/llvm-buildbot-worker.Dockerfile +++ /dev/null @@ -1,23 +0,0 @@ - -#===-------------------------------------------------------------------------------------------===// -# buildslave -#===-------------------------------------------------------------------------------------------===// -FROM ericwf/llvm-builder-base:latest AS llvm-buildbot-worker - -COPY --from=ericwf/compiler:gcc-5 /opt/gcc-5 /opt/gcc-5 -COPY --from=ericwf/compiler:gcc-tot /opt/gcc-tot /opt/gcc-tot -COPY --from=ericwf/compiler:llvm-4 /opt/llvm-4 /opt/llvm-4.0 - -# FIXME(EricWF): Remove this hack once zorg has been updated. -RUN ln -s /opt/gcc-5/bin/gcc /usr/local/bin/gcc-4.9 && \ - ln -s /opt/gcc-5/bin/g++ /usr/local/bin/g++-4.9 - -RUN apt-get update && \ - apt-get install -y --no-install-recommends \ - buildbot-slave \ - && rm -rf /var/lib/apt/lists/* - -ADD scripts/install_clang_packages.sh /tmp/ -RUN /tmp/install_clang_packages.sh && rm /tmp/install_clang_packages.sh - -RUN rm -rf /llvm-project/ && git clone --depth=1 https://github.com/llvm/llvm-project.git /llvm-project diff --git a/libcxx/utils/docker/debian9/llvm-builder-base.Dockerfile b/libcxx/utils/docker/debian9/llvm-builder-base.Dockerfile deleted file mode 100644 index 2464641f0bd3b..0000000000000 --- a/libcxx/utils/docker/debian9/llvm-builder-base.Dockerfile +++ /dev/null @@ -1,47 +0,0 @@ -#===----------------------------------------------------------------------===// -# -# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -# See https://llvm.org/LICENSE.txt for license information. -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -# -#===----------------------------------------------------------------------===// - -FROM launcher.gcr.io/google/debian9:latest AS llvm-builder-base -LABEL maintainer "libc++ Developers" - -RUN apt-get update && \ - apt-get install -y --no-install-recommends \ - ca-certificates \ - gnupg \ - build-essential \ - wget \ - subversion \ - unzip \ - automake \ - python \ - cmake \ - ninja-build \ - curl \ - git \ - gcc-multilib \ - g++-multilib \ - libc6-dev \ - bison \ - flex \ - libtool \ - autoconf \ - binutils-dev \ - binutils-gold \ - software-properties-common \ - gnupg \ - apt-transport-https \ - sudo \ - bash-completion \ - vim \ - systemd \ - sysvinit-utils \ - systemd-sysv && \ - update-alternatives --install "/usr/bin/ld" "ld" "/usr/bin/ld.gold" 20 && \ - update-alternatives --install "/usr/bin/ld" "ld" "/usr/bin/ld.bfd" 10 && \ - rm -rf /var/lib/apt/lists/* - diff --git a/libcxx/utils/docker/debian9/scripts/build_gcc_version.sh b/libcxx/utils/docker/debian9/scripts/build_gcc_version.sh deleted file mode 100755 index b759373f01160..0000000000000 --- a/libcxx/utils/docker/debian9/scripts/build_gcc_version.sh +++ /dev/null @@ -1,108 +0,0 @@ -#!/usr/bin/env bash -#===- libcxx/utils/docker/scripts/build-gcc.sh ----------------------------===// -# -# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -# See https://llvm.org/LICENSE.txt for license information. -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -# -#===-----------------------------------------------------------------------===// - -set -e - -function show_usage() { - cat << EOF -Usage: build_gcc_version.sh [options] - -Run autoconf with the specified arguments. Used inside docker container. - -Available options: - -h|--help show this help message - --branch the branch of gcc you want to build. - --cherry-pick a commit hash to apply to the GCC sources. - --install destination directory where to install the targets. -Required options: --install and --branch - -All options after '--' are passed to CMake invocation. -EOF -} - -GCC_INSTALL_DIR="" -GCC_BRANCH="" -CHERRY_PICK="" - -while [[ $# -gt 0 ]]; do - case "$1" in - --install) - shift - GCC_INSTALL_DIR="$1" - shift - ;; - --branch) - shift - GCC_BRANCH="$1" - shift - ;; - --cherry-pick) - shift - CHERRY_PICK="$1" - shift - ;; - -h|--help) - show_usage - exit 0 - ;; - *) - echo "Unknown option: $1" - exit 1 - esac -done - -if [ "$GCC_INSTALL_DIR" == "" ]; then - echo "No install directory. Please specify the --install argument." - exit 1 -fi - -if [ "$GCC_BRANCH" == "" ]; then - echo "No branch specified. Please specify the --branch argument." - exit 1 -fi - -set -x - -NPROC=`nproc` -TMP_ROOT="$(mktemp -d -p /tmp)" -GCC_SOURCE_DIR="$TMP_ROOT/gcc" -GCC_BUILD_DIR="$TMP_ROOT/build" - -echo "Cloning source directory for branch $GCC_BRANCH" -git clone --branch "$GCC_BRANCH" --single-branch --depth=1 git://gcc.gnu.org/git/gcc.git $GCC_SOURCE_DIR - -pushd "$GCC_SOURCE_DIR" -if [ "$CHERRY_PICK" != "" ]; then - git fetch origin master --unshallow # Urg, we have to get the entire history. This will take a while. - git cherry-pick --no-commit -X theirs "$CHERRY_PICK" -fi -./contrib/download_prerequisites -popd - - -mkdir "$GCC_BUILD_DIR" -pushd "$GCC_BUILD_DIR" - -# Run the build as specified in the build arguments. -echo "Running configuration" -$GCC_SOURCE_DIR/configure --prefix=$GCC_INSTALL_DIR \ - --disable-bootstrap --disable-libgomp --disable-libitm \ - --disable-libvtv --disable-libcilkrts --disable-libmpx \ - --disable-liboffloadmic --disable-libcc1 --enable-languages=c,c++ - -echo "Running build with $NPROC threads" -make -j$NPROC -echo "Installing to $GCC_INSTALL_DIR" -make install -j$NPROC -popd - -# Cleanup. -rm -rf "$TMP_ROOT" - -echo "Done" diff --git a/libcxx/utils/docker/debian9/scripts/build_llvm_version.sh b/libcxx/utils/docker/debian9/scripts/build_llvm_version.sh deleted file mode 100755 index 613a7babd335a..0000000000000 --- a/libcxx/utils/docker/debian9/scripts/build_llvm_version.sh +++ /dev/null @@ -1,106 +0,0 @@ -#!/usr/bin/env bash -#===- libcxx/utils/docker/scripts/build_install_llvm_version_default.sh -----------------------===// -# -# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -# See https://llvm.org/LICENSE.txt for license information. -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -# -#===-------------------------------------------------------------------------------------------===// - -set -e - -function show_usage() { - cat << EOF -Usage: build_install_llvm.sh [options] -- [cmake-args] - -Run cmake with the specified arguments. Used inside docker container. -Passes additional -DCMAKE_INSTALL_PREFIX and puts the build results into -the directory specified by --to option. - -Available options: - -h|--help show this help message - --install destination directory where to install the targets. - --branch the branch or tag of LLVM to build -Required options: --install, and --version. - -All options after '--' are passed to CMake invocation. -EOF -} - -LLVM_BRANCH="" -CMAKE_ARGS="" -LLVM_INSTALL_DIR="" - -while [[ $# -gt 0 ]]; do - case "$1" in - --install) - shift - LLVM_INSTALL_DIR="$1" - shift - ;; - --branch) - shift - LLVM_BRANCH="$1" - shift - ;; - --) - shift - CMAKE_ARGS="$*" - shift $# - ;; - -h|--help) - show_usage - exit 0 - ;; - *) - echo "Unknown option: $1" - exit 1 - esac -done - - -if [ "$LLVM_INSTALL_DIR" == "" ]; then - echo "No install directory. Please specify the --install argument." - exit 1 -fi - -if [ "$LLVM_BRANCH" == "" ]; then - echo "No install directory. Please specify the --branch argument." - exit 1 -fi - -if [ "$CMAKE_ARGS" == "" ]; then - CMAKE_ARGS="-DCMAKE_BUILD_TYPE=RELEASE '-DCMAKE_C_FLAGS=-gline-tables-only' '-DCMAKE_CXX_FLAGS=-gline-tables-only' -DLLVM_ENABLE_ASSERTIONS=ON -DLLVM_INSTALL_TOOLCHAIN_ONLY=ON" -fi - -set -x - -TMP_ROOT="$(mktemp -d -p /tmp)" -LLVM_SOURCE_DIR="$TMP_ROOT/llvm-project" -LLVM_BUILD_DIR="$TMP_ROOT/build" -LLVM="$LLVM_SOURCE_DIR/llvm" - -git clone --branch $LLVM_BRANCH --single-branch --depth=1 https://github.com/llvm/llvm-project.git $LLVM_SOURCE_DIR - -pushd "$LLVM_SOURCE_DIR" - -# Setup the source-tree using the old style layout -ln -s $LLVM_SOURCE_DIR/libcxx $LLVM/projects/libcxx -ln -s $LLVM_SOURCE_DIR/libcxxabi $LLVM/projects/libcxxabi -ln -s $LLVM_SOURCE_DIR/compiler-rt $LLVM/projects/compiler-rt -ln -s $LLVM_SOURCE_DIR/clang $LLVM/tools/clang -ln -s $LLVM_SOURCE_DIR/clang-tools-extra $LLVM/tools/clang/tools/extra - -popd - -# Configure and build -mkdir "$LLVM_BUILD_DIR" -pushd "$LLVM_BUILD_DIR" -cmake -GNinja "-DCMAKE_INSTALL_PREFIX=$LLVM_INSTALL_DIR" $CMAKE_ARGS $LLVM -ninja install -popd - -# Cleanup -rm -rf "$TMP_ROOT/" - -echo "Done" diff --git a/libcxx/utils/docker/debian9/scripts/buildbot/docker_start_buildbots.sh b/libcxx/utils/docker/debian9/scripts/buildbot/docker_start_buildbots.sh deleted file mode 100755 index b655170ec80e8..0000000000000 --- a/libcxx/utils/docker/debian9/scripts/buildbot/docker_start_buildbots.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/usr/bin/env bash -set -x - -# Update the libc++ sources in the image in order to use the most recent version of -# run_buildbots.sh -cd /llvm-project/ -git pull -/llvm-project/libcxx/utils/docker/debian9/scripts/buildbot/run_buildbot.sh "$@" diff --git a/libcxx/utils/docker/debian9/scripts/install_clang_packages.sh b/libcxx/utils/docker/debian9/scripts/install_clang_packages.sh deleted file mode 100755 index 785f7a73e98f3..0000000000000 --- a/libcxx/utils/docker/debian9/scripts/install_clang_packages.sh +++ /dev/null @@ -1,81 +0,0 @@ -#!/usr/bin/env bash -#===- libcxx/utils/docker/scripts/install_clang_package.sh -----------------===// -# -# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -# See https://llvm.org/LICENSE.txt for license information. -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -# -#===-----------------------------------------------------------------------===// - -set -e - -function show_usage() { - cat << EOF -Usage: install_clang_package.sh [options] - -Install -Available options: - -h|--help show this help message - --version the numeric version of the package to use. -EOF -} - -VERSION="11" - -while [[ $# -gt 0 ]]; do - case "$1" in - --version) - shift - VERSION="$1" - shift - ;; - -h|--help) - show_usage - exit 0 - ;; - *) - echo "Unknown option: $1" - exit 1 - esac -done - -set -x - -curl -fsSL https://apt.llvm.org/llvm-snapshot.gpg.key | apt-key add - -add-apt-repository -s "deb http://apt.llvm.org/$(lsb_release -cs)/ llvm-toolchain-$(lsb_release -cs) main" -apt-get update -apt-get upgrade -y -apt-get install -y --no-install-recommends "clang-$VERSION" - -# FIXME(EricWF): Remove this once the clang packages are no longer broken. -if [ -f "/usr/local/bin/clang" ]; then - echo "clang already exists" - exit 1 -else - CC_BINARY="$(which clang-$VERSION)" - ln -s "$CC_BINARY" "/usr/local/bin/clang" -fi -if [ -f "/usr/local/bin/clang++" ]; then - echo "clang++ already exists" - exit 1 -else - CXX_BINARY="$(which clang++-$VERSION)" - ln -s "$CXX_BINARY" "/usr/local/bin/clang++" -fi - -echo "Testing clang version..." -clang --version - -echo "Testing clang++ version..." -clang++ --version - -# Figure out the libc++ and libc++abi package versions that we want. -if [ "$VERSION" == "" ]; then - VERSION="$(apt-cache search 'libc\+\+-[0-9]+-dev' | awk '{print $1}' | awk -F- '{print $2}')" - echo "Installing version '$VERSION'" -fi - -apt-get purge -y "libc++-$VERSION-dev" "libc++abi-$VERSION-dev" -apt-get install -y --no-install-recommends "libc++-$VERSION-dev" "libc++abi-$VERSION-dev" - -echo "Done" diff --git a/libcxx/utils/libcxx/test/config.py b/libcxx/utils/libcxx/test/config.py index 459a5ccdf225f..a9a353916a9c6 100644 --- a/libcxx/utils/libcxx/test/config.py +++ b/libcxx/utils/libcxx/test/config.py @@ -452,10 +452,6 @@ def configure_features(self): if self.cxx.hasCompileFlag('-faligned-allocation'): self.config.available_features.add('-faligned-allocation') - else: - # FIXME remove this once more than just clang-4.0 support - # C++17 aligned allocation. - self.config.available_features.add('no-aligned-allocation') if self.cxx.hasCompileFlag('-fdelayed-template-parsing'): self.config.available_features.add('fdelayed-template-parsing') @@ -1083,8 +1079,11 @@ def configure_substitutions(self): # Configure run env substitution. codesign_ident = self.get_lit_conf('llvm_codesign_identity', '') run_py = os.path.join(self.libcxx_src_root, 'utils', 'run.py') - run_str = '%s %s "%s" %%t.exe' % (pipes.quote(sys.executable), \ - pipes.quote(run_py), codesign_ident) + env_vars = ' '.join('%s=%s' % (k, pipes.quote(v)) for (k, v) in self.exec_env.items()) + run_str = '%s %s --codesign_identity "%s" --working_directory "%%S" ' \ + '--dependencies %%file_dependencies --env %s -- %%t.exe' % \ + (pipes.quote(sys.executable), pipes.quote(run_py), + codesign_ident, env_vars) sub.append(('%run', run_str)) # Configure not program substitutions not_py = os.path.join(self.libcxx_src_root, 'utils', 'not.py') diff --git a/libcxx/utils/libcxx/test/format.py b/libcxx/utils/libcxx/test/format.py index 55f179a20e15a..bf526d3618d13 100644 --- a/libcxx/utils/libcxx/test/format.py +++ b/libcxx/utils/libcxx/test/format.py @@ -41,12 +41,14 @@ def __init__(self, cxx, use_verify_for_fail, execute_external, self.exec_env = dict(exec_env) @staticmethod - def _make_custom_parsers(): + def _make_custom_parsers(test): return [ IntegratedTestKeywordParser('FLAKY_TEST.', ParserKind.TAG, initial_value=False), IntegratedTestKeywordParser('MODULES_DEFINES:', ParserKind.LIST, - initial_value=[]) + initial_value=[]), + IntegratedTestKeywordParser('FILE_DEPENDENCIES:', ParserKind.LIST, + initial_value=test.file_dependencies) ] @staticmethod @@ -102,9 +104,13 @@ def _execute(self, test, lit_config): 'objective-c++' in test.config.available_features: return (lit.Test.UNSUPPORTED, "Objective-C++ is not supported") - parsers = self._make_custom_parsers() + setattr(test, 'file_dependencies', []) + parsers = self._make_custom_parsers(test) script = lit.TestRunner.parseIntegratedTestScript( test, additional_parsers=parsers, require_script=is_sh_test) + + local_cwd = os.path.dirname(test.getSourcePath()) + data_files = [os.path.join(local_cwd, f) for f in test.file_dependencies] # Check if a result for the test was returned. If so return that # result. if isinstance(script, lit.Test.Result): @@ -119,6 +125,7 @@ def _execute(self, test, lit_config): tmpDir, tmpBase = lit.TestRunner.getTempPaths(test) substitutions = lit.TestRunner.getDefaultSubstitutions(test, tmpDir, tmpBase) + substitutions.append(('%file_dependencies', ' '.join(data_files))) script = lit.TestRunner.applySubstitutions(script, substitutions) test_cxx = copy.deepcopy(self.cxx) @@ -162,7 +169,7 @@ def _execute(self, test, lit_config): return self._evaluate_fail_test(test, test_cxx, parsers) elif is_pass_test: return self._evaluate_pass_test(test, tmpBase, lit_config, - test_cxx, parsers) + test_cxx, parsers, data_files) else: # No other test type is supported assert False @@ -171,7 +178,7 @@ def _clean(self, exec_path): # pylint: disable=no-self-use libcxx.util.cleanFile(exec_path) def _evaluate_pass_test(self, test, tmpBase, lit_config, - test_cxx, parsers): + test_cxx, parsers, data_files): execDir = os.path.dirname(test.getExecPath()) source_path = test.getSourcePath() exec_path = tmpBase + '.exe' @@ -193,12 +200,6 @@ def _evaluate_pass_test(self, test, tmpBase, lit_config, env = None if self.exec_env: env = self.exec_env - # TODO: Only list actually needed files in file_deps. - # Right now we just mark all of the .dat files in the same - # directory as dependencies, but it's likely less than that. We - # should add a `// FILE-DEP: foo.dat` to each test to track this. - data_files = [os.path.join(local_cwd, f) - for f in os.listdir(local_cwd) if f.endswith('.dat')] is_flaky = self._get_parser('FLAKY_TEST.', parsers).getValue() max_retry = 3 if is_flaky else 1 for retry_count in range(max_retry): diff --git a/libcxx/utils/run.py b/libcxx/utils/run.py index fcfee96c69eb2..3e9a4703c8c7e 100644 --- a/libcxx/utils/run.py +++ b/libcxx/utils/run.py @@ -12,27 +12,44 @@ program's error code. """ +import argparse +import os import subprocess import sys def main(): - codesign_ident = sys.argv[1] - - # Ignore 'run.py' and the codesigning identity. - argv = sys.argv[2:] - - exec_path = argv[0] + parser = argparse.ArgumentParser() + parser.add_argument('--codesign_identity', type=str, required=False) + parser.add_argument('--working_directory', type=str, required=True) + parser.add_argument('--dependencies', type=str, nargs='*', required=True) + parser.add_argument('--env', type=str, nargs='*', required=True) + (args, remaining) = parser.parse_known_args(sys.argv[1:]) + + if len(remaining) < 2: + sys.stderr.write('Missing actual commands to run') + exit(1) + remaining = remaining[1:] # Skip the '--' # Do any necessary codesigning. - if codesign_ident: - sign_cmd = ['xcrun', 'codesign', '-f', '-s', codesign_ident, exec_path] - cs_rc = subprocess.call(sign_cmd, env={}) - if cs_rc != 0: - sys.stderr.write('Failed to codesign: ' + exec_path) - return cs_rc - - return subprocess.call(argv) + if args.codesign_identity: + exe = remaining[0] + rc = subprocess.call(['xcrun', 'codesign', '-f', '-s', args.codesign_identity, exe], env={}) + if rc != 0: + sys.stderr.write('Failed to codesign: ' + exe) + return rc + + # Extract environment variables into a dictionary + env = {k : v for (k, v) in map(lambda s: s.split('=', 1), args.env)} + + # Ensure the file dependencies exist + for file in args.dependencies: + if not os.path.exists(file): + sys.stderr.write('Missing file {} marked as a dependency of a test'.format(file)) + exit(1) + + # Run the executable with the given environment in the given working directory + return subprocess.call(remaining, cwd=args.working_directory, env=env) if __name__ == '__main__': exit(main()) diff --git a/libcxx/www/cxx1z_status.html b/libcxx/www/cxx1z_status.html index 627f9a085cb2c..8f3f88310a74f 100644 --- a/libcxx/www/cxx1z_status.html +++ b/libcxx/www/cxx1z_status.html @@ -308,7 +308,7 @@

Library Working group Issues Status

2582§[res.on.functions]/2's prohibition against incomplete types shouldn't apply to type traitsJacksonvilleComplete 2583There is no way to supply an allocator for basic_string(str, pos)JacksonvilleComplete 2585forward_list::resize(size_type, const value_type&) effects incorrectJacksonvilleComplete - 2586Wrong value category used in scoped_allocator_adaptor::construct()Jacksonville + 2586Wrong value category used in scoped_allocator_adaptor::construct()JacksonvilleComplete 2590Aggregate initialization for std::arrayJacksonvilleComplete 2181Exceptions from seed sequence operationsOuluComplete diff --git a/libunwind/docs/BuildingLibunwind.rst b/libunwind/docs/BuildingLibunwind.rst index 7f42133a8a50e..ad9dee0519a82 100644 --- a/libunwind/docs/BuildingLibunwind.rst +++ b/libunwind/docs/BuildingLibunwind.rst @@ -57,8 +57,8 @@ build would look like this: $ cd where-you-want-libunwind-to-live $ # Check out llvm, and libunwind - $ ``svn co http://llvm.org/svn/llvm-project/llvm/trunk llvm`` - $ ``svn co http://llvm.org/svn/llvm-project/libunwind/trunk libunwind`` + $ ``svn co https://llvm.org/svn/llvm-project/llvm/trunk llvm`` + $ ``svn co https://llvm.org/svn/llvm-project/libunwind/trunk libunwind`` $ cd where-you-want-to-build $ mkdir build && cd build $ export CC=clang CXX=clang++ diff --git a/libunwind/docs/index.rst b/libunwind/docs/index.rst index a4e21bb3c336c..d134bf29fb55b 100644 --- a/libunwind/docs/index.rst +++ b/libunwind/docs/index.rst @@ -71,8 +71,8 @@ Notes and Known Issues Getting Involved ================ -First please review our `Developer's Policy `__ -and `Getting started with LLVM `__. +First please review our `Developer's Policy `__ +and `Getting started with LLVM `__. **Bug Reports** @@ -84,7 +84,7 @@ Please include "libunwind" in your subject. **Patches** If you want to contribute a patch to libunwind, the best place for that is -`Phabricator `_. Please include [libunwind] in the subject and +`Phabricator `_. Please include [libunwind] in the subject and add `cfe-commits` as a subscriber. Also make sure you are subscribed to the `cfe-commits mailing list `_. @@ -97,7 +97,7 @@ Please include [libunwind] in the subject. Quick Links =========== -* `LLVM Homepage `_ +* `LLVM Homepage `_ * `LLVM Bugzilla `_ * `cfe-commits Mailing List`_ * `cfe-dev Mailing List`_ diff --git a/lld/COFF/CMakeLists.txt b/lld/COFF/CMakeLists.txt index 1d310f8419f55..4592ace373efa 100644 --- a/lld/COFF/CMakeLists.txt +++ b/lld/COFF/CMakeLists.txt @@ -14,6 +14,7 @@ add_lld_library(lldCOFF DriverUtils.cpp ICF.cpp InputFiles.cpp + LLDMapFile.cpp LTO.cpp MapFile.cpp MarkLive.cpp diff --git a/lld/COFF/Config.h b/lld/COFF/Config.h index af86b69922729..e376ea51f133d 100644 --- a/lld/COFF/Config.h +++ b/lld/COFF/Config.h @@ -182,6 +182,9 @@ struct Configuration { llvm::StringMap order; // Used for /lldmap. + std::string lldmapFile; + + // Used for /map. std::string mapFile; // Used for /thinlto-index-only: diff --git a/lld/COFF/Driver.cpp b/lld/COFF/Driver.cpp index 5320b8b83ce0a..8b3ba1cdf24b5 100644 --- a/lld/COFF/Driver.cpp +++ b/lld/COFF/Driver.cpp @@ -707,14 +707,15 @@ static unsigned parseDebugTypes(const opt::InputArgList &args) { return debugTypes; } -static std::string getMapFile(const opt::InputArgList &args) { - auto *arg = args.getLastArg(OPT_lldmap, OPT_lldmap_file); +static std::string getMapFile(const opt::InputArgList &args, + opt::OptSpecifier os, opt::OptSpecifier osFile) { + auto *arg = args.getLastArg(os, osFile); if (!arg) return ""; - if (arg->getOption().getID() == OPT_lldmap_file) + if (arg->getOption().getID() == osFile.getID()) return arg->getValue(); - assert(arg->getOption().getID() == OPT_lldmap); + assert(arg->getOption().getID() == os.getID()); StringRef outFile = config->outputFile; return (outFile.substr(0, outFile.rfind('.')) + ".map").str(); } @@ -1564,7 +1565,14 @@ void LinkerDriver::link(ArrayRef argsArr) { if (config->mingw || config->debugDwarf) config->warnLongSectionNames = false; - config->mapFile = getMapFile(args); + config->lldmapFile = getMapFile(args, OPT_lldmap, OPT_lldmap_file); + config->mapFile = getMapFile(args, OPT_map, OPT_map_file); + + if (config->lldmapFile != "" && config->lldmapFile == config->mapFile) { + warn("/lldmap and /map have the same output file '" + config->mapFile + + "'.\n>>> ignoring /lldmap"); + config->lldmapFile.clear(); + } if (config->incremental && args.hasArg(OPT_profile)) { warn("ignoring '/incremental' due to '/profile' specification"); diff --git a/lld/COFF/LLDMapFile.cpp b/lld/COFF/LLDMapFile.cpp new file mode 100644 index 0000000000000..0e069724f8163 --- /dev/null +++ b/lld/COFF/LLDMapFile.cpp @@ -0,0 +1,123 @@ +//===- LLDMapFile.cpp -----------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements the /lldmap option. It shows lists in order and +// hierarchically the output sections, input sections, input files and +// symbol: +// +// Address Size Align Out File Symbol +// 00201000 00000015 4 .text +// 00201000 0000000e 4 test.o:(.text) +// 0020100e 00000000 0 local +// 00201005 00000000 0 f(int) +// +//===----------------------------------------------------------------------===// + +#include "LLDMapFile.h" +#include "SymbolTable.h" +#include "Symbols.h" +#include "Writer.h" +#include "lld/Common/ErrorHandler.h" +#include "lld/Common/Threads.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; +using namespace llvm::object; +using namespace lld; +using namespace lld::coff; + +using SymbolMapTy = + DenseMap>; + +static constexpr char indent8[] = " "; // 8 spaces +static constexpr char indent16[] = " "; // 16 spaces + +// Print out the first three columns of a line. +static void writeHeader(raw_ostream &os, uint64_t addr, uint64_t size, + uint64_t align) { + os << format("%08llx %08llx %5lld ", addr, size, align); +} + +// Returns a list of all symbols that we want to print out. +static std::vector getSymbols() { + std::vector v; + for (ObjFile *file : ObjFile::instances) + for (Symbol *b : file->getSymbols()) + if (auto *sym = dyn_cast_or_null(b)) + if (sym && !sym->getCOFFSymbol().isSectionDefinition()) + v.push_back(sym); + return v; +} + +// Returns a map from sections to their symbols. +static SymbolMapTy getSectionSyms(ArrayRef syms) { + SymbolMapTy ret; + for (DefinedRegular *s : syms) + ret[s->getChunk()].push_back(s); + + // Sort symbols by address. + for (auto &it : ret) { + SmallVectorImpl &v = it.second; + std::stable_sort(v.begin(), v.end(), [](DefinedRegular *a, DefinedRegular *b) { + return a->getRVA() < b->getRVA(); + }); + } + return ret; +} + +// Construct a map from symbols to their stringified representations. +static DenseMap +getSymbolStrings(ArrayRef syms) { + std::vector str(syms.size()); + parallelForEachN((size_t)0, syms.size(), [&](size_t i) { + raw_string_ostream os(str[i]); + writeHeader(os, syms[i]->getRVA(), 0, 0); + os << indent16 << toString(*syms[i]); + }); + + DenseMap ret; + for (size_t i = 0, e = syms.size(); i < e; ++i) + ret[syms[i]] = std::move(str[i]); + return ret; +} + +void lld::coff::writeLLDMapFile(ArrayRef outputSections) { + if (config->lldmapFile.empty()) + return; + + std::error_code ec; + raw_fd_ostream os(config->lldmapFile, ec, sys::fs::OF_None); + if (ec) + fatal("cannot open " + config->lldmapFile + ": " + ec.message()); + + // Collect symbol info that we want to print out. + std::vector syms = getSymbols(); + SymbolMapTy sectionSyms = getSectionSyms(syms); + DenseMap symStr = getSymbolStrings(syms); + + // Print out the header line. + os << "Address Size Align Out In Symbol\n"; + + // Print out file contents. + for (OutputSection *sec : outputSections) { + writeHeader(os, sec->getRVA(), sec->getVirtualSize(), /*align=*/pageSize); + os << sec->name << '\n'; + + for (Chunk *c : sec->chunks) { + auto *sc = dyn_cast(c); + if (!sc) + continue; + + writeHeader(os, sc->getRVA(), sc->getSize(), sc->getAlignment()); + os << indent8 << sc->file->getName() << ":(" << sc->getSectionName() + << ")\n"; + for (DefinedRegular *sym : sectionSyms[sc]) + os << symStr[sym] << '\n'; + } + } +} diff --git a/lld/COFF/LLDMapFile.h b/lld/COFF/LLDMapFile.h new file mode 100644 index 0000000000000..b731293a8625d --- /dev/null +++ b/lld/COFF/LLDMapFile.h @@ -0,0 +1,21 @@ +//===- LLDMapFile.h ---------------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLD_COFF_LLDMAPFILE_H +#define LLD_COFF_LLDMAPFILE_H + +#include "llvm/ADT/ArrayRef.h" + +namespace lld { +namespace coff { +class OutputSection; +void writeLLDMapFile(llvm::ArrayRef outputSections); +} +} + +#endif diff --git a/lld/COFF/MapFile.cpp b/lld/COFF/MapFile.cpp index a80c553637aa1..0958a79b14029 100644 --- a/lld/COFF/MapFile.cpp +++ b/lld/COFF/MapFile.cpp @@ -6,16 +6,25 @@ // //===----------------------------------------------------------------------===// // -// This file implements the /lldmap option. It shows lists in order and -// hierarchically the output sections, input sections, input files and -// symbol: +// This file implements the /map option in the same format as link.exe +// (based on observations) // -// Address Size Align Out File Symbol -// 00201000 00000015 4 .text -// 00201000 0000000e 4 test.o:(.text) -// 0020100e 00000000 0 local -// 00201005 00000000 0 f(int) +// Header (program name, timestamp info, preferred load address) // +// Section list (Start = Section index:Base address): +// Start Length Name Class +// 0001:00001000 00000015H .text CODE +// +// Symbols list: +// Address Publics by Value Rva + Base Lib:Object +// 0001:00001000 main 0000000140001000 main.obj +// 0001:00001300 ?__scrt_common_main@@YAHXZ 0000000140001300 libcmt:exe_main.obj +// +// entry point at 0001:00000360 +// +// Static symbols +// +// 0000:00000000 __guard_fids__ 0000000140000000 libcmt : exe_main.obj //===----------------------------------------------------------------------===// #include "MapFile.h" @@ -24,6 +33,8 @@ #include "Writer.h" #include "lld/Common/ErrorHandler.h" #include "lld/Common/Threads.h" +#include "lld/Common/Timer.h" +#include "llvm/Support/Path.h" #include "llvm/Support/raw_ostream.h" using namespace llvm; @@ -31,56 +42,160 @@ using namespace llvm::object; using namespace lld; using namespace lld::coff; -using SymbolMapTy = - DenseMap>; +static Timer totalMapTimer("MAP emission (Cumulative)", Timer::root()); +static Timer symbolGatherTimer("Gather symbols", totalMapTimer); +static Timer symbolStringsTimer("Build symbol strings", totalMapTimer); +static Timer writeTimer("Write to file", totalMapTimer); -static constexpr char indent8[] = " "; // 8 spaces -static constexpr char indent16[] = " "; // 16 spaces +// Print out the first two columns of a line. +static void writeHeader(raw_ostream &os, uint32_t sec, uint64_t addr) { + os << format(" %04x:%08llx", sec, addr); +} -// Print out the first three columns of a line. -static void writeHeader(raw_ostream &os, uint64_t addr, uint64_t size, - uint64_t align) { - os << format("%08llx %08llx %5lld ", addr, size, align); +// Write the time stamp with the format used by link.exe +// It seems identical to strftime with "%c" on msvc build, but we need a +// locale-agnostic version. +static void writeFormattedTimestamp(raw_ostream &os, time_t tds) { + constexpr const char *const days[7] = {"Sun", "Mon", "Tue", "Wed", + "Thu", "Fri", "Sat"}; + constexpr const char *const months[12] = {"Jan", "Feb", "Mar", "Apr", + "May", "Jun", "Jul", "Aug", + "Sep", "Oct", "Nov", "Dec"}; + tm *time = localtime(&tds); + os << format("%s %s %2d %02d:%02d:%02d %d", days[time->tm_wday], + months[time->tm_mon], time->tm_mday, time->tm_hour, time->tm_min, + time->tm_sec, time->tm_year + 1900); } -// Returns a list of all symbols that we want to print out. -static std::vector getSymbols() { - std::vector v; - for (ObjFile *file : ObjFile::instances) - for (Symbol *b : file->getSymbols()) - if (auto *sym = dyn_cast_or_null(b)) - if (sym && !sym->getCOFFSymbol().isSectionDefinition()) - v.push_back(sym); - return v; +static void sortUniqueSymbols(std::vector &syms) { + // Build helper vector + using SortEntry = std::pair; + std::vector v; + v.resize(syms.size()); + for (size_t i = 0, e = syms.size(); i < e; ++i) + v[i] = SortEntry(syms[i], i); + + // Remove duplicate symbol pointers + parallelSort(v, std::less()); + auto end = std::unique(v.begin(), v.end(), + [](const SortEntry &a, const SortEntry &b) { + return a.first == b.first; + }); + v.erase(end, v.end()); + + // Sort by RVA then original order + parallelSort(v, [](const SortEntry &a, const SortEntry &b) { + // Add config->imageBase to avoid comparing "negative" RVAs. + // This can happen with symbols of Absolute kind + uint64_t rvaa = config->imageBase + a.first->getRVA(); + uint64_t rvab = config->imageBase + b.first->getRVA(); + return rvaa < rvab || (rvaa == rvab && a.second < b.second); + }); + + syms.resize(v.size()); + for (size_t i = 0, e = v.size(); i < e; ++i) + syms[i] = v[i].first; } -// Returns a map from sections to their symbols. -static SymbolMapTy getSectionSyms(ArrayRef syms) { - SymbolMapTy ret; - for (DefinedRegular *s : syms) - ret[s->getChunk()].push_back(s); - - // Sort symbols by address. - for (auto &it : ret) { - SmallVectorImpl &v = it.second; - std::sort(v.begin(), v.end(), [](DefinedRegular *a, DefinedRegular *b) { - return a->getRVA() < b->getRVA(); - }); +// Returns the lists of all symbols that we want to print out. +static void getSymbols(std::vector &syms, + std::vector &staticSyms) { + + for (ObjFile *file : ObjFile::instances) + for (Symbol *b : file->getSymbols()) { + if (!b || !b->isLive()) + continue; + if (auto *sym = dyn_cast(b)) { + COFFSymbolRef symRef = sym->getCOFFSymbol(); + if (!symRef.isSectionDefinition() && + symRef.getStorageClass() != COFF::IMAGE_SYM_CLASS_LABEL) { + if (symRef.getStorageClass() == COFF::IMAGE_SYM_CLASS_STATIC) + staticSyms.push_back(sym); + else + syms.push_back(sym); + } + } else if (auto *sym = dyn_cast(b)) { + syms.push_back(sym); + } + } + + for (ImportFile *file : ImportFile::instances) { + if (!file->live) + continue; + + if (!file->thunkSym) + continue; + + if (!file->thunkLive) + continue; + + if (auto *thunkSym = dyn_cast(file->thunkSym)) + syms.push_back(thunkSym); + + if (auto *impSym = dyn_cast_or_null(file->impSym)) + syms.push_back(impSym); } - return ret; + + sortUniqueSymbols(syms); + sortUniqueSymbols(staticSyms); } // Construct a map from symbols to their stringified representations. -static DenseMap -getSymbolStrings(ArrayRef syms) { +static DenseMap +getSymbolStrings(ArrayRef syms) { std::vector str(syms.size()); parallelForEachN((size_t)0, syms.size(), [&](size_t i) { raw_string_ostream os(str[i]); - writeHeader(os, syms[i]->getRVA(), 0, 0); - os << indent16 << toString(*syms[i]); + Defined *sym = syms[i]; + + uint16_t sectionIdx = 0; + uint64_t address = 0; + SmallString<128> fileDescr; + + if (auto *absSym = dyn_cast(sym)) { + address = absSym->getVA(); + fileDescr = ""; + } else if (isa(sym)) { + fileDescr = ""; + } else if (isa(sym)) { + fileDescr = ""; + } else if (Chunk *chunk = sym->getChunk()) { + address = sym->getRVA(); + if (OutputSection *sec = chunk->getOutputSection()) + address -= sec->header.VirtualAddress; + + sectionIdx = chunk->getOutputSectionIdx(); + + InputFile *file; + if (auto *impSym = dyn_cast(sym)) + file = impSym->file; + else if (auto *thunkSym = dyn_cast(sym)) + file = thunkSym->wrappedSym->file; + else + file = sym->getFile(); + + if (file) { + if (!file->parentName.empty()) { + fileDescr = sys::path::filename(file->parentName); + sys::path::replace_extension(fileDescr, ""); + fileDescr += ":"; + } + fileDescr += sys::path::filename(file->getName()); + } + } + writeHeader(os, sectionIdx, address); + os << " "; + os << left_justify(sym->getName(), 26); + os << " "; + os << format_hex_no_prefix((config->imageBase + sym->getRVA()), 16); + if (!fileDescr.empty()) { + os << " "; // FIXME : Handle "f" and "i" flags sometimes generated + // by link.exe in those spaces + os << fileDescr; + } }); - DenseMap ret; + DenseMap ret; for (size_t i = 0, e = syms.size(); i < e; ++i) ret[syms[i]] = std::move(str[i]); return ret; @@ -95,29 +210,113 @@ void lld::coff::writeMapFile(ArrayRef outputSections) { if (ec) fatal("cannot open " + config->mapFile + ": " + ec.message()); + ScopedTimer t1(totalMapTimer); + // Collect symbol info that we want to print out. - std::vector syms = getSymbols(); - SymbolMapTy sectionSyms = getSectionSyms(syms); - DenseMap symStr = getSymbolStrings(syms); + ScopedTimer t2(symbolGatherTimer); + std::vector syms; + std::vector staticSyms; + getSymbols(syms, staticSyms); + t2.stop(); - // Print out the header line. - os << "Address Size Align Out In Symbol\n"; + ScopedTimer t3(symbolStringsTimer); + DenseMap symStr = getSymbolStrings(syms); + DenseMap staticSymStr = getSymbolStrings(staticSyms); + t3.stop(); - // Print out file contents. - for (OutputSection *sec : outputSections) { - writeHeader(os, sec->getRVA(), sec->getVirtualSize(), /*align=*/pageSize); - os << sec->name << '\n'; + ScopedTimer t4(writeTimer); + SmallString<128> AppName = sys::path::filename(config->outputFile); + sys::path::replace_extension(AppName, ""); + // Print out the file header + os << " " << AppName << "\n"; + os << "\n"; + + os << " Timestamp is " << format_hex_no_prefix(config->timestamp, 8) << " ("; + if (config->repro) { + os << "Repro mode"; + } else { + writeFormattedTimestamp(os, config->timestamp); + } + os << ")\n"; + + os << "\n"; + os << " Preferred load address is " + << format_hex_no_prefix(config->imageBase, 16) << "\n"; + os << "\n"; + + // Print out section table. + os << " Start Length Name Class\n"; + + for (OutputSection *sec : outputSections) { + // Merge display of chunks with same sectionName + std::vector> ChunkRanges; for (Chunk *c : sec->chunks) { auto *sc = dyn_cast(c); if (!sc) continue; - writeHeader(os, sc->getRVA(), sc->getSize(), sc->getAlignment()); - os << indent8 << sc->file->getName() << ":(" << sc->getSectionName() - << ")\n"; - for (DefinedRegular *sym : sectionSyms[sc]) - os << symStr[sym] << '\n'; + if (ChunkRanges.empty() || + c->getSectionName() != ChunkRanges.back().first->getSectionName()) { + ChunkRanges.emplace_back(sc, sc); + } else { + ChunkRanges.back().second = sc; + } + } + + const bool isCodeSection = + (sec->header.Characteristics & COFF::IMAGE_SCN_CNT_CODE) && + (sec->header.Characteristics & COFF::IMAGE_SCN_MEM_READ) && + (sec->header.Characteristics & COFF::IMAGE_SCN_MEM_EXECUTE); + StringRef SectionClass = (isCodeSection ? "CODE" : "DATA"); + + for (auto &cr : ChunkRanges) { + size_t size = + cr.second->getRVA() + cr.second->getSize() - cr.first->getRVA(); + + auto address = cr.first->getRVA() - sec->header.VirtualAddress; + writeHeader(os, sec->sectionIndex, address); + os << " " << format_hex_no_prefix(size, 8) << "H"; + os << " " << left_justify(cr.first->getSectionName(), 23); + os << " " << SectionClass; + os << '\n'; + } + } + + // Print out the symbols table (without static symbols) + os << "\n"; + os << " Address Publics by Value Rva+Base" + " Lib:Object\n"; + os << "\n"; + for (Defined *sym : syms) + os << symStr[sym] << '\n'; + + // Print out the entry point. + os << "\n"; + + uint16_t entrySecIndex = 0; + uint64_t entryAddress = 0; + + if (!config->noEntry) { + Defined *entry = dyn_cast_or_null(config->entry); + if (entry) { + Chunk *chunk = entry->getChunk(); + entrySecIndex = chunk->getOutputSectionIdx(); + entryAddress = + entry->getRVA() - chunk->getOutputSection()->header.VirtualAddress; } } + os << " entry point at "; + os << format("%04x:%08llx", entrySecIndex, entryAddress); + os << "\n"; + + // Print out the static symbols + os << "\n"; + os << " Static symbols\n"; + os << "\n"; + for (Defined *sym : staticSyms) + os << staticSymStr[sym] << '\n'; + + t4.stop(); + t1.stop(); } diff --git a/lld/COFF/Options.td b/lld/COFF/Options.td index cea02e7a0042c..72fe9ce8c118c 100644 --- a/lld/COFF/Options.td +++ b/lld/COFF/Options.td @@ -226,6 +226,8 @@ defm threads: B<"threads", // Flags for debugging def lldmap : F<"lldmap">; def lldmap_file : Joined<["/", "-", "/?", "-?"], "lldmap:">; +def map : F<"map">; +def map_file : Joined<["/", "-", "/?", "-?"], "map:">; def show_timing : F<"time">; def summary : F<"summary">; diff --git a/lld/COFF/Writer.cpp b/lld/COFF/Writer.cpp index 7e7aaafe18ed5..d5e2b59027b4f 100644 --- a/lld/COFF/Writer.cpp +++ b/lld/COFF/Writer.cpp @@ -10,6 +10,7 @@ #include "Config.h" #include "DLL.h" #include "InputFiles.h" +#include "LLDMapFile.h" #include "MapFile.h" #include "PDB.h" #include "SymbolTable.h" @@ -633,6 +634,7 @@ void Writer::run() { } writeBuildId(); + writeLLDMapFile(outputSections); writeMapFile(outputSections); if (errorCount()) diff --git a/lld/ELF/ScriptParser.cpp b/lld/ELF/ScriptParser.cpp index 0a041202f2788..89b099d492000 100644 --- a/lld/ELF/ScriptParser.cpp +++ b/lld/ELF/ScriptParser.cpp @@ -545,12 +545,6 @@ void ScriptParser::readSections() { v.end()); if (atEOF() || !consume("INSERT")) { - // --no-rosegment is used to avoid placing read only non-executable sections - // in their own segment. We do the same if SECTIONS command is present in - // linker script. See comment for computeFlags(). - // TODO This rule will be dropped in the future. - config->singleRoRx = true; - script->hasSectionsCommand = true; return; } @@ -848,9 +842,9 @@ OutputSection *ScriptParser::readOutputSectionDescription(StringRef outSec) { // We handle the FILL command as an alias for =fillexp section attribute, // which is different from what GNU linkers do. // https://sourceware.org/binutils/docs/ld/Output-Section-Data.html - expect("("); + if (peek() != "(") + setError("( expected, but got " + peek()); cmd->filler = readFill(); - expect(")"); } else if (tok == "SORT") { readSort(); } else if (tok == "INCLUDE") { @@ -905,8 +899,11 @@ OutputSection *ScriptParser::readOutputSectionDescription(StringRef outSec) { // When reading a hexstring, ld.bfd handles it as a blob of arbitrary // size, while ld.gold always handles it as a 32-bit big-endian number. // We are compatible with ld.gold because it's easier to implement. +// Also, we require that expressions with operators must be wrapped into +// round brackets. We did it to resolve the ambiguity when parsing scripts like: +// SECTIONS { .foo : { ... } =120+3 /DISCARD/ : { ... } } std::array ScriptParser::readFill() { - uint64_t value = readExpr()().val; + uint64_t value = readPrimary()().val; if (value > UINT32_MAX) setError("filler expression result does not fit 32-bit: 0x" + Twine::utohexstr(value)); diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp index e172b708afcf0..15b04d6fe332b 100644 --- a/lld/ELF/Writer.cpp +++ b/lld/ELF/Writer.cpp @@ -108,12 +108,21 @@ StringRef getOutputSectionName(const InputSectionBase *s) { } } - // This check is for -z keep-text-section-prefix. This option separates text - // sections with prefix ".text.hot", ".text.unlikely", ".text.startup" or - // ".text.exit". - // When enabled, this allows identifying the hot code region (.text.hot) in - // the final binary which can be selectively mapped to huge pages or mlocked, - // for instance. + // A BssSection created for a common symbol is identified as "COMMON" in + // linker scripts. It should go to .bss section. + if (s->name == "COMMON") + return ".bss"; + + if (script->hasSectionsCommand) + return s->name; + + // When no SECTIONS is specified, emulate GNU ld's internal linker scripts + // by grouping sections with certain prefixes. + + // GNU ld places text sections with prefix ".text.hot.", ".text.unlikely.", + // ".text.startup." or ".text.exit." before others. We provide an option -z + // keep-text-section-prefix to group such sections into separate output + // sections. This is more flexible. See also sortISDBySectionOrder(). if (config->zKeepTextSectionPrefix) for (StringRef v : {".text.hot.", ".text.unlikely.", ".text.startup.", ".text.exit."}) @@ -127,11 +136,6 @@ StringRef getOutputSectionName(const InputSectionBase *s) { if (isSectionPrefix(v, s->name)) return v.drop_back(); - // CommonSection is identified as "COMMON" in linker scripts. - // By default, it should go to .bss section. - if (s->name == "COMMON") - return ".bss"; - return s->name; } diff --git a/lld/docs/AtomLLD.rst b/lld/docs/AtomLLD.rst index 614e568d19970..2766094696e0a 100644 --- a/lld/docs/AtomLLD.rst +++ b/lld/docs/AtomLLD.rst @@ -59,4 +59,4 @@ Indices and tables * :ref:`genindex` * :ref:`search` -__ http://llvm.org/docs/DeveloperPolicy.html#license +__ https://llvm.org/docs/DeveloperPolicy.html#license diff --git a/lld/docs/NewLLD.rst b/lld/docs/NewLLD.rst index 59a8f87ea86af..1b1c87067f512 100644 --- a/lld/docs/NewLLD.rst +++ b/lld/docs/NewLLD.rst @@ -248,7 +248,7 @@ Finally, the linker replaces bitcode symbols with ELF/COFF symbols, so that they are linked as if they were in the native format from the beginning. The details are described in this document. -http://llvm.org/docs/LinkTimeOptimization.html +https://llvm.org/docs/LinkTimeOptimization.html Glossary -------- diff --git a/lld/docs/design.rst b/lld/docs/design.rst index 1e111f979bb5a..20d8fe78a6412 100644 --- a/lld/docs/design.rst +++ b/lld/docs/design.rst @@ -326,7 +326,7 @@ Testing The lld project contains a test suite which is being built up as new code is added to lld. All new lld functionality should have a tests added to the test -suite. The test suite is `lit `_ driven. Each +suite. The test suite is `lit `_ driven. Each test is a text file with comments telling lit how to run the test and check the result To facilitate testing, the lld project builds a tool called lld-core. This tool reads a YAML file (default from stdin), parses it into one or more diff --git a/lld/docs/development.rst b/lld/docs/development.rst index ce91341d665f8..81b826f648351 100644 --- a/lld/docs/development.rst +++ b/lld/docs/development.rst @@ -6,7 +6,7 @@ Development Note: this document discuss Mach-O port of LLD. For ELF and COFF, see :doc:`index`. -lld is developed as part of the `LLVM `_ project. +lld is developed as part of the `LLVM `_ project. Creating a Reader ----------------- diff --git a/lld/docs/getting_started.rst b/lld/docs/getting_started.rst index a174f652e7363..506cb24dde845 100644 --- a/lld/docs/getting_started.rst +++ b/lld/docs/getting_started.rst @@ -6,7 +6,7 @@ Getting Started: Building and Running lld This page gives you the shortest path to checking out and building lld. If you run into problems, please file bugs in the `LLVM Bugzilla`__ -__ http://llvm.org/bugs/ +__ https://bugs.llvm.org/ Building lld ------------ @@ -84,4 +84,4 @@ More Information For more information on using CMake see the `LLVM CMake guide`_. -.. _LLVM CMake guide: http://llvm.org/docs/CMake.html +.. _LLVM CMake guide: https://llvm.org/docs/CMake.html diff --git a/lld/docs/index.rst b/lld/docs/index.rst index b001f884f0bde..5a3f1a211b7bc 100644 --- a/lld/docs/index.rst +++ b/lld/docs/index.rst @@ -98,7 +98,7 @@ Build If you have already checked out LLVM using SVN, you can check out LLD under ``tools`` directory just like you probably did for clang. For the details, see `Getting Started with the LLVM System -`_. +`_. If you haven't checked out LLVM, the easiest way to build LLD is to check out the entire LLVM projects/sub-projects from a git mirror and diff --git a/lld/test/COFF/Inputs/map.yaml b/lld/test/COFF/Inputs/map.yaml new file mode 100644 index 0000000000000..7a834344cc5b0 --- /dev/null +++ b/lld/test/COFF/Inputs/map.yaml @@ -0,0 +1,60 @@ +--- !COFF +header: + Machine: IMAGE_FILE_MACHINE_AMD64 + Characteristics: [] +sections: + - Name: .text + Characteristics: [ IMAGE_SCN_CNT_CODE, IMAGE_SCN_MEM_EXECUTE, IMAGE_SCN_MEM_READ ] + Alignment: 4 + SectionData: 0000000000000000 + Relocations: + - VirtualAddress: 0 + SymbolName: exportfn1 + Type: IMAGE_REL_AMD64_ADDR32NB + - VirtualAddress: 4 + SymbolName: exportfn2 + Type: IMAGE_REL_AMD64_ADDR32NB +symbols: + - Name: .text + Value: 0 + SectionNumber: 1 + SimpleType: IMAGE_SYM_TYPE_NULL + ComplexType: IMAGE_SYM_DTYPE_NULL + StorageClass: IMAGE_SYM_CLASS_STATIC + SectionDefinition: + Length: 8 + NumberOfRelocations: 2 + NumberOfLinenumbers: 0 + CheckSum: 0 + Number: 0 + - Name: main + Value: 0 + SectionNumber: 1 + SimpleType: IMAGE_SYM_TYPE_NULL + ComplexType: IMAGE_SYM_DTYPE_FUNCTION + StorageClass: IMAGE_SYM_CLASS_EXTERNAL + - Name: exportfn1 + Value: 0 + SectionNumber: 0 + SimpleType: IMAGE_SYM_TYPE_NULL + ComplexType: IMAGE_SYM_DTYPE_NULL + StorageClass: IMAGE_SYM_CLASS_EXTERNAL + - Name: exportfn2 + Value: 0 + SectionNumber: 0 + SimpleType: IMAGE_SYM_TYPE_NULL + ComplexType: IMAGE_SYM_DTYPE_NULL + StorageClass: IMAGE_SYM_CLASS_EXTERNAL + - Name: absolute + Value: 0x00000042 + SectionNumber: -1 + SimpleType: IMAGE_SYM_TYPE_NULL + ComplexType: IMAGE_SYM_DTYPE_NULL + StorageClass: IMAGE_SYM_CLASS_STATIC + - Name: staticdef + Value: 0x00000043 + SectionNumber: 1 + SimpleType: IMAGE_SYM_TYPE_NULL + ComplexType: IMAGE_SYM_DTYPE_NULL + StorageClass: IMAGE_SYM_CLASS_STATIC +... diff --git a/lld/test/COFF/lldmap.test b/lld/test/COFF/lldmap.test deleted file mode 100644 index d705a16c6c2a0..0000000000000 --- a/lld/test/COFF/lldmap.test +++ /dev/null @@ -1,10 +0,0 @@ -# RUN: yaml2obj < %p/Inputs/ret42.yaml > %t.obj -# RUN: lld-link /out:%t.exe /entry:main /lldmap:%T/foo.map %t.obj -# RUN: FileCheck -strict-whitespace %s < %T/foo.map -# RUN: lld-link /out:%T/bar.exe /entry:main /lldmap %t.obj -# RUN: FileCheck -strict-whitespace %s < %T/bar.map - -# CHECK: Address Size Align Out In Symbol -# CHECK-NEXT: 00001000 00000006 4096 .text -# CHECK-NEXT: 00001000 00000006 16 {{.*}}lldmap.test.tmp.obj:(.text$mn) -# CHECK-NEXT: 00001000 00000000 0 main diff --git a/lld/test/COFF/map.test b/lld/test/COFF/map.test new file mode 100644 index 0000000000000..0ec27fb33bdce --- /dev/null +++ b/lld/test/COFF/map.test @@ -0,0 +1,40 @@ +# RUN: yaml2obj < %p/Inputs/export.yaml > %t-dll.obj +# RUN: lld-link /out:%t.dll /dll %t-dll.obj /implib:%t-dll.lib \ +# RUN: /export:exportfn1 /export:exportfn2 +# RUN: yaml2obj < %p/Inputs/map.yaml > %t.obj +# RUN: lld-link /out:%t.exe /entry:main %t.obj %t-dll.lib /map:%T/foo.map /lldmap +# RUN: FileCheck -check-prefix=MAP -strict-whitespace %s < %T/foo.map +# RUN: FileCheck -check-prefix=LLDMAP -strict-whitespace %s < %t.map +# RUN: lld-link /out:%t.exe /entry:main %t.obj %t-dll.lib /map /lldmap:%T/foo-lld.map +# RUN: FileCheck -check-prefix=MAP -strict-whitespace %s < %t.map +# RUN: FileCheck -check-prefix=LLDMAP -strict-whitespace %s < %T/foo-lld.map + +# MAP: {{.*}} +# MAP-EMPTY: +# MAP-NEXT: Timestamp is {{.*}} +# MAP-EMPTY: +# MAP-NEXT: Preferred load address is 0000000140000000 +# MAP-EMPTY: +# MAP-NEXT: Start Length Name Class +# MAP-NEXT: 0001:00000000 00000008H .text CODE +# MAP-EMPTY: +# MAP-NEXT: Address Publics by Value Rva+Base Lib:Object +# MAP-EMPTY: +# MAP-NEXT: 0000:00000042 absolute 0000000000000042 +# MAP-NEXT: 0001:00000000 main 0000000140001000 map.test.tmp.obj +# MAP-NEXT: 0001:00000010 exportfn1 0000000140001010 map.test.tmp-dll:map.test.tmp.dll +# MAP-NEXT: 0001:00000020 exportfn2 0000000140001020 map.test.tmp-dll:map.test.tmp.dll +# MAP-NEXT: 0002:00000040 __imp_exportfn1 0000000140002040 map.test.tmp-dll:map.test.tmp.dll +# MAP-NEXT: 0002:00000048 __imp_exportfn2 0000000140002048 map.test.tmp-dll:map.test.tmp.dll +# MAP-EMPTY: +# MAP-NEXT: entry point at 0001:00000000 +# MAP-EMPTY: +# MAP-NEXT: Static symbols +# MAP-EMPTY: +# MAP-NEXT: 0001:00000043 staticdef 0000000140001043 map.test.tmp.obj + + +# LLDMAP: Address Size Align Out In Symbol +# LLDMAP-NEXT: 00001000 00000026 4096 .text +# LLDMAP-NEXT: 00001000 00000008 4 {{.*}}map.test.tmp.obj:(.text) +# LLDMAP-NEXT: 00001000 00000000 0 main diff --git a/lld/test/ELF/arm-force-pi-thunk.s b/lld/test/ELF/arm-force-pi-thunk.s index 5504b4e19bd9d..582d1e4babdae 100644 --- a/lld/test/ELF/arm-force-pi-thunk.s +++ b/lld/test/ELF/arm-force-pi-thunk.s @@ -5,7 +5,7 @@ // RUN: .text_low : { *(.text_low) *(.text_low2) } \ // RUN: .text_high 0x2000000 : { *(.text_high) *(.text_high2) } \ // RUN: } " > %t.script -// RUN: ld.lld --pic-veneer --script %t.script %t -o %t2 +// RUN: ld.lld --pic-veneer --no-rosegment --script %t.script %t -o %t2 // RUN: llvm-objdump -d --triple=thumbv7a-none-linux-gnueabi %t2 | FileCheck %s // Test that we can force generation of position independent thunks even when diff --git a/lld/test/ELF/arm-thumb-thunk-v6m.s b/lld/test/ELF/arm-thumb-thunk-v6m.s index b97f9f767adf6..6036febf8e63d 100644 --- a/lld/test/ELF/arm-thumb-thunk-v6m.s +++ b/lld/test/ELF/arm-thumb-thunk-v6m.s @@ -5,9 +5,9 @@ // RUN: .text_low : { *(.text_low) *(.text_low2) } \ // RUN: .text_high 0x2000000 : { *(.text_high) *(.text_high2) } \ // RUN: } " > %t.script -// RUN: ld.lld --script %t.script %t -o %t2 +// RUN: ld.lld --no-rosegment --script %t.script %t -o %t2 // RUN: llvm-objdump -d %t2 --triple=armv6m-none-eabi | FileCheck %s -// RUN: ld.lld --script %t.script %t -o %t3 --pie +// RUN: ld.lld --no-rosegment --script %t.script %t -o %t3 --pie // RUN: llvm-objdump -d %t3 --triple=armv6m-none-eabi | FileCheck --check-prefix=CHECK-PI %s // Range extension thunks for Arm Architecture v6m. Only Thumb instructions diff --git a/lld/test/ELF/arm-thunk-linkerscript-dotexpr.s b/lld/test/ELF/arm-thunk-linkerscript-dotexpr.s index 1da6da5f33bab..62b5ea0ebc956 100644 --- a/lld/test/ELF/arm-thunk-linkerscript-dotexpr.s +++ b/lld/test/ELF/arm-thunk-linkerscript-dotexpr.s @@ -4,7 +4,7 @@ // RUN: . = SIZEOF_HEADERS; \ // RUN: .text_low : { *(.text_low) *(.text_low2) . = . + 0x2000000 ; *(.text_high) *(.text_high2) } \ // RUN: } " > %t.script -// RUN: ld.lld --script %t.script %t -o %t2 +// RUN: ld.lld --no-rosegment --script %t.script %t -o %t2 // RUN: llvm-objdump -d %t2 --start-address=148 --stop-address=188 --triple=thumbv7a-linux-gnueabihf | FileCheck --check-prefix=CHECK1 %s // RUN: llvm-objdump -d %t2 --start-address=33554620 --stop-address=33554654 --triple=thumbv7a-linux-gnueabihf | FileCheck --check-prefix=CHECK2 %s // Test that range extension thunks can handle location expressions within diff --git a/lld/test/ELF/arm-thunk-linkerscript.s b/lld/test/ELF/arm-thunk-linkerscript.s index 5de80fc0d5c40..a08aadc795e75 100644 --- a/lld/test/ELF/arm-thunk-linkerscript.s +++ b/lld/test/ELF/arm-thunk-linkerscript.s @@ -5,7 +5,7 @@ // RUN: .text_low : { *(.text_low) *(.text_low2) } \ // RUN: .text_high 0x2000000 : { *(.text_high) *(.text_high2) } \ // RUN: } " > %t.script -// RUN: ld.lld --script %t.script %t -o %t2 +// RUN: ld.lld --no-rosegment --script %t.script %t -o %t2 // RUN: llvm-objdump -d --triple=thumbv7a-none-linux-gnueabi %t2 | FileCheck %s // Simple test that we can support range extension thunks with linker scripts .syntax unified diff --git a/lld/test/ELF/linkerscript/absolute-expr.test b/lld/test/ELF/linkerscript/absolute-expr.test index 47016d3f5d880..6786c1d5807fb 100644 --- a/lld/test/ELF/linkerscript/absolute-expr.test +++ b/lld/test/ELF/linkerscript/absolute-expr.test @@ -53,7 +53,7 @@ SECTIONS { # CHECK-NEXT: } # CHECK-NEXT: Symbol { # CHECK-NEXT: Name: bar4 -# CHECK-NEXT: Value: 0x190 +# CHECK-NEXT: Value: # CHECK-NEXT: Size: 0 # CHECK-NEXT: Binding: Global # CHECK-NEXT: Type: None diff --git a/lld/test/ELF/linkerscript/align-empty.test b/lld/test/ELF/linkerscript/align-empty.test index 13edafe92eff7..0f212e877e672 100644 --- a/lld/test/ELF/linkerscript/align-empty.test +++ b/lld/test/ELF/linkerscript/align-empty.test @@ -15,8 +15,8 @@ SECTIONS { # CHECK: Sections: # CHECK-NEXT: Idx Name Size VMA # CHECK-NEXT: 0 00000000 0000000000000000 -# CHECK-NEXT: 1 .dynsym 00000018 0000000000000190 -# CHECK-NEXT: 2 .gnu.hash 0000001c 00000000000001a8 -# CHECK-NEXT: 3 .hash 00000010 00000000000001c4 -# CHECK-NEXT: 4 .dynstr 00000001 00000000000001d4 +# CHECK-NEXT: 1 .dynsym 00000018 00000000000001c8 +# CHECK-NEXT: 2 .gnu.hash 0000001c 00000000000001e0 +# CHECK-NEXT: 3 .hash 00000010 00000000000001fc +# CHECK-NEXT: 4 .dynstr 00000001 000000000000020c # CHECK-NEXT: 5 foo 00000001 0000000000001000 diff --git a/lld/test/ELF/linkerscript/at-self-reference.s b/lld/test/ELF/linkerscript/at-self-reference.s deleted file mode 100644 index 87b8cfb42b413..0000000000000 --- a/lld/test/ELF/linkerscript/at-self-reference.s +++ /dev/null @@ -1,63 +0,0 @@ -# REQUIRES: x86 -# RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux %s -o %t -# RUN: echo "SECTIONS { \ -# RUN: . = 0x1000; \ -# RUN: .aaa : AT(ADDR(.aaa)) { *(.aaa) } \ -# RUN: .bbb : AT(ADDR(.bbb)) { *(.bbb) } \ -# RUN: }" > %t.script -# RUN: ld.lld %t --script %t.script -o %t2 -# RUN: llvm-readobj -l %t2 | FileCheck %s - -# CHECK: ProgramHeaders [ -# CHECK-NEXT: ProgramHeader { -# CHECK-NEXT: Type: PT_LOAD (0x1) -# CHECK-NEXT: Offset: 0x1000 -# CHECK-NEXT: VirtualAddress: 0x1000 -# CHECK-NEXT: PhysicalAddress: 0x1000 -# CHECK-NEXT: FileSize: 3 -# CHECK-NEXT: MemSize: 3 -# CHECK-NEXT: Flags [ (0x5) -# CHECK-NEXT: PF_R (0x4) -# CHECK-NEXT: PF_X (0x1) -# CHECK-NEXT: ] -# CHECK-NEXT: Alignment: 4096 -# CHECK-NEXT: } -# CHECK-NEXT: ProgramHeader { -# CHECK-NEXT: Type: PT_LOAD (0x1) -# CHECK-NEXT: Offset: 0x1008 -# CHECK-NEXT: VirtualAddress: 0x1008 -# CHECK-NEXT: PhysicalAddress: 0x1008 -# CHECK-NEXT: FileSize: 9 -# CHECK-NEXT: MemSize: 9 -# CHECK-NEXT: Flags [ (0x5) -# CHECK-NEXT: PF_R (0x4) -# CHECK-NEXT: PF_X (0x1) -# CHECK-NEXT: ] -# CHECK-NEXT: Alignment: 4096 -# CHECK-NEXT: } -# CHECK-NEXT: ProgramHeader { -# CHECK-NEXT: Type: PT_GNU_STACK (0x6474E551) -# CHECK-NEXT: Offset: 0x0 -# CHECK-NEXT: VirtualAddress: 0x0 -# CHECK-NEXT: PhysicalAddress: 0x0 -# CHECK-NEXT: FileSize: 0 -# CHECK-NEXT: MemSize: 0 -# CHECK-NEXT: Flags [ (0x6) -# CHECK-NEXT: PF_R (0x4) -# CHECK-NEXT: PF_W (0x2) -# CHECK-NEXT: ] -# CHECK-NEXT: Alignment: 0 -# CHECK-NEXT: } -# CHECK-NEXT:] - -.global _start -_start: - nop - - -.section .aaa, "a" -.asciz "aa" - -.section .bbb, "a" -.align 8 -.quad 0 diff --git a/lld/test/ELF/linkerscript/at.s b/lld/test/ELF/linkerscript/at.s index 8240d004c9a85..f1681656c38b7 100644 --- a/lld/test/ELF/linkerscript/at.s +++ b/lld/test/ELF/linkerscript/at.s @@ -12,10 +12,11 @@ # RUN: llvm-readelf -l %t | FileCheck %s # CHECK: Type Offset VirtAddr PhysAddr FileSiz MemSiz Flg Align -# CHECK-NEXT: LOAD 0x001000 0x0000000000001000 0x0000000000002000 0x000010 0x000010 R E 0x1000 -# CHECK-NEXT: LOAD 0x001010 0x0000000000001010 0x0000000000003000 0x000008 0x000008 R E 0x1000 -# CHECK-NEXT: LOAD 0x001018 0x0000000000001018 0x0000000000004000 0x000008 0x000008 R E 0x1000 -# CHECK-NEXT: LOAD 0x002000 0x0000000000005000 0x0000000000005000 0x000009 0x000009 R E 0x1000 +# CHECK-NEXT: LOAD 0x001000 0x0000000000001000 0x0000000000002000 0x000010 0x000010 R 0x1000 +# CHECK-NEXT: LOAD 0x001010 0x0000000000001010 0x0000000000003000 0x000008 0x000008 R 0x1000 +# CHECK-NEXT: LOAD 0x001018 0x0000000000001018 0x0000000000004000 0x000008 0x000008 R 0x1000 +# CHECK-NEXT: LOAD 0x002000 0x0000000000005000 0x0000000000005000 0x000008 0x000008 R 0x1000 +# CHECK-NEXT: LOAD 0x002008 0x0000000000005008 0x0000000000005008 0x000001 0x000001 R E 0x1000 .global _start _start: diff --git a/lld/test/ELF/linkerscript/at4.s b/lld/test/ELF/linkerscript/at4.s index c4500e31ada85..a52a33e5cee4e 100644 --- a/lld/test/ELF/linkerscript/at4.s +++ b/lld/test/ELF/linkerscript/at4.s @@ -10,8 +10,9 @@ # RUN: llvm-readelf -l %t | FileCheck %s # CHECK: Type Offset VirtAddr PhysAddr FileSiz MemSiz Flg Align -# CHECK-NEXT: LOAD 0x001000 0x0000000000001000 0x0000000000001000 0x000008 0x000008 R E 0x1000 -# CHECK-NEXT: LOAD 0x001008 0x0000000000001008 0x0000000000002008 0x000011 0x000011 R E 0x1000 +# CHECK-NEXT: LOAD 0x001000 0x0000000000001000 0x0000000000001000 0x000008 0x000008 R 0x1000 +# CHECK-NEXT: LOAD 0x001008 0x0000000000001008 0x0000000000002008 0x000010 0x000010 R 0x1000 +# CHECK-NEXT: LOAD 0x001018 0x0000000000001018 0x0000000000001018 0x000001 0x000001 R E 0x1000 .global _start _start: diff --git a/lld/test/ELF/linkerscript/common-assign.s b/lld/test/ELF/linkerscript/common-assign.s index 4b8ad2cf3b240..ef0ad14ce92dd 100644 --- a/lld/test/ELF/linkerscript/common-assign.s +++ b/lld/test/ELF/linkerscript/common-assign.s @@ -6,7 +6,7 @@ # CHECK: Symbol { # CHECK: Name: bar -# CHECK-NEXT: Value: 0x134 +# CHECK-NEXT: Value: [[BAR:.*]] # CHECK-NEXT: Size: 4 # CHECK-NEXT: Binding: Global # CHECK-NEXT: Type: Object @@ -15,7 +15,7 @@ # CHECK-NEXT: } # CHECK-NEXT: Symbol { # CHECK-NEXT: Name: foo -# CHECK-NEXT: Value: 0x138 +# CHECK-NEXT: Value: [[FOO:.*]] # CHECK-NEXT: Size: 4 # CHECK-NEXT: Binding: Global # CHECK-NEXT: Type: Object @@ -24,7 +24,7 @@ # CHECK-NEXT: } # CHECK-NEXT: Symbol { # CHECK-NEXT: Name: pfoo -# CHECK-NEXT: Value: 0x138 +# CHECK-NEXT: Value: [[FOO]] # CHECK-NEXT: Size: 0 # CHECK-NEXT: Binding: Global # CHECK-NEXT: Type: None @@ -33,7 +33,7 @@ # CHECK-NEXT: } # CHECK-NEXT: Symbol { # CHECK-NEXT: Name: pbar -# CHECK-NEXT: Value: 0x134 +# CHECK-NEXT: Value: [[BAR]] # CHECK-NEXT: Size: 0 # CHECK-NEXT: Binding: Global # CHECK-NEXT: Type: None diff --git a/lld/test/ELF/linkerscript/data-commands-gc.s b/lld/test/ELF/linkerscript/data-commands-gc.s index 0add6ba27e94d..0262d7334e3e1 100644 --- a/lld/test/ELF/linkerscript/data-commands-gc.s +++ b/lld/test/ELF/linkerscript/data-commands-gc.s @@ -4,7 +4,7 @@ # RUN: ld.lld --gc-sections -o %t %t.o --script %t.script # RUN: llvm-objdump -t %t | FileCheck %s -# CHECK: 0000000000000008 g .rodata 0000000000000000 bar +# CHECK: 0000000000000008 g .rodata.bar 0000000000000000 bar .section .rodata.bar .quad 0x1122334455667788 diff --git a/lld/test/ELF/linkerscript/double-bss.test b/lld/test/ELF/linkerscript/double-bss.test index 59eb4cb1d6152..7a7a6519db491 100644 --- a/lld/test/ELF/linkerscript/double-bss.test +++ b/lld/test/ELF/linkerscript/double-bss.test @@ -2,9 +2,9 @@ # RUN: echo '.short 0; .bss; .zero 4; .comm q,128,8' \ # RUN: | llvm-mc -filetype=obj -triple=x86_64-unknown-linux - -o %t # RUN: ld.lld -o %t1 --script %s %t -# RUN: llvm-objdump --section-headers %t1 | FileCheck %s -# CHECK: .bss1 00000004 0000000000000122 BSS -# CHECK-NEXT: .bss2 00000080 0000000000000128 BSS +# RUN: llvm-readelf -S %t1 | FileCheck %s +# CHECK: .bss1 NOBITS +# CHECK-NEXT: .bss2 NOBITS SECTIONS { . = SIZEOF_HEADERS; diff --git a/lld/test/ELF/linkerscript/extend-pt-load1.test b/lld/test/ELF/linkerscript/extend-pt-load1.test index c39e0244ec2c5..63c6f0d9ca8e0 100644 --- a/lld/test/ELF/linkerscript/extend-pt-load1.test +++ b/lld/test/ELF/linkerscript/extend-pt-load1.test @@ -16,8 +16,8 @@ SECTIONS { .data.rel.ro : { *(.data.rel.ro) } } -# CHECK: .text PROGBITS 00000000000001bc 0001bc 000001 00 AX +# CHECK: .text PROGBITS 00000000000001f4 0001f4 000001 00 AX # CHECK-NEXT: .data.rel.ro PROGBITS 0000000000001000 001000 000001 00 WA -# CHECK: LOAD 0x000000 0x0000000000000000 0x0000000000000000 0x0001bd 0x0001bd R E -# CHECK-NEXT: LOAD 0x001000 0x0000000000001000 0x0000000000001000 0x000068 0x000068 RW +# CHECK: LOAD 0x000000 0x0000000000000000 0x0000000000000000 0x0001f1 0x0001f1 R 0x1000 +# CHECK-NEXT: LOAD 0x0001f4 0x00000000000001f4 0x00000000000001f4 0x000001 0x000001 R E 0x1000 diff --git a/lld/test/ELF/linkerscript/extend-pt-load2.test b/lld/test/ELF/linkerscript/extend-pt-load2.test index 4fbeb505b1786..3e91f3b4f162d 100644 --- a/lld/test/ELF/linkerscript/extend-pt-load2.test +++ b/lld/test/ELF/linkerscript/extend-pt-load2.test @@ -16,9 +16,10 @@ SECTIONS { .data.rel.ro : { *(.data.rel.ro) } } -# CHECK: .text PROGBITS 00000000000001bc 0001bc 000001 00 AX -# CHECK-NEXT: bar PROGBITS 00000000000001bd 0001bd 000e43 00 AX +# CHECK: .text PROGBITS 00000000000001f4 0001f4 000001 00 AX +# CHECK-NEXT: bar PROGBITS 00000000000001f5 0001f5 000e0b 00 AX # CHECK-NEXT: .data.rel.ro PROGBITS 0000000000001000 001000 000001 00 WA -# CHECK: LOAD 0x000000 0x0000000000000000 0x0000000000000000 0x001000 0x001000 R E -# CHECK-NEXT: LOAD 0x001000 0x0000000000001000 0x0000000000001000 0x000068 0x000068 RW +# CHECK: LOAD 0x000000 0x0000000000000000 0x0000000000000000 0x0001f1 0x0001f1 R 0x1000 +# CHECK: LOAD 0x0001f4 0x00000000000001f4 0x00000000000001f4 0x000e0c 0x000e0c R E 0x1000 +# CHECK-NEXT: LOAD 0x001000 0x0000000000001000 0x0000000000001000 0x000068 0x000068 RW 0x1000 diff --git a/lld/test/ELF/linkerscript/extend-pt-load3.test b/lld/test/ELF/linkerscript/extend-pt-load3.test index 9dd4ab8e0c43f..072fe90a6461c 100644 --- a/lld/test/ELF/linkerscript/extend-pt-load3.test +++ b/lld/test/ELF/linkerscript/extend-pt-load3.test @@ -17,8 +17,9 @@ SECTIONS { .data.rel.ro : { *(.data.rel.ro) } } -# CHECK: .text PROGBITS 00000000000001bc 0001bc 000001 00 AX +# CHECK: .text PROGBITS 00000000000001f4 0001f4 000001 00 AX # CHECK-NEXT: .data.rel.ro PROGBITS 0000000000001000 001000 000001 00 WA -# CHECK: LOAD 0x000000 0x0000000000000000 0x0000000000000000 0x0001bd 0x0001bd R E -# CHECK-NEXT: LOAD 0x001000 0x0000000000001000 0x0000000000001000 0x000068 0x000068 RW \ No newline at end of file +# CHECK: LOAD 0x000000 0x0000000000000000 0x0000000000000000 0x0001f1 0x0001f1 R 0x1000 +# CHECK-NEXT: LOAD 0x0001f4 0x00000000000001f4 0x00000000000001f4 0x000001 0x000001 R E 0x1000 +# CHECK-NEXT: LOAD 0x001000 0x0000000000001000 0x0000000000001000 0x000068 0x000068 RW 0x1000 diff --git a/lld/test/ELF/linkerscript/icf-output-sections.s b/lld/test/ELF/linkerscript/icf-output-sections.s index f23d7fff06b08..ae9203abaea49 100644 --- a/lld/test/ELF/linkerscript/icf-output-sections.s +++ b/lld/test/ELF/linkerscript/icf-output-sections.s @@ -28,13 +28,19 @@ # SEC2: .text.foo PROGBITS 0000000000000000 001000 000001 # SEC2-NEXT: .text.bar PROGBITS 0000000000000001 001001 000001 -## .text.bar* are orphans that get assigned to .text. +## .text.bar* are orphan sections. # RUN: echo 'SECTIONS { .text.foo : {*(.text.foo*)} }' > %t3.script -# RUN: ld.lld %t.o --script %t3.script --icf=all --print-icf-sections -o %t | FileCheck --check-prefix=ICF2 %s -# RUN: llvm-readelf -S %t | FileCheck --check-prefix=SEC3 %s +# RUN: ld.lld %t.o -T %t3.script --icf=all --print-icf-sections -o %t3 | FileCheck --check-prefix=ICF3 %s +# RUN: llvm-readelf -S %t3 | FileCheck --check-prefix=SEC3 %s +# ICF3: selected section {{.*}}.o:(.text.foo0) +# ICF3-NEXT: removing identical section {{.*}}.o:(.text.foo1) + +# SEC3: Name Type Address Off Size # SEC3: .text.foo PROGBITS 0000000000000000 001000 000001 -# SEC3-NEXT: .text PROGBITS 0000000000000004 001004 000001 +# SEC3-NEXT: .text PROGBITS 0000000000000004 001004 000000 +# SEC3-NEXT: .text.bar0 PROGBITS 0000000000000004 001004 000001 +# SEC3-NEXT: .text.bar1 PROGBITS 0000000000000005 001005 000001 .section .text.foo0,"ax" ret diff --git a/lld/test/ELF/linkerscript/linkorder.s b/lld/test/ELF/linkerscript/linkorder.s index 44547b8ab0029..042e8b3293feb 100644 --- a/lld/test/ELF/linkerscript/linkorder.s +++ b/lld/test/ELF/linkerscript/linkorder.s @@ -1,11 +1,11 @@ # REQUIRES: x86 # RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux %s -o %t.o -# RUN: echo "SECTIONS { .text : { *(.text.bar) *(.text.foo) } }" > %t.script +# RUN: echo "SECTIONS { .rodata : {*(.rodata*)} .text : {*(.text.bar) *(.text.foo)} }" > %t.script # RUN: ld.lld -o %t --script %t.script %t.o # RUN: llvm-objdump -s %t | FileCheck %s -# RUN: echo "SECTIONS { .text : { *(.text.foo) *(.text.bar) } }" > %t.script +# RUN: echo "SECTIONS { .rodata : {*(.rodata*)} .text : {*(.text.foo) *(.text.bar)} }" > %t.script # RUN: ld.lld -o %t --script %t.script %t.o # RUN: llvm-objdump -s %t | FileCheck --check-prefix=INV %s diff --git a/lld/test/ELF/linkerscript/linkorder2.s b/lld/test/ELF/linkerscript/linkorder2.s index 4a538b6190e70..5b2eeea08abb3 100644 --- a/lld/test/ELF/linkerscript/linkorder2.s +++ b/lld/test/ELF/linkerscript/linkorder2.s @@ -1,6 +1,6 @@ # REQUIRES: x86 # RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux %s -o %t.o -# RUN: echo "SECTIONS { .text : { *(.text.*) } }" > %t.script +# RUN: echo 'SECTIONS { .rodata : {*(.rodata.*)} .text : {*(.text.*)} }' > %t.script # RUN: echo "_bar" > %t.ord # RUN: echo "_foo" >> %t.ord diff --git a/lld/test/ELF/linkerscript/loadaddr.s b/lld/test/ELF/linkerscript/loadaddr.s index 055b7422baeb5..e2c82fc6c8cb5 100644 --- a/lld/test/ELF/linkerscript/loadaddr.s +++ b/lld/test/ELF/linkerscript/loadaddr.s @@ -22,7 +22,7 @@ # CHECK-NEXT: 0000000000002008 g *ABS* 0000000000000000 bbb_lma # CHECK-NEXT: 0000000000003000 g *ABS* 0000000000000000 ccc_lma # CHECK-NEXT: 0000000000004000 g *ABS* 0000000000000000 ddd_lma -# CHECK-NEXT: 0000000000004008 g *ABS* 0000000000000000 txt_lma +# CHECK-NEXT: 0000000000001020 g *ABS* 0000000000000000 txt_lma # ERROR: {{.*}}.script:1: undefined section .zzz .global _start diff --git a/lld/test/ELF/linkerscript/map-file2.test b/lld/test/ELF/linkerscript/map-file2.test index 535043282249f..f527e8ecdf80f 100644 --- a/lld/test/ELF/linkerscript/map-file2.test +++ b/lld/test/ELF/linkerscript/map-file2.test @@ -32,10 +32,10 @@ SECTIONS { # CHECK-NEXT: 1219 3209 8 1 {{.*}}{{/|\\}}map-file2.test.tmp.o:(.ddd) # CHECK-NEXT: 1228 3218 34 8 .eh_frame # CHECK-NEXT: 1228 3218 30 1 {{.*}}{{/|\\}}map-file2.test.tmp.o:(.eh_frame+0x0) -# CHECK-NEXT: 125c 324c 1 4 .text -# CHECK-NEXT: 125c 324c 1 4 {{.*}}{{/|\\}}map-file2.test.tmp.o:(.text) -# CHECK-NEXT: 125c 324c 0 1 f(int) -# CHECK-NEXT: 125c 324c 0 1 _start +# CHECK-NEXT: 125c 125c 1 4 .text +# CHECK-NEXT: 125c 125c 1 4 {{.*}}{{/|\\}}map-file2.test.tmp.o:(.text) +# CHECK-NEXT: 125c 125c 0 1 f(int) +# CHECK-NEXT: 125c 125c 0 1 _start # CHECK-NEXT: 0 0 8 1 .comment # CHECK-NEXT: 0 0 8 1 :(.comment) # CHECK-NEXT: 0 0 48 8 .symtab diff --git a/lld/test/ELF/linkerscript/memory3.s b/lld/test/ELF/linkerscript/memory3.s index b9d609e59e31c..56fc36d67a313 100644 --- a/lld/test/ELF/linkerscript/memory3.s +++ b/lld/test/ELF/linkerscript/memory3.s @@ -18,6 +18,6 @@ # CHECK: 0 00000000 0000000000000000 # CHECK: 1 .text 00000001 0000000000001000 -.section .text.foo,"ax",%progbits +.section .text,"ax",%progbits foo: nop diff --git a/lld/test/ELF/linkerscript/merge-header-load.s b/lld/test/ELF/linkerscript/merge-header-load.s index 5fb866abef855..d067c40d67ab3 100644 --- a/lld/test/ELF/linkerscript/merge-header-load.s +++ b/lld/test/ELF/linkerscript/merge-header-load.s @@ -12,8 +12,9 @@ # CHECK: Program Headers: # CHECK: Type Offset VirtAddr PhysAddr -# CHECK-NEXT: PHDR 0x000040 0xffffffff80000040 0x0000000000004040 -# CHECK-NEXT: LOAD 0x000000 0xffffffff80000000 0x0000000000004000 +# CHECK-NEXT: PHDR 0x000040 0xffffffff80000040 0xffffffff80000040 +# CHECK-NEXT: LOAD 0x000000 0xffffffff80000000 0xffffffff80000000 +# CHECK-NEXT: LOAD 0x000200 0xffffffff80000200 0x0000000000004200 # CHECK-NOT: LOAD .global _start diff --git a/lld/test/ELF/linkerscript/merge-sections-syms.s b/lld/test/ELF/linkerscript/merge-sections-syms.s index 421749b6f1b98..ed9e7d3d523fe 100644 --- a/lld/test/ELF/linkerscript/merge-sections-syms.s +++ b/lld/test/ELF/linkerscript/merge-sections-syms.s @@ -6,38 +6,10 @@ # RUN: .rodata : { *(.aaa) *(.bbb) A = .; *(.ccc) B = .; } \ # RUN: }" > %t.script # RUN: ld.lld -o %t.so --script %t.script %t.o -shared -# RUN: llvm-readobj --dyn-symbols %t.so | FileCheck %s - -# CHECK: DynamicSymbols [ -# CHECK-NEXT: Symbol { -# CHECK-NEXT: Name: -# CHECK-NEXT: Value: -# CHECK-NEXT: Size: -# CHECK-NEXT: Binding: -# CHECK-NEXT: Type: -# CHECK-NEXT: Other: -# CHECK-NEXT: Section: -# CHECK-NEXT: } -# CHECK-NEXT: Symbol { -# CHECK-NEXT: Name: A -# CHECK-NEXT: Value: 0x226 -# CHECK-NEXT: Size: -# CHECK-NEXT: Binding: -# CHECK-NEXT: Type: -# CHECK-NEXT: Other: -# CHECK-NEXT: Section: -# CHECK-NEXT: } -# CHECK-NEXT: Symbol { -# CHECK-NEXT: Name: B -# CHECK-NEXT: Value: 0x227 -# CHECK-NEXT: Size: -# CHECK-NEXT: Binding: -# CHECK-NEXT: Type: -# CHECK-NEXT: Other: -# CHECK-NEXT: Section: -# CHECK-NEXT: } -# CHECK-NEXT: ] +# RUN: llvm-nm -D %t.so | FileCheck %s +# CHECK: 000000000000025e R A +# CHECK: 000000000000025f R B .section .aaa,"a" .byte 11 diff --git a/lld/test/ELF/linkerscript/merge-sections.s b/lld/test/ELF/linkerscript/merge-sections.s index ea53ba3e4201a..366961051e6fd 100644 --- a/lld/test/ELF/linkerscript/merge-sections.s +++ b/lld/test/ELF/linkerscript/merge-sections.s @@ -28,7 +28,7 @@ # CHECK-NEXT: Value: 0x[[ADDR1]] # CHECK: Name: end -# CHECK-NEXT: Value: 0x230 +# CHECK-NEXT: Value: 0x268 # Check that we don't crash with --gc-sections # RUN: ld.lld --gc-sections -o %t2 --script %t.script %t -shared diff --git a/lld/test/ELF/linkerscript/noload.s b/lld/test/ELF/linkerscript/noload.s index 18b509134afc0..eb6ace45adcd7 100644 --- a/lld/test/ELF/linkerscript/noload.s +++ b/lld/test/ELF/linkerscript/noload.s @@ -5,53 +5,14 @@ # RUN: .data_noload_b (0x10000) (NOLOAD) : { *(.data_noload_b) } \ # RUN: .text (0x20000) : { *(.text) } };" > %t.script # RUN: ld.lld -o %t --script %t.script %t.o -# RUN: llvm-readobj --sections -l %t | FileCheck %s +# RUN: llvm-readelf -S -l %t | FileCheck %s -# CHECK: Section { -# CHECK: Index: 1 -# CHECK-NEXT: Name: .data_noload_a -# CHECK-NEXT: Type: SHT_NOBITS -# CHECK-NEXT: Flags [ -# CHECK-NEXT: SHF_ALLOC -# CHECK-NEXT: SHF_WRITE -# CHECK-NEXT: ] -# CHECK-NEXT: Address: 0x0 -# CHECK-NEXT: Offset: 0xE8 -# CHECK-NEXT: Size: 4096 -# CHECK-NEXT: Link: 0 -# CHECK-NEXT: Info: 0 -# CHECK-NEXT: AddressAlignment: 1 -# CHECK-NEXT: EntrySize: 0 -# CHECK-NEXT: } -# CHECK-NEXT: Section { -# CHECK-NEXT: Index: 2 -# CHECK-NEXT: Name: .data_noload_b -# CHECK-NEXT: Type: SHT_NOBITS -# CHECK-NEXT: Flags [ -# CHECK-NEXT: SHF_ALLOC -# CHECK-NEXT: SHF_WRITE -# CHECK-NEXT: ] -# CHECK-NEXT: Address: 0x10000 -# CHECK-NEXT: Offset: 0xE8 -# CHECK-NEXT: Size: 4096 -# CHECK-NEXT: Link: 0 -# CHECK-NEXT: Info: 0 -# CHECK-NEXT: AddressAlignment: 1 -# CHECK-NEXT: EntrySize: 0 -# CHECK-NEXT: } -# CHECK: ProgramHeader { -# CHECK-NEXT: Type: PT_LOAD (0x1) -# CHECK-NEXT: Offset: 0x1000 -# CHECK-NEXT: VirtualAddress: 0x20000 -# CHECK-NEXT: PhysicalAddress: 0x20000 -# CHECK-NEXT: FileSize: 1 -# CHECK-NEXT: MemSize: 1 -# CHECK-NEXT: Flags [ (0x5) -# CHECK-NEXT: PF_R (0x4) -# CHECK-NEXT: PF_X (0x1) -# CHECK-NEXT: ] -# CHECK-NEXT: Alignment: 4096 -# CHECK-NEXT: } +# CHECK: Name Type Address Off Size +# CHECK: .data_noload_a NOBITS 0000000000000000 [[OFF:[0-9a-f]+]] 001000 +# CHECK-NEXT: .data_noload_b NOBITS 0000000000010000 [[OFF]] 001000 + +# CHECK: Type Offset VirtAddr PhysAddr +# CHECK-NEXT: LOAD 0x001000 0x0000000000020000 0x0000000000020000 .section .text,"ax",@progbits nop diff --git a/lld/test/ELF/linkerscript/non-alloc.s b/lld/test/ELF/linkerscript/non-alloc.s index e6fb84d17d41a..aa1a3ac7ca688 100644 --- a/lld/test/ELF/linkerscript/non-alloc.s +++ b/lld/test/ELF/linkerscript/non-alloc.s @@ -10,13 +10,15 @@ # CHECK: Program Headers: # CHECK-NEXT: Type +# CHECK-NEXT: LOAD {{.*}} R # CHECK-NEXT: LOAD {{.*}} R E # CHECK-NEXT: LOAD {{.*}} RW # CHECK: Section to Segment mapping: # CHECK-NEXT: Segment Sections... -# CHECK-NEXT: 00 .dynsym .hash .dynstr .text -# CHECK-NEXT: 01 .dynamic +# CHECK-NEXT: 00 .dynsym .hash .dynstr +# CHECK-NEXT: 01 .text +# CHECK-NEXT: 02 .dynamic nop .section foo diff --git a/lld/test/ELF/linkerscript/orphan-align.s b/lld/test/ELF/linkerscript/orphan-align.s index edd637b928201..b866beaa898bb 100644 --- a/lld/test/ELF/linkerscript/orphan-align.s +++ b/lld/test/ELF/linkerscript/orphan-align.s @@ -6,7 +6,7 @@ # RUN: . = ALIGN(0x1000); \ # RUN: .data.rel.ro : { *(.data.rel.ro) } \ # RUN: }" > %t.script -# RUN: ld.lld -o %t -T %t.script %t.o -shared +# RUN: ld.lld -o %t -T %t.script %t.o -shared --no-rosegment # RUN: llvm-readobj -l %t | FileCheck %s diff --git a/lld/test/ELF/linkerscript/orphan-report.s b/lld/test/ELF/linkerscript/orphan-report.s index 5203a6d20de2e..3dca23267ec64 100644 --- a/lld/test/ELF/linkerscript/orphan-report.s +++ b/lld/test/ELF/linkerscript/orphan-report.s @@ -24,7 +24,7 @@ # RUN: %t.o 2>&1 | FileCheck %s --check-prefixes=COMMON,DYNSYM,SYMTAB # COMMON: {{.*}}.o:(.text) is being placed in '.text' -# COMMON-NEXT: {{.*}}.o:(.text.2) is being placed in '.text' +# COMMON-NEXT: {{.*}}.o:(.text.2) is being placed in '.text.2' # COMMON-NEXT: :(.comment) is being placed in '.comment' # DYNSYM-NEXT: :(.dynsym) is being placed in '.dynsym' # DYNSYM-NEXT: :(.gnu.hash) is being placed in '.gnu.hash' diff --git a/lld/test/ELF/linkerscript/overlapping-sections.s b/lld/test/ELF/linkerscript/overlapping-sections.s index a6c63d5d8d0e7..ad59aa2d472fe 100644 --- a/lld/test/ELF/linkerscript/overlapping-sections.s +++ b/lld/test/ELF/linkerscript/overlapping-sections.s @@ -5,15 +5,15 @@ # RUN: .sec1 0x8000 : AT(0x8000) { sec1_start = .; *(.first_sec) sec1_end = .;} \ # RUN: .sec2 0x8800 : AT(0x8080) { sec2_start = .; *(.second_sec) sec2_end = .;} \ # RUN: }" > %t-lma.script -# RUN: not ld.lld -o /dev/null --script %t-lma.script %t.o -shared 2>&1 | FileCheck %s -check-prefix LMA-OVERLAP-ERR +# RUN: not ld.lld -o /dev/null -T %t-lma.script %t.o -shared --no-rosegment 2>&1 | FileCheck %s -check-prefix LMA-OVERLAP-ERR # LMA-OVERLAP-ERR: error: section .sec1 load address range overlaps with .sec2 # LMA-OVERLAP-ERR-NEXT: >>> .sec1 range is [0x8000, 0x80FF] # LMA-OVERLAP-ERR-NEXT: >>> .sec2 range is [0x8080, 0x817F] # Check that we create the expected binary with --noinhibit-exec or --no-check-sections: -# RUN: ld.lld -o %t.so --script %t-lma.script %t.o -shared --noinhibit-exec -# RUN: ld.lld -o %t.so --script %t-lma.script %t.o -shared --no-check-sections -fatal-warnings -# RUN: ld.lld -o %t.so --script %t-lma.script %t.o -shared --check-sections --no-check-sections -fatal-warnings +# RUN: ld.lld -o %t.so --script %t-lma.script %t.o -shared --no-rosegment --noinhibit-exec +# RUN: ld.lld -o %t.so --script %t-lma.script %t.o -shared --no-rosegment --no-check-sections -fatal-warnings +# RUN: ld.lld -o %t.so --script %t-lma.script %t.o -shared --no-rosegment --check-sections --no-check-sections -fatal-warnings # Verify that the .sec2 was indeed placed in a PT_LOAD where the PhysAddr # overlaps with where .sec1 is loaded: @@ -42,7 +42,7 @@ # VADDR-OVERLAP-ERR-NEXT: >>> .sec2 range is [0x8020, 0x811F] # Check that the expected binary was created with --noinhibit-exec: -# RUN: ld.lld -o %t.so --script %t-vaddr.script %t.o -shared --noinhibit-exec +# RUN: ld.lld -o %t.so --script %t-vaddr.script %t.o -shared --no-rosegment --noinhibit-exec # RUN: llvm-readelf --sections -l %t.so | FileCheck %s -check-prefix BAD-VADDR # BAD-VADDR-LABEL: Section Headers: # BAD-VADDR: .sec1 PROGBITS 0000000000008000 002000 000100 00 WA 0 0 1 @@ -75,7 +75,7 @@ # BOTH-OVERLAP-ERR-NEXT: >>> .sec1 range is [0x8000, 0x80FF] # BOTH-OVERLAP-ERR-NEXT: >>> .sec2 range is [0x8040, 0x813F] -# RUN: ld.lld -o %t.so --script %t-both-overlap.script %t.o -shared --noinhibit-exec +# RUN: ld.lld -o %t.so --script %t-both-overlap.script %t.o -shared --no-rosegment --noinhibit-exec # Note: In case everything overlaps we create a binary with overlapping file # offsets. ld.bfd seems to place .sec1 to file offset 18000 and .sec2 # at 18100 so that only virtual addr and LMA overlap diff --git a/lld/test/ELF/linkerscript/overlay.test b/lld/test/ELF/linkerscript/overlay.test index aaa8619a94fcf..85e140d60ab0c 100644 --- a/lld/test/ELF/linkerscript/overlay.test +++ b/lld/test/ELF/linkerscript/overlay.test @@ -26,5 +26,6 @@ SECTIONS { # CHECK: Program Headers: # CHECK: Type Offset VirtAddr PhysAddr FileSiz MemSiz Flg Align -# CHECK-NEXT: LOAD 0x001000 0x0000000000001000 0x0000000000004000 0x000008 0x000008 R E 0x1000 -# CHECK-NEXT: LOAD 0x002000 0x0000000000001000 0x0000000000004008 0x000009 0x000009 R E 0x1000 +# CHECK-NEXT: LOAD 0x001000 0x0000000000001000 0x0000000000004000 0x000008 0x000008 R 0x1000 +# CHECK-NEXT: LOAD 0x002000 0x0000000000001000 0x0000000000004008 0x000004 0x000004 R 0x1000 +# CHECK-NEXT: LOAD 0x002008 0x0000000000001008 0x0000000000001008 0x000001 0x000001 R E 0x1000 diff --git a/lld/test/ELF/linkerscript/repsection-symbol.s b/lld/test/ELF/linkerscript/repsection-symbol.s index 96963eef3abef..d5f61ee82cece 100644 --- a/lld/test/ELF/linkerscript/repsection-symbol.s +++ b/lld/test/ELF/linkerscript/repsection-symbol.s @@ -6,17 +6,17 @@ # RUN: .text : { *(.text) } \ # RUN: .foo : {foo1 = .; *(.foo.*) foo2 = .; *(.bar) foo3 = .;} \ # RUN: }" > %t.script -# RUN: ld.lld --hash-style=sysv -o %t1 --script %t.script %t -shared +# RUN: ld.lld -o %t1 --script %t.script %t -shared # RUN: llvm-readobj --symbols %t1 | FileCheck %s # CHECK: Name: foo1 -# CHECK-NEXT: Value: 0x2C0 +# CHECK-NEXT: Value: 0x330 # CHECK: Name: foo2 -# CHECK-NEXT: Value: 0x2C8 +# CHECK-NEXT: Value: 0x338 # CHECK: Name: foo3 -# CHECK-NEXT: Value: 0x2CC +# CHECK-NEXT: Value: 0x33C .section .foo.1,"a" .long 1 diff --git a/lld/test/ELF/linkerscript/rosegment.test b/lld/test/ELF/linkerscript/rosegment.test index 41479e609d24b..1fece2898d8f3 100644 --- a/lld/test/ELF/linkerscript/rosegment.test +++ b/lld/test/ELF/linkerscript/rosegment.test @@ -1,26 +1,13 @@ # REQUIRES: x86 -# RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux /dev/null -o %t - -# Test that with linker scripts we don't create a RO PT_LOAD. +## Create a readonly PT_LOAD in the absence of PHDRS. +# RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux /dev/null -o %t # RUN: ld.lld -o %t1 --script %s %t -shared -# RUN: llvm-readobj -l %t1 | FileCheck %s +# RUN: llvm-readelf -l %t1 | FileCheck %s SECTIONS { } -# CHECK-NOT: Type: PT_LOAD - -# CHECK: Type: PT_LOAD -# CHECK: Flags [ -# CHECK-NEXT: PF_R -# CHECK-NEXT: PF_X -# CHECK-NEXT: ] - -# CHECK: Type: PT_LOAD -# CHECK: Flags [ -# CHECK-NEXT: PF_R -# CHECK-NEXT: PF_W -# CHECK-NEXT: ] - -# CHECK-NOT: Type: PT_LOAD +# CHECK: Type {{.*}} Flg Align +# CHECK-NEXT: LOAD {{.*}} R 0x1000 +# CHECK-NEXT: LOAD {{.*}} RW 0x1000 diff --git a/lld/test/ELF/linkerscript/sections-keep.s b/lld/test/ELF/linkerscript/sections-keep.s index 6532f7b93930e..9abc9893bc8cd 100644 --- a/lld/test/ELF/linkerscript/sections-keep.s +++ b/lld/test/ELF/linkerscript/sections-keep.s @@ -44,8 +44,8 @@ # MIXED1-NEXT: Idx Name Size # MIXED1-NEXT: 0 00000000 # MIXED1-NEXT: 1 .keep 00000004 -# MIXED1-NEXT: 2 .temp 00000004 00000000000000ec -# MIXED1-NEXT: 3 .text 00000007 00000000000000f0 +# MIXED1-NEXT: 2 .temp 00000004 0000000000000124 +# MIXED1-NEXT: 3 .text 00000007 0000000000000128 # MIXED1-NEXT: 4 .comment 00000008 0000000000000000 # MIXED1-NEXT: 5 .symtab 00000060 0000000000000000 # MIXED1-NEXT: 6 .shstrtab 00000036 0000000000000000 @@ -64,9 +64,9 @@ # MIXED2: Sections: # MIXED2-NEXT: Idx Name Size # MIXED2-NEXT: 0 00000000 -# MIXED2-NEXT: 1 .nokeep 00000004 00000000000000e8 -# MIXED2-NEXT: 2 .temp 00000004 00000000000000ec -# MIXED2-NEXT: 3 .text 00000007 00000000000000f0 +# MIXED2-NEXT: 1 .nokeep 00000004 0000000000000120 +# MIXED2-NEXT: 2 .temp 00000004 0000000000000124 +# MIXED2-NEXT: 3 .text 00000007 0000000000000128 # MIXED2-NEXT: 4 .comment 00000008 0000000000000000 # MIXED2-NEXT: 5 .symtab 00000060 0000000000000000 # MIXED2-NEXT: 6 .shstrtab 00000038 0000000000000000 @@ -80,7 +80,7 @@ # RUN: ld.lld --gc-sections -o %t --script %t.script %t1.o %t.o # RUN: llvm-objdump -s %t | FileCheck --check-prefix=FILEMATCH %s # FILEMATCH: Contents of section .keep: -# FILEMATCH-NEXT: 00e8 41414141 AAAA +# FILEMATCH-NEXT: 0120 41414141 AAAA .global _start _start: diff --git a/lld/test/ELF/linkerscript/sections-padding.s b/lld/test/ELF/linkerscript/sections-padding.s index 5ec0ddbe767a7..4d147d79c63e6 100644 --- a/lld/test/ELF/linkerscript/sections-padding.s +++ b/lld/test/ELF/linkerscript/sections-padding.s @@ -7,7 +7,7 @@ # RUN: llvm-objdump -s %t.out | FileCheck --check-prefix=YES %s # YES: 66000011 22000011 22000011 22000011 -# RUN: echo "SECTIONS { .mysec : { *(.mysec*) } =0x1100+0x22 }" > %t.script +# RUN: echo "SECTIONS { .mysec : { *(.mysec*) } =(0x1100+0x22) }" > %t.script # RUN: ld.lld -o %t.out --script %t.script %t # RUN: llvm-objdump -s %t.out | FileCheck --check-prefix=YES2 %s # YES2: 66000011 22000011 22000011 22000011 @@ -66,6 +66,11 @@ # RUN: not ld.lld -o /dev/null %t --script %t.script 2>&1 | FileCheck --check-prefix=ERR4 %s # ERR4: symbol not found: foo +## Check we are able to parse scripts where "/DISCARD/" follows a section fill expression. +# RUN: echo "SECTIONS { .mysec : { *(.mysec*) } =0x1122 /DISCARD/ : { *(.text) } }" > %t.script +# RUN: ld.lld -o %t.out --script %t.script %t +# RUN: llvm-objdump -s %t.out | FileCheck --check-prefix=YES %s + .section .mysec.1,"a" .align 16 .byte 0x66 diff --git a/lld/test/ELF/linkerscript/sizeofheaders.s b/lld/test/ELF/linkerscript/sizeofheaders.s index 3293c4ea34e4e..c01ab23dca818 100644 --- a/lld/test/ELF/linkerscript/sizeofheaders.s +++ b/lld/test/ELF/linkerscript/sizeofheaders.s @@ -9,8 +9,8 @@ # RUN: llvm-readelf -s %t | FileCheck %s # CHECK: Value Size Type Bind Vis Ndx Name -# CHECK: 00000000000000e8 0 NOTYPE GLOBAL DEFAULT 1 _start -# CHECK-NEXT: 00000000000000e8 0 NOTYPE GLOBAL DEFAULT ABS _size +# CHECK: 0000000000000120 0 NOTYPE GLOBAL DEFAULT 1 _start +# CHECK-NEXT: 0000000000000120 0 NOTYPE GLOBAL DEFAULT ABS _size .global _start _start: diff --git a/lld/test/ELF/linkerscript/symbol-assign-many-passes2.test b/lld/test/ELF/linkerscript/symbol-assign-many-passes2.test index 973a4881850e4..18dc5019ee1eb 100644 --- a/lld/test/ELF/linkerscript/symbol-assign-many-passes2.test +++ b/lld/test/ELF/linkerscript/symbol-assign-many-passes2.test @@ -22,7 +22,7 @@ SECTIONS { b = c + 1; c = d + 1; d = e + 1; - *(.text); + *(.text*); } e = .; } diff --git a/lld/test/ELF/linkerscript/symbol-conflict.s b/lld/test/ELF/linkerscript/symbol-conflict.s deleted file mode 100644 index dc922b65dac83..0000000000000 --- a/lld/test/ELF/linkerscript/symbol-conflict.s +++ /dev/null @@ -1,11 +0,0 @@ -# REQUIRES: x86 -# RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux %s -o %t - -# RUN: echo "SECTIONS { . = SIZEOF_HEADERS; .text : {*(.text.*)} end = .;}" > %t.script -# RUN: ld.lld -o %t1 --script %t.script %t -# RUN: llvm-nm %t1 | FileCheck %s -# CHECK: 00000000000000e9 T end - -.global _start -_start: - nop diff --git a/lld/test/ELF/linkerscript/synthetic-symbols1.test b/lld/test/ELF/linkerscript/synthetic-symbols1.test index 36b526534da17..f32248b405924 100644 --- a/lld/test/ELF/linkerscript/synthetic-symbols1.test +++ b/lld/test/ELF/linkerscript/synthetic-symbols1.test @@ -38,19 +38,19 @@ SECTIONS { .eh_frame : {} } -# CHECK: 0000000000000128 l .foo 0000000000000000 .hidden _end_sec -# CHECK-NEXT: 0000000000000120 g .foo 0000000000000000 _begin_sec -# CHECK-NEXT: 0000000000000128 g *ABS* 0000000000000000 _end_sec_abs +# CHECK: 0000000000000160 l .foo 0000000000000000 .hidden _end_sec +# CHECK-NEXT: 0000000000000158 g .foo 0000000000000000 _begin_sec +# CHECK-NEXT: 0000000000000160 g *ABS* 0000000000000000 _end_sec_abs # CHECK-NEXT: 000000000000104c g .text 0000000000000000 _start -# CHECK-NEXT: 0000000000000120 g .foo 0000000000000000 begin_foo -# CHECK-NEXT: 0000000000000128 g .foo 0000000000000000 end_foo +# CHECK-NEXT: 0000000000000158 g .foo 0000000000000000 begin_foo +# CHECK-NEXT: 0000000000000160 g .foo 0000000000000000 end_foo # CHECK-NEXT: 0000000000000008 g *ABS* 0000000000000000 size_foo_1 # CHECK-NEXT: 0000000000000008 g *ABS* 0000000000000000 size_foo_1_abs # CHECK-NEXT: 0000000000001000 g .foo 0000000000000000 begin_bar # CHECK-NEXT: 0000000000001004 g .foo 0000000000000000 end_bar -# CHECK-NEXT: 0000000000000ee4 g *ABS* 0000000000000000 size_foo_2 -# CHECK-NEXT: 0000000000000ee4 g *ABS* 0000000000000000 size_foo_3 +# CHECK-NEXT: 0000000000000eac g *ABS* 0000000000000000 size_foo_2 +# CHECK-NEXT: 0000000000000eac g *ABS* 0000000000000000 size_foo_3 # CHECK-NEXT: 0000000000001004 g .eh_frame_hdr 0000000000000000 __eh_frame_hdr_start -# CHECK-NEXT: 0000000000001010 g *ABS* 0000000000000000 __eh_frame_hdr_start2 +# CHECK-NEXT: 0000000000001010 g *ABS* 0000000000000000 __eh_frame_hdr_start2 # CHECK-NEXT: 0000000000001018 g .eh_frame_hdr 0000000000000000 __eh_frame_hdr_end -# CHECK-NEXT: 0000000000001020 g *ABS* 0000000000000000 __eh_frame_hdr_end2 +# CHECK-NEXT: 0000000000001020 g *ABS* 0000000000000000 __eh_frame_hdr_end2 diff --git a/lld/test/ELF/many-alloc-sections.s b/lld/test/ELF/many-alloc-sections.s index 12c7a5ce3bc11..3b42dad87a90c 100644 --- a/lld/test/ELF/many-alloc-sections.s +++ b/lld/test/ELF/many-alloc-sections.s @@ -6,7 +6,7 @@ // Test that _start is in the correct section. // CHECK: Name: _start -// CHECK-NEXT: Value: 0x120 +// CHECK-NEXT: Value: // CHECK-NEXT: Size: 0 // CHECK-NEXT: Binding: Global // CHECK-NEXT: Type: None diff --git a/lld/test/ELF/mips-npic-call-pic-script.s b/lld/test/ELF/mips-npic-call-pic-script.s index 0ce1bfe947795..041c62101f7ff 100644 --- a/lld/test/ELF/mips-npic-call-pic-script.s +++ b/lld/test/ELF/mips-npic-call-pic-script.s @@ -87,13 +87,13 @@ __start: # ORPH1: Disassembly of section .text: # ORPH1-EMPTY: # ORPH1-NEXT: <__start>: -# ORPH1-NEXT: 20000: jal 131156 <__LA25Thunk_foo1a> +# ORPH1-NEXT: 20000: jal 131168 <__LA25Thunk_foo1a> # ORPH1-NEXT: 20004: nop -# ORPH1-NEXT: 20008: jal 131208 <__LA25Thunk_foo2> +# ORPH1-NEXT: 20008: jal 131216 <__LA25Thunk_foo2> # ORPH1-NEXT: 2000c: nop -# ORPH1-NEXT: 20010: jal 131172 <__LA25Thunk_foo1b> +# ORPH1-NEXT: 20010: jal 131184 <__LA25Thunk_foo1b> # ORPH1-NEXT: 20014: nop -# ORPH1-NEXT: 20018: jal 131208 <__LA25Thunk_foo2> +# ORPH1-NEXT: 20018: jal 131216 <__LA25Thunk_foo2> # ORPH1-NEXT: 2001c: nop # ORPH1-NEXT: 20020: jal 131120 <__LA25Thunk_fpic> # ORPH1-NEXT: 20024: nop @@ -113,16 +113,16 @@ __start: # ORPH1-NEXT: 20050: nop # ORPH1: <__LA25Thunk_foo1a>: -# ORPH1-NEXT: 20054: lui $25, 2 -# ORPH1-NEXT: 20058: j 131200 -# ORPH1-NEXT: 2005c: addiu $25, $25, 128 -# ORPH1-NEXT: 20060: nop +# ORPH1-NEXT: 20060: lui $25, 2 +# ORPH1-NEXT: j 131200 +# ORPH1-NEXT: addiu $25, $25, 128 +# ORPH1-NEXT: nop # ORPH1: <__LA25Thunk_foo1b>: -# ORPH1-NEXT: 20064: lui $25, 2 -# ORPH1-NEXT: 20068: j 131204 -# ORPH1-NEXT: 2006c: addiu $25, $25, 132 -# ORPH1-NEXT: 20070: nop +# ORPH1-NEXT: 20070: lui $25, 2 +# ORPH1-NEXT: j 131204 +# ORPH1-NEXT: addiu $25, $25, 132 +# ORPH1-NEXT: nop # ORPH1: : # ORPH1-NEXT: 20080: nop @@ -131,17 +131,17 @@ __start: # ORPH1-NEXT: 20084: nop # ORPH1: <__LA25Thunk_foo2>: -# ORPH1-NEXT: 20088: lui $25, 2 -# ORPH1-NEXT: 2008c: j 131232 -# ORPH1-NEXT: 20090: addiu $25, $25, 160 -# ORPH1-NEXT: 20094: nop +# ORPH1-NEXT: 20090: lui $25, 2 +# ORPH1-NEXT: j 131232 +# ORPH1-NEXT: addiu $25, $25, 160 +# ORPH1-NEXT: nop # ORPH1: : # ORPH1-NEXT: 200a0: nop # Test script with orphans added to new OutputSection, the .text.1 and # .text.2 sections will form a new OutputSection .text -# RUN: echo "SECTIONS { .out 0x20000 : { *(.text) } }" > %t3.script +# RUN: echo "SECTIONS { .out 0x20000 : { *(.text) } .text : {*(.text*)} }" > %t3.script # RUN: ld.lld --script %t3.script %t-npic.o %t-pic.o %t-sto-pic.o -o %t3.exe # RUN: llvm-objdump -d --no-show-raw-insn %t3.exe | FileCheck --check-prefix=ORPH2 %s diff --git a/lld/test/ELF/shuffle-sections-init-fini.s b/lld/test/ELF/shuffle-sections-init-fini.s index 31d87bb32444b..d98ca8d359de8 100644 --- a/lld/test/ELF/shuffle-sections-init-fini.s +++ b/lld/test/ELF/shuffle-sections-init-fini.s @@ -30,7 +30,9 @@ ## With a SECTIONS command, SHT_INIT_ARRAY prirotities are ignored. ## All .init_array* are shuffled together. -# RUN: echo 'SECTIONS {}' > %t.script +# RUN: echo 'SECTIONS { \ +# RUN: .init_array : { *(.init_array*) } \ +# RUN: .fini_array : { *(.fini_array*) }}' > %t.script # RUN: ld.lld -T %t.script %t.o -o %t2 # RUN: llvm-readelf -x .init -x .fini -x .init_array -x .fini_array %t2 | \ # RUN: FileCheck --check-prefixes=CHECK2,ORDERED2 %s diff --git a/lld/test/ELF/text-section-prefix.s b/lld/test/ELF/text-section-prefix.s index e39536da387d6..022b4167037d6 100644 --- a/lld/test/ELF/text-section-prefix.s +++ b/lld/test/ELF/text-section-prefix.s @@ -1,39 +1,53 @@ # REQUIRES: x86 -# RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux %s -o %t -# RUN: ld.lld -z keep-text-section-prefix %t -o %t2 -# RUN: llvm-readelf -l %t2 | FileCheck %s -# RUN: ld.lld %t -o %t3 -# RUN: llvm-readelf -l %t3 | FileCheck --check-prefix=CHECKNO %s -# RUN: ld.lld -z nokeep-text-section-prefix %t -o %t4 -# RUN: llvm-readelf -l %t4 | FileCheck --check-prefix=CHECKNO %s - -# CHECK: .text -# CHECK: .text.hot -# CHECK: .text.startup -# CHECK: .text.exit -# CHECK: .text.unlikely -# CHECKNO: .text -# CHECKNO-NOT: .text.hot +## -z keep-text-section-prefix separates text sections with prefix .text.hot, +## .text.unlikely, .text.startup, or .text.exit, in the absence of a SECTIONS command. +# RUN: llvm-mc -filetype=obj -triple=x86_64 %s -o %t.o +# RUN: ld.lld %t.o -o %t1 +# RUN: llvm-readelf -S %t1 | FileCheck --check-prefix=NOKEEP %s +# RUN: ld.lld -z nokeep-text-section-prefix %t.o -o %t2 +# RUN: cmp %t1 %t2 + +# RUN: ld.lld -z keep-text-section-prefix %t.o -o %t.keep +# RUN: llvm-readelf -S %t.keep | FileCheck --check-prefix=KEEP %s + +# KEEP: [ 1] .text +# KEEP-NEXT: [ 2] .text.hot +# KEEP-NEXT: [ 3] .text.startup +# KEEP-NEXT: [ 4] .text.exit +# KEEP-NEXT: [ 5] .text.unlikely + +# NOKEEP: [ 1] .text +# NOKEEP-NOT: .text + +## With a SECTIONS command, orphan sections are created verbatim. +## No grouping is performed for them. +# RUN: echo 'SECTIONS {}' > %t.lds +# RUN: ld.lld -T %t.lds -z keep-text-section-prefix %t.o -o %t.script +# RUN: llvm-readelf -S %t.script | FileCheck --check-prefix=SCRIPT %s + +# SCRIPT: .text +# SCRIPT-NEXT: .text.f +# SCRIPT-NEXT: .text.hot.f_hot +# SCRIPT-NEXT: .text.startup.f_startup +# SCRIPT-NEXT: .text.exit.f_exit +# SCRIPT-NEXT: .text.unlikely.f_unlikely + +.globl _start _start: ret .section .text.f,"ax" -f: nop .section .text.hot.f_hot,"ax" -f_hot: nop .section .text.startup.f_startup,"ax" -f_startup: nop .section .text.exit.f_exit,"ax" -f_exit: nop .section .text.unlikely.f_unlikely,"ax" -f_unlikely: nop diff --git a/lld/tools/lld/CMakeLists.txt b/lld/tools/lld/CMakeLists.txt index a15e296e31dfa..a37c2c702bd5c 100644 --- a/lld/tools/lld/CMakeLists.txt +++ b/lld/tools/lld/CMakeLists.txt @@ -4,7 +4,11 @@ set(LLVM_LINK_COMPONENTS add_lld_tool(lld lld.cpp + + ENABLE_PLUGINS + SUPPORT_PLUGINS ) +export_executable_symbols_for_plugins(lld) target_link_libraries(lld PRIVATE diff --git a/lld/tools/lld/lld.cpp b/lld/tools/lld/lld.cpp index 72ff758164d89..646fc3d6468eb 100644 --- a/lld/tools/lld/lld.cpp +++ b/lld/tools/lld/lld.cpp @@ -36,6 +36,7 @@ #include "llvm/Support/Host.h" #include "llvm/Support/InitLLVM.h" #include "llvm/Support/Path.h" +#include "llvm/Support/PluginLoader.h" #include using namespace lld; diff --git a/lldb/bindings/headers.swig b/lldb/bindings/headers.swig index ddd3964beb48a..7371e1a3873b3 100644 --- a/lldb/bindings/headers.swig +++ b/lldb/bindings/headers.swig @@ -21,6 +21,7 @@ #include "lldb/API/SBData.h" #include "lldb/API/SBDebugger.h" #include "lldb/API/SBDeclaration.h" +#include "lldb/API/SBEnvironment.h" #include "lldb/API/SBError.h" #include "lldb/API/SBEvent.h" #include "lldb/API/SBExecutionContext.h" diff --git a/lldb/bindings/interface/SBEnvironment.i b/lldb/bindings/interface/SBEnvironment.i new file mode 100644 index 0000000000000..4ca22fc314d22 --- /dev/null +++ b/lldb/bindings/interface/SBEnvironment.i @@ -0,0 +1,48 @@ +//===-- SWIG Interface for SBEnvironment-------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +namespace lldb { + +%feature("docstring", +"Represents the environment of a certain process. + +Example: + for entry in lldb.debugger.GetSelectedTarget().GetEnvironment().GetEntries(): + print(entry) + +") SBEnvironment; +class SBEnvironment { +public: + SBEnvironment (); + + SBEnvironment (const lldb::SBEnvironment &rhs); + + ~SBEnvironment(); + + size_t GetNumValues(); + + const char *Get(const char *name); + + const char *GetNameAtIndex(size_t index); + + const char *GetValueAtIndex(size_t index); + + SBStringList GetEntries(); + + void PutEntry(const char *name_and_value); + + void SetEntries(const SBStringList &entries, bool append); + + bool Set(const char *name, const char *value, bool overwrite); + + bool Unset(const char *name); + + void Clear(); +}; + +} // namespace lldb diff --git a/lldb/bindings/interface/SBLaunchInfo.i b/lldb/bindings/interface/SBLaunchInfo.i index e76950c6fb482..1de89b58b272b 100644 --- a/lldb/bindings/interface/SBLaunchInfo.i +++ b/lldb/bindings/interface/SBLaunchInfo.i @@ -64,6 +64,12 @@ public: void SetEnvironmentEntries (const char **envp, bool append); + void + SetEnvironment(const SBEnvironment &env, bool append); + + SBEnvironment + GetEnvironment(); + void Clear (); diff --git a/lldb/bindings/interface/SBPlatform.i b/lldb/bindings/interface/SBPlatform.i index 1f52edb0232c3..81945222c059a 100644 --- a/lldb/bindings/interface/SBPlatform.i +++ b/lldb/bindings/interface/SBPlatform.i @@ -194,6 +194,9 @@ public: lldb::SBUnixSignals GetUnixSignals(); + lldb::SBEnvironment + GetEnvironment(); + }; } // namespace lldb diff --git a/lldb/bindings/interface/SBTarget.i b/lldb/bindings/interface/SBTarget.i index 371bf5c35ebd0..57b5ccea6399e 100644 --- a/lldb/bindings/interface/SBTarget.i +++ b/lldb/bindings/interface/SBTarget.i @@ -677,6 +677,9 @@ public: lldb::SBBreakpoint BreakpointCreateByAddress (addr_t address); + lldb::SBEnvironment + GetEnvironment(); + lldb::SBBreakpoint BreakpointCreateBySBAddress (SBAddress &sb_address); diff --git a/lldb/bindings/interfaces.swig b/lldb/bindings/interfaces.swig index 780fe34392ff5..e906bb9e56569 100644 --- a/lldb/bindings/interfaces.swig +++ b/lldb/bindings/interfaces.swig @@ -29,6 +29,7 @@ %include "./interface/SBDebugger.i" %include "./interface/SBDeclaration.i" %include "./interface/SBError.i" +%include "./interface/SBEnvironment.i" %include "./interface/SBEvent.i" %include "./interface/SBExecutionContext.i" %include "./interface/SBExpressionOptions.i" diff --git a/lldb/include/lldb/API/LLDB.h b/lldb/include/lldb/API/LLDB.h index 12f3b8f32f57e..83c38d3b61664 100644 --- a/lldb/include/lldb/API/LLDB.h +++ b/lldb/include/lldb/API/LLDB.h @@ -24,6 +24,7 @@ #include "lldb/API/SBDebugger.h" #include "lldb/API/SBDeclaration.h" #include "lldb/API/SBDefines.h" +#include "lldb/API/SBEnvironment.h" #include "lldb/API/SBError.h" #include "lldb/API/SBEvent.h" #include "lldb/API/SBExecutionContext.h" diff --git a/lldb/include/lldb/API/SBDefines.h b/lldb/include/lldb/API/SBDefines.h index 474692c8c78d3..0ddf594e5cb50 100644 --- a/lldb/include/lldb/API/SBDefines.h +++ b/lldb/include/lldb/API/SBDefines.h @@ -35,6 +35,7 @@ class LLDB_API SBCompileUnit; class LLDB_API SBData; class LLDB_API SBDebugger; class LLDB_API SBDeclaration; +class LLDB_API SBEnvironment; class LLDB_API SBError; class LLDB_API SBEvent; class LLDB_API SBEventList; diff --git a/lldb/include/lldb/API/SBEnvironment.h b/lldb/include/lldb/API/SBEnvironment.h new file mode 100644 index 0000000000000..f40ee01a42ab9 --- /dev/null +++ b/lldb/include/lldb/API/SBEnvironment.h @@ -0,0 +1,137 @@ +//===-- SBEnvironment.h -----------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLDB_API_SBENVIRONMENT_H +#define LLDB_API_SBENVIRONMENT_H + +#include "lldb/API/SBDefines.h" + +namespace lldb { + +class LLDB_API SBEnvironment { +public: + SBEnvironment(); + + SBEnvironment(const lldb::SBEnvironment &rhs); + + ~SBEnvironment(); + + const lldb::SBEnvironment &operator=(const lldb::SBEnvironment &rhs); + + /// Return the value of a given environment variable. + /// + /// \param [in] name + /// The name of the environment variable. + /// + /// \return + /// The value of the environment variable or null if not present. + /// If the environment variable has no value but is present, a valid + /// pointer to an empty string will be returned. + const char *Get(const char *name); + + /// \return + /// The number of environment variables. + size_t GetNumValues(); + + /// Return the name of the environment variable at a given index from the + /// internal list of environment variables. + /// + /// \param [in] index + /// The index of the environment variable in the internal list. + /// + /// \return + /// The name at the given index or null if the index is invalid. + const char *GetNameAtIndex(size_t index); + + /// Return the value of the environment variable at a given index from the + /// internal list of environment variables. + /// + /// \param [in] index + /// The index of the environment variable in the internal list. + /// + /// \return + /// The value at the given index or null if the index is invalid. + /// If the environment variable has no value but is present, a valid + /// pointer to an empty string will be returned. + const char *GetValueAtIndex(size_t index); + + /// Return all environment variables contained in this object. Each variable + /// is returned as a string with the following format + /// name=value + /// + /// \return + /// Return an lldb::SBStringList object with the environment variables. + SBStringList GetEntries(); + + /// Add or replace an existing environment variable. The input must be a + /// string with the format + /// name=value + /// + /// \param [in] name_and_value + /// The entry to set which conforms to the format mentioned above. + void PutEntry(const char *name_and_value); + + /// Update this object with the given environment variables. The input is a + /// list of entries with the same format required by SBEnvironment::PutEntry. + /// + /// If append is false, the provided environment will replace the existing + /// environment. Otherwise, existing values will be updated of left untouched + /// accordingly. + /// + /// \param [in] entries + /// The environment variable entries. + /// + /// \param [in] append + /// Flag that controls whether to replace the existing environment. + void SetEntries(const SBStringList &entries, bool append); + + /// Set the value of a given environment variable. + /// If the variable exists, its value is updated only if overwrite is true. + /// + /// \param [in] name + /// The name of the environment variable to set. + /// + /// \param [in] value + /// The value of the environment variable to set. + /// + /// \param [in] overwrite + /// Flag that indicates whether to overwrite an existing environment + /// variable. + /// + /// \return + /// Return whether the variable was added or modified. + bool Set(const char *name, const char *value, bool overwrite); + + /// Unset an environment variable if exists. + /// + /// \param [in] name + /// The name of the environment variable to unset. + /// + /// \return + /// Return whether a variable was actually unset. + bool Unset(const char *name); + + /// Delete all the environment variables. + void Clear(); + +protected: + friend class SBPlatform; + friend class SBTarget; + friend class SBLaunchInfo; + + SBEnvironment(lldb_private::Environment rhs); + + lldb_private::Environment &ref() const; + +private: + std::unique_ptr m_opaque_up; +}; + +} // namespace lldb + +#endif // LLDB_API_SBENVIRONMENT_H diff --git a/lldb/include/lldb/API/SBLaunchInfo.h b/lldb/include/lldb/API/SBLaunchInfo.h index 883f17c0a57c4..04ebb5707688d 100644 --- a/lldb/include/lldb/API/SBLaunchInfo.h +++ b/lldb/include/lldb/API/SBLaunchInfo.h @@ -94,8 +94,41 @@ class LLDB_API SBLaunchInfo { const char *GetEnvironmentEntryAtIndex(uint32_t idx); + /// Update this object with the given environment variables. + /// + /// If append is false, the provided environment will replace the existing + /// environment. Otherwise, existing values will be updated of left untouched + /// accordingly. + /// + /// \param [in] envp + /// The new environment variables as a list of strings with the following + /// format + /// name=value + /// + /// \param [in] append + /// Flag that controls whether to replace the existing environment. void SetEnvironmentEntries(const char **envp, bool append); + /// Update this object with the given environment variables. + /// + /// If append is false, the provided environment will replace the existing + /// environment. Otherwise, existing values will be updated of left untouched + /// accordingly. + /// + /// \param [in] env + /// The new environment variables. + /// + /// \param [in] append + /// Flag that controls whether to replace the existing environment. + void SetEnvironment(const SBEnvironment &env, bool append); + + /// Return the environment variables of this object. + /// + /// \return + /// An lldb::SBEnvironment object which is a copy of the SBLaunchInfo's + /// environment. + SBEnvironment GetEnvironment(); + void Clear(); const char *GetWorkingDirectory() const; diff --git a/lldb/include/lldb/API/SBPlatform.h b/lldb/include/lldb/API/SBPlatform.h index 7fac182a0dd1a..4d251b1299546 100644 --- a/lldb/include/lldb/API/SBPlatform.h +++ b/lldb/include/lldb/API/SBPlatform.h @@ -154,6 +154,14 @@ class LLDB_API SBPlatform { SBUnixSignals GetUnixSignals() const; + /// Return the environment variables of the remote platform connection + /// process. + /// + /// \return + /// An lldb::SBEnvironment object which is a copy of the platform's + /// environment. + SBEnvironment GetEnvironment(); + protected: friend class SBDebugger; friend class SBTarget; diff --git a/lldb/include/lldb/API/SBTarget.h b/lldb/include/lldb/API/SBTarget.h index a50e791d4fe3c..fad842c9cb1cb 100644 --- a/lldb/include/lldb/API/SBTarget.h +++ b/lldb/include/lldb/API/SBTarget.h @@ -94,6 +94,15 @@ class LLDB_API SBTarget { /// A platform object. lldb::SBPlatform GetPlatform(); + /// Return the environment variables that would be used to launch a new + /// process. + /// + /// \return + /// An lldb::SBEnvironment object which is a copy of the target's + /// environment. + + SBEnvironment GetEnvironment(); + /// Install any binaries that need to be installed. /// /// This function does nothing when debugging on the host system. @@ -127,7 +136,9 @@ class LLDB_API SBTarget { /// The argument array. /// /// \param[in] envp - /// The environment array. + /// The environment array. If this is null, the default + /// environment values (provided through `settings set + /// target.env-vars`) will be used. /// /// \param[in] stdin_path /// The path to use when re-directing the STDIN of the new @@ -175,7 +186,9 @@ class LLDB_API SBTarget { /// The argument array. /// /// \param[in] envp - /// The environment array. + /// The environment array. If this isn't provided, the default + /// environment values (provided through `settings set + /// target.env-vars`) will be used. /// /// \param[in] working_directory /// The working directory to have the child process run in diff --git a/lldb/include/lldb/Core/PropertiesBase.td b/lldb/include/lldb/Core/PropertiesBase.td index 6e95ceb779bae..1be3b908ed410 100644 --- a/lldb/include/lldb/Core/PropertiesBase.td +++ b/lldb/include/lldb/Core/PropertiesBase.td @@ -49,3 +49,9 @@ class DefaultUnsignedValue { class EnumValues { string EnumValues = enum; } + +// Determines the element type for arrays and dictionaries. +class ElementType { + string ElementType = value; + bit HasElementType = 1; +} diff --git a/lldb/include/lldb/DataFormatters/FormatCache.h b/lldb/include/lldb/DataFormatters/FormatCache.h index 581744c04f792..e75aaee1a7bb8 100644 --- a/lldb/include/lldb/DataFormatters/FormatCache.h +++ b/lldb/include/lldb/DataFormatters/FormatCache.h @@ -49,13 +49,13 @@ class FormatCache { CacheMap m_map; std::recursive_mutex m_mutex; - uint64_t m_cache_hits; - uint64_t m_cache_misses; + uint64_t m_cache_hits = 0; + uint64_t m_cache_misses = 0; Entry &GetEntry(ConstString type); public: - FormatCache(); + FormatCache() = default; template bool Get(ConstString type, ImplSP &format_impl_sp); void Set(ConstString type, lldb::TypeFormatImplSP &format_sp); diff --git a/lldb/include/lldb/DataFormatters/FormattersHelpers.h b/lldb/include/lldb/DataFormatters/FormattersHelpers.h index 93642e57fde08..a5b0da57e5d8b 100644 --- a/lldb/include/lldb/DataFormatters/FormattersHelpers.h +++ b/lldb/include/lldb/DataFormatters/FormattersHelpers.h @@ -56,6 +56,8 @@ size_t ExtractIndexFromString(const char *item_name); lldb::addr_t GetArrayAddressOrPointerValue(ValueObject &valobj); +lldb::ValueObjectSP GetValueOfLibCXXCompressedPair(ValueObject &pair); + time_t GetOSXEpoch(); struct InferiorSizedWord { diff --git a/lldb/include/lldb/DataFormatters/StringPrinter.h b/lldb/include/lldb/DataFormatters/StringPrinter.h index 6f8869cc2a1e3..5842cde893d89 100644 --- a/lldb/include/lldb/DataFormatters/StringPrinter.h +++ b/lldb/include/lldb/DataFormatters/StringPrinter.h @@ -115,9 +115,15 @@ class StringPrinter { lldb::ProcessSP GetProcessSP() const { return m_process_sp; } + void SetHasSourceSize(bool e) { m_has_source_size = e; } + + bool HasSourceSize() const { return m_has_source_size; } + private: uint64_t m_location = 0; lldb::ProcessSP m_process_sp; + /// True iff we know the source size of the string. + bool m_has_source_size = false; }; class ReadBufferAndDumpToStreamOptions : public DumpToStreamOptions { diff --git a/lldb/include/lldb/Expression/ExpressionVariable.h b/lldb/include/lldb/Expression/ExpressionVariable.h index c523176e003fd..60062d212badf 100644 --- a/lldb/include/lldb/Expression/ExpressionVariable.h +++ b/lldb/include/lldb/Expression/ExpressionVariable.h @@ -221,11 +221,7 @@ class PersistentExpressionState : public ExpressionVariableList { uint32_t addr_byte_size) = 0; /// Return a new persistent variable name with the specified prefix. - ConstString GetNextPersistentVariableName(Target &target, - llvm::StringRef prefix); - - virtual llvm::StringRef - GetPersistentVariablePrefix(bool is_error = false) const = 0; + virtual ConstString GetNextPersistentVariableName(bool is_error = false) = 0; virtual void RemovePersistentVariable(lldb::ExpressionVariableSP variable) = 0; @@ -237,6 +233,10 @@ class PersistentExpressionState : public ExpressionVariableList { void RegisterExecutionUnit(lldb::IRExecutionUnitSP &execution_unit_sp); +protected: + virtual llvm::StringRef + GetPersistentVariablePrefix(bool is_error = false) const = 0; + private: LLVMCastKind m_kind; diff --git a/lldb/include/lldb/Host/FileSystem.h b/lldb/include/lldb/Host/FileSystem.h index 565d1f24e456c..8dcff34025929 100644 --- a/lldb/include/lldb/Host/FileSystem.h +++ b/lldb/include/lldb/Host/FileSystem.h @@ -186,8 +186,10 @@ class FileSystem { return m_fs; } + void Collect(const FileSpec &file_spec); + void Collect(const llvm::Twine &file); + private: - void AddFile(const llvm::Twine &file); static llvm::Optional &InstanceImpl(); llvm::IntrusiveRefCntPtr m_fs; std::shared_ptr m_collector; diff --git a/lldb/include/lldb/Symbol/Function.h b/lldb/include/lldb/Symbol/Function.h index 0db9a5116d25f..40d316fa78eb1 100644 --- a/lldb/include/lldb/Symbol/Function.h +++ b/lldb/include/lldb/Symbol/Function.h @@ -284,19 +284,33 @@ class CallEdge { /// Like \ref GetReturnPCAddress, but returns an unresolved file address. lldb::addr_t GetUnresolvedReturnPCAddress() const { return return_pc; } + /// Get the load PC address of the call instruction (or LLDB_INVALID_ADDRESS). + lldb::addr_t GetCallInstPC(Function &caller, Target &target) const; + /// Get the call site parameters available at this call edge. llvm::ArrayRef GetCallSiteParameters() const { return parameters; } protected: - CallEdge(lldb::addr_t return_pc, CallSiteParameterArray &¶meters) - : return_pc(return_pc), parameters(std::move(parameters)) {} + CallEdge(lldb::addr_t return_pc, lldb::addr_t call_inst_pc, + CallSiteParameterArray &¶meters) + : return_pc(return_pc), call_inst_pc(call_inst_pc), + parameters(std::move(parameters)) {} + + /// Helper that finds the load address of \p unresolved_pc, a file address + /// which refers to an instruction within \p caller. + static lldb::addr_t GetLoadAddress(lldb::addr_t unresolved_pc, + Function &caller, Target &target); /// An invalid address if this is a tail call. Otherwise, the return PC for /// the call. Note that this is a file address which must be resolved. lldb::addr_t return_pc; + /// The address of the call instruction. Usually an invalid address, unless + /// this is a tail call. + lldb::addr_t call_inst_pc; + CallSiteParameterArray parameters; }; @@ -308,8 +322,8 @@ class DirectCallEdge : public CallEdge { /// Construct a call edge using a symbol name to identify the callee, and a /// return PC within the calling function to identify a specific call site. DirectCallEdge(const char *symbol_name, lldb::addr_t return_pc, - CallSiteParameterArray &¶meters) - : CallEdge(return_pc, std::move(parameters)) { + lldb::addr_t call_inst_pc, CallSiteParameterArray &¶meters) + : CallEdge(return_pc, call_inst_pc, std::move(parameters)) { lazy_callee.symbol_name = symbol_name; } @@ -339,8 +353,9 @@ class IndirectCallEdge : public CallEdge { /// Construct a call edge using a DWARFExpression to identify the callee, and /// a return PC within the calling function to identify a specific call site. IndirectCallEdge(DWARFExpression call_target, lldb::addr_t return_pc, + lldb::addr_t call_inst_pc, CallSiteParameterArray &¶meters) - : CallEdge(return_pc, std::move(parameters)), + : CallEdge(return_pc, call_inst_pc, std::move(parameters)), call_target(std::move(call_target)) {} Function *GetCallee(ModuleList &images, ExecutionContext &exe_ctx) override; diff --git a/lldb/include/lldb/Target/Target.h b/lldb/include/lldb/Target/Target.h index 3a8570c0d6305..cc74fe0f3d748 100644 --- a/lldb/include/lldb/Target/Target.h +++ b/lldb/include/lldb/Target/Target.h @@ -211,6 +211,8 @@ class TargetProperties : public Properties { bool GetAutoInstallMainExecutable() const; + void UpdateLaunchInfoFromProperties(); + private: // Callbacks for m_launch_info. void Arg0ValueChangedCallback(); @@ -223,9 +225,12 @@ class TargetProperties : public Properties { void DisableASLRValueChangedCallback(); void DisableSTDIOValueChangedCallback(); + Environment ComputeEnvironment() const; + // Member variables. ProcessLaunchInfo m_launch_info; std::unique_ptr m_experimental_properties_up; + Target *m_target; }; class EvaluateExpressionOptions { @@ -1095,11 +1100,6 @@ class Target : public std::enable_shared_from_this, lldb::ExpressionVariableSP GetPersistentVariable(ConstString name); - /// Return the next available number for numbered persistent variables. - unsigned GetNextPersistentVariableIndex() { - return m_next_persistent_variable_index++; - } - lldb::addr_t GetPersistentSymbol(ConstString name); /// This method will return the address of the starting function for diff --git a/lldb/include/lldb/Utility/Environment.h b/lldb/include/lldb/Utility/Environment.h index 331eab9f7f0b4..e2af2eb2463d7 100644 --- a/lldb/include/lldb/Utility/Environment.h +++ b/lldb/include/lldb/Utility/Environment.h @@ -50,6 +50,7 @@ class Environment : private llvm::StringMap { using Base::erase; using Base::find; using Base::insert; + using Base::insert_or_assign; using Base::lookup; using Base::size; using Base::try_emplace; diff --git a/lldb/include/lldb/lldb-forward.h b/lldb/include/lldb/lldb-forward.h index 6b22e50a553d4..4fd2a07dd6165 100644 --- a/lldb/include/lldb/lldb-forward.h +++ b/lldb/include/lldb/lldb-forward.h @@ -76,6 +76,7 @@ class DynamicCheckerFunctions; class DynamicLoader; class Editline; class EmulateInstruction; +class Environment; class EvaluateExpressionOptions; class Event; class EventData; diff --git a/lldb/packages/Python/lldbsuite/test/decorators.py b/lldb/packages/Python/lldbsuite/test/decorators.py index 32ae8ee9b000a..cc28ae9016346 100644 --- a/lldb/packages/Python/lldbsuite/test/decorators.py +++ b/lldb/packages/Python/lldbsuite/test/decorators.py @@ -702,7 +702,7 @@ def is_compiler_clang_with_call_site_info(self): f = tempfile.NamedTemporaryFile() cmd = "echo 'int main() {}' | " \ - "%s -g -glldb -O1 -Xclang -femit-debug-entry-values -S -emit-llvm -x c -o %s -" % (compiler_path, f.name) + "%s -g -glldb -O1 -S -emit-llvm -x c -o %s -" % (compiler_path, f.name) if os.popen(cmd).close() is not None: return "Compiler can't compile with call site info enabled" diff --git a/lldb/packages/Python/lldbsuite/test/lldbtest.py b/lldb/packages/Python/lldbsuite/test/lldbtest.py index f8f916036f9aa..966d460ea13d9 100644 --- a/lldb/packages/Python/lldbsuite/test/lldbtest.py +++ b/lldb/packages/Python/lldbsuite/test/lldbtest.py @@ -1,7 +1,7 @@ """ LLDB module which provides the abstract base class of lldb test case. -The concrete subclass can override lldbtest.TesBase in order to inherit the +The concrete subclass can override lldbtest.TestBase in order to inherit the common behavior for unitest.TestCase.setUp/tearDown implemented in this file. The subclass should override the attribute mydir in order for the python runtime diff --git a/lldb/source/API/CMakeLists.txt b/lldb/source/API/CMakeLists.txt index e0ecf29b502b7..f8ed1b37f4fa0 100644 --- a/lldb/source/API/CMakeLists.txt +++ b/lldb/source/API/CMakeLists.txt @@ -35,6 +35,7 @@ add_lldb_library(liblldb SHARED ${option_framework} SBData.cpp SBDebugger.cpp SBDeclaration.cpp + SBEnvironment.cpp SBError.cpp SBEvent.cpp SBExecutionContext.cpp diff --git a/lldb/source/API/SBEnvironment.cpp b/lldb/source/API/SBEnvironment.cpp new file mode 100644 index 0000000000000..f3676b03a9b92 --- /dev/null +++ b/lldb/source/API/SBEnvironment.cpp @@ -0,0 +1,99 @@ +//===-- SBEnvironment.cpp -------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "lldb/API/SBEnvironment.h" +#include "Utils.h" +#include "lldb/API/SBStringList.h" +#include "lldb/Utility/ConstString.h" +#include "lldb/Utility/Environment.h" + +using namespace lldb; +using namespace lldb_private; + +/// This class is highly mutable, therefore we don't reproducers. + +SBEnvironment::SBEnvironment() : m_opaque_up(new Environment()) {} + +SBEnvironment::SBEnvironment(const SBEnvironment &rhs) + : m_opaque_up(clone(rhs.m_opaque_up)) {} + +SBEnvironment::SBEnvironment(Environment rhs) + : m_opaque_up(new Environment(std::move(rhs))) {} + +SBEnvironment::~SBEnvironment() = default; + +const SBEnvironment &SBEnvironment::operator=(const SBEnvironment &rhs) { + if (this != &rhs) + m_opaque_up = clone(rhs.m_opaque_up); + return *this; +} + +size_t SBEnvironment::GetNumValues() { + return m_opaque_up->size(); +} + +const char *SBEnvironment::Get(const char *name) { + auto entry = m_opaque_up->find(name); + if (entry == m_opaque_up->end()) { + return nullptr; + } + return ConstString(entry->second).AsCString(""); +} + +const char *SBEnvironment::GetNameAtIndex(size_t index) { + if (index >= GetNumValues()) + return nullptr; + return ConstString(std::next(m_opaque_up->begin(), index)->first()) + .AsCString(""); +} + +const char *SBEnvironment::GetValueAtIndex(size_t index) { + if (index >= GetNumValues()) + return nullptr; + return ConstString(std::next(m_opaque_up->begin(), index)->second) + .AsCString(""); +} + +bool SBEnvironment::Set(const char *name, const char *value, bool overwrite) { + if (overwrite) { + m_opaque_up->insert_or_assign(name, std::string(value)); + return true; + } + return m_opaque_up->try_emplace(name, std::string(value)).second; +} + +bool SBEnvironment::Unset(const char *name) { + return m_opaque_up->erase(name); +} + +SBStringList SBEnvironment::GetEntries() { + SBStringList entries; + for (const auto &KV : *m_opaque_up) { + entries.AppendString(Environment::compose(KV).c_str()); + } + return entries; +} + +void SBEnvironment::PutEntry(const char *name_and_value) { + auto split = llvm::StringRef(name_and_value).split('='); + m_opaque_up->insert_or_assign(split.first.str(), split.second.str()); +} + +void SBEnvironment::SetEntries(const SBStringList &entries, bool append) { + if (!append) + m_opaque_up->clear(); + for (size_t i = 0; i < entries.GetSize(); i++) { + PutEntry(entries.GetStringAtIndex(i)); + } +} + +void SBEnvironment::Clear() { + m_opaque_up->clear(); +} + +Environment &SBEnvironment::ref() const { return *m_opaque_up; } diff --git a/lldb/source/API/SBLaunchInfo.cpp b/lldb/source/API/SBLaunchInfo.cpp index 58307077153d6..ba13072e8f9bc 100644 --- a/lldb/source/API/SBLaunchInfo.cpp +++ b/lldb/source/API/SBLaunchInfo.cpp @@ -9,6 +9,7 @@ #include "lldb/API/SBLaunchInfo.h" #include "SBReproducerPrivate.h" +#include "lldb/API/SBEnvironment.h" #include "lldb/API/SBFileSpec.h" #include "lldb/API/SBListener.h" #include "lldb/Host/ProcessLaunchInfo.h" @@ -182,15 +183,26 @@ const char *SBLaunchInfo::GetEnvironmentEntryAtIndex(uint32_t idx) { void SBLaunchInfo::SetEnvironmentEntries(const char **envp, bool append) { LLDB_RECORD_METHOD(void, SBLaunchInfo, SetEnvironmentEntries, (const char **, bool), envp, append); + SetEnvironment(SBEnvironment(Environment(envp)), append); +} - Environment env(envp); +void SBLaunchInfo::SetEnvironment(const SBEnvironment &env, bool append) { + LLDB_RECORD_METHOD(void, SBLaunchInfo, SetEnvironment, + (const lldb::SBEnvironment &, bool), env, append); + Environment &refEnv = env.ref(); if (append) - m_opaque_sp->GetEnvironment().insert(env.begin(), env.end()); + m_opaque_sp->GetEnvironment().insert(refEnv.begin(), refEnv.end()); else - m_opaque_sp->GetEnvironment() = env; + m_opaque_sp->GetEnvironment() = refEnv; m_opaque_sp->RegenerateEnvp(); } +SBEnvironment SBLaunchInfo::GetEnvironment() { + LLDB_RECORD_METHOD_NO_ARGS(lldb::SBEnvironment, SBLaunchInfo, GetEnvironment); + return LLDB_RECORD_RESULT( + SBEnvironment(Environment(m_opaque_sp->GetEnvironment()))); +} + void SBLaunchInfo::Clear() { LLDB_RECORD_METHOD_NO_ARGS(void, SBLaunchInfo, Clear); @@ -390,6 +402,9 @@ void RegisterMethods(Registry &R) { ()); LLDB_REGISTER_METHOD(void, SBLaunchInfo, SetDetachOnError, (bool)); LLDB_REGISTER_METHOD_CONST(bool, SBLaunchInfo, GetDetachOnError, ()); + LLDB_REGISTER_METHOD(void, SBLaunchInfo, SetEnvironment, + (const lldb::SBEnvironment &, bool)); + LLDB_REGISTER_METHOD(lldb::SBEnvironment, SBLaunchInfo, GetEnvironment, ()); } } diff --git a/lldb/source/API/SBPlatform.cpp b/lldb/source/API/SBPlatform.cpp index 7aa0b54d00053..ddb77f4f008a7 100644 --- a/lldb/source/API/SBPlatform.cpp +++ b/lldb/source/API/SBPlatform.cpp @@ -8,9 +8,11 @@ #include "lldb/API/SBPlatform.h" #include "SBReproducerPrivate.h" +#include "lldb/API/SBEnvironment.h" #include "lldb/API/SBError.h" #include "lldb/API/SBFileSpec.h" #include "lldb/API/SBLaunchInfo.h" +#include "lldb/API/SBPlatform.h" #include "lldb/API/SBUnixSignals.h" #include "lldb/Host/File.h" #include "lldb/Target/Platform.h" @@ -649,6 +651,17 @@ SBUnixSignals SBPlatform::GetUnixSignals() const { return LLDB_RECORD_RESULT(SBUnixSignals()); } +SBEnvironment SBPlatform::GetEnvironment() { + LLDB_RECORD_METHOD_NO_ARGS(lldb::SBEnvironment, SBPlatform, GetEnvironment); + PlatformSP platform_sp(GetSP()); + + if (platform_sp) { + return LLDB_RECORD_RESULT(SBEnvironment(platform_sp->GetEnvironment())); + } + + return LLDB_RECORD_RESULT(SBEnvironment()); +} + namespace lldb_private { namespace repro { @@ -740,6 +753,7 @@ void RegisterMethods(Registry &R) { (const char *)); LLDB_REGISTER_METHOD(lldb::SBError, SBPlatform, SetFilePermissions, (const char *, uint32_t)); + LLDB_REGISTER_METHOD(lldb::SBEnvironment, SBPlatform, GetEnvironment, ()); LLDB_REGISTER_METHOD_CONST(lldb::SBUnixSignals, SBPlatform, GetUnixSignals, ()); } diff --git a/lldb/source/API/SBTarget.cpp b/lldb/source/API/SBTarget.cpp index b90e77280d24c..ca75e91bd9069 100644 --- a/lldb/source/API/SBTarget.cpp +++ b/lldb/source/API/SBTarget.cpp @@ -13,6 +13,7 @@ #include "lldb/API/SBBreakpoint.h" #include "lldb/API/SBDebugger.h" +#include "lldb/API/SBEnvironment.h" #include "lldb/API/SBEvent.h" #include "lldb/API/SBExpressionOptions.h" #include "lldb/API/SBFileSpec.h" @@ -371,10 +372,19 @@ SBProcess SBTarget::Launch(SBListener &listener, char const **argv, Module *exe_module = target_sp->GetExecutableModulePointer(); if (exe_module) launch_info.SetExecutableFile(exe_module->GetPlatformFileSpec(), true); - if (argv) + if (argv) { launch_info.GetArguments().AppendArguments(argv); - if (envp) + } else { + auto default_launch_info = target_sp->GetProcessLaunchInfo(); + launch_info.GetArguments().AppendArguments( + default_launch_info.GetArguments()); + } + if (envp) { launch_info.GetEnvironment() = Environment(envp); + } else { + auto default_launch_info = target_sp->GetProcessLaunchInfo(); + launch_info.GetEnvironment() = default_launch_info.GetEnvironment(); + } if (listener.IsValid()) launch_info.SetListener(listener.GetSP()); @@ -2330,16 +2340,6 @@ lldb::SBValue SBTarget::EvaluateExpression(const char *expr, Target *target = exe_ctx.GetTargetPtr(); if (target) { -#ifdef LLDB_CONFIGURATION_DEBUG - StreamString frame_description; - if (frame) - frame->DumpUsingSettingsFormat(&frame_description); - llvm::PrettyStackTraceFormat stack_trace( - "SBTarget::EvaluateExpression (expr = \"%s\", fetch_dynamic_value = " - "%u) %s", - expr, options.GetFetchDynamicValue(), - frame_description.GetString().str().c_str()); -#endif target->EvaluateExpression(expr, frame, expr_value_sp, options.ref()); expr_result.SetSP(expr_value_sp, options.GetFetchDynamicValue()); @@ -2388,6 +2388,17 @@ void SBTarget::SetLaunchInfo(const lldb::SBLaunchInfo &launch_info) { m_opaque_sp->SetProcessLaunchInfo(launch_info.ref()); } +SBEnvironment SBTarget::GetEnvironment() { + LLDB_RECORD_METHOD_NO_ARGS(lldb::SBEnvironment, SBTarget, GetEnvironment); + TargetSP target_sp(GetSP()); + + if (target_sp) { + return LLDB_RECORD_RESULT(SBEnvironment(target_sp->GetEnvironment())); + } + + return LLDB_RECORD_RESULT(SBEnvironment()); +} + namespace lldb_private { namespace repro { @@ -2643,6 +2654,7 @@ void RegisterMethods(Registry &R) { LLDB_REGISTER_METHOD(lldb::SBInstructionList, SBTarget, GetInstructionsWithFlavor, (lldb::addr_t, const char *, const void *, size_t)); + LLDB_REGISTER_METHOD(lldb::SBEnvironment, SBTarget, GetEnvironment, ()); } } diff --git a/lldb/source/Commands/CommandObjectExpression.cpp b/lldb/source/Commands/CommandObjectExpression.cpp index 7d8de573df0e8..9a314c251475f 100644 --- a/lldb/source/Commands/CommandObjectExpression.cpp +++ b/lldb/source/Commands/CommandObjectExpression.cpp @@ -486,7 +486,8 @@ bool CommandObjectExpression::EvaluateExpression(llvm::StringRef expr, } } - return true; + return (success != eExpressionSetupError && + success != eExpressionParseError); } void CommandObjectExpression::IOHandlerInputComplete(IOHandler &io_handler, diff --git a/lldb/source/Commands/CommandObjectExpression.h b/lldb/source/Commands/CommandObjectExpression.h index ddee9c36924d7..1e59cbc145288 100644 --- a/lldb/source/Commands/CommandObjectExpression.h +++ b/lldb/source/Commands/CommandObjectExpression.h @@ -71,6 +71,16 @@ class CommandObjectExpression : public CommandObjectRaw, /// expression in the given target. EvaluateExpressionOptions GetEvalOptions(const Target &target); + /// Evaluates the given expression. + /// \param output_stream The stream to which the evaluation result will be + /// printed. + /// \param error_stream Contains error messages that should be displayed to + /// the user in case the evaluation fails. + /// \param result A CommandReturnObject which status will be set to the + /// appropriate value depending on evaluation success and + /// whether the expression produced any result. + /// \return Returns true iff the expression was successfully evaluated, + /// executed and the result could be printed to the output stream. bool EvaluateExpression(llvm::StringRef expr, Stream &output_stream, Stream &error_stream, CommandReturnObject &result); diff --git a/lldb/source/Commands/CommandObjectTarget.cpp b/lldb/source/Commands/CommandObjectTarget.cpp index c70117c7a80a3..95f81fc6cd545 100644 --- a/lldb/source/Commands/CommandObjectTarget.cpp +++ b/lldb/source/Commands/CommandObjectTarget.cpp @@ -682,6 +682,41 @@ class CommandObjectTargetDelete : public CommandObjectParsed { OptionGroupBoolean m_cleanup_option; }; +class CommandObjectTargetShowLaunchEnvironment : public CommandObjectParsed { +public: + CommandObjectTargetShowLaunchEnvironment(CommandInterpreter &interpreter) + : CommandObjectParsed( + interpreter, "target show-launch-environment", + "Shows the environment being passed to the process when launched, " + "taking info account 3 settings: target.env-vars, " + "target.inherit-env and target.unset-env-vars.", + nullptr, eCommandRequiresTarget) {} + + ~CommandObjectTargetShowLaunchEnvironment() override = default; + +protected: + bool DoExecute(Args &args, CommandReturnObject &result) override { + Target *target = m_exe_ctx.GetTargetPtr(); + Environment env = target->GetEnvironment(); + + std::vector env_vector; + env_vector.reserve(env.size()); + for (auto &KV : env) + env_vector.push_back(&KV); + std::sort(env_vector.begin(), env_vector.end(), + [](Environment::value_type *a, Environment::value_type *b) { + return a->first() < b->first(); + }); + + auto &strm = result.GetOutputStream(); + for (auto &KV : env_vector) + strm.Format("{0}={1}\n", KV->first(), KV->second); + + result.SetStatus(eReturnStatusSuccessFinishResult); + return result.Succeeded(); + } +}; + #pragma mark CommandObjectTargetVariable // "target variable" @@ -4876,6 +4911,9 @@ CommandObjectMultiwordTarget::CommandObjectMultiwordTarget( CommandObjectSP(new CommandObjectTargetList(interpreter))); LoadSubCommand("select", CommandObjectSP(new CommandObjectTargetSelect(interpreter))); + LoadSubCommand("show-launch-environment", + CommandObjectSP(new CommandObjectTargetShowLaunchEnvironment( + interpreter))); LoadSubCommand( "stop-hook", CommandObjectSP(new CommandObjectMultiwordTargetStopHooks(interpreter))); diff --git a/lldb/source/Core/ValueObject.cpp b/lldb/source/Core/ValueObject.cpp index e1d0ca941108b..9e20ba76dccbd 100644 --- a/lldb/source/Core/ValueObject.cpp +++ b/lldb/source/Core/ValueObject.cpp @@ -764,7 +764,7 @@ bool ValueObject::IsCStringContainer(bool check_pointer) { return true; addr_t cstr_address = LLDB_INVALID_ADDRESS; AddressType cstr_address_type = eAddressTypeInvalid; - cstr_address = GetAddressOf(true, &cstr_address_type); + cstr_address = GetPointerValue(&cstr_address_type); return (cstr_address != LLDB_INVALID_ADDRESS); } @@ -3270,9 +3270,7 @@ ValueObjectSP ValueObject::Persist() { if (!persistent_state) return nullptr; - auto prefix = persistent_state->GetPersistentVariablePrefix(); - ConstString name = - persistent_state->GetNextPersistentVariableName(*target_sp, prefix); + ConstString name = persistent_state->GetNextPersistentVariableName(); ValueObjectSP const_result_sp = ValueObjectConstResult::Create(target_sp.get(), GetValue(), name); diff --git a/lldb/source/DataFormatters/FormatCache.cpp b/lldb/source/DataFormatters/FormatCache.cpp index f7e5c72f7781e..5e0965fcdae40 100644 --- a/lldb/source/DataFormatters/FormatCache.cpp +++ b/lldb/source/DataFormatters/FormatCache.cpp @@ -51,15 +51,6 @@ void FormatCache::Entry::Set(lldb::SyntheticChildrenSP synthetic_sp) { m_synthetic_sp = synthetic_sp; } -FormatCache::FormatCache() - : m_map(), m_mutex() -#ifdef LLDB_CONFIGURATION_DEBUG - , - m_cache_hits(0), m_cache_misses(0) -#endif -{ -} - FormatCache::Entry &FormatCache::GetEntry(ConstString type) { auto i = m_map.find(type), e = m_map.end(); if (i != e) @@ -87,15 +78,11 @@ bool FormatCache::Get(ConstString type, ImplSP &format_impl_sp) { std::lock_guard guard(m_mutex); auto entry = GetEntry(type); if (entry.IsCached()) { -#ifdef LLDB_CONFIGURATION_DEBUG m_cache_hits++; -#endif entry.Get(format_impl_sp); return true; } -#ifdef LLDB_CONFIGURATION_DEBUG m_cache_misses++; -#endif format_impl_sp.reset(); return false; } diff --git a/lldb/source/DataFormatters/FormattersHelpers.cpp b/lldb/source/DataFormatters/FormattersHelpers.cpp index 96e93808c18e5..7944ff06eee53 100644 --- a/lldb/source/DataFormatters/FormattersHelpers.cpp +++ b/lldb/source/DataFormatters/FormattersHelpers.cpp @@ -142,3 +142,14 @@ lldb_private::formatters::GetArrayAddressOrPointerValue(ValueObject &valobj) { return data_addr; } + +lldb::ValueObjectSP +lldb_private::formatters::GetValueOfLibCXXCompressedPair(ValueObject &pair) { + ValueObjectSP value = + pair.GetChildMemberWithName(ConstString("__value_"), true); + if (!value) { + // pre-r300140 member name + value = pair.GetChildMemberWithName(ConstString("__first_"), true); + } + return value; +} diff --git a/lldb/source/DataFormatters/StringPrinter.cpp b/lldb/source/DataFormatters/StringPrinter.cpp index 92dd71d17b8c1..4515b67b2adfd 100644 --- a/lldb/source/DataFormatters/StringPrinter.cpp +++ b/lldb/source/DataFormatters/StringPrinter.cpp @@ -525,27 +525,33 @@ static bool ReadUTFBufferAndDumpToStream( if (!options.GetStream()) return false; - uint32_t sourceSize = options.GetSourceSize(); + uint32_t sourceSize; bool needs_zero_terminator = options.GetNeedsZeroTermination(); bool is_truncated = false; const auto max_size = process_sp->GetTarget().GetMaximumSizeOfStringSummary(); - if (!sourceSize) { + if (options.HasSourceSize()) { + sourceSize = options.GetSourceSize(); + if (!options.GetIgnoreMaxLength()) { + if (sourceSize > max_size) { + sourceSize = max_size; + is_truncated = true; + } + } + } else { sourceSize = max_size; needs_zero_terminator = true; - } else if (!options.GetIgnoreMaxLength()) { - if (sourceSize > max_size) { - sourceSize = max_size; - is_truncated = true; - } } const int bufferSPSize = sourceSize * type_width; lldb::DataBufferSP buffer_sp(new DataBufferHeap(bufferSPSize, 0)); - if (!buffer_sp->GetBytes()) + // Check if we got bytes. We never get any bytes if we have an empty + // string, but we still continue so that we end up actually printing + // an empty string (""). + if (sourceSize != 0 && !buffer_sp->GetBytes()) return false; Status error; diff --git a/lldb/source/Expression/ExpressionVariable.cpp b/lldb/source/Expression/ExpressionVariable.cpp index 7c27c0f249ec4..d95f0745cf4ba 100644 --- a/lldb/source/Expression/ExpressionVariable.cpp +++ b/lldb/source/Expression/ExpressionVariable.cpp @@ -76,13 +76,3 @@ void PersistentExpressionState::RegisterExecutionUnit( } } } - -ConstString PersistentExpressionState::GetNextPersistentVariableName( - Target &target, llvm::StringRef Prefix) { - llvm::SmallString<64> name; - { - llvm::raw_svector_ostream os(name); - os << Prefix << target.GetNextPersistentVariableIndex(); - } - return ConstString(name); -} diff --git a/lldb/source/Expression/Materializer.cpp b/lldb/source/Expression/Materializer.cpp index 33c061effca47..8e96891257e4d 100644 --- a/lldb/source/Expression/Materializer.cpp +++ b/lldb/source/Expression/Materializer.cpp @@ -881,11 +881,9 @@ class EntityResultVariable : public Materializer::Entity { return; } - ConstString name = - m_delegate - ? m_delegate->GetName() - : persistent_state->GetNextPersistentVariableName( - *target_sp, persistent_state->GetPersistentVariablePrefix()); + ConstString name = m_delegate + ? m_delegate->GetName() + : persistent_state->GetNextPersistentVariableName(); lldb::ExpressionVariableSP ret = persistent_state->CreatePersistentVariable( exe_scope, name, m_type, map.GetByteOrder(), map.GetAddressByteSize()); diff --git a/lldb/source/Expression/UserExpression.cpp b/lldb/source/Expression/UserExpression.cpp index 0243cc0c37505..5bd2321e48ddc 100644 --- a/lldb/source/Expression/UserExpression.cpp +++ b/lldb/source/Expression/UserExpression.cpp @@ -259,6 +259,10 @@ UserExpression::Evaluate(ExecutionContext &exe_ctx, // If there is a fixed expression, try to parse it: if (!parse_success) { + // Delete the expression that failed to parse before attempting to parse + // the next expression. + user_expression_sp.reset(); + execution_results = lldb::eExpressionParseError; if (fixed_expression && !fixed_expression->empty() && options.GetAutoApplyFixIts()) { diff --git a/lldb/source/Host/common/FileSystem.cpp b/lldb/source/Host/common/FileSystem.cpp index 220d7672cfb56..dcfa594597a1b 100644 --- a/lldb/source/Host/common/FileSystem.cpp +++ b/lldb/source/Host/common/FileSystem.cpp @@ -279,7 +279,7 @@ void FileSystem::Resolve(FileSpec &file_spec) { std::shared_ptr FileSystem::CreateDataBuffer(const llvm::Twine &path, uint64_t size, uint64_t offset) { - AddFile(path); + Collect(path); const bool is_volatile = !IsLocal(path); const ErrorOr external_path = GetExternalPath(path); @@ -417,7 +417,7 @@ static mode_t GetOpenMode(uint32_t permissions) { Expected FileSystem::Open(const FileSpec &file_spec, File::OpenOptions options, uint32_t permissions, bool should_close_fd) { - AddFile(file_spec.GetPath()); + Collect(file_spec.GetPath()); const int open_flags = GetOpenFlags(options); const mode_t open_mode = @@ -465,7 +465,11 @@ ErrorOr FileSystem::GetExternalPath(const FileSpec &file_spec) { return GetExternalPath(file_spec.GetPath()); } -void FileSystem::AddFile(const llvm::Twine &file) { +void FileSystem::Collect(const FileSpec &file_spec) { + Collect(file_spec.GetPath()); +} + +void FileSystem::Collect(const llvm::Twine &file) { if (m_collector && !llvm::sys::fs::is_directory(file)) { m_collector->addFile(file); } diff --git a/lldb/source/Host/macosx/objcxx/Host.mm b/lldb/source/Host/macosx/objcxx/Host.mm index 2475338a37fd5..eba3060f8ec63 100644 --- a/lldb/source/Host/macosx/objcxx/Host.mm +++ b/lldb/source/Host/macosx/objcxx/Host.mm @@ -1088,43 +1088,6 @@ static Status LaunchProcessPosixSpawn(const char *exe_path, return error; } -// posix_spawnattr_setbinpref_np appears to be an Apple extension per: -// http://www.unix.com/man-page/OSX/3/posix_spawnattr_setbinpref_np/ -#if !defined(__arm__) - - // Don't set the binpref if a shell was provided. After all, that's only - // going to affect what version of the shell - // is launched, not what fork of the binary is launched. We insert "arch - // --arch as part of the shell invocation - // to do that job on OSX. - - if (launch_info.GetShell() == FileSpec()) { - // We don't need to do this for ARM, and we really shouldn't now that we - // have multiple CPU subtypes and no posix_spawnattr call that allows us - // to set which CPU subtype to launch... - const ArchSpec &arch_spec = launch_info.GetArchitecture(); - cpu_type_t cpu = arch_spec.GetMachOCPUType(); - cpu_type_t sub = arch_spec.GetMachOCPUSubType(); - if (cpu != 0 && cpu != static_cast(UINT32_MAX) && - cpu != static_cast(LLDB_INVALID_CPUTYPE) && - !(cpu == 0x01000007 && sub == 8)) // If haswell is specified, don't try - // to set the CPU type or we will fail - { - size_t ocount = 0; - error.SetError(::posix_spawnattr_setbinpref_np(&attr, 1, &cpu, &ocount), - eErrorTypePOSIX); - if (error.Fail()) - LLDB_LOG(log, - "error: {0}, ::posix_spawnattr_setbinpref_np ( &attr, 1, " - "cpu_type = {1:x}, count => {2} )", - error, cpu, ocount); - - if (error.Fail() || ocount != 1) - return error; - } - } -#endif // !defined(__arm__) - const char *tmp_argv[2]; char *const *argv = const_cast( launch_info.GetArguments().GetConstArgumentVector()); diff --git a/lldb/source/Plugins/DynamicLoader/Darwin-Kernel/DynamicLoaderDarwinKernel.cpp b/lldb/source/Plugins/DynamicLoader/Darwin-Kernel/DynamicLoaderDarwinKernel.cpp index 193b3bd829c54..68a0335682d3a 100644 --- a/lldb/source/Plugins/DynamicLoader/Darwin-Kernel/DynamicLoaderDarwinKernel.cpp +++ b/lldb/source/Plugins/DynamicLoader/Darwin-Kernel/DynamicLoaderDarwinKernel.cpp @@ -245,6 +245,7 @@ DynamicLoaderDarwinKernel::SearchForKernelWithDebugHints(Process *process) { Status read_err; addr_t kernel_addresses_64[] = { + 0xfffffff000002010ULL, 0xfffffff000004010ULL, // newest arm64 devices 0xffffff8000004010ULL, // 2014-2015-ish arm64 devices 0xffffff8000002010ULL, // oldest arm64 devices diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionParser.cpp b/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionParser.cpp index 3faf6f238b233..698fea4c2d3cd 100644 --- a/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionParser.cpp +++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionParser.cpp @@ -281,23 +281,22 @@ ClangExpressionParser::ClangExpressionParser( // We can't compile expressions without a target. So if the exe_scope is // null or doesn't have a target, then we just need to get out of here. I'll - // lldb_assert and not make any of the compiler objects since + // lldbassert and not make any of the compiler objects since // I can't return errors directly from the constructor. Further calls will // check if the compiler was made and // bag out if it wasn't. if (!exe_scope) { - lldb_assert(exe_scope, "Can't make an expression parser with a null scope.", - __FUNCTION__, __FILE__, __LINE__); + lldbassert(exe_scope && + "Can't make an expression parser with a null scope."); return; } lldb::TargetSP target_sp; target_sp = exe_scope->CalculateTarget(); if (!target_sp) { - lldb_assert(target_sp.get(), - "Can't make an expression parser with a null target.", - __FUNCTION__, __FILE__, __LINE__); + lldbassert(target_sp.get() && + "Can't make an expression parser with a null target."); return; } diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangPersistentVariables.cpp b/lldb/source/Plugins/ExpressionParser/Clang/ClangPersistentVariables.cpp index 3cbedf80755a9..42afac9edb0d5 100644 --- a/lldb/source/Plugins/ExpressionParser/Clang/ClangPersistentVariables.cpp +++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangPersistentVariables.cpp @@ -108,3 +108,14 @@ ClangPersistentVariables::GetClangASTImporter() { } return m_ast_importer_sp; } + +ConstString +ClangPersistentVariables::GetNextPersistentVariableName(bool is_error) { + llvm::SmallString<64> name; + { + llvm::raw_svector_ostream os(name); + os << GetPersistentVariablePrefix(is_error) + << m_next_persistent_variable_id++; + } + return ConstString(name); +} diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangPersistentVariables.h b/lldb/source/Plugins/ExpressionParser/Clang/ClangPersistentVariables.h index 12268b6549aa3..f888b2d56e68c 100644 --- a/lldb/source/Plugins/ExpressionParser/Clang/ClangPersistentVariables.h +++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangPersistentVariables.h @@ -51,9 +51,7 @@ class ClangPersistentVariables : public PersistentExpressionState { void RemovePersistentVariable(lldb::ExpressionVariableSP variable) override; - llvm::StringRef GetPersistentVariablePrefix(bool is_error) const override { - return "$"; - } + ConstString GetNextPersistentVariableName(bool is_error = false) override; /// Returns the next file name that should be used for user expressions. std::string GetNextExprFileName() { @@ -80,6 +78,12 @@ class ClangPersistentVariables : public PersistentExpressionState { return m_hand_loaded_clang_modules; } +protected: + llvm::StringRef + GetPersistentVariablePrefix(bool is_error = false) const override { + return "$"; + } + private: /// The counter used by GetNextExprFileName. uint32_t m_next_user_file_id = 0; diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangUserExpression.cpp b/lldb/source/Plugins/ExpressionParser/Clang/ClangUserExpression.cpp index 6d781934c1740..b246fc374d1cc 100644 --- a/lldb/source/Plugins/ExpressionParser/Clang/ClangUserExpression.cpp +++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangUserExpression.cpp @@ -924,9 +924,7 @@ void ClangUserExpression::ClangUserExpressionHelper::CommitPersistentDecls() { } ConstString ClangUserExpression::ResultDelegate::GetName() { - auto prefix = m_persistent_state->GetPersistentVariablePrefix(); - return m_persistent_state->GetNextPersistentVariableName(*m_target_sp, - prefix); + return m_persistent_state->GetNextPersistentVariableName(false); } void ClangUserExpression::ResultDelegate::DidDematerialize( diff --git a/lldb/source/Plugins/Instruction/ARM/EmulateInstructionARM.cpp b/lldb/source/Plugins/Instruction/ARM/EmulateInstructionARM.cpp index aa99db418283d..34f88d2b4443d 100644 --- a/lldb/source/Plugins/Instruction/ARM/EmulateInstructionARM.cpp +++ b/lldb/source/Plugins/Instruction/ARM/EmulateInstructionARM.cpp @@ -605,9 +605,6 @@ static uint32_t CountITSize(uint32_t ITMask) { // First count the trailing zeros of the IT mask. uint32_t TZ = llvm::countTrailingZeros(ITMask); if (TZ > 3) { -#ifdef LLDB_CONFIGURATION_DEBUG - printf("Encoding error: IT Mask '0000'\n"); -#endif return 0; } return (4 - TZ); @@ -622,15 +619,9 @@ bool ITSession::InitIT(uint32_t bits7_0) { // A8.6.50 IT unsigned short FirstCond = Bits32(bits7_0, 7, 4); if (FirstCond == 0xF) { -#ifdef LLDB_CONFIGURATION_DEBUG - printf("Encoding error: IT FirstCond '1111'\n"); -#endif return false; } if (FirstCond == 0xE && ITCounter != 1) { -#ifdef LLDB_CONFIGURATION_DEBUG - printf("Encoding error: IT FirstCond '1110' && Mask != '1000'\n"); -#endif return false; } diff --git a/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp b/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp index 97084da5fffad..ecb577e0c531f 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp +++ b/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp @@ -611,6 +611,15 @@ static void LoadLibCxxFormatters(lldb::TypeCategoryImplSP cpp_category_sp) { "shared_ptr synthetic children", ConstString("^(std::__[[:alnum:]]+::)shared_ptr<.+>(( )?&)?$"), stl_synth_flags, true); + + ConstString libcxx_std_unique_ptr_regex( + "^std::__[[:alnum:]]+::unique_ptr<.+>(( )?&)?$"); + AddCXXSynthetic( + cpp_category_sp, + lldb_private::formatters::LibcxxUniquePtrSyntheticFrontEndCreator, + "unique_ptr synthetic children", libcxx_std_unique_ptr_regex, + stl_synth_flags, true); + AddCXXSynthetic( cpp_category_sp, lldb_private::formatters::LibcxxSharedPtrSyntheticFrontEndCreator, @@ -715,6 +724,10 @@ static void LoadLibCxxFormatters(lldb::TypeCategoryImplSP cpp_category_sp) { "libc++ std::weak_ptr summary provider", ConstString("^std::__[[:alnum:]]+::weak_ptr<.+>(( )?&)?$"), stl_summary_flags, true); + AddCXXSummary(cpp_category_sp, + lldb_private::formatters::LibcxxUniquePointerSummaryProvider, + "libc++ std::unique_ptr summary provider", + libcxx_std_unique_ptr_regex, stl_summary_flags, true); AddCXXSynthetic( cpp_category_sp, diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxx.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxx.cpp index 7152ff407f291..84dd09a47d8ac 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/LibCxx.cpp +++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxx.cpp @@ -144,6 +144,43 @@ bool lldb_private::formatters::LibcxxSmartPointerSummaryProvider( return true; } +bool lldb_private::formatters::LibcxxUniquePointerSummaryProvider( + ValueObject &valobj, Stream &stream, const TypeSummaryOptions &options) { + ValueObjectSP valobj_sp(valobj.GetNonSyntheticValue()); + if (!valobj_sp) + return false; + + ValueObjectSP ptr_sp( + valobj_sp->GetChildMemberWithName(ConstString("__ptr_"), true)); + if (!ptr_sp) + return false; + + ptr_sp = GetValueOfLibCXXCompressedPair(*ptr_sp); + if (!ptr_sp) + return false; + + if (ptr_sp->GetValueAsUnsigned(0) == 0) { + stream.Printf("nullptr"); + return true; + } else { + bool print_pointee = false; + Status error; + ValueObjectSP pointee_sp = ptr_sp->Dereference(error); + if (pointee_sp && error.Success()) { + if (pointee_sp->DumpPrintableRepresentation( + stream, ValueObject::eValueObjectRepresentationStyleSummary, + lldb::eFormatInvalid, + ValueObject::PrintableRepresentationSpecialCases::eDisable, + false)) + print_pointee = true; + } + if (!print_pointee) + stream.Printf("ptr = 0x%" PRIx64, ptr_sp->GetValueAsUnsigned(0)); + } + + return true; +} + /* (lldb) fr var ibeg --raw --ptr-depth 1 (std::__1::__map_iteratorGetChildMemberWithName(ConstString("__ptr_"), true)); + if (!ptr_sp) + return false; + + m_compressed_pair_sp = GetValueOfLibCXXCompressedPair(*ptr_sp); + + return false; +} + +bool lldb_private::formatters::LibcxxUniquePtrSyntheticFrontEnd:: + MightHaveChildren() { + return true; +} + +size_t lldb_private::formatters::LibcxxUniquePtrSyntheticFrontEnd:: + GetIndexOfChildWithName(ConstString name) { + if (name == "__value_") + return 0; + return UINT32_MAX; +} + bool lldb_private::formatters::LibcxxContainerSummaryProvider( ValueObject &valobj, Stream &stream, const TypeSummaryOptions &options) { if (valobj.IsPointerType()) { diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxx.h b/lldb/source/Plugins/Language/CPlusPlus/LibCxx.h index a92e8be9abe95..ea5a7c1781783 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/LibCxx.h +++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxx.h @@ -43,6 +43,10 @@ bool LibcxxSmartPointerSummaryProvider( const TypeSummaryOptions &options); // libc++ std::shared_ptr<> and std::weak_ptr<> +// libc++ std::unique_ptr<> +bool LibcxxUniquePointerSummaryProvider(ValueObject &valobj, Stream &stream, + const TypeSummaryOptions &options); + bool LibcxxFunctionSummaryProvider( ValueObject &valobj, Stream &stream, const TypeSummaryOptions &options); // libc++ std::function<> @@ -107,6 +111,26 @@ class LibcxxSharedPtrSyntheticFrontEnd : public SyntheticChildrenFrontEnd { lldb::ByteOrder m_byte_order; }; +class LibcxxUniquePtrSyntheticFrontEnd : public SyntheticChildrenFrontEnd { +public: + LibcxxUniquePtrSyntheticFrontEnd(lldb::ValueObjectSP valobj_sp); + + size_t CalculateNumChildren() override; + + lldb::ValueObjectSP GetChildAtIndex(size_t idx) override; + + bool Update() override; + + bool MightHaveChildren() override; + + size_t GetIndexOfChildWithName(ConstString name) override; + + ~LibcxxUniquePtrSyntheticFrontEnd() override; + +private: + lldb::ValueObjectSP m_compressed_pair_sp; +}; + SyntheticChildrenFrontEnd * LibcxxBitsetSyntheticFrontEndCreator(CXXSyntheticChildren *, lldb::ValueObjectSP); @@ -115,6 +139,10 @@ SyntheticChildrenFrontEnd * LibcxxSharedPtrSyntheticFrontEndCreator(CXXSyntheticChildren *, lldb::ValueObjectSP); +SyntheticChildrenFrontEnd * +LibcxxUniquePtrSyntheticFrontEndCreator(CXXSyntheticChildren *, + lldb::ValueObjectSP); + SyntheticChildrenFrontEnd * LibcxxStdVectorSyntheticFrontEndCreator(CXXSyntheticChildren *, lldb::ValueObjectSP); diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxxList.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxxList.cpp index 4c5940a45766d..0d5ae16a0b295 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/LibCxxList.cpp +++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxxList.cpp @@ -290,15 +290,6 @@ ValueObjectSP ForwardListFrontEnd::GetChildAtIndex(size_t idx) { m_element_type); } -static ValueObjectSP GetValueOfCompressedPair(ValueObject &pair) { - ValueObjectSP value = pair.GetChildMemberWithName(ConstString("__value_"), true); - if (! value) { - // pre-r300140 member name - value = pair.GetChildMemberWithName(ConstString("__first_"), true); - } - return value; -} - bool ForwardListFrontEnd::Update() { AbstractListFrontEnd::Update(); @@ -311,7 +302,7 @@ bool ForwardListFrontEnd::Update() { m_backend.GetChildMemberWithName(ConstString("__before_begin_"), true)); if (!impl_sp) return false; - impl_sp = GetValueOfCompressedPair(*impl_sp); + impl_sp = GetValueOfLibCXXCompressedPair(*impl_sp); if (!impl_sp) return false; m_head = impl_sp->GetChildMemberWithName(ConstString("__next_"), true).get(); @@ -332,7 +323,7 @@ size_t ListFrontEnd::CalculateNumChildren() { ValueObjectSP size_alloc( m_backend.GetChildMemberWithName(ConstString("__size_alloc_"), true)); if (size_alloc) { - ValueObjectSP value = GetValueOfCompressedPair(*size_alloc); + ValueObjectSP value = GetValueOfLibCXXCompressedPair(*size_alloc); if (value) { m_count = value->GetValueAsUnsigned(UINT32_MAX); } diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibStdcpp.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibStdcpp.cpp index 78f58754cc319..b4af67ecee0dd 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/LibStdcpp.cpp +++ b/lldb/source/Plugins/Language/CPlusPlus/LibStdcpp.cpp @@ -259,6 +259,7 @@ bool lldb_private::formatters::LibStdcppStringSummaryProvider( if (error.Fail()) return false; options.SetSourceSize(size_of_data); + options.SetHasSourceSize(true); if (!StringPrinter::ReadStringAndDumpToStream< StringPrinter::StringElementType::UTF8>(options)) { @@ -319,6 +320,7 @@ bool lldb_private::formatters::LibStdcppWStringSummaryProvider( if (error.Fail()) return false; options.SetSourceSize(size_of_data); + options.SetHasSourceSize(true); options.SetPrefixToken("L"); switch (wchar_size) { diff --git a/lldb/source/Plugins/Language/ObjC/NSString.cpp b/lldb/source/Plugins/Language/ObjC/NSString.cpp index 65256dc7acbdd..7c4afb36b5883 100644 --- a/lldb/source/Plugins/Language/ObjC/NSString.cpp +++ b/lldb/source/Plugins/Language/ObjC/NSString.cpp @@ -170,6 +170,7 @@ bool lldb_private::formatters::NSStringSummaryProvider( options.SetStream(&stream); options.SetQuote('"'); options.SetSourceSize(explicit_length); + options.SetHasSourceSize(has_explicit_length); options.SetNeedsZeroTermination(false); options.SetIgnoreMaxLength(summary_options.GetCapping() == TypeSummaryCapping::eTypeSummaryUncapped); @@ -182,6 +183,7 @@ bool lldb_private::formatters::NSStringSummaryProvider( options.SetProcessSP(process_sp); options.SetStream(&stream); options.SetSourceSize(explicit_length); + options.SetHasSourceSize(has_explicit_length); options.SetNeedsZeroTermination(false); options.SetIgnoreMaxLength(summary_options.GetCapping() == TypeSummaryCapping::eTypeSummaryUncapped); @@ -199,6 +201,7 @@ bool lldb_private::formatters::NSStringSummaryProvider( options.SetStream(&stream); options.SetQuote('"'); options.SetSourceSize(explicit_length); + options.SetHasSourceSize(has_explicit_length); options.SetIgnoreMaxLength(summary_options.GetCapping() == TypeSummaryCapping::eTypeSummaryUncapped); options.SetLanguage(summary_options.GetLanguage()); @@ -221,6 +224,7 @@ bool lldb_private::formatters::NSStringSummaryProvider( options.SetStream(&stream); options.SetQuote('"'); options.SetSourceSize(explicit_length); + options.SetHasSourceSize(has_explicit_length); options.SetNeedsZeroTermination(!has_explicit_length); options.SetIgnoreMaxLength(summary_options.GetCapping() == TypeSummaryCapping::eTypeSummaryUncapped); @@ -241,6 +245,7 @@ bool lldb_private::formatters::NSStringSummaryProvider( options.SetStream(&stream); options.SetQuote('"'); options.SetSourceSize(explicit_length); + options.SetHasSourceSize(has_explicit_length); options.SetNeedsZeroTermination(!has_explicit_length); options.SetIgnoreMaxLength(summary_options.GetCapping() == TypeSummaryCapping::eTypeSummaryUncapped); @@ -263,6 +268,7 @@ bool lldb_private::formatters::NSStringSummaryProvider( options.SetProcessSP(process_sp); options.SetStream(&stream); options.SetSourceSize(explicit_length); + options.SetHasSourceSize(has_explicit_length); options.SetNeedsZeroTermination(!has_explicit_length); options.SetIgnoreMaxLength(summary_options.GetCapping() == TypeSummaryCapping::eTypeSummaryUncapped); @@ -286,6 +292,7 @@ bool lldb_private::formatters::NSStringSummaryProvider( options.SetProcessSP(process_sp); options.SetStream(&stream); options.SetSourceSize(explicit_length); + options.SetHasSourceSize(has_explicit_length); options.SetIgnoreMaxLength(summary_options.GetCapping() == TypeSummaryCapping::eTypeSummaryUncapped); options.SetLanguage(summary_options.GetLanguage()); diff --git a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCRuntimeV2.cpp b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCRuntimeV2.cpp index 9b3dbb166b687..4a07c792eebba 100644 --- a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCRuntimeV2.cpp +++ b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCRuntimeV2.cpp @@ -1175,6 +1175,28 @@ AppleObjCRuntimeV2::GetClassDescriptorFromISA(ObjCISA isa) { return class_descriptor_sp; } +static std::pair ObjCGetClassNameRaw( + AppleObjCRuntime::ObjCISA isa, + Process *process) { + StreamString expr_string; + std::string input = std::to_string(isa); + expr_string.Printf("(const char *)objc_debug_class_getNameRaw(%s)", + input.c_str()); + + ValueObjectSP result_sp; + EvaluateExpressionOptions eval_options; + eval_options.SetLanguage(lldb::eLanguageTypeObjC); + eval_options.SetResultIsInternal(true); + eval_options.SetGenerateDebugInfo(true); + eval_options.SetTimeout(process->GetUtilityExpressionTimeout()); + auto eval_result = process->GetTarget().EvaluateExpression( + expr_string.GetData(), + process->GetThreadList().GetSelectedThread()->GetSelectedFrame().get(), + result_sp, eval_options); + ConstString type_name(result_sp->GetSummaryAsCString()); + return std::make_pair(eval_result == eExpressionCompleted, type_name); +} + ObjCLanguageRuntime::ClassDescriptorSP AppleObjCRuntimeV2::GetClassDescriptor(ValueObject &valobj) { ClassDescriptorSP objc_class_sp; @@ -1191,33 +1213,43 @@ AppleObjCRuntimeV2::GetClassDescriptor(ValueObject &valobj) { // if we get an invalid VO (which might still happen when playing around with // pointers returned by the expression parser, don't consider this a valid // ObjC object) - if (valobj.GetCompilerType().IsValid()) { - addr_t isa_pointer = valobj.GetPointerValue(); + if (!valobj.GetCompilerType().IsValid()) + return objc_class_sp; + addr_t isa_pointer = valobj.GetPointerValue(); - // tagged pointer - if (IsTaggedPointer(isa_pointer)) { - return m_tagged_pointer_vendor_up->GetClassDescriptor(isa_pointer); - } else { - ExecutionContext exe_ctx(valobj.GetExecutionContextRef()); + // tagged pointer + if (IsTaggedPointer(isa_pointer)) + return m_tagged_pointer_vendor_up->GetClassDescriptor(isa_pointer); + ExecutionContext exe_ctx(valobj.GetExecutionContextRef()); - Process *process = exe_ctx.GetProcessPtr(); - if (process) { - Status error; - ObjCISA isa = process->ReadPointerFromMemory(isa_pointer, error); - if (isa != LLDB_INVALID_ADDRESS) { - objc_class_sp = GetClassDescriptorFromISA(isa); - if (isa && !objc_class_sp) { - Log *log(GetLogIfAnyCategoriesSet(LIBLLDB_LOG_PROCESS | - LIBLLDB_LOG_TYPES)); - LLDB_LOGF(log, - "0x%" PRIx64 - ": AppleObjCRuntimeV2::GetClassDescriptor() ISA was " - "not in class descriptor cache 0x%" PRIx64, - isa_pointer, isa); - } - } - } - } + Process *process = exe_ctx.GetProcessPtr(); + if (!process) + return objc_class_sp; + + Status error; + ObjCISA isa = process->ReadPointerFromMemory(isa_pointer, error); + if (isa == LLDB_INVALID_ADDRESS) + return objc_class_sp; + + objc_class_sp = GetClassDescriptorFromISA(isa); + + if (objc_class_sp) + return objc_class_sp; + else { + Log *log(GetLogIfAllCategoriesSet(LIBLLDB_LOG_PROCESS | + LIBLLDB_LOG_TYPES)); + LLDB_LOGF(log, + "0x%" PRIx64 + ": AppleObjCRuntimeV2::GetClassDescriptor() ISA was " + "not in class descriptor cache 0x%" PRIx64, + isa_pointer, isa); + } + + ClassDescriptorSP descriptor_sp(new ClassDescriptorV2(*this, isa, nullptr)); + auto resolved = ObjCGetClassNameRaw(isa, process); + if (resolved.first == true) { + AddClass(isa, descriptor_sp, resolved.second.AsCString()); + objc_class_sp = descriptor_sp; } return objc_class_sp; } diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.cpp b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.cpp index aa1f8994ecb66..350043f8d4e9a 100644 --- a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.cpp +++ b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.cpp @@ -58,6 +58,17 @@ PlatformDarwin::PlatformDarwin(bool is_host) /// inherited from by the plug-in instance. PlatformDarwin::~PlatformDarwin() {} +lldb_private::Status +PlatformDarwin::PutFile(const lldb_private::FileSpec &source, + const lldb_private::FileSpec &destination, uint32_t uid, + uint32_t gid) { + // Unconditionally unlink the destination. If it is an executable, + // simply opening it and truncating its contents would invalidate + // its cached code signature. + Unlink(destination); + return PlatformPOSIX::PutFile(source, destination, uid, gid); +} + FileSpecList PlatformDarwin::LocateExecutableScriptingResources( Target *target, Module &module, Stream *feedback_stream) { FileSpecList file_list; @@ -1829,6 +1840,21 @@ lldb_private::Status PlatformDarwin::FindBundleBinaryInExecSearchPaths( return Status(); } +std::string PlatformDarwin::FindComponentInPath(llvm::StringRef path, + llvm::StringRef component) { + auto begin = llvm::sys::path::begin(path); + auto end = llvm::sys::path::end(path); + for (auto it = begin; it != end; ++it) { + if (it->contains(component)) { + llvm::SmallString<128> buffer; + llvm::sys::path::append(buffer, begin, ++it, + llvm::sys::path::Style::posix); + return buffer.str().str(); + } + } + return {}; +} + std::string PlatformDarwin::FindXcodeContentsDirectoryInPath(llvm::StringRef path) { auto begin = llvm::sys::path::begin(path); @@ -1959,3 +1985,15 @@ FileSpec PlatformDarwin::GetXcodeContentsDirectory() { }); return g_xcode_contents_path; } + +FileSpec PlatformDarwin::GetCurrentToolchainDirectory() { + if (FileSpec fspec = HostInfo::GetShlibDir()) + return FileSpec(FindComponentInPath(fspec.GetPath(), ".xctoolchain")); + return {}; +} + +FileSpec PlatformDarwin::GetCurrentCommandLineToolsDirectory() { + if (FileSpec fspec = HostInfo::GetShlibDir()) + return FileSpec(FindComponentInPath(fspec.GetPath(), "CommandLineTools")); + return {}; +} diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.h b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.h index d385712db8e63..f6729c508f009 100644 --- a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.h +++ b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwin.h @@ -25,6 +25,11 @@ class PlatformDarwin : public PlatformPOSIX { ~PlatformDarwin() override; + lldb_private::Status PutFile(const lldb_private::FileSpec &source, + const lldb_private::FileSpec &destination, + uint32_t uid = UINT32_MAX, + uint32_t gid = UINT32_MAX) override; + // lldb_private::Platform functions lldb_private::Status ResolveSymbolFile(lldb_private::Target &target, @@ -100,6 +105,13 @@ class PlatformDarwin : public PlatformPOSIX { static lldb_private::FileSpec GetXcodeSDK(SDKType type); static lldb_private::FileSpec GetXcodeContentsDirectory(); + /// Return the toolchain directroy the current LLDB instance is located in. + static lldb_private::FileSpec GetCurrentToolchainDirectory(); + + /// Return the command line tools directory the current LLDB instance is + /// located in. + static lldb_private::FileSpec GetCurrentCommandLineToolsDirectory(); + protected: struct CrashInfoAnnotations { uint64_t version; // unsigned long @@ -172,6 +184,8 @@ class PlatformDarwin : public PlatformPOSIX { const lldb_private::FileSpecList *module_search_paths_ptr, lldb::ModuleSP *old_module_sp_ptr, bool *did_create_ptr); + static std::string FindComponentInPath(llvm::StringRef path, + llvm::StringRef component); static std::string FindXcodeContentsDirectoryInPath(llvm::StringRef path); std::string m_developer_directory; diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp index edbd408622f1e..fdf1397d7a69b 100644 --- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp +++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationClient.cpp @@ -2797,12 +2797,10 @@ size_t GDBRemoteCommunicationClient::GetCurrentThreadIDs( thread_ids.push_back(1); } } else { -#if !defined(LLDB_CONFIGURATION_DEBUG) Log *log(ProcessGDBRemoteLog::GetLogIfAnyCategoryIsSet(GDBR_LOG_PROCESS | GDBR_LOG_PACKETS)); - LLDB_LOGF(log, "error: failed to get packet sequence mutex, not sending " - "packet 'qfThreadInfo'"); -#endif + LLDB_LOG(log, "error: failed to get packet sequence mutex, not sending " + "packet 'qfThreadInfo'"); sequence_mutex_unavailable = true; } return thread_ids.size(); diff --git a/lldb/source/Plugins/ScriptInterpreter/Lua/ScriptInterpreterLua.cpp b/lldb/source/Plugins/ScriptInterpreter/Lua/ScriptInterpreterLua.cpp index ecbd30c10ae01..f9b24ad83de51 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Lua/ScriptInterpreterLua.cpp +++ b/lldb/source/Plugins/ScriptInterpreter/Lua/ScriptInterpreterLua.cpp @@ -89,6 +89,7 @@ bool ScriptInterpreterLua::LoadScriptingModule( const char *filename, bool init_session, lldb_private::Status &error, StructuredData::ObjectSP *module_sp) { + FileSystem::Instance().Collect(filename); if (llvm::Error e = m_lua->LoadModule(filename)) { error.SetErrorStringWithFormatv("lua failed to import '{0}': {1}\n", filename, llvm::toString(std::move(e))); diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp index 3e93ddbf18c8e..f59b70ac31d27 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp +++ b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp @@ -2772,6 +2772,7 @@ bool ScriptInterpreterPythonImpl::LoadScriptingModule( { FileSpec target_file(pathname); FileSystem::Instance().Resolve(target_file); + FileSystem::Instance().Collect(target_file); std::string basename(target_file.GetFilename().GetCString()); StreamString command_stream; diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp index 1e149d89153a4..c98694fca6b5a 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp +++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp @@ -185,7 +185,7 @@ GetFileByIndex(const llvm::DWARFDebugLine::Prologue &prologue, size_t idx, // Otherwise ask for a relative path. std::string rel_path; - auto relative = llvm::DILineInfoSpecifier::FileLineInfoKind::Default; + auto relative = llvm::DILineInfoSpecifier::FileLineInfoKind::RawValue; if (!prologue.getFileNameByIndex(idx, compile_dir, relative, rel_path, style)) return {}; return std::move(rel_path); @@ -3737,6 +3737,7 @@ SymbolFileDWARF::CollectCallEdges(ModuleSP module, DWARFDIE function_die) { llvm::Optional call_origin; llvm::Optional call_target; addr_t return_pc = LLDB_INVALID_ADDRESS; + addr_t call_inst_pc = LLDB_INVALID_ADDRESS; DWARFAttributes attributes; const size_t num_attributes = child.GetAttributes(attributes); @@ -3765,6 +3766,12 @@ SymbolFileDWARF::CollectCallEdges(ModuleSP module, DWARFDIE function_die) { if (attr == DW_AT_call_return_pc) return_pc = form_value.Address(); + // Extract DW_AT_call_pc (the PC at the call/branch instruction). It + // should only ever be unavailable for non-tail calls, in which case use + // LLDB_INVALID_ADDRESS. + if (attr == DW_AT_call_pc) + call_inst_pc = form_value.Address(); + // Extract DW_AT_call_target (the location of the address of the indirect // call). if (attr == DW_AT_call_target) { @@ -3787,10 +3794,11 @@ SymbolFileDWARF::CollectCallEdges(ModuleSP module, DWARFDIE function_die) { continue; } - // Adjust the return PC. It needs to be fixed up if the main executable + // Adjust any PC forms. It needs to be fixed up if the main executable // contains a debug map (i.e. pointers to object files), because we need a // file address relative to the executable's text section. return_pc = FixupAddress(return_pc); + call_inst_pc = FixupAddress(call_inst_pc); // Extract call site parameters. CallSiteParameterArray parameters = @@ -3798,10 +3806,13 @@ SymbolFileDWARF::CollectCallEdges(ModuleSP module, DWARFDIE function_die) { std::unique_ptr edge; if (call_origin) { - LLDB_LOG(log, "CollectCallEdges: Found call origin: {0} (retn-PC: {1:x})", - call_origin->GetPubname(), return_pc); + LLDB_LOG(log, + "CollectCallEdges: Found call origin: {0} (retn-PC: {1:x}) " + "(call-PC: {2:x})", + call_origin->GetPubname(), return_pc, call_inst_pc); edge = std::make_unique(call_origin->GetMangledName(), - return_pc, std::move(parameters)); + return_pc, call_inst_pc, + std::move(parameters)); } else { if (log) { StreamString call_target_desc; @@ -3810,8 +3821,8 @@ SymbolFileDWARF::CollectCallEdges(ModuleSP module, DWARFDIE function_die) { LLDB_LOG(log, "CollectCallEdges: Found indirect call target: {0}", call_target_desc.GetString()); } - edge = std::make_unique(*call_target, return_pc, - std::move(parameters)); + edge = std::make_unique( + *call_target, return_pc, call_inst_pc, std::move(parameters)); } if (log && parameters.size()) { diff --git a/lldb/source/Symbol/Function.cpp b/lldb/source/Symbol/Function.cpp index e1d5a720bcc33..0b1f6a8c3a3d5 100644 --- a/lldb/source/Symbol/Function.cpp +++ b/lldb/source/Symbol/Function.cpp @@ -120,27 +120,36 @@ size_t InlineFunctionInfo::MemorySize() const { /// @name Call site related structures /// @{ -lldb::addr_t CallEdge::GetReturnPCAddress(Function &caller, - Target &target) const { +lldb::addr_t CallEdge::GetLoadAddress(lldb::addr_t unresolved_pc, + Function &caller, Target &target) { Log *log(lldb_private::GetLogIfAllCategoriesSet(LIBLLDB_LOG_STEP)); const Address &caller_start_addr = caller.GetAddressRange().GetBaseAddress(); ModuleSP caller_module_sp = caller_start_addr.GetModule(); if (!caller_module_sp) { - LLDB_LOG(log, "GetReturnPCAddress: cannot get Module for caller"); + LLDB_LOG(log, "GetLoadAddress: cannot get Module for caller"); return LLDB_INVALID_ADDRESS; } SectionList *section_list = caller_module_sp->GetSectionList(); if (!section_list) { - LLDB_LOG(log, "GetReturnPCAddress: cannot get SectionList for Module"); + LLDB_LOG(log, "GetLoadAddress: cannot get SectionList for Module"); return LLDB_INVALID_ADDRESS; } - Address return_pc_addr = Address(return_pc, section_list); - lldb::addr_t ret_addr = return_pc_addr.GetLoadAddress(&target); - return ret_addr; + Address the_addr = Address(unresolved_pc, section_list); + lldb::addr_t load_addr = the_addr.GetLoadAddress(&target); + return load_addr; +} + +lldb::addr_t CallEdge::GetReturnPCAddress(Function &caller, + Target &target) const { + return GetLoadAddress(return_pc, caller, target); +} + +lldb::addr_t CallEdge::GetCallInstPC(Function &caller, Target &target) const { + return GetLoadAddress(call_inst_pc, caller, target); } void DirectCallEdge::ParseSymbolFileAndResolve(ModuleList &images) { diff --git a/lldb/source/Target/ABI.cpp b/lldb/source/Target/ABI.cpp index cb7eca280a391..4320eb93adfc9 100644 --- a/lldb/source/Target/ABI.cpp +++ b/lldb/source/Target/ABI.cpp @@ -97,10 +97,8 @@ ValueObjectSP ABI::GetReturnValueObject(Thread &thread, CompilerType &ast_type, if (!persistent_expression_state) return {}; - auto prefix = persistent_expression_state->GetPersistentVariablePrefix(); ConstString persistent_variable_name = - persistent_expression_state->GetNextPersistentVariableName(target, - prefix); + persistent_expression_state->GetNextPersistentVariableName(); lldb::ValueObjectSP const_valobj_sp; diff --git a/lldb/source/Target/StackFrameList.cpp b/lldb/source/Target/StackFrameList.cpp index e8e72203e204c..1a75986a80cf4 100644 --- a/lldb/source/Target/StackFrameList.cpp +++ b/lldb/source/Target/StackFrameList.cpp @@ -236,13 +236,17 @@ void StackFrameList::GetOnlyConcreteFramesUpTo(uint32_t end_idx, m_frames.resize(num_frames); } +/// A sequence of calls that comprise some portion of a backtrace. Each frame +/// is represented as a pair of a callee (Function *) and an address within the +/// callee. +using CallSequence = std::vector>; + /// Find the unique path through the call graph from \p begin (with return PC /// \p return_pc) to \p end. On success this path is stored into \p path, and /// on failure \p path is unchanged. static void FindInterveningFrames(Function &begin, Function &end, ExecutionContext &exe_ctx, Target &target, - addr_t return_pc, - std::vector &path, + addr_t return_pc, CallSequence &path, ModuleList &images, Log *log) { LLDB_LOG(log, "Finding frames between {0} and {1}, retn-pc={2:x}", begin.GetDisplayName(), end.GetDisplayName(), return_pc); @@ -275,24 +279,27 @@ static void FindInterveningFrames(Function &begin, Function &end, // Fully explore the set of functions reachable from the first edge via tail // calls in order to detect ambiguous executions. struct DFS { - std::vector active_path = {}; - std::vector solution_path = {}; + CallSequence active_path = {}; + CallSequence solution_path = {}; llvm::SmallPtrSet visited_nodes = {}; bool ambiguous = false; Function *end; ModuleList &images; + Target ⌖ ExecutionContext &context; - DFS(Function *end, ModuleList &images, ExecutionContext &context) - : end(end), images(images), context(context) {} + DFS(Function *end, ModuleList &images, Target &target, + ExecutionContext &context) + : end(end), images(images), target(target), context(context) {} - void search(Function &first_callee, std::vector &path) { - dfs(first_callee); + void search(CallEdge &first_edge, Function &first_callee, + CallSequence &path) { + dfs(first_edge, first_callee); if (!ambiguous) path = std::move(solution_path); } - void dfs(Function &callee) { + void dfs(CallEdge ¤t_edge, Function &callee) { // Found a path to the target function. if (&callee == end) { if (solution_path.empty()) @@ -312,13 +319,16 @@ static void FindInterveningFrames(Function &begin, Function &end, } // Search the calls made from this callee. - active_path.push_back(&callee); + active_path.emplace_back(&callee, LLDB_INVALID_ADDRESS); for (const auto &edge : callee.GetTailCallingEdges()) { Function *next_callee = edge->GetCallee(images, context); if (!next_callee) continue; - dfs(*next_callee); + addr_t tail_call_pc = edge->GetCallInstPC(callee, target); + active_path.back().second = tail_call_pc; + + dfs(*edge, *next_callee); if (ambiguous) return; } @@ -326,7 +336,7 @@ static void FindInterveningFrames(Function &begin, Function &end, } }; - DFS(&end, images, exe_ctx).search(*first_callee, path); + DFS(&end, images, target, exe_ctx).search(*first_edge, *first_callee, path); } /// Given that \p next_frame will be appended to the frame list, synthesize @@ -379,7 +389,7 @@ void StackFrameList::SynthesizeTailCallFrames(StackFrame &next_frame) { // Try to find the unique sequence of (tail) calls which led from next_frame // to prev_frame. - std::vector path; + CallSequence path; addr_t return_pc = next_reg_ctx_sp->GetPC(); Target &target = *target_sp.get(); ModuleList &images = next_frame.CalculateTarget()->GetImages(); @@ -389,14 +399,17 @@ void StackFrameList::SynthesizeTailCallFrames(StackFrame &next_frame) { path, images, log); // Push synthetic tail call frames. - for (Function *callee : llvm::reverse(path)) { + for (auto calleeInfo : llvm::reverse(path)) { + Function *callee = calleeInfo.first; uint32_t frame_idx = m_frames.size(); uint32_t concrete_frame_idx = next_frame.GetConcreteFrameIndex(); addr_t cfa = LLDB_INVALID_ADDRESS; bool cfa_is_valid = false; - addr_t pc = - callee->GetAddressRange().GetBaseAddress().GetLoadAddress(&target); - constexpr bool behaves_like_zeroth_frame = false; + addr_t pc = calleeInfo.second; + // We do not want to subtract 1 from this PC, as it's the actual address + // of the tail-calling branch instruction. This address is provided by the + // compiler via DW_AT_call_pc. + constexpr bool behaves_like_zeroth_frame = true; SymbolContext sc; callee->CalculateSymbolContext(&sc); auto synth_frame = std::make_shared( @@ -404,7 +417,7 @@ void StackFrameList::SynthesizeTailCallFrames(StackFrame &next_frame) { cfa_is_valid, pc, StackFrame::Kind::Artificial, behaves_like_zeroth_frame, &sc); m_frames.push_back(synth_frame); - LLDB_LOG(log, "Pushed frame {0}", callee->GetDisplayName()); + LLDB_LOG(log, "Pushed frame {0} at {1:x}", callee->GetDisplayName(), pc); } // If any frames were created, adjust next_frame's index. diff --git a/lldb/source/Target/Target.cpp b/lldb/source/Target/Target.cpp index 2bb53bcd4230f..e2c808120877c 100644 --- a/lldb/source/Target/Target.cpp +++ b/lldb/source/Target/Target.cpp @@ -113,6 +113,8 @@ Target::Target(Debugger &debugger, const ArchSpec &target_arch, target_arch.GetArchitectureName(), target_arch.GetTriple().getTriple().c_str()); } + + UpdateLaunchInfoFromProperties(); } Target::~Target() { @@ -3342,16 +3344,13 @@ enum { class TargetOptionValueProperties : public OptionValueProperties { public: - TargetOptionValueProperties(ConstString name) - : OptionValueProperties(name), m_target(nullptr), m_got_host_env(false) {} + TargetOptionValueProperties(ConstString name) : OptionValueProperties(name) {} // This constructor is used when creating TargetOptionValueProperties when it // is part of a new lldb_private::Target instance. It will copy all current // global property values as needed - TargetOptionValueProperties(Target *target, - const TargetPropertiesSP &target_properties_sp) - : OptionValueProperties(*target_properties_sp->GetValueProperties()), - m_target(target), m_got_host_env(false) {} + TargetOptionValueProperties(const TargetPropertiesSP &target_properties_sp) + : OptionValueProperties(*target_properties_sp->GetValueProperties()) {} const Property *GetPropertyAtIndex(const ExecutionContext *exe_ctx, bool will_modify, @@ -3359,9 +3358,6 @@ class TargetOptionValueProperties : public OptionValueProperties { // When getting the value for a key from the target options, we will always // try and grab the setting from the current target if there is one. Else // we just use the one from this instance. - if (idx == ePropertyEnvVars) - GetHostEnvironmentIfNeeded(); - if (exe_ctx) { Target *target = exe_ctx->GetTargetPtr(); if (target) { @@ -3374,41 +3370,6 @@ class TargetOptionValueProperties : public OptionValueProperties { } return ProtectedGetPropertyAtIndex(idx); } - - lldb::TargetSP GetTargetSP() { return m_target->shared_from_this(); } - -protected: - void GetHostEnvironmentIfNeeded() const { - if (!m_got_host_env) { - if (m_target) { - m_got_host_env = true; - const uint32_t idx = ePropertyInheritEnv; - if (GetPropertyAtIndexAsBoolean( - nullptr, idx, g_target_properties[idx].default_uint_value != 0)) { - PlatformSP platform_sp(m_target->GetPlatform()); - if (platform_sp) { - Environment env = platform_sp->GetEnvironment(); - OptionValueDictionary *env_dict = - GetPropertyAtIndexAsOptionValueDictionary(nullptr, - ePropertyEnvVars); - if (env_dict) { - const bool can_replace = false; - for (const auto &KV : env) { - // Don't allow existing keys to be replaced with ones we get - // from the platform environment - env_dict->SetValueForKey( - ConstString(KV.first()), - OptionValueSP(new OptionValueString(KV.second.c_str())), - can_replace); - } - } - } - } - } - } - } - Target *m_target; - mutable bool m_got_host_env; }; // TargetProperties @@ -3435,10 +3396,10 @@ TargetExperimentalProperties::TargetExperimentalProperties() // TargetProperties TargetProperties::TargetProperties(Target *target) - : Properties(), m_launch_info() { + : Properties(), m_launch_info(), m_target(target) { if (target) { m_collection_sp = std::make_shared( - target, Target::GetGlobalProperties()); + Target::GetGlobalProperties()); // Set callbacks to update launch_info whenever "settins set" updated any // of these properties @@ -3448,6 +3409,10 @@ TargetProperties::TargetProperties(Target *target) ePropertyRunArgs, [this] { RunArgsValueChangedCallback(); }); m_collection_sp->SetValueChangedCallback( ePropertyEnvVars, [this] { EnvVarsValueChangedCallback(); }); + m_collection_sp->SetValueChangedCallback( + ePropertyUnsetEnvVars, [this] { EnvVarsValueChangedCallback(); }); + m_collection_sp->SetValueChangedCallback( + ePropertyInheritEnv, [this] { EnvVarsValueChangedCallback(); }); m_collection_sp->SetValueChangedCallback( ePropertyInputPath, [this] { InputPathValueChangedCallback(); }); m_collection_sp->SetValueChangedCallback( @@ -3468,18 +3433,6 @@ TargetProperties::TargetProperties(Target *target) ConstString("Experimental settings - setting these won't produce " "errors if the setting is not present."), true, m_experimental_properties_up->GetValueProperties()); - - // Update m_launch_info once it was created - Arg0ValueChangedCallback(); - RunArgsValueChangedCallback(); - // EnvVarsValueChangedCallback(); // FIXME: cause segfault in - // Target::GetPlatform() - InputPathValueChangedCallback(); - OutputPathValueChangedCallback(); - ErrorPathValueChangedCallback(); - DetachOnErrorValueChangedCallback(); - DisableASLRValueChangedCallback(); - DisableSTDIOValueChangedCallback(); } else { m_collection_sp = std::make_shared(ConstString("target")); @@ -3498,6 +3451,18 @@ TargetProperties::TargetProperties(Target *target) TargetProperties::~TargetProperties() = default; +void TargetProperties::UpdateLaunchInfoFromProperties() { + Arg0ValueChangedCallback(); + RunArgsValueChangedCallback(); + EnvVarsValueChangedCallback(); + InputPathValueChangedCallback(); + OutputPathValueChangedCallback(); + ErrorPathValueChangedCallback(); + DetachOnErrorValueChangedCallback(); + DisableASLRValueChangedCallback(); + DisableSTDIOValueChangedCallback(); +} + bool TargetProperties::GetInjectLocalVariables( ExecutionContext *exe_ctx) const { const Property *exp_property = m_collection_sp->GetPropertyAtIndex( @@ -3639,19 +3604,43 @@ void TargetProperties::SetRunArguments(const Args &args) { m_launch_info.GetArguments() = args; } +Environment TargetProperties::ComputeEnvironment() const { + Environment env; + + if (m_target && + m_collection_sp->GetPropertyAtIndexAsBoolean( + nullptr, ePropertyInheritEnv, + g_target_properties[ePropertyInheritEnv].default_uint_value != 0)) { + if (auto platform_sp = m_target->GetPlatform()) { + Environment platform_env = platform_sp->GetEnvironment(); + for (const auto &KV : platform_env) + env[KV.first()] = KV.second; + } + } + + Args property_unset_env; + m_collection_sp->GetPropertyAtIndexAsArgs(nullptr, ePropertyUnsetEnvVars, + property_unset_env); + for (const auto &var : property_unset_env) + env.erase(var.ref()); + + Args property_env; + m_collection_sp->GetPropertyAtIndexAsArgs(nullptr, ePropertyEnvVars, + property_env); + for (const auto &KV : Environment(property_env)) + env[KV.first()] = KV.second; + + return env; +} + Environment TargetProperties::GetEnvironment() const { - // TODO: Get rid of the Args intermediate step - Args env; - const uint32_t idx = ePropertyEnvVars; - m_collection_sp->GetPropertyAtIndexAsArgs(nullptr, idx, env); - return Environment(env); + return ComputeEnvironment(); } void TargetProperties::SetEnvironment(Environment env) { // TODO: Get rid of the Args intermediate step const uint32_t idx = ePropertyEnvVars; m_collection_sp->SetPropertyAtIndexFromArgs(nullptr, idx, Args(env)); - m_launch_info.GetEnvironment() = std::move(env); } bool TargetProperties::GetSkipPrologue() const { @@ -3969,7 +3958,7 @@ void TargetProperties::RunArgsValueChangedCallback() { } void TargetProperties::EnvVarsValueChangedCallback() { - m_launch_info.GetEnvironment() = GetEnvironment(); + m_launch_info.GetEnvironment() = ComputeEnvironment(); } void TargetProperties::InputPathValueChangedCallback() { diff --git a/lldb/source/Target/TargetProperties.td b/lldb/source/Target/TargetProperties.td index ce08e44acb9bf..c8dd0a12315e0 100644 --- a/lldb/source/Target/TargetProperties.td +++ b/lldb/source/Target/TargetProperties.td @@ -79,8 +79,11 @@ let Definition = "target" in { DefaultStringValue<"">, Desc<"A list containing all the arguments to be passed to the executable when it is run. Note that this does NOT include the argv[0] which is in target.arg0.">; def EnvVars: Property<"env-vars", "Dictionary">, - DefaultUnsignedValue<16>, - Desc<"A list of all the environment variables to be passed to the executable's environment, and their values.">; + ElementType<"String">, + Desc<"A list of user provided environment variables to be passed to the executable's environment, and their values.">; + def UnsetEnvVars: Property<"unset-env-vars", "Array">, + ElementType<"String">, + Desc<"A list of environment variable names to be unset in the inferior's environment. This is most useful to unset some host environment variables when target.inherit-env is true. target.env-vars takes precedence over target.unset-env-vars.">; def InheritEnv: Property<"inherit-env", "Boolean">, DefaultTrue, Desc<"Inherit the environment from the process that is running LLDB.">; @@ -140,7 +143,7 @@ let Definition = "target" in { Desc<"Expressions that crash will show up in crash logs if the host system supports executable specific crash log strings and this setting is set to true.">; def TrapHandlerNames: Property<"trap-handler-names", "Array">, Global, - DefaultUnsignedValue<16>, + ElementType<"String">, Desc<"A list of trap handler function names, e.g. a common Unix user process one is _sigtramp.">; def DisplayRuntimeSupportValues: Property<"display-runtime-support-values", "Boolean">, DefaultFalse, @@ -164,7 +167,7 @@ let Definition = "process" in { DefaultFalse, Desc<"Disable reading and caching of memory in fixed-size units.">; def ExtraStartCommand: Property<"extra-startup-command", "Array">, - DefaultUnsignedValue<16>, + ElementType<"String">, Desc<"A list containing extra commands understood by the particular process plugin used. For instance, to turn on debugserver logging set this to 'QSetLogging:bitmask=LOG_DEFAULT;'">; def IgnoreBreakpointsInExpressions: Property<"ignore-breakpoints-in-expressions", "Boolean">, Global, diff --git a/lldb/source/Target/ThreadPlanStepOverRange.cpp b/lldb/source/Target/ThreadPlanStepOverRange.cpp index efffcb165018f..37795176119a5 100644 --- a/lldb/source/Target/ThreadPlanStepOverRange.cpp +++ b/lldb/source/Target/ThreadPlanStepOverRange.cpp @@ -171,6 +171,10 @@ bool ThreadPlanStepOverRange::ShouldStop(Event *event_ptr) { const SymbolContext &older_context = older_frame_sp->GetSymbolContext(eSymbolContextEverything); if (IsEquivalentContext(older_context)) { + // If we have the next-branch-breakpoint in the range, we can just + // rely on that breakpoint to trigger once we return to the range. + if (m_next_branch_bp_sp) + return false; new_plan_sp = m_thread.QueueThreadPlanForStepOutNoShouldStop( false, nullptr, true, stop_others, eVoteNo, eVoteNoOpinion, 0, m_status, true); diff --git a/lldb/test/API/commands/expression/import-std-module/empty-module/TestEmptyStdModule.py b/lldb/test/API/commands/expression/import-std-module/empty-module/TestEmptyStdModule.py index 76e79df5cd1c8..2b1cb100a3251 100644 --- a/lldb/test/API/commands/expression/import-std-module/empty-module/TestEmptyStdModule.py +++ b/lldb/test/API/commands/expression/import-std-module/empty-module/TestEmptyStdModule.py @@ -15,6 +15,7 @@ class ImportStdModule(TestBase): # but we still add the libc++ category so that this test is only run in # test configurations where libc++ is actually supposed to be tested. @add_test_categories(["libc++"]) + @skipIfRemote @skipIf(compiler=no_match("clang")) def test(self): self.build() diff --git a/lldb/test/API/commands/expression/result_numbering/Makefile b/lldb/test/API/commands/expression/result_numbering/Makefile new file mode 100644 index 0000000000000..695335e068c0c --- /dev/null +++ b/lldb/test/API/commands/expression/result_numbering/Makefile @@ -0,0 +1,4 @@ +C_SOURCES := main.c +CFLAGS_EXTRAS := -std=c99 + +include Makefile.rules diff --git a/lldb/test/API/commands/expression/result_numbering/TestResultNumbering.py b/lldb/test/API/commands/expression/result_numbering/TestResultNumbering.py new file mode 100644 index 0000000000000..cd6b9c43775c7 --- /dev/null +++ b/lldb/test/API/commands/expression/result_numbering/TestResultNumbering.py @@ -0,0 +1,48 @@ +""" +Make sure running internal expressions doesn't +influence the result variable numbering. +""" + + + +import lldb +import lldbsuite.test.lldbutil as lldbutil +from lldbsuite.test.lldbtest import * + + +class TestExpressionResultNumbering(TestBase): + + mydir = TestBase.compute_mydir(__file__) + + NO_DEBUG_INFO_TESTCASE = True + + def test_sample_rename_this(self): + self.build() + self.main_source_file = lldb.SBFileSpec("main.c") + self.do_numbering_test() + + def do_numbering_test(self): + (target, process, thread, bkpt) = lldbutil.run_to_source_breakpoint(self, + "Set a breakpoint here", self.main_source_file) + + bkpt = target.BreakpointCreateBySourceRegex("Add conditions to this breakpoint", + self.main_source_file) + self.assertEqual(bkpt.GetNumLocations(), 1, "Set the breakpoint") + + bkpt.SetCondition("call_me(value) < 6") + + # Get the number of the last expression: + result = thread.frames[0].EvaluateExpression("call_me(200)") + self.assertTrue(result.GetError().Success(), "Our expression succeeded") + name = result.GetName() + ordinal = int(name[1:]) + + process.Continue() + + # The condition evaluation had to run a 4 expressions, but we haven't + # run any user expressions. + result = thread.frames[0].EvaluateExpression("call_me(200)") + self.assertTrue(result.GetError().Success(), "Our expression succeeded the second time") + after_name = result.GetName() + after_ordinal = int(after_name[1:]) + self.assertEqual(ordinal + 1, after_ordinal) diff --git a/lldb/test/API/commands/expression/result_numbering/main.c b/lldb/test/API/commands/expression/result_numbering/main.c new file mode 100644 index 0000000000000..0f5853c99fb1d --- /dev/null +++ b/lldb/test/API/commands/expression/result_numbering/main.c @@ -0,0 +1,18 @@ +#include + +int +call_me(int input) +{ + return input; +} + +int +main() +{ + int value = call_me(0); // Set a breakpoint here + while (value < 10) + { + printf("Add conditions to this breakpoint: %d.\n", value++); + } + return 0; +} diff --git a/lldb/test/API/commands/settings/TestSettings.py b/lldb/test/API/commands/settings/TestSettings.py index ffb194fda808f..c0cdc085f1297 100644 --- a/lldb/test/API/commands/settings/TestSettings.py +++ b/lldb/test/API/commands/settings/TestSettings.py @@ -204,10 +204,15 @@ def test_disassembler_settings(self): @skipIfDarwinEmbedded # debugserver on ios etc can't write files def test_run_args_and_env_vars(self): + self.do_test_run_args_and_env_vars(use_launchsimple=False) + + @skipIfDarwinEmbedded # debugserver on ios etc can't write files + def test_launchsimple_args_and_env_vars(self): + self.do_test_run_args_and_env_vars(use_launchsimple=True) + + def do_test_run_args_and_env_vars(self, use_launchsimple): """Test that run-args and env-vars are passed to the launched process.""" self.build() - exe = self.getBuildArtifact("a.out") - self.runCmd("file " + exe, CURRENT_EXECUTABLE_SET) # Set the run-args and the env-vars. # And add hooks to restore the settings during tearDown(). @@ -218,7 +223,29 @@ def test_run_args_and_env_vars(self): self.addTearDownHook( lambda: self.runCmd("settings clear target.env-vars")) - self.runCmd("process launch --working-dir '{0}'".format(self.get_process_working_directory()), + exe = self.getBuildArtifact("a.out") + self.runCmd("file " + exe, CURRENT_EXECUTABLE_SET) + + target = self.dbg.GetTargetAtIndex(0) + launch_info = target.GetLaunchInfo() + found_env_var = False + for i in range(0, launch_info.GetNumEnvironmentEntries()): + if launch_info.GetEnvironmentEntryAtIndex(i) == "MY_ENV_VAR=YES": + found_env_var = True + break + self.assertTrue(found_env_var, + "MY_ENV_VAR was not set in LunchInfo object") + + self.expect( + 'target show-launch-environment', + substrs=["MY_ENV_VAR=YES"]) + + wd = self.get_process_working_directory() + if use_launchsimple: + process = target.LaunchSimple(None, None, wd) + self.assertTrue(process) + else: + self.runCmd("process launch --working-dir '{0}'".format(wd), RUN_SUCCEEDED) # Read the output file produced by running the program. @@ -233,11 +260,47 @@ def test_run_args_and_env_vars(self): "argv[3] matches", "Environment variable 'MY_ENV_VAR' successfully passed."]) + # Check that env-vars overrides unset-env-vars. + self.runCmd('settings set target.unset-env-vars MY_ENV_VAR') + + self.expect( + 'target show-launch-environment', + 'env-vars overrides unset-env-vars', + substrs=["MY_ENV_VAR=YES"]) + + wd = self.get_process_working_directory() + if use_launchsimple: + process = target.LaunchSimple(None, None, wd) + self.assertTrue(process) + else: + self.runCmd("process launch --working-dir '{0}'".format(wd), + RUN_SUCCEEDED) + + # Read the output file produced by running the program. + output = lldbutil.read_file_from_process_wd(self, "output2.txt") + + self.expect( + output, + exe=False, + substrs=[ + "Environment variable 'MY_ENV_VAR' successfully passed."]) + @skipIfRemote # it doesn't make sense to send host env to remote target + @skipIf(oslist=["windows"]) def test_pass_host_env_vars(self): """Test that the host env vars are passed to the launched process.""" self.build() + # Set some host environment variables now. + os.environ["MY_HOST_ENV_VAR1"] = "VAR1" + os.environ["MY_HOST_ENV_VAR2"] = "VAR2" + + # This is the function to unset the two env variables set above. + def unset_env_variables(): + os.environ.pop("MY_HOST_ENV_VAR1") + os.environ.pop("MY_HOST_ENV_VAR2") + self.addTearDownHook(unset_env_variables) + exe = self.getBuildArtifact("a.out") self.runCmd("file " + exe, CURRENT_EXECUTABLE_SET) @@ -247,16 +310,33 @@ def test_pass_host_env_vars(self): "Default inherit-env is 'true'", startstr="target.inherit-env (boolean) = true") - # Set some host environment variables now. - os.environ["MY_HOST_ENV_VAR1"] = "VAR1" - os.environ["MY_HOST_ENV_VAR2"] = "VAR2" + self.expect( + 'target show-launch-environment', + 'Host environment is passed correctly', + substrs=['MY_HOST_ENV_VAR1=VAR1', 'MY_HOST_ENV_VAR2=VAR2']) + self.runCmd("process launch --working-dir '{0}'".format(self.get_process_working_directory()), + RUN_SUCCEEDED) - # This is the function to unset the two env variables set above. - def unset_env_variables(): - os.environ.pop("MY_HOST_ENV_VAR1") - os.environ.pop("MY_HOST_ENV_VAR2") + # Read the output file produced by running the program. + output = lldbutil.read_file_from_process_wd(self, "output1.txt") + + self.expect( + output, + exe=False, + substrs=[ + "The host environment variable 'MY_HOST_ENV_VAR1' successfully passed.", + "The host environment variable 'MY_HOST_ENV_VAR2' successfully passed."]) + + # Now test that we can prevent the inferior from inheriting the + # environment. + self.runCmd('settings set target.inherit-env false') + + self.expect( + 'target show-launch-environment', + 'target.inherit-env affects `target show-launch-environment`', + matching=False, + substrs = ['MY_HOST_ENV_VAR1=VAR1', 'MY_HOST_ENV_VAR2=VAR2']) - self.addTearDownHook(unset_env_variables) self.runCmd("process launch --working-dir '{0}'".format(self.get_process_working_directory()), RUN_SUCCEEDED) @@ -266,10 +346,42 @@ def unset_env_variables(): self.expect( output, exe=False, + matching=False, substrs=[ "The host environment variable 'MY_HOST_ENV_VAR1' successfully passed.", "The host environment variable 'MY_HOST_ENV_VAR2' successfully passed."]) + # Now test that we can unset variables from the inherited environment. + self.runCmd('settings set target.inherit-env true') + self.runCmd('settings set target.unset-env-vars MY_HOST_ENV_VAR1') + self.runCmd("process launch --working-dir '{0}'".format(self.get_process_working_directory()), + RUN_SUCCEEDED) + + # Read the output file produced by running the program. + output = lldbutil.read_file_from_process_wd(self, "output1.txt") + + self.expect( + 'target show-launch-environment', + 'MY_HOST_ENV_VAR1 is unset, it shouldn\'t be in `target show-launch-environment`', + matching=False, + substrs = ['MY_HOST_ENV_VAR1=VAR1']) + self.expect( + 'target show-launch-environment', + 'MY_HOST_ENV_VAR2 shouldn be in `target show-launch-environment`', + substrs = ['MY_HOST_ENV_VAR2=VAR2']) + + self.expect( + output, + exe=False, + matching=False, + substrs=[ + "The host environment variable 'MY_HOST_ENV_VAR1' successfully passed."]) + self.expect( + output, + exe=False, + substrs=[ + "The host environment variable 'MY_HOST_ENV_VAR2' successfully passed."]) + @skipIfDarwinEmbedded # debugserver on ios etc can't write files def test_set_error_output_path(self): """Test that setting target.error/output-path for the launched process works.""" diff --git a/lldb/test/API/commands/statistics/basic/TestStats.py b/lldb/test/API/commands/statistics/basic/TestStats.py index 76bc224338488..be028199fade1 100644 --- a/lldb/test/API/commands/statistics/basic/TestStats.py +++ b/lldb/test/API/commands/statistics/basic/TestStats.py @@ -21,6 +21,20 @@ def test(self): self.expect("statistics dump", substrs=['expr evaluation successes : 1\n', 'expr evaluation failures : 0\n']) + self.expect("statistics enable") + # Doesn't parse. + self.expect("expr doesnt_exist", error=True, + substrs=["undeclared identifier 'doesnt_exist'"]) + # Doesn't successfully execute. + self.expect("expr int *i = nullptr; *i", error=True) + # Interpret an integer as an array with 3 elements is also a failure. + self.expect("expr -Z 3 -- 1", error=True, + substrs=["expression cannot be used with --element-count"]) + self.expect("statistics disable") + # We should have gotten 3 new failures and the previous success. + self.expect("statistics dump", substrs=['expr evaluation successes : 1\n', + 'expr evaluation failures : 3\n']) + # 'frame var' with disabled statistics shouldn't change stats. self.expect("frame var", substrs=['27']) diff --git a/lldb/test/API/functionalities/breakpoint/breakpoint_locations/main.c b/lldb/test/API/functionalities/breakpoint/breakpoint_locations/main.c index 7ec3ded67b74f..f6ccb031c7445 100644 --- a/lldb/test/API/functionalities/breakpoint/breakpoint_locations/main.c +++ b/lldb/test/API/functionalities/breakpoint/breakpoint_locations/main.c @@ -14,9 +14,9 @@ func_inlined (void) { static int func_inline_call_count = 0; printf ("Called func_inlined.\n"); - ++func_inline_call_count; + ++func_inline_call_count; // Set break point at this line. printf ("Returning func_inlined call count: %d.\n", func_inline_call_count); - return func_inline_call_count; // Set break point at this line. + return func_inline_call_count; } extern int func_inlined (void); diff --git a/lldb/test/API/functionalities/breakpoint/hardware_breakpoints/hardware_breakpoint_on_multiple_threads/TestHWBreakMultiThread.py b/lldb/test/API/functionalities/breakpoint/hardware_breakpoints/hardware_breakpoint_on_multiple_threads/TestHWBreakMultiThread.py index f778b8e39e72c..a4a1d9effbe19 100644 --- a/lldb/test/API/functionalities/breakpoint/hardware_breakpoints/hardware_breakpoint_on_multiple_threads/TestHWBreakMultiThread.py +++ b/lldb/test/API/functionalities/breakpoint/hardware_breakpoints/hardware_breakpoint_on_multiple_threads/TestHWBreakMultiThread.py @@ -32,19 +32,21 @@ def test_hw_break_set_disable_multi_thread_linux(self): self.setTearDownCleanup() self.break_multi_thread('disable', False) # llvm.org/PR44659 - # LLDB on darwin supports hardware breakpoints for arm, aarch64, x86_64 and - # i386 architectures. + # LLDB on darwin supports hardware breakpoints for x86_64 and i386 + # architectures. @skipUnlessDarwin @skipIfOutOfTreeDebugserver + @skipIfDarwinEmbedded def test_hw_break_set_delete_multi_thread_macos(self): self.build() self.setTearDownCleanup() self.break_multi_thread('delete') - # LLDB on darwin supports hardware breakpoints for arm, aarch64, x86_64 and - # i386 architectures. + # LLDB on darwin supports hardware breakpoints for x86_64 and i386 + # architectures. @skipUnlessDarwin @skipIfOutOfTreeDebugserver + @skipIfDarwinEmbedded def test_hw_break_set_disable_multi_thread_macos(self): self.build() self.setTearDownCleanup() diff --git a/lldb/test/API/functionalities/breakpoint/require_hw_breakpoints/TestRequireHWBreakpoints.py b/lldb/test/API/functionalities/breakpoint/require_hw_breakpoints/TestRequireHWBreakpoints.py index 74f2fbb0c1a01..61e4171131013 100644 --- a/lldb/test/API/functionalities/breakpoint/require_hw_breakpoints/TestRequireHWBreakpoints.py +++ b/lldb/test/API/functionalities/breakpoint/require_hw_breakpoints/TestRequireHWBreakpoints.py @@ -19,7 +19,6 @@ def supports_hw_breakpoints(self): CURRENT_EXECUTABLE_SET) self.runCmd("breakpoint set -b main --hardware") self.runCmd("run") - print(self.res.GetOutput()) if 'stopped' in self.res.GetOutput(): return 'Hardware breakpoints are supported' return None diff --git a/lldb/test/API/functionalities/data-formatter/cstring-utf8-summary/Makefile b/lldb/test/API/functionalities/data-formatter/cstring-utf8-summary/Makefile new file mode 100644 index 0000000000000..99998b20bcb05 --- /dev/null +++ b/lldb/test/API/functionalities/data-formatter/cstring-utf8-summary/Makefile @@ -0,0 +1,3 @@ +CXX_SOURCES := main.cpp + +include Makefile.rules diff --git a/lldb/test/API/functionalities/data-formatter/cstring-utf8-summary/TestCstringUnicode.py b/lldb/test/API/functionalities/data-formatter/cstring-utf8-summary/TestCstringUnicode.py new file mode 100644 index 0000000000000..c05e9e9b06ba7 --- /dev/null +++ b/lldb/test/API/functionalities/data-formatter/cstring-utf8-summary/TestCstringUnicode.py @@ -0,0 +1,18 @@ +# coding=utf8 + +import lldb +from lldbsuite.test.lldbtest import * +import lldbsuite.test.lldbutil as lldbutil + + +class CstringUnicodeTestCase(TestBase): + + mydir = TestBase.compute_mydir(__file__) + NO_DEBUG_INFO_TESTCASE = True + + def test_cstring_unicode(self): + self.build() + lldbutil.run_to_source_breakpoint(self, "// break here", + lldb.SBFileSpec("main.cpp", False)) + self.expect_expr("s", result_summary='"🔥"') + self.expect_expr("(const char*)s", result_summary='"🔥"') diff --git a/lldb/test/API/functionalities/data-formatter/cstring-utf8-summary/main.cpp b/lldb/test/API/functionalities/data-formatter/cstring-utf8-summary/main.cpp new file mode 100644 index 0000000000000..c1e8bcf242f4f --- /dev/null +++ b/lldb/test/API/functionalities/data-formatter/cstring-utf8-summary/main.cpp @@ -0,0 +1,4 @@ +int main() { + const char *s = u8"🔥"; + return 0; // break here +} diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-objc/nsstring/TestDataFormatterNSString.py b/lldb/test/API/functionalities/data-formatter/data-formatter-objc/nsstring/TestDataFormatterNSString.py index 4ef0a5957503f..5b323f5614b21 100644 --- a/lldb/test/API/functionalities/data-formatter/data-formatter-objc/nsstring/TestDataFormatterNSString.py +++ b/lldb/test/API/functionalities/data-formatter/data-formatter-objc/nsstring/TestDataFormatterNSString.py @@ -76,8 +76,8 @@ def rdar11106605_commands(self): self.expect('frame variable hebrew', substrs=['לילה טוב']) def nsstring_data_formatter_commands(self): - self.expect('frame variable str0 str1 str2 str3 str4 str5 str6 str8 str9 str10 str11 label1 label2 processName str12', - substrs=[ + self.expect('frame variable empty str0 str1 str2 str3 str4 str5 str6 str8 str9 str10 str11 label1 label2 processName str12', + substrs=['(NSString *) empty = ', ' @""', # '(NSString *) str0 = ',' @"255"', '(NSString *) str1 = ', ' @"A rather short ASCII NSString object is here"', '(NSString *) str2 = ', ' @"A rather short UTF8 NSString object is here"', @@ -104,6 +104,8 @@ def nsstring_data_formatter_commands(self): self.expect('expr -d run-target -- path', substrs=['usr/blah/stuff']) self.expect('frame variable path', substrs=['usr/blah/stuff']) + self.expect('expr -d run-target -- empty_path', substrs=['@""']) + self.expect('frame variable empty_path', substrs=['@""']) def nsstring_withNULs_commands(self): """Check that the NSString formatter supports embedded NULs in the text""" diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-objc/nsstring/main.m b/lldb/test/API/functionalities/data-formatter/data-formatter-objc/nsstring/main.m index 576e091db1bc0..0787561e4da39 100644 --- a/lldb/test/API/functionalities/data-formatter/data-formatter-objc/nsstring/main.m +++ b/lldb/test/API/functionalities/data-formatter/data-formatter-objc/nsstring/main.m @@ -17,6 +17,7 @@ int main (int argc, const char * argv[]) NSAutoreleasePool * pool = [[NSAutoreleasePool alloc] init]; + NSString *empty = @""; NSString *str0 = [[NSNumber numberWithUnsignedLongLong:0xFF] stringValue]; NSString *str1 = [NSString stringWithCString:"A rather short ASCII NSString object is here" encoding:NSASCIIStringEncoding]; NSString *str2 = [NSString stringWithUTF8String:"A rather short UTF8 NSString object is here"]; @@ -69,6 +70,7 @@ int main (int argc, const char * argv[]) NSArray *components = @[@"usr", @"blah", @"stuff"]; NSString *path = [NSString pathWithComponents: components]; + NSString *empty_path = [empty stringByDeletingPathExtension]; const unichar someOfTheseAreNUL[] = {'a',' ', 'v','e','r','y',' ', 'm','u','c','h',' ','b','o','r','i','n','g',' ','t','a','s','k', diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/unique_ptr/Makefile b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/unique_ptr/Makefile new file mode 100644 index 0000000000000..7e57f13aea55a --- /dev/null +++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/unique_ptr/Makefile @@ -0,0 +1,6 @@ +CXX_SOURCES := main.cpp + +USE_LIBCPP := 1 + +CXXFLAGS_EXTRAS := -std=c++14 +include Makefile.rules diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/unique_ptr/TestDataFormatterLibcxxUniquePtr.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/unique_ptr/TestDataFormatterLibcxxUniquePtr.py new file mode 100644 index 0000000000000..b91e494258bf9 --- /dev/null +++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/unique_ptr/TestDataFormatterLibcxxUniquePtr.py @@ -0,0 +1,47 @@ +""" +Test lldb data formatter for libc++ std::unique_ptr. +""" + + + +import lldb +from lldbsuite.test.decorators import * +from lldbsuite.test.lldbtest import * +from lldbsuite.test import lldbutil + +class LibcxUniquePtrDataFormatterTestCase(TestBase): + + mydir = TestBase.compute_mydir(__file__) + + @add_test_categories(["libc++"]) + def test_with_run_command(self): + """Test that that file and class static variables display correctly.""" + self.build() + + (self.target, self.process, _, bkpt) = lldbutil.run_to_source_breakpoint(self, '// break here', + lldb.SBFileSpec("main.cpp", False)) + + self.expect("frame variable up_empty", + substrs=['(std::unique_ptr >) up_empty = nullptr {', + '__value_ = ', + '}']) + + self.expect("frame variable up_int", + substrs=['(std::unique_ptr >) up_int = 10 {', + '__value_ = ', + '}']) + + self.expect("frame variable up_int_ref", + substrs=['(std::unique_ptr > &) up_int_ref = 10: {', + '__value_ = ', + '}']) + + self.expect("frame variable up_int_ref_ref", + substrs=['(std::unique_ptr > &&) up_int_ref_ref = 10: {', + '__value_ = ', + '}']) + + self.expect("frame variable up_str", + substrs=['up_str = "hello" {', + '__value_ = ', + '}']) diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/unique_ptr/main.cpp b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/unique_ptr/main.cpp new file mode 100644 index 0000000000000..4ccffe2a006d3 --- /dev/null +++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/unique_ptr/main.cpp @@ -0,0 +1,13 @@ +#include +#include +#include + +int main() { + std::unique_ptr up_empty; + std::unique_ptr up_int = std::make_unique(10); + std::unique_ptr up_str = std::make_unique("hello"); + std::unique_ptr &up_int_ref = up_int; + std::unique_ptr &&up_int_ref_ref = std::make_unique(10); + + return 0; // break here +} diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libstdcpp/string/TestDataFormatterStdString.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libstdcpp/string/TestDataFormatterStdString.py index fa0e4d123984d..44d8ff26078a5 100644 --- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libstdcpp/string/TestDataFormatterStdString.py +++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libstdcpp/string/TestDataFormatterStdString.py @@ -60,8 +60,7 @@ def cleanup(): var_Q = self.frame().FindVariable('Q') var_uchar = self.frame().FindVariable('uchar') - # TODO: This is currently broken - # self.assertEqual(var_wempty.GetSummary(), 'L""', "wempty summary wrong") + self.assertEqual(var_wempty.GetSummary(), 'L""', "wempty summary wrong") self.assertEqual( var_s.GetSummary(), 'L"hello world! מזל טוב!"', "s summary wrong") diff --git a/lldb/test/API/functionalities/inline-stepping/TestInlineStepping.py b/lldb/test/API/functionalities/inline-stepping/TestInlineStepping.py index 40e29e614ad6d..8e84566d9f691 100644 --- a/lldb/test/API/functionalities/inline-stepping/TestInlineStepping.py +++ b/lldb/test/API/functionalities/inline-stepping/TestInlineStepping.py @@ -16,9 +16,6 @@ class TestInlineStepping(TestBase): @expectedFailureAll( compiler="icc", bugnumber="# Not really a bug. ICC combines two inlined functions.") - @expectedFailureAll(oslist=["windows"], bugnumber="llvm.org/pr32343") - @expectedFailureAll(archs=["aarch64"], oslist=["linux"], - bugnumber="llvm.org/pr44057") def test_with_python_api(self): """Test stepping over and into inlined functions.""" self.build() diff --git a/lldb/test/API/functionalities/inline-stepping/calling.cpp b/lldb/test/API/functionalities/inline-stepping/calling.cpp index 9982fbf42734f..49179ce7c9788 100644 --- a/lldb/test/API/functionalities/inline-stepping/calling.cpp +++ b/lldb/test/API/functionalities/inline-stepping/calling.cpp @@ -75,7 +75,7 @@ caller_trivial_1 () void caller_trivial_2 () { - inline_trivial_1 (); // In caller_trivial_2. + asm volatile ("nop"); inline_trivial_1 (); // In caller_trivial_2. inline_value += 1; // At increment in caller_trivial_2. } @@ -88,7 +88,7 @@ called_by_inline_trivial () void inline_trivial_1 () { - inline_trivial_2(); // In inline_trivial_1. + asm volatile ("nop"); inline_trivial_2(); // In inline_trivial_1. inline_value += 1; // At increment in inline_trivial_1. } diff --git a/lldb/test/API/functionalities/param_entry_vals/basic_entry_values_x86_64/Makefile b/lldb/test/API/functionalities/param_entry_vals/basic_entry_values_x86_64/Makefile index db8fa57abb910..ab505a6841262 100644 --- a/lldb/test/API/functionalities/param_entry_vals/basic_entry_values_x86_64/Makefile +++ b/lldb/test/API/functionalities/param_entry_vals/basic_entry_values_x86_64/Makefile @@ -1,3 +1,3 @@ CXX_SOURCES := main.cpp -CXXFLAGS_EXTRAS := -O2 -glldb -Xclang -femit-debug-entry-values +CXXFLAGS_EXTRAS := -O2 -glldb include Makefile.rules diff --git a/lldb/test/API/functionalities/process_crash_info/TestProcessCrashInfo.py b/lldb/test/API/functionalities/process_crash_info/TestProcessCrashInfo.py index d0f47de83eea4..6ef5018204fd8 100644 --- a/lldb/test/API/functionalities/process_crash_info/TestProcessCrashInfo.py +++ b/lldb/test/API/functionalities/process_crash_info/TestProcessCrashInfo.py @@ -69,6 +69,8 @@ def test_api(self): self.assertIn("pointer being freed was not allocated", stream.GetData()) + # dyld leaves permanent crash_info records when testing on device. + @skipIfDarwinEmbedded def test_on_sane_process(self): """Test that lldb doesn't fetch the extended crash information dictionnary from a 'sane' stopped process.""" diff --git a/lldb/test/API/functionalities/tail_call_frames/unambiguous_sequence/main.cpp b/lldb/test/API/functionalities/tail_call_frames/unambiguous_sequence/main.cpp index c9ab74074f90e..559f8a6d66aa9 100644 --- a/lldb/test/API/functionalities/tail_call_frames/unambiguous_sequence/main.cpp +++ b/lldb/test/API/functionalities/tail_call_frames/unambiguous_sequence/main.cpp @@ -3,19 +3,28 @@ volatile int x; void __attribute__((noinline)) sink() { x++; //% self.filecheck("bt", "main.cpp", "-implicit-check-not=artificial") // CHECK: frame #0: 0x{{[0-9a-f]+}} a.out`sink() at main.cpp:[[@LINE-1]]:4 [opt] - // CHECK-NEXT: frame #1: 0x{{[0-9a-f]+}} a.out`func3{{.*}} [opt] [artificial] - // CHECK-NEXT: frame #2: 0x{{[0-9a-f]+}} a.out`func2{{.*}} [opt] - // CHECK-NEXT: frame #3: 0x{{[0-9a-f]+}} a.out`func1{{.*}} [opt] [artificial] + // CHECK-NEXT: frame #1: 0x{{[0-9a-f]+}} a.out`func3() at main.cpp:14:3 [opt] [artificial] + // CHECK-NEXT: frame #2: 0x{{[0-9a-f]+}} a.out`func2() {{.*}} [opt] + // CHECK-NEXT: frame #3: 0x{{[0-9a-f]+}} a.out`func1() at main.cpp:23:3 [opt] [artificial] // CHECK-NEXT: frame #4: 0x{{[0-9a-f]+}} a.out`main{{.*}} [opt] } -void __attribute__((noinline)) func3() { sink(); /* tail */ } +void __attribute__((noinline)) func3() { + x++; + sink(); /* tail */ +} -void __attribute__((disable_tail_calls, noinline)) func2() { func3(); /* regular */ } +void __attribute__((disable_tail_calls, noinline)) func2() { + func3(); /* regular */ +} -void __attribute__((noinline)) func1() { func2(); /* tail */ } +void __attribute__((noinline)) func1() { + x++; + func2(); /* tail */ +} int __attribute__((disable_tail_calls)) main() { + // DEBUG: self.runCmd("log enable lldb step -f /tmp/lldbstep.log") func1(); /* regular */ return 0; } diff --git a/lldb/test/API/lang/cpp/thread_local/TestThreadLocal.py b/lldb/test/API/lang/cpp/thread_local/TestThreadLocal.py index 5152c0010d102..e7cfa1ca14f27 100644 --- a/lldb/test/API/lang/cpp/thread_local/TestThreadLocal.py +++ b/lldb/test/API/lang/cpp/thread_local/TestThreadLocal.py @@ -1,6 +1,49 @@ -from lldbsuite.test import lldbinline from lldbsuite.test import decorators -lldbinline.MakeInlineTest(__file__, globals(), - lldbinline.expectedFailureAll(oslist=[ - "windows", "linux", "netbsd"])) +import lldb +from lldbsuite.test.decorators import * +from lldbsuite.test.lldbtest import * +from lldbsuite.test import lldbutil +from lldbsuite.test import lldbtest + + +class PlatformProcessCrashInfoTestCase(TestBase): + + mydir = TestBase.compute_mydir(__file__) + + @expectedFailureAll(oslist=["windows", "linux", "netbsd"]) + def test_thread_local(self): + # Set a breakpoint on the first instruction of the main function, + # before the TLS initialization has run. + self.build() + exe = self.getBuildArtifact("a.out") + + (target, process, _, _) = \ + lldbutil.run_to_source_breakpoint(self, "Set breakpoint here", + lldb.SBFileSpec("main.cpp")) + self.expect_expr("tl_local_int + 1", + result_type="int", result_value="323") + self.expect_expr("*tl_local_ptr + 2", + result_type="int", result_value="324") + self.expect_expr("tl_global_int", + result_type="int", result_value="123") + self.expect_expr("*tl_global_ptr", + result_type="int", result_value="45") + + # Now see if we emit the correct error when the TLS is not yet + # initialized. Let's set a breakpoint on the first instruction + # of main. + main_module = target.FindModule(lldb.SBFileSpec(exe)) + main_address = main_module.FindSymbol("main").GetStartAddress() + main_bkpt = target.BreakpointCreateBySBAddress(main_address) + + process.Kill() + lldbutil.run_to_breakpoint_do_run(self, target, main_bkpt) + + self.expect("expr tl_local_int", error=True, + substrs=["couldn't get the value of variable tl_local_int", + "No TLS data currently exists for this thread"]) + self.expect("expr *tl_local_ptr", error=True, + substrs=["couldn't get the value of variable tl_local_ptr", + "No TLS data currently exists for this thread"]) + diff --git a/lldb/test/API/lang/cpp/thread_local/main.cpp b/lldb/test/API/lang/cpp/thread_local/main.cpp index 1855b7c5f3441..04c7fc0ed74de 100644 --- a/lldb/test/API/lang/cpp/thread_local/main.cpp +++ b/lldb/test/API/lang/cpp/thread_local/main.cpp @@ -3,15 +3,9 @@ thread_local int tl_global_int = 123; thread_local int *tl_global_ptr = &storage; int main(int argc, char **argv) { - //% self.expect("expr tl_local_int", error=True, substrs=["couldn't get the value of variable tl_local_int"]) - //% self.expect("expr *tl_local_ptr", error=True, substrs=["couldn't get the value of variable tl_local_ptr"]) thread_local int tl_local_int = 321; thread_local int *tl_local_ptr = nullptr; tl_local_ptr = &tl_local_int; tl_local_int++; - //% self.expect("expr tl_local_int + 1", substrs=["int", "= 323"]) - //% self.expect("expr *tl_local_ptr + 2", substrs=["int", "= 324"]) - //% self.expect("expr tl_global_int", substrs=["int", "= 123"]) - //% self.expect("expr *tl_global_ptr", substrs=["int", "= 45"]) - return 0; + return 0; // Set breakpoint here } diff --git a/lldb/test/API/lang/objc/hidden-ivars/Makefile b/lldb/test/API/lang/objc/hidden-ivars/Makefile index 0664769456eff..283e8a118fb16 100644 --- a/lldb/test/API/lang/objc/hidden-ivars/Makefile +++ b/lldb/test/API/lang/objc/hidden-ivars/Makefile @@ -4,4 +4,24 @@ OBJC_SOURCES := main.m LD_EXTRAS = -framework Foundation +all: a.out libInternalDefiner.dylib stripped + include Makefile.rules + +ifeq "$(MAKE_DSYM)" "YES" +stripped: a.out.dSYM +endif + +stripped: a.out libInternalDefiner.dylib + mkdir stripped + strip -Sx a.out -o stripped/a.out + strip -Sx libInternalDefiner.dylib -o stripped/libInternalDefiner.dylib +ifneq "$(CODESIGN)" "" + $(CODESIGN) -fs - stripped/a.out +endif +ifneq "$(CODESIGN)" "" + $(CODESIGN) -fs - stripped/libInternalDefiner.dylib +endif +ifeq "$(MAKE_DSYM)" "YES" + cp -r a.out.dSYM stripped/a.out.dSYM +endif diff --git a/lldb/test/API/lang/objc/hidden-ivars/TestHiddenIvars.py b/lldb/test/API/lang/objc/hidden-ivars/TestHiddenIvars.py index 03a325ac49c62..5930ffdc958aa 100644 --- a/lldb/test/API/lang/objc/hidden-ivars/TestHiddenIvars.py +++ b/lldb/test/API/lang/objc/hidden-ivars/TestHiddenIvars.py @@ -80,20 +80,11 @@ def test_frame_variable_across_modules(self): def common_setup(self, strip): if strip: - self.assertTrue(subprocess.call( - ['/usr/bin/strip', '-Sx', - self.getBuildArtifact('libInternalDefiner.dylib')]) == 0, - 'stripping dylib succeeded') - self.assertTrue(subprocess.call( - ['/bin/rm', '-rf', - self.getBuildArtifact('libInternalDefiner.dylib.dSYM')]) == 0, - 'remove dylib dSYM file succeeded') - self.assertTrue(subprocess.call(['/usr/bin/strip', '-Sx', - self.getBuildArtifact("a.out") - ]) == 0, - 'stripping a.out succeeded') + exe = self.getBuildArtifact("stripped/a.out") + else: + exe = self.getBuildArtifact("a.out") # Create a target by the debugger. - target = self.dbg.CreateTarget(self.getBuildArtifact("a.out")) + target = self.dbg.CreateTarget(exe) self.assertTrue(target, VALID_TARGET) # Create the breakpoint inside function 'main'. @@ -110,7 +101,6 @@ def common_setup(self, strip): None, environment, self.get_process_working_directory()) self.assertTrue(process, PROCESS_IS_VALID) - exe = self.getBuildArtifact("a.out") self.runCmd("file " + exe, CURRENT_EXECUTABLE_SET) # Break inside the foo function which takes a bar_ptr argument. diff --git a/lldb/test/API/lang/objc/objc-ivar-stripped/Makefile b/lldb/test/API/lang/objc/objc-ivar-stripped/Makefile index 0aaa021132e16..8b63215d6d9da 100644 --- a/lldb/test/API/lang/objc/objc-ivar-stripped/Makefile +++ b/lldb/test/API/lang/objc/objc-ivar-stripped/Makefile @@ -3,7 +3,10 @@ LD_EXTRAS := -lobjc -framework Foundation all: a.out.stripped +include Makefile.rules + a.out.stripped: a.out.dSYM strip -o a.out.stripped a.out - -include Makefile.rules +ifneq "$(CODESIGN)" "" + $(CODESIGN) -fs - a.out.stripped +endif diff --git a/lldb/test/API/linux/builtin_trap/TestBuiltinTrap.py b/lldb/test/API/linux/builtin_trap/TestBuiltinTrap.py index 22de873e29fad..added4ef508a7 100644 --- a/lldb/test/API/linux/builtin_trap/TestBuiltinTrap.py +++ b/lldb/test/API/linux/builtin_trap/TestBuiltinTrap.py @@ -23,7 +23,7 @@ def setUp(self): # gcc generates incorrect linetable @expectedFailureAll(archs="arm", compiler="gcc", triple=".*-android") - @expectedFailureAll(oslist=['linux'], archs=['arm', 'aarch64']) + @expectedFailureAll(archs=['arm', 'aarch64']) @skipIfWindows def test_with_run_command(self): """Test that LLDB handles a function with __builtin_trap correctly.""" diff --git a/lldb/test/API/python_api/sbenvironment/TestSBEnvironment.py b/lldb/test/API/python_api/sbenvironment/TestSBEnvironment.py new file mode 100644 index 0000000000000..c1937f985e283 --- /dev/null +++ b/lldb/test/API/python_api/sbenvironment/TestSBEnvironment.py @@ -0,0 +1,125 @@ +"""Test the SBEnvironment APIs.""" + + + +from math import fabs +import lldb +from lldbsuite.test.decorators import * +from lldbsuite.test.lldbtest import * +from lldbsuite.test import lldbutil + +class SBEnvironmentAPICase(TestBase): + + mydir = TestBase.compute_mydir(__file__) + NO_DEBUG_INFO_TESTCASE = True + + # We use this function to test both kind of accessors: + # . Get*AtIndex and GetEntries + def assertEqualEntries(self, env, entries): + self.assertEqual(env.GetNumValues(), len(entries)) + for i in range(env.GetNumValues()): + name = env.GetNameAtIndex(i) + value = env.GetValueAtIndex(i) + self.assertIn(name + "=" + value, entries) + + entries = env.GetEntries() + self.assertEqual(entries.GetSize(), len(entries)) + for i in range(entries.GetSize()): + (name, value) = entries.GetStringAtIndex(i).split("=") + self.assertIn(name + "=" + value, entries) + + + + @add_test_categories(['pyapi']) + def test_platform_environment(self): + env = self.dbg.GetSelectedPlatform().GetEnvironment() + # We assume at least PATH is set + self.assertNotEqual(env.Get("PATH"), None) + + + @add_test_categories(['pyapi']) + def test_launch_info(self): + target = self.dbg.CreateTarget("") + launch_info = target.GetLaunchInfo() + env = launch_info.GetEnvironment() + env_count = env.GetNumValues() + + env.Set("FOO", "bar", overwrite=True) + self.assertEqual(env.GetNumValues(), env_count + 1) + + # Make sure we only modify the copy of the launchInfo's environment + self.assertEqual(launch_info.GetEnvironment().GetNumValues(), env_count) + + launch_info.SetEnvironment(env, append=True) + self.assertEqual(launch_info.GetEnvironment().GetNumValues(), env_count + 1) + + # Make sure we can replace the launchInfo's environment + env.Clear() + env.Set("BAR", "foo", overwrite=True) + env.PutEntry("X=y") + launch_info.SetEnvironment(env, append=False) + self.assertEqualEntries(launch_info.GetEnvironment(), ["BAR=foo", "X=y"]) + + + @add_test_categories(['pyapi']) + def test_target_environment(self): + env = self.dbg.GetSelectedTarget().GetEnvironment() + # There is no target, so env should be empty + self.assertEqual(env.GetNumValues(), 0) + self.assertEqual(env.Get("PATH"), None) + + target = self.dbg.CreateTarget("") + env = target.GetEnvironment() + path = env.Get("PATH") + # Now there's a target, so at least PATH should exist + self.assertNotEqual(path, None) + + # Make sure we are getting a copy by modifying the env we just got + env.PutEntry("PATH=#" + path) + self.assertEqual(target.GetEnvironment().Get("PATH"), path) + + @add_test_categories(['pyapi']) + def test_creating_and_modifying_environment(self): + env = lldb.SBEnvironment() + + self.assertEqual(env.Get("FOO"), None) + self.assertEqual(env.Get("BAR"), None) + + # We also test empty values + self.assertTrue(env.Set("FOO", "", overwrite=False)) + env.Set("BAR", "foo", overwrite=False) + + self.assertEqual(env.Get("FOO"), "") + self.assertEqual(env.Get("BAR"), "foo") + + self.assertEqual(env.GetNumValues(), 2) + + self.assertEqualEntries(env, ["FOO=", "BAR=foo"]) + + # Make sure modifications work + self.assertFalse(env.Set("FOO", "bar", overwrite=False)) + self.assertEqual(env.Get("FOO"), "") + + env.PutEntry("FOO=bar") + self.assertEqual(env.Get("FOO"), "bar") + + self.assertEqualEntries(env, ["FOO=bar", "BAR=foo"]) + + # Make sure we can unset + self.assertTrue(env.Unset("FOO")) + self.assertFalse(env.Unset("FOO")) + self.assertEqual(env.Get("FOO"), None) + + # Test SetEntries + entries = lldb.SBStringList() + entries.AppendList(["X=x", "Y=y"], 2) + + env.SetEntries(entries, append=True) + self.assertEqualEntries(env, ["BAR=foo", "X=x", "Y=y"]) + + env.SetEntries(entries, append=False) + self.assertEqualEntries(env, ["X=x", "Y=y"]) + + # Test clear + env.Clear() + self.assertEqualEntries(env, []) diff --git a/lldb/test/API/tools/lldb-vscode/launch/TestVSCode_launch.py b/lldb/test/API/tools/lldb-vscode/launch/TestVSCode_launch.py index b63170ee6b8ca..0d20dd0987ac6 100644 --- a/lldb/test/API/tools/lldb-vscode/launch/TestVSCode_launch.py +++ b/lldb/test/API/tools/lldb-vscode/launch/TestVSCode_launch.py @@ -9,6 +9,7 @@ from lldbsuite.test.lldbtest import * from lldbsuite.test import lldbutil import lldbvscode_testcase +import time import os @@ -35,6 +36,31 @@ def test_default(self): self.assertTrue(program in lines[0], "make sure program path is in first argument") + @skipIfWindows + @skipIfRemote + def test_termination(self): + ''' + Tests the correct termination of lldb-vscode upon a 'disconnect' + request. + ''' + self.create_debug_adaptor() + # The underlying lldb-vscode process must be alive + self.assertEqual(self.vscode.process.poll(), None) + + # The lldb-vscode process should finish even though + # we didn't close the communication socket explicitly + self.vscode.request_disconnect() + + # Wait until the underlying lldb-vscode process dies. + # We need to do this because the popen.wait function in python2.7 + # doesn't have a timeout argument. + for _ in range(10): + time.sleep(1) + if self.vscode.process.poll() is not None: + break + # Check the return code + self.assertEqual(self.vscode.process.poll(), 0) + @skipIfWindows @skipIfRemote def test_stopOnEntry(self): diff --git a/lldb/test/Shell/ExecControl/StopHook/stop-hook.test b/lldb/test/Shell/ExecControl/StopHook/stop-hook.test index a06de6634ea19..98a77cac99bac 100644 --- a/lldb/test/Shell/ExecControl/StopHook/stop-hook.test +++ b/lldb/test/Shell/ExecControl/StopHook/stop-hook.test @@ -46,12 +46,12 @@ target stop-hook list run # Stopping inside of the stop hook range # CHECK: (lldb) run -# CHECK-NEXT: (void *) $0 = 0x +# CHECK-NEXT: (void *) ${{.*}} = 0x thread step-over # Stepping inside of the stop hook range # CHECK: (lldb) thread step-over -# CHECK-NEXT: (void *) $1 = 0x +# CHECK-NEXT: (void *) ${{.*}} = 0x # CHECK: ->{{.*}} // We should stop here after stepping. process continue diff --git a/lldb/test/Shell/Reproducer/Functionalities/TestDataFormatter.test b/lldb/test/Shell/Reproducer/Functionalities/TestDataFormatter.test index 7db8bc4b36cf1..d133c8d3b9f9e 100644 --- a/lldb/test/Shell/Reproducer/Functionalities/TestDataFormatter.test +++ b/lldb/test/Shell/Reproducer/Functionalities/TestDataFormatter.test @@ -1,4 +1,4 @@ -# UNSUPPORTED: system-windows, system-freebsd +# UNSUPPORTED: system-freebsd # This tests that data formatters continue to work when replaying a reproducer. diff --git a/lldb/test/Shell/Reproducer/Functionalities/TestExpressionEvaluation.test b/lldb/test/Shell/Reproducer/Functionalities/TestExpressionEvaluation.test index e2bcb2d96570c..f400cef07a24a 100644 --- a/lldb/test/Shell/Reproducer/Functionalities/TestExpressionEvaluation.test +++ b/lldb/test/Shell/Reproducer/Functionalities/TestExpressionEvaluation.test @@ -1,4 +1,4 @@ -# UNSUPPORTED: system-windows, system-freebsd +# UNSUPPORTED: system-freebsd # XFAIL: system-netbsd # Flaky diff --git a/lldb/test/Shell/Reproducer/Functionalities/TestImageList.test b/lldb/test/Shell/Reproducer/Functionalities/TestImageList.test index db319093f1749..ec8b36ea9576c 100644 --- a/lldb/test/Shell/Reproducer/Functionalities/TestImageList.test +++ b/lldb/test/Shell/Reproducer/Functionalities/TestImageList.test @@ -1,4 +1,4 @@ -# UNSUPPORTED: system-windows, system-freebsd +# UNSUPPORTED: system-freebsd # This tests that image list works when replaying. We arbitrarily assume # there's at least two entries and compare that they're identical. diff --git a/lldb/test/Shell/Reproducer/Functionalities/TestStepping.test b/lldb/test/Shell/Reproducer/Functionalities/TestStepping.test index 1dec9a077c7b1..ba9164f4b43a9 100644 --- a/lldb/test/Shell/Reproducer/Functionalities/TestStepping.test +++ b/lldb/test/Shell/Reproducer/Functionalities/TestStepping.test @@ -1,4 +1,4 @@ -# UNSUPPORTED: system-windows, system-freebsd +# UNSUPPORTED: system-freebsd # This tests that stepping continues to work when replaying a reproducer. diff --git a/lldb/test/Shell/Reproducer/Inputs/foo.lua b/lldb/test/Shell/Reproducer/Inputs/foo.lua new file mode 100644 index 0000000000000..8ed0c94cbba96 --- /dev/null +++ b/lldb/test/Shell/Reproducer/Inputs/foo.lua @@ -0,0 +1 @@ +print('95126') diff --git a/lldb/test/Shell/Reproducer/Inputs/foo.py b/lldb/test/Shell/Reproducer/Inputs/foo.py new file mode 100644 index 0000000000000..8ed0c94cbba96 --- /dev/null +++ b/lldb/test/Shell/Reproducer/Inputs/foo.py @@ -0,0 +1 @@ +print('95126') diff --git a/lldb/test/Shell/Reproducer/TestCaptureEnvOverride.test b/lldb/test/Shell/Reproducer/TestCaptureEnvOverride.test index a8e7bdec250e6..ef06bce8983fd 100644 --- a/lldb/test/Shell/Reproducer/TestCaptureEnvOverride.test +++ b/lldb/test/Shell/Reproducer/TestCaptureEnvOverride.test @@ -1,4 +1,3 @@ -# UNSUPPORTED: system-windows # This tests the LLDB_CAPTURE_REPRODUCER override. # RUN: %lldb -b -o 'reproducer status' --capture --capture-path %t.repro /bin/ls | FileCheck %s --check-prefix CAPTURE diff --git a/lldb/test/Shell/Reproducer/TestCrash.test b/lldb/test/Shell/Reproducer/TestCrash.test index cb0c09aad1414..1389a9b76ad39 100644 --- a/lldb/test/Shell/Reproducer/TestCrash.test +++ b/lldb/test/Shell/Reproducer/TestCrash.test @@ -1,4 +1,3 @@ -# UNSUPPORTED: system-windows # This tests that a reproducer is generated when LLDB crashes. # Start clean. diff --git a/lldb/test/Shell/Reproducer/TestDiscard.test b/lldb/test/Shell/Reproducer/TestDiscard.test index db9614aabb841..829aabbe2b03b 100644 --- a/lldb/test/Shell/Reproducer/TestDiscard.test +++ b/lldb/test/Shell/Reproducer/TestDiscard.test @@ -1,4 +1,3 @@ -# UNSUPPORTED: system-windows # This ensures that the reproducer properly cleans up after itself. # Build the inferior. diff --git a/lldb/test/Shell/Reproducer/TestDump.test b/lldb/test/Shell/Reproducer/TestDump.test index c193b806b5472..8300a97004bbf 100644 --- a/lldb/test/Shell/Reproducer/TestDump.test +++ b/lldb/test/Shell/Reproducer/TestDump.test @@ -1,4 +1,3 @@ -# UNSUPPORTED: system-windows # This tests the reproducer dump functionality. # Generate a reproducer. diff --git a/lldb/test/Shell/Reproducer/TestGDBRemoteRepro.test b/lldb/test/Shell/Reproducer/TestGDBRemoteRepro.test index 609c839292927..683a7e2f55297 100644 --- a/lldb/test/Shell/Reproducer/TestGDBRemoteRepro.test +++ b/lldb/test/Shell/Reproducer/TestGDBRemoteRepro.test @@ -1,4 +1,4 @@ -# UNSUPPORTED: system-windows, system-freebsd +# UNSUPPORTED: system-freebsd # This tests the replaying of GDB remote packets. # diff --git a/lldb/test/Shell/Reproducer/TestLuaImport.test b/lldb/test/Shell/Reproducer/TestLuaImport.test new file mode 100644 index 0000000000000..315cfd396028e --- /dev/null +++ b/lldb/test/Shell/Reproducer/TestLuaImport.test @@ -0,0 +1,11 @@ +# REQUIRES: lua +# UNSUPPORTED: system-windows +# Ensure that the reproducers know about imported Lua modules. + +# RUN: rm -rf %t.repro +# RUN: %lldb -x -b --script-language lua --capture --capture-path %t.repro -o 'command script import %S/Inputs/foo.lua' -o 'reproducer generate' | FileCheck %s --check-prefix CAPTURE + +# CAPTURE: 95126 + +# RUN: %lldb -b -o 'reproducer dump -p files -f %t.repro' | FileCheck %s --check-prefix FILES +# FILES: foo.lua diff --git a/lldb/test/Shell/Reproducer/TestMultipleTargets.test b/lldb/test/Shell/Reproducer/TestMultipleTargets.test index ce1a5ecdd4c81..7859480e2d040 100644 --- a/lldb/test/Shell/Reproducer/TestMultipleTargets.test +++ b/lldb/test/Shell/Reproducer/TestMultipleTargets.test @@ -1,4 +1,4 @@ -# UNSUPPORTED: system-windows, system-freebsd +# UNSUPPORTED: system-freebsd # This tests the replaying with multiple targets. diff --git a/lldb/test/Shell/Reproducer/TestPythonImport.test b/lldb/test/Shell/Reproducer/TestPythonImport.test new file mode 100644 index 0000000000000..7bea97c91d986 --- /dev/null +++ b/lldb/test/Shell/Reproducer/TestPythonImport.test @@ -0,0 +1,11 @@ +# REQUIRES: python +# UNSUPPORTED: system-windows +# Ensure that the reproducers know about imported Python modules. + +# RUN: rm -rf %t.repro +# RUN: %lldb -x -b --capture --capture-path %t.repro -o 'command script import %S/Inputs/foo.py' -o 'reproducer generate' | FileCheck %s --check-prefix CAPTURE + +# CAPTURE: 95126 + +# RUN: %lldb -b -o 'reproducer dump -p files -f %t.repro' | FileCheck %s --check-prefix FILES +# FILES: foo.py diff --git a/lldb/test/Shell/Reproducer/TestRelativePath.test b/lldb/test/Shell/Reproducer/TestRelativePath.test index 2ee4bf0c96495..fa75187845502 100644 --- a/lldb/test/Shell/Reproducer/TestRelativePath.test +++ b/lldb/test/Shell/Reproducer/TestRelativePath.test @@ -1,4 +1,3 @@ -# UNSUPPORTED: system-windows # This tests relative capture paths. # RUN: mkdir -p %t diff --git a/lldb/test/Shell/Reproducer/TestReuseDirectory.test b/lldb/test/Shell/Reproducer/TestReuseDirectory.test index 31b71a0f26015..a3fecced2504b 100644 --- a/lldb/test/Shell/Reproducer/TestReuseDirectory.test +++ b/lldb/test/Shell/Reproducer/TestReuseDirectory.test @@ -1,4 +1,4 @@ -# UNSUPPORTED: system-windows, system-freebsd +# UNSUPPORTED: system-freebsd # Test that we can capture twice to the same directory without breaking the # reproducer functionality. diff --git a/lldb/test/Shell/Reproducer/TestSynchronous.test b/lldb/test/Shell/Reproducer/TestSynchronous.test index c62cbe81aff3b..f32ce8c57224f 100644 --- a/lldb/test/Shell/Reproducer/TestSynchronous.test +++ b/lldb/test/Shell/Reproducer/TestSynchronous.test @@ -1,5 +1,4 @@ # REQUIRES: python -# UNSUPPORTED: system-windows # Ensure that replay happens in synchronous mode. # RUN: rm -rf %t.repro diff --git a/lldb/test/Shell/Reproducer/TestWorkingDir.test b/lldb/test/Shell/Reproducer/TestWorkingDir.test index 707916bae560e..1d4c7694211af 100644 --- a/lldb/test/Shell/Reproducer/TestWorkingDir.test +++ b/lldb/test/Shell/Reproducer/TestWorkingDir.test @@ -1,5 +1,3 @@ -# XFAIL: system-windows - # This tests that the reproducer can deal with relative files. We create a # binary in a subdirectory and pass its relative path to LLDB. The subdirectory # is removed before replay so that it only exists in the reproducer's VFS. diff --git a/lldb/test/Shell/Reproducer/lit.local.cfg b/lldb/test/Shell/Reproducer/lit.local.cfg index 7f4022768c87d..30f97f28279d4 100644 --- a/lldb/test/Shell/Reproducer/lit.local.cfg +++ b/lldb/test/Shell/Reproducer/lit.local.cfg @@ -6,5 +6,8 @@ if 'LLVM_DISABLE_CRASH_REPORT' in config.environment: if 'LLDB_CAPTURE_REPRODUCER' in config.environment: del config.environment['LLDB_CAPTURE_REPRODUCER'] +if 'system-windows' in config.available_features: + config.unsupported = True + if 'lldb-repro' in config.available_features: config.unsupported = True diff --git a/lldb/tools/debugserver/debugserver.xcodeproj/project.pbxproj b/lldb/tools/debugserver/debugserver.xcodeproj/project.pbxproj index f4267b7633a27..1c7a55f7108a0 100644 --- a/lldb/tools/debugserver/debugserver.xcodeproj/project.pbxproj +++ b/lldb/tools/debugserver/debugserver.xcodeproj/project.pbxproj @@ -7,131 +7,165 @@ objects = { /* Begin PBXBuildFile section */ - 23562ED61D342A5A00AB2BD4 /* ActivityStore.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 23562ED51D342A5A00AB2BD4 /* ActivityStore.cpp */; }; - 23562ED71D342A5A00AB2BD4 /* ActivityStore.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 23562ED51D342A5A00AB2BD4 /* ActivityStore.cpp */; }; - 26CE05C5115C36590022F371 /* CFBundle.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 2695DD910D3EBFF6007E4CA2 /* CFBundle.cpp */; }; - 456F67641AD46CE9002850C2 /* CFBundle.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 2695DD910D3EBFF6007E4CA2 /* CFBundle.cpp */; }; - 26CE05C3115C36580022F371 /* CFString.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 2695DD9B0D3EC160007E4CA2 /* CFString.cpp */; }; - 456F67621AD46CE9002850C2 /* CFString.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 2695DD9B0D3EC160007E4CA2 /* CFString.cpp */; }; - 26CE05CF115C36F70022F371 /* CoreFoundation.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 26ACA3340D3E956300A2120B /* CoreFoundation.framework */; settings = {ATTRIBUTES = (Required, ); }; }; - 456F676B1AD46CE9002850C2 /* CoreFoundation.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 26ACA3340D3E956300A2120B /* CoreFoundation.framework */; settings = {ATTRIBUTES = (Required, ); }; }; - 26CE05B7115C363B0022F371 /* DNB.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637D60C71334A0024798E /* DNB.cpp */; }; - 456F67551AD46CE9002850C2 /* DNB.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637D60C71334A0024798E /* DNB.cpp */; }; - 264D5D581293835600ED4C01 /* DNBArch.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 264D5D571293835600ED4C01 /* DNBArch.cpp */; }; - 456F67671AD46CE9002850C2 /* DNBArch.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 264D5D571293835600ED4C01 /* DNBArch.cpp */; }; - 26CE05C1115C36510022F371 /* DNBArchImpl.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 2675D4220CCEB705000F49AF /* DNBArchImpl.cpp */; }; - 26CE05C2115C36550022F371 /* DNBArchImpl.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637FB0C71334A0024798E /* DNBArchImpl.cpp */; }; - 456F67601AD46CE9002850C2 /* DNBArchImpl.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 2675D4220CCEB705000F49AF /* DNBArchImpl.cpp */; }; - 456F67611AD46CE9002850C2 /* DNBArchImpl.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637FB0C71334A0024798E /* DNBArchImpl.cpp */; }; - 266B5ED11460A68200E43F0A /* DNBArchImplARM64.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 266B5ECF1460A68200E43F0A /* DNBArchImplARM64.cpp */; }; - 456F67691AD46CE9002850C2 /* DNBArchImplARM64.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 266B5ECF1460A68200E43F0A /* DNBArchImplARM64.cpp */; }; - 26CE05C0115C364F0022F371 /* DNBArchImplI386.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637EA0C71334A0024798E /* DNBArchImplI386.cpp */; }; - 456F675F1AD46CE9002850C2 /* DNBArchImplI386.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637EA0C71334A0024798E /* DNBArchImplI386.cpp */; }; - 26CE05BF115C364D0022F371 /* DNBArchImplX86_64.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26CF99A21142EB7400011AAB /* DNBArchImplX86_64.cpp */; }; - 456F675E1AD46CE9002850C2 /* DNBArchImplX86_64.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26CF99A21142EB7400011AAB /* DNBArchImplX86_64.cpp */; }; - 26CE05B8115C363C0022F371 /* DNBBreakpoint.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637D90C71334A0024798E /* DNBBreakpoint.cpp */; }; - 456F67571AD46CE9002850C2 /* DNBBreakpoint.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637D90C71334A0024798E /* DNBBreakpoint.cpp */; }; - 26CE05B9115C363D0022F371 /* DNBDataRef.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637DB0C71334A0024798E /* DNBDataRef.cpp */; }; - 456F67581AD46CE9002850C2 /* DNBDataRef.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637DB0C71334A0024798E /* DNBDataRef.cpp */; }; - 26CE05A7115C360D0022F371 /* DNBError.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637DE0C71334A0024798E /* DNBError.cpp */; }; - 456F67461AD46CE9002850C2 /* DNBError.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637DE0C71334A0024798E /* DNBError.cpp */; }; - 26CE05BA115C363E0022F371 /* DNBLog.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637E00C71334A0024798E /* DNBLog.cpp */; }; - 456F67591AD46CE9002850C2 /* DNBLog.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637E00C71334A0024798E /* DNBLog.cpp */; }; - 26CE05BB115C363F0022F371 /* DNBRegisterInfo.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637E20C71334A0024798E /* DNBRegisterInfo.cpp */; }; - 456F675A1AD46CE9002850C2 /* DNBRegisterInfo.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637E20C71334A0024798E /* DNBRegisterInfo.cpp */; }; - 26CE05A8115C36170022F371 /* DNBThreadResumeActions.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 260E7331114BFFE600D1DFB3 /* DNBThreadResumeActions.cpp */; }; - 456F67471AD46CE9002850C2 /* DNBThreadResumeActions.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 260E7331114BFFE600D1DFB3 /* DNBThreadResumeActions.cpp */; }; - 23AE72E41D25DECF00945BCE /* DarwinLogCollector.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 23AE72E21D25DECF00945BCE /* DarwinLogCollector.cpp */; }; - 23AE72E51D25DEE100945BCE /* DarwinLogCollector.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 23AE72E21D25DECF00945BCE /* DarwinLogCollector.cpp */; }; - 49D404621E39260F00570CDC /* Foundation.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 49D404611E39260F00570CDC /* Foundation.framework */; }; - AFA3FCA11E39984900218D5E /* Foundation.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 49D404611E39260F00570CDC /* Foundation.framework */; }; - 456F67561AD46CE9002850C2 /* Genealogy.cpp in Sources */ = {isa = PBXBuildFile; fileRef = AFEC3363194A8B0B00FF05C6 /* Genealogy.cpp */; }; - AFEC3364194A8B0B00FF05C6 /* Genealogy.cpp in Sources */ = {isa = PBXBuildFile; fileRef = AFEC3363194A8B0B00FF05C6 /* Genealogy.cpp */; }; 23043C9D1D35DBEC00FC25CA /* JSON.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 233B4EA51D2DB54300E98261 /* JSON.cpp */; }; + 23043C9E1D35DBFA00FC25CA /* StringConvert.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 233B4EA81D2DB96A00E98261 /* StringConvert.cpp */; }; + 2307CCCB1D4A5D630016ABC0 /* LogFilterExactMatch.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 237821AE1D4917D20028B7A1 /* LogFilterExactMatch.cpp */; }; 233B4EA71D2DB54300E98261 /* JSON.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 233B4EA51D2DB54300E98261 /* JSON.cpp */; }; + 233B4EA91D2DB96A00E98261 /* StringConvert.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 233B4EA81D2DB96A00E98261 /* StringConvert.cpp */; }; + 23562ED21D3424DF00AB2BD4 /* LogMessageOsLog.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 23562ED01D3424DF00AB2BD4 /* LogMessageOsLog.cpp */; }; + 23562ED31D3424DF00AB2BD4 /* LogMessageOsLog.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 23562ED01D3424DF00AB2BD4 /* LogMessageOsLog.cpp */; }; + 23562ED61D342A5A00AB2BD4 /* ActivityStore.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 23562ED51D342A5A00AB2BD4 /* ActivityStore.cpp */; }; + 23562ED71D342A5A00AB2BD4 /* ActivityStore.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 23562ED51D342A5A00AB2BD4 /* ActivityStore.cpp */; }; + 23562ED91D342B0000AB2BD4 /* LogMessage.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 23562ED81D342B0000AB2BD4 /* LogMessage.cpp */; }; + 23562EDA1D342B0000AB2BD4 /* LogMessage.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 23562ED81D342B0000AB2BD4 /* LogMessage.cpp */; }; + 237821B01D4917D20028B7A1 /* LogFilterExactMatch.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 237821AE1D4917D20028B7A1 /* LogFilterExactMatch.cpp */; }; 23AC04C61D2F41A00072351D /* LogFilter.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 23AC04C41D2F41A00072351D /* LogFilter.cpp */; }; 23AC04C71D2F41A00072351D /* LogFilter.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 23AC04C41D2F41A00072351D /* LogFilter.cpp */; }; 23AC04CA1D2F42250072351D /* LogFilterChain.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 23AC04C81D2F42250072351D /* LogFilterChain.cpp */; }; 23AC04CB1D2F42250072351D /* LogFilterChain.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 23AC04C81D2F42250072351D /* LogFilterChain.cpp */; }; - 2307CCCB1D4A5D630016ABC0 /* LogFilterExactMatch.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 237821AE1D4917D20028B7A1 /* LogFilterExactMatch.cpp */; }; - 237821B01D4917D20028B7A1 /* LogFilterExactMatch.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 237821AE1D4917D20028B7A1 /* LogFilterExactMatch.cpp */; }; 23AC04CF1D2F58AF0072351D /* LogFilterRegex.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 23AC04CD1D2F58AF0072351D /* LogFilterRegex.cpp */; }; 23AC04D01D2F58AF0072351D /* LogFilterRegex.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 23AC04CD1D2F58AF0072351D /* LogFilterRegex.cpp */; }; - 23562ED91D342B0000AB2BD4 /* LogMessage.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 23562ED81D342B0000AB2BD4 /* LogMessage.cpp */; }; - 23562EDA1D342B0000AB2BD4 /* LogMessage.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 23562ED81D342B0000AB2BD4 /* LogMessage.cpp */; }; - 23562ED21D3424DF00AB2BD4 /* LogMessageOsLog.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 23562ED01D3424DF00AB2BD4 /* LogMessageOsLog.cpp */; }; - 23562ED31D3424DF00AB2BD4 /* LogMessageOsLog.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 23562ED01D3424DF00AB2BD4 /* LogMessageOsLog.cpp */; }; + 23AE72E41D25DECF00945BCE /* DarwinLogCollector.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 23AE72E21D25DECF00945BCE /* DarwinLogCollector.cpp */; }; + 23AE72E51D25DEE100945BCE /* DarwinLogCollector.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 23AE72E21D25DECF00945BCE /* DarwinLogCollector.cpp */; }; + 23D1B0291D497E8B00FF831B /* OsLogger.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 23D1B0271D497E8B00FF831B /* OsLogger.cpp */; }; + 23D1B02A1D497E8B00FF831B /* OsLogger.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 23D1B0271D497E8B00FF831B /* OsLogger.cpp */; }; + 264D5D581293835600ED4C01 /* DNBArch.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 264D5D571293835600ED4C01 /* DNBArch.cpp */; }; + 266B5ED11460A68200E43F0A /* DNBArchImplARM64.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 266B5ECF1460A68200E43F0A /* DNBArchImplARM64.cpp */; }; + 26CE05A7115C360D0022F371 /* DNBError.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637DE0C71334A0024798E /* DNBError.cpp */; }; + 26CE05A8115C36170022F371 /* DNBThreadResumeActions.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 260E7331114BFFE600D1DFB3 /* DNBThreadResumeActions.cpp */; }; + 26CE05A9115C36250022F371 /* debugserver.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26A02918114AB9240029C479 /* debugserver.cpp */; }; + 26CE05AA115C36260022F371 /* RNBContext.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26A68F7E0D104EC800665A9E /* RNBContext.cpp */; }; + 26CE05AB115C36270022F371 /* RNBServices.cpp in Sources */ = {isa = PBXBuildFile; fileRef = EF8878A00D9C797C001831DA /* RNBServices.cpp */; }; + 26CE05AC115C36280022F371 /* RNBSocket.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26A68FB00D1054DA00665A9E /* RNBSocket.cpp */; }; + 26CE05AD115C36280022F371 /* RNBRemote.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26A68FD60D10574500665A9E /* RNBRemote.cpp */; }; + 26CE05AE115C36320022F371 /* dbgnub-mig.defs in Sources */ = {isa = PBXBuildFile; fileRef = 26C637E80C71334A0024798E /* dbgnub-mig.defs */; settings = {ATTRIBUTES = (Client, Server, ); }; }; 26CE05B0115C36340022F371 /* MachException.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637EE0C71334A0024798E /* MachException.cpp */; }; - 456F674E1AD46CE9002850C2 /* MachException.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637EE0C71334A0024798E /* MachException.cpp */; }; 26CE05B1115C36350022F371 /* MachProcess.mm in Sources */ = {isa = PBXBuildFile; fileRef = 26C637F00C71334A0024798E /* MachProcess.mm */; }; - 456F674F1AD46CE9002850C2 /* MachProcess.mm in Sources */ = {isa = PBXBuildFile; fileRef = 26C637F00C71334A0024798E /* MachProcess.mm */; }; - 26CE05B6115C36390022F371 /* MachTask.mm in Sources */ = {isa = PBXBuildFile; fileRef = 26B67DE10EE9BC30006C8BC0 /* MachTask.mm */; }; - 456F67541AD46CE9002850C2 /* MachTask.mm in Sources */ = {isa = PBXBuildFile; fileRef = 26B67DE10EE9BC30006C8BC0 /* MachTask.mm */; }; 26CE05B2115C36360022F371 /* MachThread.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637F20C71334A0024798E /* MachThread.cpp */; }; - 456F67501AD46CE9002850C2 /* MachThread.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637F20C71334A0024798E /* MachThread.cpp */; }; 26CE05B3115C36370022F371 /* MachThreadList.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637F40C71334A0024798E /* MachThreadList.cpp */; }; - 456F67511AD46CE9002850C2 /* MachThreadList.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637F40C71334A0024798E /* MachThreadList.cpp */; }; 26CE05B4115C36380022F371 /* MachVMMemory.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637F60C71334A0024798E /* MachVMMemory.cpp */; }; - 456F67521AD46CE9002850C2 /* MachVMMemory.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637F60C71334A0024798E /* MachVMMemory.cpp */; }; 26CE05B5115C36380022F371 /* MachVMRegion.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637F80C71334A0024798E /* MachVMRegion.cpp */; }; - 456F67531AD46CE9002850C2 /* MachVMRegion.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637F80C71334A0024798E /* MachVMRegion.cpp */; }; - 23D1B0291D497E8B00FF831B /* OsLogger.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 23D1B0271D497E8B00FF831B /* OsLogger.cpp */; }; - 23D1B02A1D497E8B00FF831B /* OsLogger.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 23D1B0271D497E8B00FF831B /* OsLogger.cpp */; }; + 26CE05B6115C36390022F371 /* MachTask.mm in Sources */ = {isa = PBXBuildFile; fileRef = 26B67DE10EE9BC30006C8BC0 /* MachTask.mm */; }; + 26CE05B7115C363B0022F371 /* DNB.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637D60C71334A0024798E /* DNB.cpp */; }; + 26CE05B8115C363C0022F371 /* DNBBreakpoint.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637D90C71334A0024798E /* DNBBreakpoint.cpp */; }; + 26CE05B9115C363D0022F371 /* DNBDataRef.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637DB0C71334A0024798E /* DNBDataRef.cpp */; }; + 26CE05BA115C363E0022F371 /* DNBLog.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637E00C71334A0024798E /* DNBLog.cpp */; }; + 26CE05BB115C363F0022F371 /* DNBRegisterInfo.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637E20C71334A0024798E /* DNBRegisterInfo.cpp */; }; 26CE05BC115C36420022F371 /* PThreadEvent.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637FE0C71334A0024798E /* PThreadEvent.cpp */; }; - 456F675B1AD46CE9002850C2 /* PThreadEvent.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637FE0C71334A0024798E /* PThreadEvent.cpp */; }; 26CE05BD115C36430022F371 /* PThreadMutex.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 2672DBEE0EEF446700E92059 /* PThreadMutex.cpp */; }; - 456F675C1AD46CE9002850C2 /* PThreadMutex.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 2672DBEE0EEF446700E92059 /* PThreadMutex.cpp */; }; + 26CE05BE115C36440022F371 /* SysSignal.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C638010C71334A0024798E /* SysSignal.cpp */; }; + 26CE05BF115C364D0022F371 /* DNBArchImplX86_64.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26CF99A21142EB7400011AAB /* DNBArchImplX86_64.cpp */; }; + 26CE05C0115C364F0022F371 /* DNBArchImplI386.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637EA0C71334A0024798E /* DNBArchImplI386.cpp */; }; + 26CE05C1115C36510022F371 /* DNBArchImpl.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 2675D4220CCEB705000F49AF /* DNBArchImpl.cpp */; }; + 26CE05C3115C36580022F371 /* CFString.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 2695DD9B0D3EC160007E4CA2 /* CFString.cpp */; }; + 26CE05C5115C36590022F371 /* CFBundle.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 2695DD910D3EBFF6007E4CA2 /* CFBundle.cpp */; }; + 26CE05CF115C36F70022F371 /* CoreFoundation.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 26ACA3340D3E956300A2120B /* CoreFoundation.framework */; settings = {ATTRIBUTES = (Required, ); }; }; 26CE05F1115C387C0022F371 /* PseudoTerminal.cpp in Sources */ = {isa = PBXBuildFile; fileRef = AF67ABFF0D34604D0022D128 /* PseudoTerminal.cpp */; }; - 456F67651AD46CE9002850C2 /* PseudoTerminal.cpp in Sources */ = {isa = PBXBuildFile; fileRef = AF67ABFF0D34604D0022D128 /* PseudoTerminal.cpp */; }; - 26CE05AA115C36260022F371 /* RNBContext.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26A68F7E0D104EC800665A9E /* RNBContext.cpp */; }; + 456F67461AD46CE9002850C2 /* DNBError.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637DE0C71334A0024798E /* DNBError.cpp */; }; + 456F67471AD46CE9002850C2 /* DNBThreadResumeActions.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 260E7331114BFFE600D1DFB3 /* DNBThreadResumeActions.cpp */; }; + 456F67481AD46CE9002850C2 /* debugserver.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26A02918114AB9240029C479 /* debugserver.cpp */; }; 456F67491AD46CE9002850C2 /* RNBContext.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26A68F7E0D104EC800665A9E /* RNBContext.cpp */; }; - 26CE05AD115C36280022F371 /* RNBRemote.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26A68FD60D10574500665A9E /* RNBRemote.cpp */; }; - 456F674C1AD46CE9002850C2 /* RNBRemote.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26A68FD60D10574500665A9E /* RNBRemote.cpp */; }; - 26CE05AB115C36270022F371 /* RNBServices.cpp in Sources */ = {isa = PBXBuildFile; fileRef = EF8878A00D9C797C001831DA /* RNBServices.cpp */; }; 456F674A1AD46CE9002850C2 /* RNBServices.cpp in Sources */ = {isa = PBXBuildFile; fileRef = EF8878A00D9C797C001831DA /* RNBServices.cpp */; }; - 26CE05AC115C36280022F371 /* RNBSocket.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26A68FB00D1054DA00665A9E /* RNBSocket.cpp */; }; 456F674B1AD46CE9002850C2 /* RNBSocket.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26A68FB00D1054DA00665A9E /* RNBSocket.cpp */; }; - AF588449206077BD00A0CB5A /* SocketAddress.cpp in Sources */ = {isa = PBXBuildFile; fileRef = D6631CA81E848FE9006A7B11 /* SocketAddress.cpp */; }; - D6631CA91E848FE9006A7B11 /* SocketAddress.cpp in Sources */ = {isa = PBXBuildFile; fileRef = D6631CA81E848FE9006A7B11 /* SocketAddress.cpp */; }; + 456F674C1AD46CE9002850C2 /* RNBRemote.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26A68FD60D10574500665A9E /* RNBRemote.cpp */; }; + 456F674D1AD46CE9002850C2 /* dbgnub-mig.defs in Sources */ = {isa = PBXBuildFile; fileRef = 26C637E80C71334A0024798E /* dbgnub-mig.defs */; settings = {ATTRIBUTES = (Client, Server, ); }; }; + 456F674E1AD46CE9002850C2 /* MachException.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637EE0C71334A0024798E /* MachException.cpp */; }; + 456F674F1AD46CE9002850C2 /* MachProcess.mm in Sources */ = {isa = PBXBuildFile; fileRef = 26C637F00C71334A0024798E /* MachProcess.mm */; }; + 456F67501AD46CE9002850C2 /* MachThread.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637F20C71334A0024798E /* MachThread.cpp */; }; + 456F67511AD46CE9002850C2 /* MachThreadList.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637F40C71334A0024798E /* MachThreadList.cpp */; }; + 456F67521AD46CE9002850C2 /* MachVMMemory.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637F60C71334A0024798E /* MachVMMemory.cpp */; }; + 456F67531AD46CE9002850C2 /* MachVMRegion.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637F80C71334A0024798E /* MachVMRegion.cpp */; }; + 456F67541AD46CE9002850C2 /* MachTask.mm in Sources */ = {isa = PBXBuildFile; fileRef = 26B67DE10EE9BC30006C8BC0 /* MachTask.mm */; }; + 456F67551AD46CE9002850C2 /* DNB.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637D60C71334A0024798E /* DNB.cpp */; }; + 456F67561AD46CE9002850C2 /* Genealogy.cpp in Sources */ = {isa = PBXBuildFile; fileRef = AFEC3363194A8B0B00FF05C6 /* Genealogy.cpp */; }; + 456F67571AD46CE9002850C2 /* DNBBreakpoint.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637D90C71334A0024798E /* DNBBreakpoint.cpp */; }; + 456F67581AD46CE9002850C2 /* DNBDataRef.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637DB0C71334A0024798E /* DNBDataRef.cpp */; }; + 456F67591AD46CE9002850C2 /* DNBLog.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637E00C71334A0024798E /* DNBLog.cpp */; }; + 456F675A1AD46CE9002850C2 /* DNBRegisterInfo.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637E20C71334A0024798E /* DNBRegisterInfo.cpp */; }; + 456F675B1AD46CE9002850C2 /* PThreadEvent.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637FE0C71334A0024798E /* PThreadEvent.cpp */; }; + 456F675C1AD46CE9002850C2 /* PThreadMutex.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 2672DBEE0EEF446700E92059 /* PThreadMutex.cpp */; }; + 456F675D1AD46CE9002850C2 /* SysSignal.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C638010C71334A0024798E /* SysSignal.cpp */; }; + 456F675E1AD46CE9002850C2 /* DNBArchImplX86_64.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26CF99A21142EB7400011AAB /* DNBArchImplX86_64.cpp */; }; + 456F675F1AD46CE9002850C2 /* DNBArchImplI386.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C637EA0C71334A0024798E /* DNBArchImplI386.cpp */; }; + 456F67601AD46CE9002850C2 /* DNBArchImpl.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 2675D4220CCEB705000F49AF /* DNBArchImpl.cpp */; }; + 456F67621AD46CE9002850C2 /* CFString.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 2695DD9B0D3EC160007E4CA2 /* CFString.cpp */; }; + 456F67641AD46CE9002850C2 /* CFBundle.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 2695DD910D3EBFF6007E4CA2 /* CFBundle.cpp */; }; + 456F67651AD46CE9002850C2 /* PseudoTerminal.cpp in Sources */ = {isa = PBXBuildFile; fileRef = AF67ABFF0D34604D0022D128 /* PseudoTerminal.cpp */; }; + 456F67671AD46CE9002850C2 /* DNBArch.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 264D5D571293835600ED4C01 /* DNBArch.cpp */; }; + 456F67691AD46CE9002850C2 /* DNBArchImplARM64.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 266B5ECF1460A68200E43F0A /* DNBArchImplARM64.cpp */; }; + 456F676B1AD46CE9002850C2 /* CoreFoundation.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 26ACA3340D3E956300A2120B /* CoreFoundation.framework */; settings = {ATTRIBUTES = (Required, ); }; }; + 49D404621E39260F00570CDC /* Foundation.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 49D404611E39260F00570CDC /* Foundation.framework */; }; AF48558C1D75126800D19C07 /* StdStringExtractor.cpp in Sources */ = {isa = PBXBuildFile; fileRef = AF48558B1D75126800D19C07 /* StdStringExtractor.cpp */; }; AF48558D1D75127500D19C07 /* StdStringExtractor.cpp in Sources */ = {isa = PBXBuildFile; fileRef = AF48558B1D75126800D19C07 /* StdStringExtractor.cpp */; }; - 23043C9E1D35DBFA00FC25CA /* StringConvert.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 233B4EA81D2DB96A00E98261 /* StringConvert.cpp */; }; - 233B4EA91D2DB96A00E98261 /* StringConvert.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 233B4EA81D2DB96A00E98261 /* StringConvert.cpp */; }; - 26CE05BE115C36440022F371 /* SysSignal.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C638010C71334A0024798E /* SysSignal.cpp */; }; - 456F675D1AD46CE9002850C2 /* SysSignal.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26C638010C71334A0024798E /* SysSignal.cpp */; }; - 26CE05AE115C36320022F371 /* dbgnub-mig.defs in Sources */ = {isa = PBXBuildFile; fileRef = 26C637E80C71334A0024798E /* dbgnub-mig.defs */; settings = {ATTRIBUTES = (Client, Server, ); }; }; - 456F674D1AD46CE9002850C2 /* dbgnub-mig.defs in Sources */ = {isa = PBXBuildFile; fileRef = 26C637E80C71334A0024798E /* dbgnub-mig.defs */; settings = {ATTRIBUTES = (Client, Server, ); }; }; - 26CE05A9115C36250022F371 /* debugserver.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26A02918114AB9240029C479 /* debugserver.cpp */; }; - 456F67481AD46CE9002850C2 /* debugserver.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 26A02918114AB9240029C479 /* debugserver.cpp */; }; + AF588449206077BD00A0CB5A /* SocketAddress.cpp in Sources */ = {isa = PBXBuildFile; fileRef = D6631CA81E848FE9006A7B11 /* SocketAddress.cpp */; }; + AFA3FCA11E39984900218D5E /* Foundation.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 49D404611E39260F00570CDC /* Foundation.framework */; }; + AFEC3364194A8B0B00FF05C6 /* Genealogy.cpp in Sources */ = {isa = PBXBuildFile; fileRef = AFEC3363194A8B0B00FF05C6 /* Genealogy.cpp */; }; + D6631CA91E848FE9006A7B11 /* SocketAddress.cpp in Sources */ = {isa = PBXBuildFile; fileRef = D6631CA81E848FE9006A7B11 /* SocketAddress.cpp */; }; /* End PBXBuildFile section */ /* Begin PBXFileReference section */ - 23562ED51D342A5A00AB2BD4 /* ActivityStore.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ActivityStore.cpp; sourceTree = ""; }; + 2307CCCC1D4A5DAE0016ABC0 /* CMakeLists.txt */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = CMakeLists.txt; sourceTree = ""; }; + 233B4EA51D2DB54300E98261 /* JSON.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = JSON.cpp; sourceTree = ""; }; + 233B4EA61D2DB54300E98261 /* JSON.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = JSON.h; sourceTree = ""; }; + 233B4EA81D2DB96A00E98261 /* StringConvert.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = StringConvert.cpp; path = ../../../source/Host/common/StringConvert.cpp; sourceTree = ""; }; + 23562ECF1D34110D00AB2BD4 /* DarwinLogTypes.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = DarwinLogTypes.h; sourceTree = ""; }; + 23562ED01D3424DF00AB2BD4 /* LogMessageOsLog.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = LogMessageOsLog.cpp; sourceTree = ""; }; + 23562ED11D3424DF00AB2BD4 /* LogMessageOsLog.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = LogMessageOsLog.h; sourceTree = ""; }; 23562ED41D3426DD00AB2BD4 /* ActivityStore.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = ActivityStore.h; sourceTree = ""; }; + 23562ED51D342A5A00AB2BD4 /* ActivityStore.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ActivityStore.cpp; sourceTree = ""; }; + 23562ED81D342B0000AB2BD4 /* LogMessage.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = LogMessage.cpp; sourceTree = ""; }; + 237821AD1D4917D20028B7A1 /* CMakeLists.txt */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = CMakeLists.txt; sourceTree = ""; }; + 237821AE1D4917D20028B7A1 /* LogFilterExactMatch.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = LogFilterExactMatch.cpp; sourceTree = ""; }; + 237821AF1D4917D20028B7A1 /* LogFilterExactMatch.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = LogFilterExactMatch.h; sourceTree = ""; }; + 23AC04C41D2F41A00072351D /* LogFilter.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = LogFilter.cpp; sourceTree = ""; }; + 23AC04C51D2F41A00072351D /* LogFilter.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = LogFilter.h; sourceTree = ""; }; + 23AC04C81D2F42250072351D /* LogFilterChain.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = LogFilterChain.cpp; sourceTree = ""; }; + 23AC04C91D2F42250072351D /* LogFilterChain.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = LogFilterChain.h; sourceTree = ""; }; + 23AC04CC1D2F42F10072351D /* DarwinLogInterfaces.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = DarwinLogInterfaces.h; sourceTree = ""; }; + 23AC04CD1D2F58AF0072351D /* LogFilterRegex.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = LogFilterRegex.cpp; sourceTree = ""; }; + 23AC04CE1D2F58AF0072351D /* LogFilterRegex.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = LogFilterRegex.h; sourceTree = ""; }; + 23AC04D11D2F60130072351D /* LogMessage.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = LogMessage.h; sourceTree = ""; }; + 23AE72E21D25DECF00945BCE /* DarwinLogCollector.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = DarwinLogCollector.cpp; sourceTree = ""; }; + 23AE72E31D25DECF00945BCE /* DarwinLogCollector.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = DarwinLogCollector.h; sourceTree = ""; }; 23AE72E61D25DEFB00945BCE /* ActivityStreamSPI.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = ActivityStreamSPI.h; sourceTree = ""; }; + 23CF6F5E1D28A3760088ADC9 /* DarwinLogEvent.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = DarwinLogEvent.h; sourceTree = ""; }; + 23D1B0271D497E8B00FF831B /* OsLogger.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = OsLogger.cpp; sourceTree = ""; }; + 23D1B0281D497E8B00FF831B /* OsLogger.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = OsLogger.h; sourceTree = ""; }; + 260828DE0CBAF7F400F95054 /* DNBRuntimeAction.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = DNBRuntimeAction.h; sourceTree = ""; }; + 260E7331114BFFE600D1DFB3 /* DNBThreadResumeActions.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = DNBThreadResumeActions.cpp; sourceTree = ""; }; + 260E7332114BFFE600D1DFB3 /* DNBThreadResumeActions.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = DNBThreadResumeActions.h; sourceTree = ""; }; + 260FC7320E5B290400043FC9 /* debugnub-exports */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = "debugnub-exports"; sourceTree = SOURCE_ROOT; }; + 26203D1C1641EFB200A662F7 /* com.apple.debugserver.applist.internal.plist */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.plist.xml; path = com.apple.debugserver.applist.internal.plist; sourceTree = ""; }; + 26203D1D1641EFB200A662F7 /* com.apple.debugserver.internal.plist */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.plist.xml; path = com.apple.debugserver.internal.plist; sourceTree = ""; }; + 26242C390DDBD33C0054A4CC /* debugserver-entitlements.plist */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.plist.xml; path = "debugserver-entitlements.plist"; sourceTree = ""; }; + 264D5D571293835600ED4C01 /* DNBArch.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = DNBArch.cpp; sourceTree = ""; }; + 264F679A1B2F9EB200140093 /* JSONGenerator.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = JSONGenerator.h; sourceTree = ""; }; + 26593A060D4931CC001C9FE3 /* ChangeLog */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = ChangeLog; sourceTree = ""; }; + 266B5ECF1460A68200E43F0A /* DNBArchImplARM64.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = DNBArchImplARM64.cpp; sourceTree = ""; }; + 266B5ED01460A68200E43F0A /* DNBArchImplARM64.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = DNBArchImplARM64.h; sourceTree = ""; }; + 2672DBEE0EEF446700E92059 /* PThreadMutex.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = PThreadMutex.cpp; sourceTree = ""; }; + 2675D4220CCEB705000F49AF /* DNBArchImpl.cpp */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.cpp.cpp; name = DNBArchImpl.cpp; path = arm/DNBArchImpl.cpp; sourceTree = ""; }; + 2675D4230CCEB705000F49AF /* DNBArchImpl.h */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.c.h; name = DNBArchImpl.h; path = arm/DNBArchImpl.h; sourceTree = ""; }; 2695DD910D3EBFF6007E4CA2 /* CFBundle.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CFBundle.cpp; sourceTree = ""; }; 2695DD920D3EBFF6007E4CA2 /* CFBundle.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = CFBundle.h; sourceTree = ""; }; - 2695DD9B0D3EC160007E4CA2 /* CFString.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CFString.cpp; sourceTree = ""; }; 2695DD9A0D3EC160007E4CA2 /* CFString.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = CFString.h; sourceTree = ""; }; - 26C637E70C71334A0024798E /* CFUtils.h */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.c.h; path = CFUtils.h; sourceTree = ""; }; - 2307CCCC1D4A5DAE0016ABC0 /* CMakeLists.txt */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = CMakeLists.txt; sourceTree = ""; }; - 237821AD1D4917D20028B7A1 /* CMakeLists.txt */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = CMakeLists.txt; sourceTree = ""; }; - 26593A060D4931CC001C9FE3 /* ChangeLog */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = ChangeLog; sourceTree = ""; }; + 2695DD9B0D3EC160007E4CA2 /* CFString.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = CFString.cpp; sourceTree = ""; }; + 269E8DF8164B2ED200AD65F6 /* com.apple.debugserver.posix.plist */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.plist.xml; path = com.apple.debugserver.posix.plist; sourceTree = ""; }; + 26A02918114AB9240029C479 /* debugserver.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = debugserver.cpp; sourceTree = ""; }; + 26A4BAED0D498B7D00A9BEAB /* com.apple.debugserver.plist */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.plist.xml; path = com.apple.debugserver.plist; sourceTree = ""; }; + 26A68F7D0D104EC800665A9E /* RNBContext.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = RNBContext.h; sourceTree = ""; }; + 26A68F7E0D104EC800665A9E /* RNBContext.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = RNBContext.cpp; sourceTree = ""; }; + 26A68FAF0D1054DA00665A9E /* RNBSocket.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = RNBSocket.h; sourceTree = ""; }; + 26A68FB00D1054DA00665A9E /* RNBSocket.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = RNBSocket.cpp; sourceTree = ""; }; + 26A68FD50D10574500665A9E /* RNBRemote.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = RNBRemote.h; sourceTree = ""; }; + 26A68FD60D10574500665A9E /* RNBRemote.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = RNBRemote.cpp; sourceTree = ""; }; + 26A8FE1E0D11A77B00203048 /* DNBTimer.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = DNBTimer.h; sourceTree = ""; }; 26ACA3340D3E956300A2120B /* CoreFoundation.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = CoreFoundation.framework; path = System/Library/Frameworks/CoreFoundation.framework; sourceTree = SDKROOT; }; + 26B67DE00EE9BC30006C8BC0 /* MachTask.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = MachTask.h; sourceTree = ""; }; + 26B67DE10EE9BC30006C8BC0 /* MachTask.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = MachTask.mm; sourceTree = ""; }; 26C637D60C71334A0024798E /* DNB.cpp */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.cpp.cpp; path = DNB.cpp; sourceTree = ""; }; 26C637D70C71334A0024798E /* DNB.h */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.c.h; path = DNB.h; sourceTree = ""; }; - 264D5D571293835600ED4C01 /* DNBArch.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = DNBArch.cpp; sourceTree = ""; }; 26C637D80C71334A0024798E /* DNBArch.h */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.c.h; path = DNBArch.h; sourceTree = ""; }; - 2675D4220CCEB705000F49AF /* DNBArchImpl.cpp */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.cpp.cpp; name = DNBArchImpl.cpp; path = arm/DNBArchImpl.cpp; sourceTree = ""; }; - 26C637FB0C71334A0024798E /* DNBArchImpl.cpp */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.cpp.cpp; path = DNBArchImpl.cpp; sourceTree = ""; }; - 2675D4230CCEB705000F49AF /* DNBArchImpl.h */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.c.h; name = DNBArchImpl.h; path = arm/DNBArchImpl.h; sourceTree = ""; }; - 26C637FC0C71334A0024798E /* DNBArchImpl.h */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.c.h; path = DNBArchImpl.h; sourceTree = ""; }; - 266B5ECF1460A68200E43F0A /* DNBArchImplARM64.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = DNBArchImplARM64.cpp; sourceTree = ""; }; - 266B5ED01460A68200E43F0A /* DNBArchImplARM64.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = DNBArchImplARM64.h; sourceTree = ""; }; - 26C637EA0C71334A0024798E /* DNBArchImplI386.cpp */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.cpp.cpp; path = DNBArchImplI386.cpp; sourceTree = ""; }; - 26C637EB0C71334A0024798E /* DNBArchImplI386.h */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.c.h; path = DNBArchImplI386.h; sourceTree = ""; }; - 26CF99A21142EB7400011AAB /* DNBArchImplX86_64.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = DNBArchImplX86_64.cpp; sourceTree = ""; }; - 26CF99A31142EB7400011AAB /* DNBArchImplX86_64.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = DNBArchImplX86_64.h; sourceTree = ""; }; 26C637D90C71334A0024798E /* DNBBreakpoint.cpp */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.cpp.cpp; path = DNBBreakpoint.cpp; sourceTree = ""; }; 26C637DA0C71334A0024798E /* DNBBreakpoint.h */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.c.h; path = DNBBreakpoint.h; sourceTree = ""; }; 26C637DB0C71334A0024798E /* DNBDataRef.cpp */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.cpp.cpp; path = DNBDataRef.cpp; sourceTree = ""; }; @@ -143,42 +177,14 @@ 26C637E10C71334A0024798E /* DNBLog.h */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.c.h; path = DNBLog.h; sourceTree = ""; }; 26C637E20C71334A0024798E /* DNBRegisterInfo.cpp */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.cpp.cpp; path = DNBRegisterInfo.cpp; sourceTree = ""; }; 26C637E30C71334A0024798E /* DNBRegisterInfo.h */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.c.h; path = DNBRegisterInfo.h; sourceTree = ""; }; - 260828DE0CBAF7F400F95054 /* DNBRuntimeAction.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = DNBRuntimeAction.h; sourceTree = ""; }; - 260E7331114BFFE600D1DFB3 /* DNBThreadResumeActions.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = DNBThreadResumeActions.cpp; sourceTree = ""; }; - 260E7332114BFFE600D1DFB3 /* DNBThreadResumeActions.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = DNBThreadResumeActions.h; sourceTree = ""; }; - 26A8FE1E0D11A77B00203048 /* DNBTimer.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = DNBTimer.h; sourceTree = ""; }; - 23AE72E21D25DECF00945BCE /* DarwinLogCollector.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = DarwinLogCollector.cpp; sourceTree = ""; }; - 23AE72E31D25DECF00945BCE /* DarwinLogCollector.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = DarwinLogCollector.h; sourceTree = ""; }; - 23CF6F5E1D28A3760088ADC9 /* DarwinLogEvent.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = DarwinLogEvent.h; sourceTree = ""; }; - 23AC04CC1D2F42F10072351D /* DarwinLogInterfaces.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = DarwinLogInterfaces.h; sourceTree = ""; }; - 23562ECF1D34110D00AB2BD4 /* DarwinLogTypes.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = DarwinLogTypes.h; sourceTree = ""; }; - 49D404611E39260F00570CDC /* Foundation.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Foundation.framework; path = System/Library/Frameworks/Foundation.framework; sourceTree = SDKROOT; }; - AFEC3363194A8B0B00FF05C6 /* Genealogy.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = Genealogy.cpp; sourceTree = ""; }; - AF0934BA18E12B92005A11FD /* Genealogy.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = Genealogy.h; sourceTree = ""; }; - AF0934BB18E12B92005A11FD /* GenealogySPI.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = GenealogySPI.h; sourceTree = ""; }; - 233B4EA51D2DB54300E98261 /* JSON.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = JSON.cpp; sourceTree = ""; }; - 233B4EA61D2DB54300E98261 /* JSON.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = JSON.h; sourceTree = ""; }; - 264F679A1B2F9EB200140093 /* JSONGenerator.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = JSONGenerator.h; sourceTree = ""; }; - 23AC04C41D2F41A00072351D /* LogFilter.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = LogFilter.cpp; sourceTree = ""; }; - 23AC04C51D2F41A00072351D /* LogFilter.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = LogFilter.h; sourceTree = ""; }; - 23AC04C81D2F42250072351D /* LogFilterChain.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = LogFilterChain.cpp; sourceTree = ""; }; - 23AC04C91D2F42250072351D /* LogFilterChain.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = LogFilterChain.h; sourceTree = ""; }; - 237821AE1D4917D20028B7A1 /* LogFilterExactMatch.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = LogFilterExactMatch.cpp; sourceTree = ""; }; - 237821AF1D4917D20028B7A1 /* LogFilterExactMatch.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = LogFilterExactMatch.h; sourceTree = ""; }; - 23AC04CD1D2F58AF0072351D /* LogFilterRegex.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = LogFilterRegex.cpp; sourceTree = ""; }; - 23AC04CE1D2F58AF0072351D /* LogFilterRegex.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = LogFilterRegex.h; sourceTree = ""; }; - 23562ED81D342B0000AB2BD4 /* LogMessage.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = LogMessage.cpp; sourceTree = ""; }; - 23AC04D11D2F60130072351D /* LogMessage.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = LogMessage.h; sourceTree = ""; }; - 23562ED01D3424DF00AB2BD4 /* LogMessageOsLog.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = LogMessageOsLog.cpp; sourceTree = ""; }; - 23562ED11D3424DF00AB2BD4 /* LogMessageOsLog.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = LogMessageOsLog.h; sourceTree = ""; }; + 26C637E70C71334A0024798E /* CFUtils.h */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.c.h; path = CFUtils.h; sourceTree = ""; }; + 26C637E80C71334A0024798E /* dbgnub-mig.defs */ = {isa = PBXFileReference; explicitFileType = sourcecode.mig; fileEncoding = 30; path = "dbgnub-mig.defs"; sourceTree = ""; }; + 26C637EA0C71334A0024798E /* DNBArchImplI386.cpp */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.cpp.cpp; path = DNBArchImplI386.cpp; sourceTree = ""; }; + 26C637EB0C71334A0024798E /* DNBArchImplI386.h */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.c.h; path = DNBArchImplI386.h; sourceTree = ""; }; 26C637EE0C71334A0024798E /* MachException.cpp */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.cpp.cpp; path = MachException.cpp; sourceTree = ""; }; 26C637EF0C71334A0024798E /* MachException.h */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.c.h; path = MachException.h; sourceTree = ""; }; - 26C637F10C71334A0024798E /* MachProcess.h */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.c.h; path = MachProcess.h; sourceTree = ""; }; 26C637F00C71334A0024798E /* MachProcess.mm */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.cpp.objcpp; path = MachProcess.mm; sourceTree = ""; }; - 49F530111331519C008956F6 /* MachRegisterStatesI386.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = MachRegisterStatesI386.h; sourceTree = ""; }; - 49F5301213316D7F008956F6 /* MachRegisterStatesX86_64.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = MachRegisterStatesX86_64.h; sourceTree = ""; }; - 26B67DE00EE9BC30006C8BC0 /* MachTask.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = MachTask.h; sourceTree = ""; }; - 26B67DE10EE9BC30006C8BC0 /* MachTask.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = MachTask.mm; sourceTree = ""; }; + 26C637F10C71334A0024798E /* MachProcess.h */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.c.h; path = MachProcess.h; sourceTree = ""; }; 26C637F20C71334A0024798E /* MachThread.cpp */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.cpp.cpp; path = MachThread.cpp; sourceTree = ""; }; 26C637F30C71334A0024798E /* MachThread.h */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.c.h; path = MachThread.h; sourceTree = ""; }; 26C637F40C71334A0024798E /* MachThreadList.cpp */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.cpp.cpp; path = MachThreadList.cpp; sourceTree = ""; }; @@ -187,45 +193,35 @@ 26C637F70C71334A0024798E /* MachVMMemory.h */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.c.h; path = MachVMMemory.h; sourceTree = ""; }; 26C637F80C71334A0024798E /* MachVMRegion.cpp */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.cpp.cpp; path = MachVMRegion.cpp; sourceTree = ""; }; 26C637F90C71334A0024798E /* MachVMRegion.h */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.c.h; path = MachVMRegion.h; sourceTree = ""; }; - 23D1B0271D497E8B00FF831B /* OsLogger.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = OsLogger.cpp; sourceTree = ""; }; - 23D1B0281D497E8B00FF831B /* OsLogger.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = OsLogger.h; sourceTree = ""; }; 26C637FD0C71334A0024798E /* PThreadCondition.h */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.c.h; path = PThreadCondition.h; sourceTree = ""; }; 26C637FE0C71334A0024798E /* PThreadEvent.cpp */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.cpp.cpp; path = PThreadEvent.cpp; sourceTree = ""; }; 26C637FF0C71334A0024798E /* PThreadEvent.h */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.c.h; path = PThreadEvent.h; sourceTree = ""; }; - 2672DBEE0EEF446700E92059 /* PThreadMutex.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = PThreadMutex.cpp; sourceTree = ""; }; 26C638000C71334A0024798E /* PThreadMutex.h */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.c.h; path = PThreadMutex.h; sourceTree = ""; }; - AF67ABFF0D34604D0022D128 /* PseudoTerminal.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = PseudoTerminal.cpp; sourceTree = ""; }; - AF67AC000D34604D0022D128 /* PseudoTerminal.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = PseudoTerminal.h; sourceTree = ""; }; - 26A68F7E0D104EC800665A9E /* RNBContext.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = RNBContext.cpp; sourceTree = ""; }; - 26A68F7D0D104EC800665A9E /* RNBContext.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = RNBContext.h; sourceTree = ""; }; - 26E6B9DA0D1329010037ECDD /* RNBDefs.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = RNBDefs.h; sourceTree = ""; }; - 26A68FD60D10574500665A9E /* RNBRemote.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = RNBRemote.cpp; sourceTree = ""; }; - 26A68FD50D10574500665A9E /* RNBRemote.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = RNBRemote.h; sourceTree = ""; }; - EF8878A00D9C797C001831DA /* RNBServices.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = RNBServices.cpp; sourceTree = ""; }; - EF88789F0D9C797C001831DA /* RNBServices.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = RNBServices.h; sourceTree = ""; }; - 26A68FB00D1054DA00665A9E /* RNBSocket.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = RNBSocket.cpp; sourceTree = ""; }; - 26A68FAF0D1054DA00665A9E /* RNBSocket.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = RNBSocket.h; sourceTree = ""; }; - D6631CA81E848FE9006A7B11 /* SocketAddress.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; name = SocketAddress.cpp; path = ../../source/Host/common/SocketAddress.cpp; sourceTree = ""; }; - AF48558B1D75126800D19C07 /* StdStringExtractor.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = StdStringExtractor.cpp; sourceTree = ""; }; - 233B4EA81D2DB96A00E98261 /* StringConvert.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = StringConvert.cpp; path = ../../../source/Host/common/StringConvert.cpp; sourceTree = ""; }; 26C638010C71334A0024798E /* SysSignal.cpp */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.cpp.cpp; path = SysSignal.cpp; sourceTree = ""; }; 26C638020C71334A0024798E /* SysSignal.h */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.c.h; path = SysSignal.h; sourceTree = ""; }; 26C638050C71334A0024798E /* TTYState.cpp */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.cpp.cpp; path = TTYState.cpp; sourceTree = ""; }; 26C638060C71334A0024798E /* TTYState.h */ = {isa = PBXFileReference; fileEncoding = 30; lastKnownFileType = sourcecode.c.h; path = TTYState.h; sourceTree = ""; }; - 26203D1C1641EFB200A662F7 /* com.apple.debugserver.applist.internal.plist */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.plist.xml; path = com.apple.debugserver.applist.internal.plist; sourceTree = ""; }; - EF88788B0D9C7558001831DA /* com.apple.debugserver.applist.plist */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.plist.xml; path = com.apple.debugserver.applist.plist; sourceTree = ""; }; - 26203D1D1641EFB200A662F7 /* com.apple.debugserver.internal.plist */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.plist.xml; path = com.apple.debugserver.internal.plist; sourceTree = ""; }; - 26A4BAED0D498B7D00A9BEAB /* com.apple.debugserver.plist */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.plist.xml; path = com.apple.debugserver.plist; sourceTree = ""; }; - 269E8DF8164B2ED200AD65F6 /* com.apple.debugserver.posix.plist */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.plist.xml; path = com.apple.debugserver.posix.plist; sourceTree = ""; }; - AF949ED620605DC2002A91F9 /* com.apple.internal.xpc.remote.debugserver.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = com.apple.internal.xpc.remote.debugserver.plist; sourceTree = ""; }; - 26C637E80C71334A0024798E /* dbgnub-mig.defs */ = {isa = PBXFileReference; explicitFileType = sourcecode.mig; fileEncoding = 30; path = "dbgnub-mig.defs"; sourceTree = ""; }; - 260FC7320E5B290400043FC9 /* debugnub-exports */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = "debugnub-exports"; sourceTree = SOURCE_ROOT; }; 26CE0594115C31C20022F371 /* debugserver */ = {isa = PBXFileReference; explicitFileType = "compiled.mach-o.executable"; includeInIndex = 0; path = debugserver; sourceTree = BUILT_PRODUCTS_DIR; }; - 26242C390DDBD33C0054A4CC /* debugserver-entitlements.plist */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.plist.xml; path = "debugserver-entitlements.plist"; sourceTree = ""; }; - AF61C60418F75ABC00B48D9D /* debugserver-macosx-entitlements.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = "debugserver-macosx-entitlements.plist"; sourceTree = ""; }; + 26CF99A21142EB7400011AAB /* DNBArchImplX86_64.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = DNBArchImplX86_64.cpp; sourceTree = ""; }; + 26CF99A31142EB7400011AAB /* DNBArchImplX86_64.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = DNBArchImplX86_64.h; sourceTree = ""; }; + 26E6B9DA0D1329010037ECDD /* RNBDefs.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = RNBDefs.h; sourceTree = ""; }; 456F67721AD46CE9002850C2 /* debugserver-nonui */ = {isa = PBXFileReference; explicitFileType = "compiled.mach-o.executable"; includeInIndex = 0; path = "debugserver-nonui"; sourceTree = BUILT_PRODUCTS_DIR; }; - 26A02918114AB9240029C479 /* debugserver.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = debugserver.cpp; sourceTree = ""; }; + 49D404611E39260F00570CDC /* Foundation.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Foundation.framework; path = System/Library/Frameworks/Foundation.framework; sourceTree = SDKROOT; }; + 49F530111331519C008956F6 /* MachRegisterStatesI386.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = MachRegisterStatesI386.h; sourceTree = ""; }; + 49F5301213316D7F008956F6 /* MachRegisterStatesX86_64.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = MachRegisterStatesX86_64.h; sourceTree = ""; }; 9457ECF61419864100DFE7D8 /* stack_logging.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = stack_logging.h; sourceTree = ""; }; + AF0934BA18E12B92005A11FD /* Genealogy.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = Genealogy.h; sourceTree = ""; }; + AF0934BB18E12B92005A11FD /* GenealogySPI.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = GenealogySPI.h; sourceTree = ""; }; + AF48558B1D75126800D19C07 /* StdStringExtractor.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = StdStringExtractor.cpp; sourceTree = ""; }; + AF61C60418F75ABC00B48D9D /* debugserver-macosx-entitlements.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = "debugserver-macosx-entitlements.plist"; sourceTree = ""; }; + AF67ABFF0D34604D0022D128 /* PseudoTerminal.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = PseudoTerminal.cpp; sourceTree = ""; }; + AF67AC000D34604D0022D128 /* PseudoTerminal.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = PseudoTerminal.h; sourceTree = ""; }; + AF949ED620605DC2002A91F9 /* com.apple.internal.xpc.remote.debugserver.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = com.apple.internal.xpc.remote.debugserver.plist; sourceTree = ""; }; + AFEC3363194A8B0B00FF05C6 /* Genealogy.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = Genealogy.cpp; sourceTree = ""; }; + D6631CA81E848FE9006A7B11 /* SocketAddress.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; name = SocketAddress.cpp; path = ../../source/Host/common/SocketAddress.cpp; sourceTree = ""; }; + EF88788B0D9C7558001831DA /* com.apple.debugserver.applist.plist */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text.plist.xml; path = com.apple.debugserver.applist.plist; sourceTree = ""; }; + EF88789F0D9C797C001831DA /* RNBServices.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = RNBServices.h; sourceTree = ""; }; + EF8878A00D9C797C001831DA /* RNBServices.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = RNBServices.cpp; sourceTree = ""; }; /* End PBXFileReference section */ /* Begin PBXFrameworksBuildPhase section */ @@ -409,7 +405,6 @@ 2675D41C0CCEB6CF000F49AF /* arm */, 266B5ECE1460A68200E43F0A /* arm64 */, 26C637E90C71334A0024798E /* i386 */, - 26C637FA0C71334A0024798E /* ppc */, 26CF99A11142EB7400011AAB /* x86_64 */, 26C637E80C71334A0024798E /* dbgnub-mig.defs */, AFEC3363194A8B0B00FF05C6 /* Genealogy.cpp */, @@ -446,15 +441,6 @@ path = i386; sourceTree = ""; }; - 26C637FA0C71334A0024798E /* ppc */ = { - isa = PBXGroup; - children = ( - 26C637FB0C71334A0024798E /* DNBArchImpl.cpp */, - 26C637FC0C71334A0024798E /* DNBArchImpl.h */, - ); - path = ppc; - sourceTree = ""; - }; 26CF99A11142EB7400011AAB /* x86_64 */ = { isa = PBXGroup; children = ( @@ -617,7 +603,6 @@ 26CE05BF115C364D0022F371 /* DNBArchImplX86_64.cpp in Sources */, 26CE05C0115C364F0022F371 /* DNBArchImplI386.cpp in Sources */, 26CE05C1115C36510022F371 /* DNBArchImpl.cpp in Sources */, - 26CE05C2115C36550022F371 /* DNBArchImpl.cpp in Sources */, 26CE05C5115C36590022F371 /* CFBundle.cpp in Sources */, 26CE05C3115C36580022F371 /* CFString.cpp in Sources */, 23562ED91D342B0000AB2BD4 /* LogMessage.cpp in Sources */, @@ -668,7 +653,6 @@ 456F67601AD46CE9002850C2 /* DNBArchImpl.cpp in Sources */, 23AC04C71D2F41A00072351D /* LogFilter.cpp in Sources */, 23043C9E1D35DBFA00FC25CA /* StringConvert.cpp in Sources */, - 456F67611AD46CE9002850C2 /* DNBArchImpl.cpp in Sources */, AF588449206077BD00A0CB5A /* SocketAddress.cpp in Sources */, 456F67621AD46CE9002850C2 /* CFString.cpp in Sources */, 23AC04CB1D2F42250072351D /* LogFilterChain.cpp in Sources */, @@ -863,6 +847,8 @@ "$(PROJECT_DIR)/resources/lldb-debugserver-Info.plist", "$(LLDB_ENERGY_LDFLAGS)", "$(LLDB_COMPRESSION_LDFLAGS)", + "-framework", + Security, ); OTHER_MIGFLAGS = "-I$(DERIVED_FILE_DIR)"; PRODUCT_NAME = debugserver; @@ -942,6 +928,8 @@ "$(PROJECT_DIR)/resources/lldb-debugserver-Info.plist", "$(LLDB_ENERGY_LDFLAGS)", "$(LLDB_COMPRESSION_LDFLAGS)", + "-framework", + Security, ); OTHER_MIGFLAGS = "-I$(DERIVED_FILE_DIR)"; PRODUCT_NAME = debugserver; @@ -1020,6 +1008,8 @@ "$(PROJECT_DIR)/resources/lldb-debugserver-Info.plist", "$(LLDB_ENERGY_LDFLAGS)", "$(LLDB_COMPRESSION_LDFLAGS)", + "-framework", + Security, ); OTHER_MIGFLAGS = "-I$(DERIVED_FILE_DIR)"; PRODUCT_NAME = debugserver; @@ -1147,9 +1137,7 @@ LLDB_ENERGY_CFLAGS = ""; "LLDB_ENERGY_CFLAGS[sdk=*.internal]" = "-DLLDB_ENERGY"; LLDB_ENERGY_LDFLAGS = "-lpmenergy -lpmsample"; - OTHER_CFLAGS = ( - "$(LLDB_ENERGY_CFLAGS)", - ); + OTHER_CFLAGS = "$(LLDB_ENERGY_CFLAGS)"; "OTHER_CFLAGS[sdk=iphoneos*][arch=*]" = ( "-Wparentheses", "-DOS_OBJECT_USE_OBJC=0", diff --git a/lldb/tools/debugserver/source/MacOSX/arm64/DNBArchImplARM64.cpp b/lldb/tools/debugserver/source/MacOSX/arm64/DNBArchImplARM64.cpp index f99dbc48b128e..e5d4b05d987c1 100644 --- a/lldb/tools/debugserver/source/MacOSX/arm64/DNBArchImplARM64.cpp +++ b/lldb/tools/debugserver/source/MacOSX/arm64/DNBArchImplARM64.cpp @@ -666,6 +666,112 @@ uint32_t DNBArchMachARM64::NumSupportedHardwareWatchpoints() { return g_num_supported_hw_watchpoints; } +uint32_t DNBArchMachARM64::NumSupportedHardwareBreakpoints() { + // Set the init value to something that will let us know that we need to + // autodetect how many breakpoints are supported dynamically... + static uint32_t g_num_supported_hw_breakpoints = UINT_MAX; + if (g_num_supported_hw_breakpoints == UINT_MAX) { + // Set this to zero in case we can't tell if there are any HW breakpoints + g_num_supported_hw_breakpoints = 0; + + size_t len; + uint32_t n = 0; + len = sizeof(n); + if (::sysctlbyname("hw.optional.breakpoint", &n, &len, NULL, 0) == 0) { + g_num_supported_hw_breakpoints = n; + DNBLogThreadedIf(LOG_THREAD, "hw.optional.breakpoint=%u", n); + } else { +// For AArch64 we would need to look at ID_AA64DFR0_EL1 but debugserver runs in +// EL0 so it can't access that reg. The kernel should have filled in the +// sysctls based on it though. +#if defined(__arm__) + uint32_t register_DBGDIDR; + + asm("mrc p14, 0, %0, c0, c0, 0" : "=r"(register_DBGDIDR)); + uint32_t numWRPs = bits(register_DBGDIDR, 31, 28); + // Zero is reserved for the WRP count, so don't increment it if it is zero + if (numWRPs > 0) + numWRPs++; + g_num_supported_hw_breakpoints = numWRPs; + DNBLogThreadedIf(LOG_THREAD, + "Number of supported hw breakpoint via asm(): %d", + g_num_supported_hw_breakpoints); +#endif + } + } + return g_num_supported_hw_breakpoints; +} + +uint32_t DNBArchMachARM64::EnableHardwareBreakpoint(nub_addr_t addr, + nub_size_t size, + bool also_set_on_task) { + DNBLogThreadedIf(LOG_WATCHPOINTS, + "DNBArchMachARM64::EnableHardwareBreakpoint(addr = " + "0x%8.8llx, size = %zu)", + (uint64_t)addr, size); + + const uint32_t num_hw_breakpoints = NumSupportedHardwareBreakpoints(); + + nub_addr_t aligned_bp_address = addr; + uint32_t control_value = 0; + + switch (size) { + case 2: + control_value = (0x3 << 5) | 7; + aligned_bp_address &= ~1; + break; + case 4: + control_value = (0xfu << 5) | 7; + aligned_bp_address &= ~3; + break; + }; + + // Read the debug state + kern_return_t kret = GetDBGState(false); + if (kret == KERN_SUCCESS) { + // Check to make sure we have the needed hardware support + uint32_t i = 0; + + for (i = 0; i < num_hw_breakpoints; ++i) { + if ((m_state.dbg.__bcr[i] & BCR_ENABLE) == 0) + break; // We found an available hw breakpoint slot (in i) + } + + // See if we found an available hw breakpoint slot above + if (i < num_hw_breakpoints) { + m_state.dbg.__bvr[i] = aligned_bp_address; + m_state.dbg.__bcr[i] = control_value; + + DNBLogThreadedIf(LOG_WATCHPOINTS, + "DNBArchMachARM64::EnableHardwareBreakpoint() " + "adding breakpoint on address 0x%llx with control " + "register value 0x%x", + (uint64_t)m_state.dbg.__bvr[i], + (uint32_t)m_state.dbg.__bcr[i]); + + // The kernel will set the MDE_ENABLE bit in the MDSCR_EL1 for us + // automatically, don't need to do it here. + kret = SetDBGState(also_set_on_task); + + DNBLogThreadedIf(LOG_WATCHPOINTS, + "DNBArchMachARM64::" + "EnableHardwareBreakpoint() " + "SetDBGState() => 0x%8.8x.", + kret); + + if (kret == KERN_SUCCESS) + return i; + } else { + DNBLogThreadedIf(LOG_WATCHPOINTS, + "DNBArchMachARM64::" + "EnableHardwareBreakpoint(): All " + "hardware resources (%u) are in use.", + num_hw_breakpoints); + } + } + return INVALID_NUB_HW_INDEX; +} + uint32_t DNBArchMachARM64::EnableHardwareWatchpoint(nub_addr_t addr, nub_size_t size, bool read, bool write, @@ -905,6 +1011,32 @@ bool DNBArchMachARM64::DisableHardwareWatchpoint_helper(uint32_t hw_index, return (kret == KERN_SUCCESS); } +bool DNBArchMachARM64::DisableHardwareBreakpoint(uint32_t hw_index, + bool also_set_on_task) { + kern_return_t kret = GetDBGState(false); + if (kret != KERN_SUCCESS) + return false; + + const uint32_t num_hw_points = NumSupportedHardwareBreakpoints(); + if (hw_index >= num_hw_points) + return false; + + m_disabled_breakpoints[hw_index].addr = m_state.dbg.__bvr[hw_index]; + m_disabled_breakpoints[hw_index].control = m_state.dbg.__bcr[hw_index]; + + m_state.dbg.__bcr[hw_index] = 0; + DNBLogThreadedIf(LOG_WATCHPOINTS, + "DNBArchMachARM64::" + "DisableHardwareBreakpoint( %u ) - WVR%u = " + "0x%8.8llx BCR%u = 0x%8.8llx", + hw_index, hw_index, (uint64_t)m_state.dbg.__bvr[hw_index], + hw_index, (uint64_t)m_state.dbg.__bcr[hw_index]); + + kret = SetDBGState(also_set_on_task); + + return (kret == KERN_SUCCESS); +} + // This is for checking the Byte Address Select bits in the DBRWCRn_EL1 control // register. // Returns -1 if the trailing bit patterns are not one of: diff --git a/lldb/tools/debugserver/source/MacOSX/arm64/DNBArchImplARM64.h b/lldb/tools/debugserver/source/MacOSX/arm64/DNBArchImplARM64.h index ea4efa48d0260..fafcb73837b72 100644 --- a/lldb/tools/debugserver/source/MacOSX/arm64/DNBArchImplARM64.h +++ b/lldb/tools/debugserver/source/MacOSX/arm64/DNBArchImplARM64.h @@ -26,10 +26,12 @@ class DNBArchMachARM64 : public DNBArchProtocol { DNBArchMachARM64(MachThread *thread) : m_thread(thread), m_state(), m_disabled_watchpoints(), - m_watchpoint_hw_index(-1), m_watchpoint_did_occur(false), + m_disabled_breakpoints(), m_watchpoint_hw_index(-1), + m_watchpoint_did_occur(false), m_watchpoint_resume_single_step_enabled(false), m_saved_register_states() { m_disabled_watchpoints.resize(16); + m_disabled_breakpoints.resize(16); memset(&m_dbg_save, 0, sizeof(m_dbg_save)); } @@ -62,7 +64,13 @@ class DNBArchMachARM64 : public DNBArchProtocol { static const uint8_t *SoftwareBreakpointOpcode(nub_size_t byte_size); static uint32_t GetCPUType(); + virtual uint32_t NumSupportedHardwareBreakpoints(); virtual uint32_t NumSupportedHardwareWatchpoints(); + + virtual uint32_t EnableHardwareBreakpoint(nub_addr_t addr, nub_size_t size, + bool also_set_on_task); + virtual bool DisableHardwareBreakpoint(uint32_t hw_break_index, + bool also_set_on_task); virtual uint32_t EnableHardwareWatchpoint(nub_addr_t addr, nub_size_t size, bool read, bool write, bool also_set_on_task); @@ -229,10 +237,11 @@ class DNBArchMachARM64 : public DNBArchProtocol { State m_state; arm_debug_state64_t m_dbg_save; - // arm64 doesn't keep the disabled watchpoint values in the debug register - // context like armv7; + // arm64 doesn't keep the disabled watchpoint and breakpoint values in the + // debug register context like armv7; // we need to save them aside when we disable them temporarily. std::vector m_disabled_watchpoints; + std::vector m_disabled_breakpoints; // The following member variables should be updated atomically. int32_t m_watchpoint_hw_index; diff --git a/lldb/tools/lldb-vscode/lldb-vscode.cpp b/lldb/tools/lldb-vscode/lldb-vscode.cpp index 080ef5b40c01c..8c68dd0e70554 100644 --- a/lldb/tools/lldb-vscode/lldb-vscode.cpp +++ b/lldb/tools/lldb-vscode/lldb-vscode.cpp @@ -543,9 +543,6 @@ void request_attach(const llvm::json::Object &request) { return; } - const bool detatchOnError = GetBoolean(arguments, "detachOnError", false); - g_vsc.launch_info.SetDetachOnError(detatchOnError); - // Run any pre run LLDB commands the user specified in the launch.json g_vsc.RunPreRunCommands(); @@ -2821,7 +2818,7 @@ int main(int argc, char *argv[]) { } auto request_handlers = GetRequestHandlers(); uint32_t packet_idx = 0; - while (true) { + while (!g_vsc.sent_terminated_event) { std::string json = g_vsc.ReadJSON(); if (json.empty()) break; diff --git a/lldb/tools/lldb-vscode/package.json b/lldb/tools/lldb-vscode/package.json index dc27ab54900b3..2058edf685501 100644 --- a/lldb/tools/lldb-vscode/package.json +++ b/lldb/tools/lldb-vscode/package.json @@ -140,6 +140,11 @@ "description": "Commands executed just before the program is launched.", "default": [] }, + "launchCommands": { + "type": "array", + "description": "Custom commands that are executed instead of launching a process. A target will be created with the launch arguments prior to executing these commands. The commands may optionally create a new target and must perform a launch. A valid process must exist after these commands complete or the \"launch\" will fail.", + "default": [] + }, "stopCommands": { "type": "array", "description": "Commands executed each time the program stops.", diff --git a/lldb/unittests/Platform/PlatformDarwinTest.cpp b/lldb/unittests/Platform/PlatformDarwinTest.cpp index 06287c63227b6..20916f3cd1259 100644 --- a/lldb/unittests/Platform/PlatformDarwinTest.cpp +++ b/lldb/unittests/Platform/PlatformDarwinTest.cpp @@ -19,6 +19,7 @@ using namespace lldb_private; struct PlatformDarwinTester : public PlatformDarwin { public: + using PlatformDarwin::FindComponentInPath; using PlatformDarwin::FindXcodeContentsDirectoryInPath; static bool SDKSupportsModules(SDKType desired_type, const lldb_private::FileSpec &sdk_path) { @@ -132,3 +133,20 @@ TEST(PlatformDarwinTest, GetSDKNameForType) { EXPECT_EQ( "", PlatformDarwin::GetSDKNameForType(PlatformDarwin::SDKType::unknown)); } + +TEST(PlatformDarwinTest, FindComponentInPath) { + EXPECT_EQ("/path/to/foo", + PlatformDarwinTester::FindComponentInPath("/path/to/foo/", "foo")); + + EXPECT_EQ("/path/to/foo", + PlatformDarwinTester::FindComponentInPath("/path/to/foo", "foo")); + + EXPECT_EQ("/path/to/foobar", PlatformDarwinTester::FindComponentInPath( + "/path/to/foobar", "foo")); + + EXPECT_EQ("/path/to/foobar", PlatformDarwinTester::FindComponentInPath( + "/path/to/foobar", "bar")); + + EXPECT_EQ("", + PlatformDarwinTester::FindComponentInPath("/path/to/foo", "bar")); +} diff --git a/lldb/unittests/Utility/FileSpecTest.cpp b/lldb/unittests/Utility/FileSpecTest.cpp index c66edc4447978..690c5ae331ee2 100644 --- a/lldb/unittests/Utility/FileSpecTest.cpp +++ b/lldb/unittests/Utility/FileSpecTest.cpp @@ -441,3 +441,9 @@ TEST(FileSpecTest, Yaml) { EXPECT_EQ(deserialized.GetDirectory(), fs_windows.GetDirectory()); EXPECT_EQ(deserialized, fs_windows); } + +TEST(FileSpecTest, OperatorBool) { + EXPECT_FALSE(FileSpec()); + EXPECT_FALSE(FileSpec("")); + EXPECT_TRUE(FileSpec("/foo/bar")); +} diff --git a/lldb/utils/TableGen/LLDBPropertyDefEmitter.cpp b/lldb/utils/TableGen/LLDBPropertyDefEmitter.cpp index f36deeebf9065..e3522f2c7b2d3 100644 --- a/lldb/utils/TableGen/LLDBPropertyDefEmitter.cpp +++ b/lldb/utils/TableGen/LLDBPropertyDefEmitter.cpp @@ -35,8 +35,9 @@ static void emitProperty(Record *Property, raw_ostream &OS) { OS << ", "; // Emit the property type. + llvm::StringRef type = Property->getValueAsString("Type"); OS << "OptionValue::eType"; - OS << Property->getValueAsString("Type"); + OS << type; OS << ", "; // Emit the property's global value. @@ -46,11 +47,12 @@ static void emitProperty(Record *Property, raw_ostream &OS) { bool hasDefaultUnsignedValue = Property->getValue("HasDefaultUnsignedValue"); bool hasDefaultEnumValue = Property->getValue("HasDefaultEnumValue"); bool hasDefaultStringValue = Property->getValue("HasDefaultStringValue"); + bool hasElementType = Property->getValue("HasElementType"); // Guarantee that every property has a default value. assert((hasDefaultUnsignedValue || hasDefaultEnumValue || - hasDefaultStringValue) && - "Property must have a default value"); + hasDefaultStringValue || hasElementType) && + "Property must have a default value or an element type"); // Guarantee that no property has both a default unsigned value and a default // enum value, since they're bothed stored in the same field. @@ -72,11 +74,18 @@ static void emitProperty(Record *Property, raw_ostream &OS) { !(Property->getValueAsString("Type") == "Enum" && !hasDefaultEnumValue) && "Enum property must have a enum default value."); + // Guarantee that only arrays and dictionaries have an element type; + assert(((type != "Array" && type != "Dictionary") || hasElementType) && + "Only dictionaries and arrays can have an element type."); + // Emit the default uint value. if (hasDefaultUnsignedValue) { OS << std::to_string(Property->getValueAsInt("DefaultUnsignedValue")); } else if (hasDefaultEnumValue) { OS << Property->getValueAsString("DefaultEnumValue"); + } else if (hasElementType) { + OS << "OptionValue::eType"; + OS << Property->getValueAsString("ElementType"); } else { OS << "0"; } diff --git a/llvm-spirv/include/LLVMSPIRVOpts.h b/llvm-spirv/include/LLVMSPIRVOpts.h index 7ef9464455c7b..0848668033fbb 100644 --- a/llvm-spirv/include/LLVMSPIRVOpts.h +++ b/llvm-spirv/include/LLVMSPIRVOpts.h @@ -117,7 +117,7 @@ class TranslatorOpts { VersionNumber MaxVersion = VersionNumber::MaximumVersion; ExtensionsStatusMap ExtStatusMap; // SPIR-V to LLVM translation options - bool GenKernelArgNameMD; + bool GenKernelArgNameMD = false; std::unordered_map ExternalSpecialization; }; diff --git a/llvm-spirv/lib/SPIRV/SPIRVInternal.h b/llvm-spirv/lib/SPIRV/SPIRVInternal.h index 5fa18c1d56dda..0853281a2e89f 100644 --- a/llvm-spirv/lib/SPIRV/SPIRVInternal.h +++ b/llvm-spirv/lib/SPIRV/SPIRVInternal.h @@ -57,6 +57,10 @@ using namespace SPIRV; using namespace llvm; +namespace llvm { +class IntrinsicInst; +} + namespace SPIRV { /// The LLVM/SPIR-V translator version used to fill the lower 16 bits of the @@ -936,6 +940,8 @@ template <> inline void SPIRVMap::init() { // Check if the module contains llvm.loop.* metadata bool hasLoopMetadata(const Module *M); +// check LLVM Intrinsics type(s) for validity +bool checkTypeForSPIRVExtendedInstLowering(IntrinsicInst *II, SPIRVModule *BM); } // namespace SPIRV #endif // SPIRV_SPIRVINTERNAL_H diff --git a/llvm-spirv/lib/SPIRV/SPIRVReader.cpp b/llvm-spirv/lib/SPIRV/SPIRVReader.cpp index 3283646f305d7..30594a095aeaf 100644 --- a/llvm-spirv/lib/SPIRV/SPIRVReader.cpp +++ b/llvm-spirv/lib/SPIRV/SPIRVReader.cpp @@ -786,6 +786,44 @@ void SPIRVToLLVM::setLLVMLoopMetadata(const LoopInstType *LM, Metadata.push_back(llvm::MDNode::get(*Context, Parameters)); } } + if (LC & LoopControlPipelineEnableINTEL) { + Metadata.push_back(llvm::MDNode::get( + *Context, + getMetadataFromNameAndParameter("llvm.loop.intel.pipelining.enable", + LoopControlParameters[NumParam++]))); + assert(NumParam <= LoopControlParameters.size() && + "Missing loop control parameter!"); + } + if (LC & LoopControlLoopCoalesceINTEL) { + // If LoopCoalesce has no parameters + if (LoopControlParameters.empty()) { + Metadata.push_back(llvm::MDNode::get( + *Context, getMetadataFromName("llvm.loop.coalesce.enable"))); + } else { + Metadata.push_back(llvm::MDNode::get( + *Context, + getMetadataFromNameAndParameter("llvm.loop.coalesce.count", + LoopControlParameters[NumParam++]))); + } + assert(NumParam <= LoopControlParameters.size() && + "Missing loop control parameter!"); + } + if (LC & LoopControlMaxInterleavingINTEL) { + Metadata.push_back(llvm::MDNode::get( + *Context, + getMetadataFromNameAndParameter("llvm.loop.max_interleaving.count", + LoopControlParameters[NumParam++]))); + assert(NumParam <= LoopControlParameters.size() && + "Missing loop control parameter!"); + } + if (LC & LoopControlSpeculatedIterationsINTEL) { + Metadata.push_back(llvm::MDNode::get( + *Context, getMetadataFromNameAndParameter( + "llvm.loop.intel.speculated.iterations.count", + LoopControlParameters[NumParam++]))); + assert(NumParam <= LoopControlParameters.size() && + "Missing loop control parameter!"); + } llvm::MDNode *Node = llvm::MDNode::get(*Context, Metadata); // Set the first operand to refer itself @@ -2956,6 +2994,8 @@ void generateIntelFPGAAnnotation(const SPIRVEntry *E, Out << Literals[I] << ","; Out << Literals.back() << '}'; } + if (E->hasDecorate(DecorationForcePow2DepthINTEL, 0, &Result)) + Out << "{force_pow2_depth:" << Result << '}'; if (E->hasDecorate(DecorationUserSemantic)) Out << E->getDecorationStringLiteral(DecorationUserSemantic).front(); } @@ -3005,6 +3045,9 @@ void generateIntelFPGAAnnotationForStructMember( Out << Literals[I] << ","; Out << Literals.back() << '}'; } + if (E->hasMemberDecorate(DecorationForcePow2DepthINTEL, 0, MemberNumber, + &Result)) + Out << "{force_pow2_depth:" << Result << '}'; if (E->hasMemberDecorate(DecorationUserSemantic, 0, MemberNumber)) Out << E->getMemberDecorationStringLiteral(DecorationUserSemantic, diff --git a/llvm-spirv/lib/SPIRV/SPIRVUtil.cpp b/llvm-spirv/lib/SPIRV/SPIRVUtil.cpp index 386c957e10c20..af36354116b21 100644 --- a/llvm-spirv/lib/SPIRV/SPIRVUtil.cpp +++ b/llvm-spirv/lib/SPIRV/SPIRVUtil.cpp @@ -51,6 +51,7 @@ #include "llvm/ADT/StringSwitch.h" #include "llvm/Bitcode/BitcodeWriter.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" @@ -1512,4 +1513,32 @@ bool hasLoopMetadata(const Module *M) { return false; } +// Returns true if type(s) and number of elements (if vector) is valid +bool checkTypeForSPIRVExtendedInstLowering(IntrinsicInst *II, SPIRVModule *BM) { + switch (II->getIntrinsicID()) { + case Intrinsic::fabs: + case Intrinsic::ceil: { + Type *Ty = II->getType(); + if (II->getArgOperand(0)->getType() != Ty) + return false; + int NumElems = 1; + if (Ty->isVectorTy()) { + NumElems = Ty->getVectorNumElements(); + Ty = cast(Ty)->getElementType(); + } + if ((!Ty->isFloatTy() && !Ty->isDoubleTy()) || + ((NumElems > 4) && (NumElems != 8) && (NumElems != 16))) { + BM->getErrorLog().checkError(false, SPIRVEC_InvalidFunctionCall, + II->getCalledValue()->getName().str(), "", + __FILE__, __LINE__); + return false; + } + break; + } + default: + break; + } + return true; +} + } // namespace SPIRV diff --git a/llvm-spirv/lib/SPIRV/SPIRVWriter.cpp b/llvm-spirv/lib/SPIRV/SPIRVWriter.cpp index 4a140c6ebbc74..8b355974259eb 100644 --- a/llvm-spirv/lib/SPIRV/SPIRVWriter.cpp +++ b/llvm-spirv/lib/SPIRV/SPIRVWriter.cpp @@ -93,6 +93,11 @@ namespace SPIRV { cl::opt SPIRVMemToReg("spirv-mem2reg", cl::init(false), cl::desc("LLVM/SPIR-V translation enable mem2reg")); +cl::opt SPIRVAllowUnknownIntrinsics( + "spirv-allow-unknown-intrinsics", cl::init(false), + cl::desc("Unknown LLVM intrinsics will be translated as external function " + "calls in SPIR-V")); + static void foreachKernelArgMD( MDNode *MD, SPIRVFunction *BF, std::function @@ -478,7 +483,7 @@ SPIRVFunction *LLVMToSPIRV::transFunctionDecl(Function *F) { if (auto BF = getTranslatedValue(F)) return static_cast(BF); - if (F->isIntrinsic()) { + if (F->isIntrinsic() && !SPIRVAllowUnknownIntrinsics) { // We should not translate LLVM intrinsics as a function assert(none_of(F->user_begin(), F->user_end(), [this](User *U) { return getTranslatedValue(U); }) && @@ -851,6 +856,34 @@ LLVMToSPIRV::getLoopControl(const BranchInst *Branch, unsigned SafeLen = IVDep.getSafeLen(); for (auto &ArrayId : IVDep.getArrayVariables()) DependencyArrayParameters.emplace_back(ArrayId, SafeLen); + } else if (S == "llvm.loop.intel.pipelining.enable") { + BM->addExtension(ExtensionID::SPV_INTEL_fpga_loop_controls); + BM->addCapability(CapabilityFPGALoopControlsINTEL); + size_t I = getMDOperandAsInt(Node, 1); + Parameters.push_back(I); + LoopControl |= spv::LoopControlPipelineEnableINTEL; + } else if (S == "llvm.loop.coalesce.enable") { + BM->addExtension(ExtensionID::SPV_INTEL_fpga_loop_controls); + BM->addCapability(CapabilityFPGALoopControlsINTEL); + LoopControl |= spv::LoopControlLoopCoalesceINTEL; + } else if (S == "llvm.loop.coalesce.count") { + BM->addExtension(ExtensionID::SPV_INTEL_fpga_loop_controls); + BM->addCapability(CapabilityFPGALoopControlsINTEL); + size_t I = getMDOperandAsInt(Node, 1); + Parameters.push_back(I); + LoopControl |= spv::LoopControlLoopCoalesceINTEL; + } else if (S == "llvm.loop.max_interleaving.count") { + BM->addExtension(ExtensionID::SPV_INTEL_fpga_loop_controls); + BM->addCapability(CapabilityFPGALoopControlsINTEL); + size_t I = getMDOperandAsInt(Node, 1); + Parameters.push_back(I); + LoopControl |= spv::LoopControlMaxInterleavingINTEL; + } else if (S == "llvm.loop.intel.speculated.iterations.count") { + BM->addExtension(ExtensionID::SPV_INTEL_fpga_loop_controls); + BM->addCapability(CapabilityFPGALoopControlsINTEL); + size_t I = getMDOperandAsInt(Node, 1); + Parameters.push_back(I); + LoopControl |= spv::LoopControlSpeculatedIterationsINTEL; } } } @@ -1427,6 +1460,7 @@ tryParseIntelFPGAAnnotationString(StringRef AnnotatedCode) { .Case("max_replicates", DecorationMaxReplicatesINTEL) .Case("bank_bits", DecorationBankBitsINTEL) .Case("merge", DecorationMergeINTEL) + .Case("force_pow2_depth", DecorationForcePow2DepthINTEL) .Default(DecorationUserSemantic); if (Dec == DecorationUserSemantic) Value = AnnotatedCode.substr(From, To + 1); @@ -1494,6 +1528,7 @@ void addIntelFPGADecorations( // DecorationBankwidthINTEL // DecorationMaxPrivateCopiesINTEL // DecorationMaxReplicatesINTEL + // DecorationForcePow2DepthINTEL default: SPIRVWord Result = 0; StringRef(I.second).getAsInteger(10, Result); @@ -1548,6 +1583,7 @@ void addIntelFPGADecorationsForStructMember( // DecorationBankwidthINTEL // DecorationMaxPrivateCopiesINTEL // DecorationMaxReplicatesINTEL + // DecorationForcePow2DepthINTEL default: SPIRVWord Result = 0; StringRef(I.second).getAsInteger(10, Result); @@ -1590,6 +1626,24 @@ SPIRVValue *LLVMToSPIRV::transIntrinsicInst(IntrinsicInst *II, BM->getExtInstSetId(SPIRVEIS_OpenCL), OpenCLLIB::Sqrt, {transValue(II->getOperand(0), BB)}, BB); } + case Intrinsic::fabs: { + if (!checkTypeForSPIRVExtendedInstLowering(II, BM)) + break; + SPIRVWord ExtOp = OpenCLLIB::Fabs; + SPIRVType *STy = transType(II->getType()); + std::vector Ops(1, transValue(II->getArgOperand(0), BB)); + return BM->addExtInst(STy, BM->getExtInstSetId(SPIRVEIS_OpenCL), ExtOp, Ops, + BB); + } + case Intrinsic::ceil: { + if (!checkTypeForSPIRVExtendedInstLowering(II, BM)) + break; + SPIRVWord ExtOp = OpenCLLIB::Ceil; + SPIRVType *STy = transType(II->getType()); + std::vector Ops(1, transValue(II->getArgOperand(0), BB)); + return BM->addExtInst(STy, BM->getExtInstSetId(SPIRVEIS_OpenCL), ExtOp, Ops, + BB); + } case Intrinsic::ctlz: case Intrinsic::cttz: { SPIRVWord ExtOp = II->getIntrinsicID() == Intrinsic::ctlz ? OpenCLLIB::Clz @@ -1773,11 +1827,18 @@ SPIRVValue *LLVMToSPIRV::transIntrinsicInst(IntrinsicInst *II, case Intrinsic::dbg_label: return nullptr; default: - // Other LLVM intrinsics shouldn't get to SPIRV, because they - // can't be represented in SPIRV or not implemented yet. - BM->getErrorLog().checkError(false, SPIRVEC_InvalidFunctionCall, - II->getCalledValue()->getName().str(), "", - __FILE__, __LINE__); + if (SPIRVAllowUnknownIntrinsics) + return BM->addCallInst( + transFunctionDecl(II->getCalledFunction()), + transArguments(II, BB, + SPIRVEntry::createUnique(OpFunctionCall).get()), + BB); + else + // Other LLVM intrinsics shouldn't get to SPIRV, because they + // can't be represented in SPIRV or aren't implemented yet. + BM->getErrorLog().checkError(false, SPIRVEC_InvalidFunctionCall, + II->getCalledValue()->getName().str(), "", + __FILE__, __LINE__); } return nullptr; } diff --git a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVDecorate.h b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVDecorate.h index e976c51dbb8f0..9dea57d1404e6 100644 --- a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVDecorate.h +++ b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVDecorate.h @@ -153,6 +153,7 @@ class SPIRVDecorate : public SPIRVDecorateGeneric { case DecorationSimpleDualPortINTEL: case DecorationMergeINTEL: case DecorationBankBitsINTEL: + case DecorationForcePow2DepthINTEL: return getSet(ExtensionID::SPV_INTEL_fpga_memory_attributes); case DecorationReferencedIndirectlyINTEL: return getSet(ExtensionID::SPV_INTEL_function_pointers); @@ -252,6 +253,7 @@ class SPIRVMemberDecorate : public SPIRVDecorateGeneric { case DecorationSimpleDualPortINTEL: case DecorationMergeINTEL: case DecorationBankBitsINTEL: + case DecorationForcePow2DepthINTEL: return getSet(ExtensionID::SPV_INTEL_fpga_memory_attributes); case DecorationIOPipeStorageINTEL: return getSet(ExtensionID::SPV_INTEL_io_pipes); diff --git a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVEnum.h b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVEnum.h index eed9a76cc032e..2fbd8d2603f1b 100644 --- a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVEnum.h +++ b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVEnum.h @@ -363,6 +363,8 @@ template <> inline void SPIRVMap::init() { {CapabilityFPGAMemoryAttributesINTEL}); ADD_VEC_INIT(DecorationMergeINTEL, {CapabilityFPGAMemoryAttributesINTEL}); ADD_VEC_INIT(DecorationBankBitsINTEL, {CapabilityFPGAMemoryAttributesINTEL}); + ADD_VEC_INIT(DecorationForcePow2DepthINTEL, + {CapabilityFPGAMemoryAttributesINTEL}); ADD_VEC_INIT(DecorationReferencedIndirectlyINTEL, {CapabilityIndirectReferencesINTEL}); ADD_VEC_INIT(DecorationIOPipeStorageINTEL, {CapabilityIOPipeINTEL}); diff --git a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVIsValidEnum.h b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVIsValidEnum.h index 5e59bd704bb07..f460ecea25fb0 100644 --- a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVIsValidEnum.h +++ b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVIsValidEnum.h @@ -410,6 +410,7 @@ inline bool isValid(spv::Decoration V) { case DecorationSinglepumpINTEL: case DecorationDoublepumpINTEL: case DecorationBankBitsINTEL: + case DecorationForcePow2DepthINTEL: case DecorationReferencedIndirectlyINTEL: return true; default: @@ -1065,6 +1066,10 @@ inline bool isValidLoopControlMask(SPIRVWord Mask) { ValidMask |= LoopControlInitiationIntervalINTEL; ValidMask |= LoopControlMaxConcurrencyINTEL; ValidMask |= LoopControlDependencyArrayINTEL; + ValidMask |= LoopControlPipelineEnableINTEL; + ValidMask |= LoopControlLoopCoalesceINTEL; + ValidMask |= LoopControlMaxInterleavingINTEL; + ValidMask |= LoopControlSpeculatedIterationsINTEL; return (Mask & ~ValidMask) == 0; } diff --git a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVNameMapEnum.h b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVNameMapEnum.h index 41c22c5342883..3d43fe1e10f9b 100644 --- a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVNameMapEnum.h +++ b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVNameMapEnum.h @@ -348,6 +348,7 @@ template <> inline void SPIRVMap::init() { add(DecorationSimpleDualPortINTEL, "SimpleDualPortINTEL"); add(DecorationMergeINTEL, "MergeINTEL"); add(DecorationBankBitsINTEL, "BankBitsINTEL"); + add(DecorationForcePow2DepthINTEL, "ForcePow2DepthINTEL"); add(DecorationReferencedIndirectlyINTEL, "ReferencedIndirectlyINTEL"); add(DecorationIOPipeStorageINTEL, "IOPipeStorageINTEL"); } diff --git a/llvm-spirv/lib/SPIRV/libSPIRV/spirv.hpp b/llvm-spirv/lib/SPIRV/libSPIRV/spirv.hpp index be25d4754fb25..6f46dd7c975ee 100644 --- a/llvm-spirv/lib/SPIRV/libSPIRV/spirv.hpp +++ b/llvm-spirv/lib/SPIRV/libSPIRV/spirv.hpp @@ -406,6 +406,7 @@ enum Decoration { DecorationSimpleDualPortINTEL = 5833, DecorationMergeINTEL = 5834, DecorationBankBitsINTEL = 5835, + DecorationForcePow2DepthINTEL = 5836, DecorationIOPipeStorageINTEL = 5944, DecorationMax = 0x7fffffff, }; @@ -504,6 +505,10 @@ enum LoopControlMask { LoopControlInitiationIntervalINTEL = 0x10000, LoopControlMaxConcurrencyINTEL = 0x20000, LoopControlDependencyArrayINTEL = 0x40000, + LoopControlPipelineEnableINTEL = 0x80000, + LoopControlLoopCoalesceINTEL = 0x100000, + LoopControlMaxInterleavingINTEL = 0x200000, + LoopControlSpeculatedIterationsINTEL = 0x400000, }; enum FunctionControlShift { diff --git a/llvm-spirv/test/AllowIntrinsics.ll b/llvm-spirv/test/AllowIntrinsics.ll new file mode 100644 index 0000000000000..98c6f9f2090c0 --- /dev/null +++ b/llvm-spirv/test/AllowIntrinsics.ll @@ -0,0 +1,38 @@ +; The test checks command-line option for the translator +; which will allow to represent unknown llvm intrinsics as external function call in SPIR-V. +; RUN: llvm-as %s -o %t.bc +; RUN: llvm-spirv -spirv-allow-unknown-intrinsics %t.bc -o %t.spv +; RUN: spirv-val %t.spv +; RUN: llvm-spirv %t.spv -to-text -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: llvm-spirv -r %t.spv -o %t.rev.bc +; RUN: llvm-dis < %t.rev.bc | FileCheck %s --check-prefix=CHECK-LLVM + +; CHECK-LLVM: declare float @llvm.fma.f32(float, float, float) +; CHECK-SPIRV: LinkageAttributes "llvm.fma.f32" Import +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64" +target triple = "spir64" + +; Function Attrs: nounwind +define spir_func void @foo(float %a, float %b, float %c) #0 { +entry: + %0 = call float @llvm.fma.f32(float %a, float %b, float %c) + ret void +} + +; Function Attrs: nounwind readnone +declare float @llvm.fma.f32(float, float, float) #1 + +attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind readnone } + +!opencl.enable.FP_CONTRACT = !{} +!opencl.spir.version = !{!0} +!opencl.ocl.version = !{!1} +!opencl.used.extensions = !{!2} +!opencl.used.optional.core.features = !{!3} +!opencl.compiler.options = !{!2} + +!0 = !{i32 1, i32 2} +!1 = !{i32 2, i32 0} +!2 = !{} +!3 = !{!"cl_doubles"} diff --git a/llvm-spirv/test/IntelFPGAMemoryAttributes.ll b/llvm-spirv/test/IntelFPGAMemoryAttributes.ll index 2ff855a1f5387..4c5ec817af931 100644 --- a/llvm-spirv/test/IntelFPGAMemoryAttributes.ll +++ b/llvm-spirv/test/IntelFPGAMemoryAttributes.ll @@ -1,3 +1,206 @@ +; LLVM IR generated by Intel SYCL Clang compiler (https://github.com/intel/llvm) + +; SYCL source code for this test: +; void numbanks_attr() { +; [[intelfpga::numbanks(16)]] int numbanks_var; +; +; [[intelfpga::numbanks(2)]] struct numbanks_st { +; int field; +; } s; +; s.field = 0; +; } +; +; template +; void templ_numbanks_attr() { +; [[intelfpga::numbanks(A)]] int templ_numbanks_var; +; +; [[intelfpga::numbanks(A)]] struct templ_numbanks_st { +; int field; +; } s; +; s.field = 0; +; } +; +; void register_attr() { +; [[intelfpga::register]] int register_var; +; +; [[intelfpga::register]] struct register_st { +; int field; +; } s; +; s.field = 0; +; } +; +; void memory_attr() { +; [[intelfpga::memory("MLAB")]] int memory_var[500]; +; +; [[intelfpga::memory("BLOCK_RAM")]] struct memory_st { +; int field[10][2]; +; } s; +; s.field[0][0] = {0}; +; } +; +; void bankwidth_attr() { +; [[intelfpga::bankwidth(8)]] int bankwidth_var; +; +; [[intelfpga::bankwidth(4)]] struct bankwidth_st { +; int field; +; } s; +; s.field = 0; +; } +; +; template +; void templ_bankwidth_attr() { +; [[intelfpga::bankwidth(A)]] int templ_bankwidth_var; +; +; [[intelfpga::bankwidth(A)]] struct templ_bankwidth_st { +; int field; +; } s; +; s.field = 0; +; } +; +; void private_copies_attr() { +; [[intelfpga::private_copies(4)]] int priv_copies_var; +; +; [[intelfpga::private_copies(2)]] struct priv_copies_st { +; int field; +; } s; +; s.field = 0; +; } +; +; template +; void templ_private_copies_attr() { +; [[intelfpga::private_copies(A)]] int templ_priv_copies_var; +; +; [[intelfpga::private_copies(A)]] struct templ_priv_copies_st { +; int field; +; } s; +; s.field = 0; +; } +; +; void singlepump_attr() { +; [[intelfpga::singlepump]] int singlepump_var; +; +; [[intelfpga::singlepump]] struct singlepump_st { +; int field; +; } s; +; s.field = 0; +; } +; +; void doublepump_attr() { +; [[intelfpga::doublepump]] int doublepump_var; +; +; [[intelfpga::doublepump]] struct doublepump_st { +; int field; +; } s; +; s.field = 0; +; } +; +; void merge_attr() { +; [[intelfpga::merge("foo", "depth")]] int merge_var; +; +; [[intelfpga::merge("bar", "width")]] struct merge_st { +; int field; +; } s; +; s.field = 0; +; } +; +; void max_replicates_attr() { +; [[intelfpga::max_replicates(4)]] int max_repl_var; +; +; [[intelfpga::max_replicates(2)]] struct max_repl_st { +; int field; +; } s; +; s.field = 0; +; } +; +; template +; void templ_max_replicates_attr() { +; [[intelfpga::max_replicates(A)]] int templ_max_repl_var; +; +; [[intelfpga::max_replicates(A)]] struct templ_max_repl_st { +; int field; +; } s; +; s.field = 0; +; } +; +; void simple_dual_port_attr() { +; [[intelfpga::simple_dual_port]] int simple_dual_port_var; +; +; [[intelfpga::simple_dual_port]] struct simple_dual_port_st { +; int field; +; } s; +; s.field = 0; +; } +; +; void bank_bits_attr() { +; [[intelfpga::numbanks(8), intelfpga::bank_bits(2, 1, 0)]] int bank_bits_var; +; +; [[intelfpga::bank_bits(2)]] struct bank_bits_st { +; int field; +; } s; +; s.field = 0; +; } +; +; template +; void templ_bank_bits_attr() { +; [[intelfpga::bank_bits(A, B)]] int templ_bank_bits_var; +; +; [[intelfpga::bank_bits(B)]] struct templ_bank_bits_st { +; int field; +; } s; +; s.field = 0; +; } +; +; void force_pow2_depth_attr() { +; [[intelfpga::force_pow2_depth(0)]] int fp2d_var; +; +; [[intelfpga::force_pow2_depth(1)]] struct fp2d_st { +; int field; +; } s; +; s.field = 0; +; } +; +; template +; void templ_force_pow2_depth_attr() { +; [[intelfpga::force_pow2_depth(A)]] int templ_fp2d_var; +; +; [[intelfpga::force_pow2_depth(A)]] struct templ_fp2d_st { +; int field; +; } s; +; s.field = 0; +; } +; +; template +; __attribute__((sycl_kernel)) void kernel_single_task(Func kernelFunc) { +; kernelFunc(); +; } +; +; int main() { +; kernel_single_task([]() { +; numbanks_attr(); +; templ_numbanks_attr<4>(); +; register_attr(); +; memory_attr(); +; bankwidth_attr(); +; templ_bankwidth_attr<16>(); +; private_copies_attr(); +; templ_private_copies_attr<8>(); +; singlepump_attr(); +; doublepump_attr(); +; merge_attr(); +; max_replicates_attr(); +; templ_max_replicates_attr<8>(); +; simple_dual_port_attr(); +; bank_bits_attr(); +; templ_bank_bits_attr<4, 5>(); +; force_pow2_depth_attr(); +; templ_force_pow2_depth_attr<1>(); +; }); +; return 0; +; } + +; LLVM IR compilation command: +; clang -cc1 -triple spir -disable-llvm-passes -fsycl-is-device -emit-llvm intel-fpga-local-var.cpp + ; RUN: llvm-as %s -o %t.bc ; RUN: llvm-spirv %t.bc --spirv-ext=+SPV_INTEL_fpga_memory_attributes -o %t.spv ; RUN: llvm-spirv %t.spv --spirv-ext=+SPV_INTEL_fpga_memory_attributes -to-text -o %t.spt @@ -8,199 +211,615 @@ ; RUN: llvm-spirv -spirv-text -r %t.spt -o %t.rev.bc ; RUN: llvm-dis < %t.rev.bc | FileCheck %s --check-prefix=CHECK-LLVM -; + ; TODO: add a bunch of different tests for --spirv-ext option -; TODO: rewrite test to use a separate function for each attribute ; CHECK-SPIRV: Capability FPGAMemoryAttributesINTEL ; CHECK-SPIRV: Extension "SPV_INTEL_fpga_memory_attributes" -; CHECK-SPIRV: Decorate {{[0-9]+}} MemoryINTEL "DEFAULT" ; CHECK-SPIRV: Decorate {{[0-9]+}} RegisterINTEL -; CHECK-SPIRV: Decorate {{[0-9]+}} MemoryINTEL "BLOCK_RAM" +; CHECK-SPIRV: Decorate {{[0-9]+}} MemoryINTEL "DEFAULT" ; CHECK-SPIRV: Decorate {{[0-9]+}} NumbanksINTEL 2 ; CHECK-SPIRV: Decorate {{[0-9]+}} NumbanksINTEL 4 -; CHECK-SPIRV: Decorate {{[0-9]+}} BankwidthINTEL 8 -; CHECK-SPIRV: Decorate {{[0-9]+}} MaxPrivateCopiesINTEL 4 +; CHECK-SPIRV: Decorate {{[0-9]+}} BankwidthINTEL 16 +; CHECK-SPIRV: Decorate {{[0-9]+}} MaxPrivateCopiesINTEL 8 ; CHECK-SPIRV: Decorate {{[0-9]+}} SinglepumpINTEL ; CHECK-SPIRV: Decorate {{[0-9]+}} DoublepumpINTEL -; CHECK-SPIRV: Decorate {{[0-9]+}} MaxReplicatesINTEL 2 +; CHECK-SPIRV: Decorate {{[0-9]+}} MaxReplicatesINTEL 8 ; CHECK-SPIRV: Decorate {{[0-9]+}} SimpleDualPortINTEL +; CHECK-SPIRV: Decorate {{[0-9]+}} ForcePow2DepthINTEL 1 +; CHECK-SPIRV: Decorate {{[0-9]+}} MemoryINTEL "MLAB" +; CHECK-SPIRV: Decorate {{[0-9]+}} MemoryINTEL "BLOCK_RAM" +; CHECK-SPIRV: Decorate {{[0-9]+}} NumbanksINTEL 8 +; CHECK-SPIRV: Decorate {{[0-9]+}} NumbanksINTEL 16 +; CHECK-SPIRV: Decorate {{[0-9]+}} BankwidthINTEL 4 +; CHECK-SPIRV: Decorate {{[0-9]+}} BankwidthINTEL 8 +; CHECK-SPIRV: Decorate {{[0-9]+}} MaxPrivateCopiesINTEL 2 +; CHECK-SPIRV: Decorate {{[0-9]+}} MaxPrivateCopiesINTEL 4 +; CHECK-SPIRV: Decorate {{[0-9]+}} MaxReplicatesINTEL 2 +; CHECK-SPIRV: Decorate {{[0-9]+}} MaxReplicatesINTEL 4 ; CHECK-SPIRV: Decorate {{[0-9]+}} MergeINTEL "foo" "depth" +; CHECK-SPIRV: Decorate {{[0-9]+}} MergeINTEL "bar" "width" +; CHECK-SPIRV: Decorate {{[0-9]+}} BankBitsINTEL 2 +; CHECK-SPIRV: Decorate {{[0-9]+}} BankBitsINTEL 5 +; CHECK-SPIRV: Decorate {{[0-9]+}} BankBitsINTEL 4 5 ; CHECK-SPIRV: Decorate {{[0-9]+}} BankBitsINTEL 2 1 0 +; CHECK-SPIRV: Decorate {{[0-9]+}} ForcePow2DepthINTEL 0 -target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" -target triple = "spir64-unknown-linux" +target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" +target triple = "spir" %class.anon = type { i8 } -%struct._ZTS7foo_two.foo_two = type { i32 } - -; CHECK-LLVM: [[STR1:@[0-9_.]+]] = {{.*}}{memory:DEFAULT}{numbanks:4} -; CHECK-LLVM: [[STR2:@[0-9_.]+]] = {{.*}}{register:1} -; CHECK-LLVM: [[STR3:@[0-9_.]+]] = {{.*}}{memory:BLOCK_RAM} -; CHECK-LLVM: [[STR4:@[0-9_.]+]] = {{.*}}{memory:DEFAULT}{bankwidth:8} -; CHECK-LLVM: [[STR5:@[0-9_.]+]] = {{.*}}{memory:DEFAULT}{private_copies:4} -; CHECK-LLVM: [[STR6:@[0-9_.]+]] = {{.*}}{memory:DEFAULT}{pump:1} -; CHECK-LLVM: [[STR7:@[0-9_.]+]] = {{.*}}{memory:DEFAULT}{pump:2} -; CHECK-LLVM: [[STR8:@[0-9_.]+]] = {{.*}}{memory:DEFAULT}{merge:foo:depth} -; CHECK-LLVM: [[STR9:@[0-9_.]+]] = {{.*}}{max_replicates:2} -; CHECK-LLVM: [[STR10:@[0-9_.]+]] = {{.*}}{memory:DEFAULT}{simple_dual_port:1} -; CHECK-LLVM: [[STR11:@[0-9_.]+]] = {{.*}}{memory:DEFAULT}{numbanks:8}{bank_bits:2,1,0} -; CHECK-LLVM: [[STR12:@[0-9_.]+]] = {{.*}}{memory:DEFAULT}{numbanks:2} -@.str = private unnamed_addr constant [29 x i8] c"{memory:DEFAULT}{numbanks:4}\00", section "llvm.metadata" -@.str.1 = private unnamed_addr constant [13 x i8] c"test_var.cpp\00", section "llvm.metadata" -@.str.2 = private unnamed_addr constant [13 x i8] c"{register:1}\00", section "llvm.metadata" -@.str.3 = private unnamed_addr constant [19 x i8] c"{memory:BLOCK_RAM}\00", section "llvm.metadata" -@.str.4 = private unnamed_addr constant [30 x i8] c"{memory:DEFAULT}{bankwidth:8}\00", section "llvm.metadata" -@.str.5 = private unnamed_addr constant [35 x i8] c"{memory:DEFAULT}{private_copies:4}\00", section "llvm.metadata" -@.str.6 = private unnamed_addr constant [25 x i8] c"{memory:DEFAULT}{pump:1}\00", section "llvm.metadata" -@.str.7 = private unnamed_addr constant [25 x i8] c"{memory:DEFAULT}{pump:2}\00", section "llvm.metadata" -@.str.8 = private unnamed_addr constant [34 x i8] c"{memory:DEFAULT}{merge:foo:depth}\00", section "llvm.metadata" -@.str.9 = private unnamed_addr constant [19 x i8] c"{max_replicates:2}\00", section "llvm.metadata" -@.str.10 = private unnamed_addr constant [37 x i8] c"{memory:DEFAULT}{simple_dual_port:1}\00", section "llvm.metadata" -@.str.11 = private unnamed_addr constant [46 x i8] c"{memory:DEFAULT}{numbanks:8}{bank_bits:2,1,0}\00", section "llvm.metadata" -@.str.12 = private unnamed_addr constant [29 x i8] c"{memory:DEFAULT}{numbanks:2}\00", section "llvm.metadata" - -; Function Attrs: nounwind +%struct.numbanks_st = type { i32 } +%struct.templ_numbanks_st = type { i32 } +%struct.register_st = type { i32 } +%struct.memory_st = type { [10 x [2 x i32]] } +%struct.bankwidth_st = type { i32 } +%struct.templ_bankwidth_st = type { i32 } +%struct.priv_copies_st = type { i32 } +%struct.templ_priv_copies_st = type { i32 } +%struct.singlepump_st = type { i32 } +%struct.doublepump_st = type { i32 } +%struct.merge_st = type { i32 } +%struct.max_repl_st = type { i32 } +%struct.templ_max_repl_st = type { i32 } +%struct.simple_dual_port_st = type { i32 } +%struct.bank_bits_st = type { i32 } +%struct.templ_bank_bits_st = type { i32 } +%struct.fp2d_st = type { i32 } +%struct.templ_fp2d_st = type { i32 } + +; CHECK-LLVM: [[STR_NMB_VAR:@[0-9_.]+]] = {{.*}}{memory:DEFAULT}{numbanks:16} +; CHECK-LLVM: [[STR_NMB_SCT:@[0-9_.]+]] = {{.*}}{memory:DEFAULT}{numbanks:2} +; CHECK-LLVM: [[STR_NMB_TE1:@[0-9_.]+]] = {{.*}}{memory:DEFAULT}{numbanks:4} +; CHECK-LLVM: [[STR_NMB_TE2:@[0-9_.]+]] = {{.*}}{memory:DEFAULT}{numbanks:4} +; CHECK-LLVM: [[STR_REG_VAR:@[0-9_.]+]] = {{.*}}{register:1} +; CHECK-LLVM: [[STR_REG_SCT:@[0-9_.]+]] = {{.*}}{register:1} +; CHECK-LLVM: [[STR_MEM_VAR:@[0-9_.]+]] = {{.*}}{memory:MLAB} +; CHECK-LLVM: [[STR_MEM_SCT:@[0-9_.]+]] = {{.*}}{memory:BLOCK_RAM} +; CHECK-LLVM: [[STR_BWD_VAR:@[0-9_.]+]] = {{.*}}{memory:DEFAULT}{bankwidth:8} +; CHECK-LLVM: [[STR_BWD_SCT:@[0-9_.]+]] = {{.*}}{memory:DEFAULT}{bankwidth:4} +; CHECK-LLVM: [[STR_BWD_TE1:@[0-9_.]+]] = {{.*}}{memory:DEFAULT}{bankwidth:16} +; CHECK-LLVM: [[STR_BWD_TE2:@[0-9_.]+]] = {{.*}}{memory:DEFAULT}{bankwidth:16} +; CHECK-LLVM: [[STR_PRC_VAR:@[0-9_.]+]] = {{.*}}{memory:DEFAULT}{private_copies:4} +; CHECK-LLVM: [[STR_PRC_SCT:@[0-9_.]+]] = {{.*}}{memory:DEFAULT}{private_copies:2} +; CHECK-LLVM: [[STR_PRC_TE1:@[0-9_.]+]] = {{.*}}{memory:DEFAULT}{private_copies:8} +; CHECK-LLVM: [[STR_PRC_TE2:@[0-9_.]+]] = {{.*}}{memory:DEFAULT}{private_copies:8} +; CHECK-LLVM: [[STR_SNP_VAR:@[0-9_.]+]] = {{.*}}{memory:DEFAULT}{pump:1} +; CHECK-LLVM: [[STR_SNP_SCT:@[0-9_.]+]] = {{.*}}{memory:DEFAULT}{pump:1} +; CHECK-LLVM: [[STR_DBP_VAR:@[0-9_.]+]] = {{.*}}{memory:DEFAULT}{pump:2} +; CHECK-LLVM: [[STR_DBP_SCT:@[0-9_.]+]] = {{.*}}{memory:DEFAULT}{pump:2} +; CHECK-LLVM: [[STR_MRG_VAR:@[0-9_.]+]] = {{.*}}{memory:DEFAULT}{merge:foo:depth} +; CHECK-LLVM: [[STR_MRG_SCT:@[0-9_.]+]] = {{.*}}{memory:DEFAULT}{merge:bar:width} +; CHECK-LLVM: [[STR_MXR_VAR:@[0-9_.]+]] = {{.*}}{max_replicates:4} +; CHECK-LLVM: [[STR_MXR_SCT:@[0-9_.]+]] = {{.*}}{max_replicates:2} +; CHECK-LLVM: [[STR_MXR_TE1:@[0-9_.]+]] = {{.*}}{max_replicates:8} +; CHECK-LLVM: [[STR_MXR_TE2:@[0-9_.]+]] = {{.*}}{max_replicates:8} +; CHECK-LLVM: [[STR_SDP_VAR:@[0-9_.]+]] = {{.*}}{memory:DEFAULT}{simple_dual_port:1} +; CHECK-LLVM: [[STR_SDP_SCT:@[0-9_.]+]] = {{.*}}{memory:DEFAULT}{simple_dual_port:1} +; CHECK-LLVM: [[STR_BBT_VAR:@[0-9_.]+]] = {{.*}}{memory:DEFAULT}{numbanks:8}{bank_bits:2,1,0} +; CHECK-LLVM: [[STR_BBT_SCT:@[0-9_.]+]] = {{.*}}{memory:DEFAULT}{numbanks:2}{bank_bits:2} +; CHECK-LLVM: [[STR_BBT_TE1:@[0-9_.]+]] = {{.*}}{memory:DEFAULT}{numbanks:4}{bank_bits:4,5} +; CHECK-LLVM: [[STR_BBT_TE2:@[0-9_.]+]] = {{.*}}{memory:DEFAULT}{numbanks:2}{bank_bits:5} +; CHECK-LLVM: [[STR_FP2_VAR:@[0-9_.]+]] = {{.*}}{memory:DEFAULT}{force_pow2_depth:0} +; CHECK-LLVM: [[STR_FP2_SCT:@[0-9_.]+]] = {{.*}}{memory:DEFAULT}{force_pow2_depth:1} +; CHECK-LLVM: [[STR_FP2_TE1:@[0-9_.]+]] = {{.*}}{memory:DEFAULT}{force_pow2_depth:1} +; CHECK-LLVM: [[STR_FP2_TE2:@[0-9_.]+]] = {{.*}}{memory:DEFAULT}{force_pow2_depth:1} +@.str = private unnamed_addr constant [42 x i8] c"{memory:DEFAULT}{sizeinfo:4}{numbanks:16}\00", section "llvm.metadata" +@.str.1 = private unnamed_addr constant [25 x i8] c"intel-fpga-local-var.cpp\00", section "llvm.metadata" +@.str.2 = private unnamed_addr constant [41 x i8] c"{memory:DEFAULT}{sizeinfo:4}{numbanks:2}\00", section "llvm.metadata" +@.str.3 = private unnamed_addr constant [41 x i8] c"{memory:DEFAULT}{sizeinfo:4}{numbanks:4}\00", section "llvm.metadata" +@.str.4 = private unnamed_addr constant [13 x i8] c"{register:1}\00", section "llvm.metadata" +@.str.5 = private unnamed_addr constant [30 x i8] c"{memory:MLAB}{sizeinfo:4,500}\00", section "llvm.metadata" +@.str.6 = private unnamed_addr constant [32 x i8] c"{memory:BLOCK_RAM}{sizeinfo:80}\00", section "llvm.metadata" +@.str.7 = private unnamed_addr constant [42 x i8] c"{memory:DEFAULT}{sizeinfo:4}{bankwidth:8}\00", section "llvm.metadata" +@.str.8 = private unnamed_addr constant [42 x i8] c"{memory:DEFAULT}{sizeinfo:4}{bankwidth:4}\00", section "llvm.metadata" +@.str.9 = private unnamed_addr constant [43 x i8] c"{memory:DEFAULT}{sizeinfo:4}{bankwidth:16}\00", section "llvm.metadata" +@.str.10 = private unnamed_addr constant [47 x i8] c"{memory:DEFAULT}{sizeinfo:4}{private_copies:4}\00", section "llvm.metadata" +@.str.11 = private unnamed_addr constant [47 x i8] c"{memory:DEFAULT}{sizeinfo:4}{private_copies:2}\00", section "llvm.metadata" +@.str.12 = private unnamed_addr constant [47 x i8] c"{memory:DEFAULT}{sizeinfo:4}{private_copies:8}\00", section "llvm.metadata" +@.str.13 = private unnamed_addr constant [37 x i8] c"{memory:DEFAULT}{sizeinfo:4}{pump:1}\00", section "llvm.metadata" +@.str.14 = private unnamed_addr constant [37 x i8] c"{memory:DEFAULT}{sizeinfo:4}{pump:2}\00", section "llvm.metadata" +@.str.15 = private unnamed_addr constant [46 x i8] c"{memory:DEFAULT}{sizeinfo:4}{merge:foo:depth}\00", section "llvm.metadata" +@.str.16 = private unnamed_addr constant [46 x i8] c"{memory:DEFAULT}{sizeinfo:4}{merge:bar:width}\00", section "llvm.metadata" +@.str.17 = private unnamed_addr constant [19 x i8] c"{max_replicates:4}\00", section "llvm.metadata" +@.str.18 = private unnamed_addr constant [19 x i8] c"{max_replicates:2}\00", section "llvm.metadata" +@.str.19 = private unnamed_addr constant [19 x i8] c"{max_replicates:8}\00", section "llvm.metadata" +@.str.20 = private unnamed_addr constant [49 x i8] c"{memory:DEFAULT}{sizeinfo:4}{simple_dual_port:1}\00", section "llvm.metadata" +@.str.21 = private unnamed_addr constant [58 x i8] c"{memory:DEFAULT}{sizeinfo:4}{numbanks:8}{bank_bits:2,1,0}\00", section "llvm.metadata" +@.str.22 = private unnamed_addr constant [54 x i8] c"{memory:DEFAULT}{sizeinfo:4}{numbanks:2}{bank_bits:2}\00", section "llvm.metadata" +@.str.23 = private unnamed_addr constant [56 x i8] c"{memory:DEFAULT}{sizeinfo:4}{numbanks:4}{bank_bits:4,5}\00", section "llvm.metadata" +@.str.24 = private unnamed_addr constant [54 x i8] c"{memory:DEFAULT}{sizeinfo:4}{numbanks:2}{bank_bits:5}\00", section "llvm.metadata" +@.str.25 = private unnamed_addr constant [49 x i8] c"{memory:DEFAULT}{sizeinfo:4}{force_pow2_depth:0}\00", section "llvm.metadata" +@.str.26 = private unnamed_addr constant [49 x i8] c"{memory:DEFAULT}{sizeinfo:4}{force_pow2_depth:1}\00", section "llvm.metadata" + +; Function Attrs: norecurse nounwind define spir_kernel void @_ZTSZ4mainE15kernel_function() #0 !kernel_arg_addr_space !4 !kernel_arg_access_qual !4 !kernel_arg_type !4 !kernel_arg_base_type !4 !kernel_arg_type_qual !4 { entry: %0 = alloca %class.anon, align 1 %1 = bitcast %class.anon* %0 to i8* - call void @llvm.lifetime.start.p0i8(i64 1, i8* %1) #4 + call void @llvm.lifetime.start.p0i8(i64 1, i8* %1) #5 call spir_func void @"_ZZ4mainENK3$_0clEv"(%class.anon* %0) %2 = bitcast %class.anon* %0 to i8* - call void @llvm.lifetime.end.p0i8(i64 1, i8* %2) #4 + call void @llvm.lifetime.end.p0i8(i64 1, i8* %2) #5 ret void } -; Function Attrs: argmemonly nounwind -declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1 +; Function Attrs: argmemonly nounwind willreturn +declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #1 -; Function Attrs: inlinehint nounwind +; Function Attrs: inlinehint norecurse nounwind define internal spir_func void @"_ZZ4mainENK3$_0clEv"(%class.anon* %this) #2 align 2 { entry: - %this.addr = alloca %class.anon*, align 8 - store %class.anon* %this, %class.anon** %this.addr, align 8, !tbaa !5 - %this1 = load %class.anon*, %class.anon** %this.addr, align 8 - call spir_func void @_Z3foov() - call spir_func void @_Z3boov() - ret void -} - -; Function Attrs: argmemonly nounwind -declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1 - -; Function Attrs: nounwind -define spir_func void @_Z3foov() #3 { -entry: - %var_one = alloca i32, align 4 - %var_two = alloca i32, align 4 - %var_three = alloca i32, align 4 - %var_four = alloca i32, align 4 - %var_five = alloca i8, align 1 - %var_six = alloca i32, align 4 - %var_seven = alloca i32, align 4 - %var_eight = alloca i32, align 4 - %var_nine = alloca i32, align 4 - %var_ten = alloca i32, align 4 - %var_eleven = alloca i32, align 4 - %0 = bitcast i32* %var_one to i8* - call void @llvm.lifetime.start.p0i8(i64 4, i8* %0) #4 - %var_one1 = bitcast i32* %var_one to i8* - ; CHECK-LLVM: call void @llvm.var.annotation(i8* %[[VAR1:[a-zA-Z0-9_]+]], i8* getelementptr inbounds ([29 x i8], [29 x i8]* [[STR1]], i32 0, i32 0), i8* undef, i32 undef) - call void @llvm.var.annotation(i8* %var_one1, i8* getelementptr inbounds ([29 x i8], [29 x i8]* @.str, i32 0, i32 0), i8* getelementptr inbounds ([13 x i8], [13 x i8]* @.str.1, i32 0, i32 0), i32 2) - %1 = bitcast i32* %var_two to i8* - call void @llvm.lifetime.start.p0i8(i64 4, i8* %1) #4 - %var_two2 = bitcast i32* %var_two to i8* - ; CHECK-LLVM: call void @llvm.var.annotation(i8* [[VAR2:%[a-zA-Z0-9_]+]], i8* getelementptr inbounds ([13 x i8], [13 x i8]* [[STR2]], i32 0, i32 0), i8* undef, i32 undef) - call void @llvm.var.annotation(i8* %var_two2, i8* getelementptr inbounds ([13 x i8], [13 x i8]* @.str.2, i32 0, i32 0), i8* getelementptr inbounds ([13 x i8], [13 x i8]* @.str.1, i32 0, i32 0), i32 3) - %2 = bitcast i32* %var_three to i8* - call void @llvm.lifetime.start.p0i8(i64 4, i8* %2) #4 - %var_three3 = bitcast i32* %var_three to i8* - ; CHECK-LLVM: call void @llvm.var.annotation(i8* [[VAR3:%[a-zA-Z0-9_]+]], i8* getelementptr inbounds ([19 x i8], [19 x i8]* [[STR3]], i32 0, i32 0), i8* undef, i32 undef) - call void @llvm.var.annotation(i8* %var_three3, i8* getelementptr inbounds ([19 x i8], [19 x i8]* @.str.3, i32 0, i32 0), i8* getelementptr inbounds ([13 x i8], [13 x i8]* @.str.1, i32 0, i32 0), i32 4) - %3 = bitcast i32* %var_four to i8* - call void @llvm.lifetime.start.p0i8(i64 4, i8* %3) #4 - %var_four4 = bitcast i32* %var_four to i8* - ; CHECK-LLVM: call void @llvm.var.annotation(i8* [[VAR4:%[a-zA-Z0-9_]+]], i8* getelementptr inbounds ([30 x i8], [30 x i8]* [[STR4]], i32 0, i32 0), i8* undef, i32 undef) - call void @llvm.var.annotation(i8* %var_four4, i8* getelementptr inbounds ([30 x i8], [30 x i8]* @.str.4, i32 0, i32 0), i8* getelementptr inbounds ([13 x i8], [13 x i8]* @.str.1, i32 0, i32 0), i32 5) - call void @llvm.lifetime.start.p0i8(i64 1, i8* %var_five) #4 - ; CHECK-LLVM: call void @llvm.var.annotation(i8* [[VAR5:%[a-zA-Z0-9_]+]], i8* getelementptr inbounds ([35 x i8], [35 x i8]* [[STR5]], i32 0, i32 0), i8* undef, i32 undef) - call void @llvm.var.annotation(i8* %var_five, i8* getelementptr inbounds ([35 x i8], [35 x i8]* @.str.5, i32 0, i32 0), i8* getelementptr inbounds ([13 x i8], [13 x i8]* @.str.1, i32 0, i32 0), i32 6) - %4 = bitcast i32* %var_six to i8* - call void @llvm.lifetime.start.p0i8(i64 4, i8* %4) #4 - %var_six6 = bitcast i32* %var_six to i8* - ; CHECK-LLVM: call void @llvm.var.annotation(i8* [[VAR6:%[a-zA-Z0-9_]+]], i8* getelementptr inbounds ([25 x i8], [25 x i8]* [[STR6]], i32 0, i32 0), i8* undef, i32 undef) - call void @llvm.var.annotation(i8* %var_six6, i8* getelementptr inbounds ([25 x i8], [25 x i8]* @.str.6, i32 0, i32 0), i8* getelementptr inbounds ([13 x i8], [13 x i8]* @.str.1, i32 0, i32 0), i32 7) - %5 = bitcast i32* %var_seven to i8* - call void @llvm.lifetime.start.p0i8(i64 4, i8* %5) #4 - %var_seven7 = bitcast i32* %var_seven to i8* - ; CHECK-LLVM: call void @llvm.var.annotation(i8* [[VAR7:%[a-zA-Z0-9_]+]], i8* getelementptr inbounds ([25 x i8], [25 x i8]* [[STR7]], i32 0, i32 0), i8* undef, i32 undef) - call void @llvm.var.annotation(i8* %var_seven7, i8* getelementptr inbounds ([25 x i8], [25 x i8]* @.str.7, i32 0, i32 0), i8* getelementptr inbounds ([13 x i8], [13 x i8]* @.str.1, i32 0, i32 0), i32 8) - %6 = bitcast i32* %var_eight to i8* - call void @llvm.lifetime.start.p0i8(i64 4, i8* %6) #4 - %var_eight8 = bitcast i32* %var_eight to i8* - ; CHECK-LLVM: call void @llvm.var.annotation(i8* [[VAR8:%[a-zA-Z0-9_]+]], i8* getelementptr inbounds ([34 x i8], [34 x i8]* [[STR8]], i32 0, i32 0), i8* undef, i32 undef) - call void @llvm.var.annotation(i8* %var_eight8, i8* getelementptr inbounds ([34 x i8], [34 x i8]* @.str.8, i32 0, i32 0), i8* getelementptr inbounds ([13 x i8], [13 x i8]* @.str.1, i32 0, i32 0), i32 9) - %7 = bitcast i32* %var_nine to i8* - call void @llvm.lifetime.start.p0i8(i64 4, i8* %7) #4 - %var_nine9 = bitcast i32* %var_nine to i8* - ; CHECK-LLVM: call void @llvm.var.annotation(i8* [[VAR9:%[a-zA-Z0-9_]+]], i8* getelementptr inbounds ([19 x i8], [19 x i8]* [[STR9]], i32 0, i32 0), i8* undef, i32 undef) - call void @llvm.var.annotation(i8* %var_nine9, i8* getelementptr inbounds ([19 x i8], [19 x i8]* @.str.9, i32 0, i32 0), i8* getelementptr inbounds ([13 x i8], [13 x i8]* @.str.1, i32 0, i32 0), i32 10) - %8 = bitcast i32* %var_ten to i8* - call void @llvm.lifetime.start.p0i8(i64 4, i8* %8) #4 - %var_ten10 = bitcast i32* %var_ten to i8* - ; CHECK-LLVM: call void @llvm.var.annotation(i8* [[VAR10:%[a-zA-Z0-9_]+]], i8* getelementptr inbounds ([37 x i8], [37 x i8]* [[STR10]], i32 0, i32 0), i8* undef, i32 undef) - call void @llvm.var.annotation(i8* %var_ten10, i8* getelementptr inbounds ([37 x i8], [37 x i8]* @.str.10, i32 0, i32 0), i8* getelementptr inbounds ([13 x i8], [13 x i8]* @.str.1, i32 0, i32 0), i32 11) - %9 = bitcast i32* %var_eleven to i8* - call void @llvm.lifetime.start.p0i8(i64 4, i8* %9) #4 - %var_eleven11 = bitcast i32* %var_eleven to i8* - ; CHECK-LLVM: call void @llvm.var.annotation(i8* [[VAR11:%[a-zA-Z0-9_]+]], i8* getelementptr inbounds ([46 x i8], [46 x i8]* [[STR11]], i32 0, i32 0), i8* undef, i32 undef) - call void @llvm.var.annotation(i8* %var_eleven11, i8* getelementptr inbounds ([46 x i8], [46 x i8]* @.str.11, i32 0, i32 0), i8* getelementptr inbounds ([13 x i8], [13 x i8]* @.str.1, i32 0, i32 0), i32 12) - %10 = bitcast i32* %var_eleven to i8* - call void @llvm.lifetime.end.p0i8(i64 4, i8* %10) #4 - %11 = bitcast i32* %var_ten to i8* - call void @llvm.lifetime.end.p0i8(i64 4, i8* %11) #4 - %12 = bitcast i32* %var_nine to i8* - call void @llvm.lifetime.end.p0i8(i64 4, i8* %12) #4 - %13 = bitcast i32* %var_eight to i8* - call void @llvm.lifetime.end.p0i8(i64 4, i8* %13) #4 - %14 = bitcast i32* %var_seven to i8* - call void @llvm.lifetime.end.p0i8(i64 4, i8* %14) #4 - %15 = bitcast i32* %var_six to i8* - call void @llvm.lifetime.end.p0i8(i64 4, i8* %15) #4 - call void @llvm.lifetime.end.p0i8(i64 1, i8* %var_five) #4 - %16 = bitcast i32* %var_four to i8* - call void @llvm.lifetime.end.p0i8(i64 4, i8* %16) #4 - %17 = bitcast i32* %var_three to i8* - call void @llvm.lifetime.end.p0i8(i64 4, i8* %17) #4 - %18 = bitcast i32* %var_two to i8* - call void @llvm.lifetime.end.p0i8(i64 4, i8* %18) #4 - %19 = bitcast i32* %var_one to i8* - call void @llvm.lifetime.end.p0i8(i64 4, i8* %19) #4 - ret void -} - -; Function Attrs: nounwind -define dso_local spir_func void @_Z3boov() #3 { - %1 = alloca %struct._ZTS7foo_two.foo_two, align 4 - %2 = bitcast %struct._ZTS7foo_two.foo_two* %1 to i8* - call void @llvm.lifetime.start.p0i8(i64 44, i8* %2) #4 - %3 = bitcast %struct._ZTS7foo_two.foo_two* %1 to i8* - ; CHECK-LLVM: call void @llvm.var.annotation(i8* %[[VAR12:[a-zA-Z0-9_]+]], i8* getelementptr inbounds ([29 x i8], [29 x i8]* [[STR12]], i32 0, i32 0), i8* undef, i32 undef) - call void @llvm.var.annotation(i8* %3, i8* getelementptr inbounds ([29 x i8], [29 x i8]* @.str.12, i32 0, i32 0), i8* getelementptr inbounds ([13 x i8], [13 x i8]* @.str.1, i32 0, i32 0), i32 35) - %4 = bitcast %struct._ZTS7foo_two.foo_two* %1 to i8* - call void @llvm.lifetime.end.p0i8(i64 44, i8* %4) #4 - ret void -} - -; Function Attrs: nounwind + %this.addr = alloca %class.anon*, align 4 + store %class.anon* %this, %class.anon** %this.addr, align 4, !tbaa !5 + %this1 = load %class.anon*, %class.anon** %this.addr, align 4 + call spir_func void @_Z13numbanks_attrv() + call spir_func void @_Z19templ_numbanks_attrILi4EEvv() + call spir_func void @_Z13register_attrv() + call spir_func void @_Z11memory_attrv() + call spir_func void @_Z14bankwidth_attrv() + call spir_func void @_Z20templ_bankwidth_attrILi16EEvv() + call spir_func void @_Z19private_copies_attrv() + call spir_func void @_Z25templ_private_copies_attrILi8EEvv() + call spir_func void @_Z15singlepump_attrv() + call spir_func void @_Z15doublepump_attrv() + call spir_func void @_Z10merge_attrv() + call spir_func void @_Z19max_replicates_attrv() + call spir_func void @_Z25templ_max_replicates_attrILi8EEvv() + call spir_func void @_Z21simple_dual_port_attrv() + call spir_func void @_Z14bank_bits_attrv() + call spir_func void @_Z20templ_bank_bits_attrILi4ELi5EEvv() + call spir_func void @_Z21force_pow2_depth_attrv() + call spir_func void @_Z27templ_force_pow2_depth_attrILi1EEvv() + ret void +} + +; Function Attrs: argmemonly nounwind willreturn +declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #1 + +; Function Attrs: nounwind willreturn declare void @llvm.var.annotation(i8*, i8*, i8*, i32) #4 -attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { argmemonly nounwind } -attributes #2 = { inlinehint nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #3 = { nounwind optnone noinline "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #4 = { nounwind } +; Function Attrs: norecurse nounwind +define spir_func void @_Z13numbanks_attrv() #3 { +entry: + %numbanks_var = alloca i32, align 4 + %s = alloca %struct.numbanks_st, align 4 + %0 = bitcast i32* %numbanks_var to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %0) #5 + %numbanks_var1 = bitcast i32* %numbanks_var to i8* + ; CHECK-LLVM: call void @llvm.var.annotation(i8* %{{[a-zA-Z0-9_]+}}, i8* getelementptr inbounds ([{{[0-9]+}} x i8], [{{[0-9]+}} x i8]* [[STR_NMB_VAR]], i32 0, i32 0), i8* undef, i32 undef) + call void @llvm.var.annotation(i8* %numbanks_var1, i8* getelementptr inbounds ([42 x i8], [42 x i8]* @.str, i32 0, i32 0), i8* getelementptr inbounds ([25 x i8], [25 x i8]* @.str.1, i32 0, i32 0), i32 2) + %1 = bitcast %struct.numbanks_st* %s to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %1) #5 + %s2 = bitcast %struct.numbanks_st* %s to i8* + ; CHECK-LLVM: call void @llvm.var.annotation(i8* %{{[a-zA-Z0-9_]+}}, i8* getelementptr inbounds ([{{[0-9]+}} x i8], [{{[0-9]+}} x i8]* [[STR_NMB_SCT]], i32 0, i32 0), i8* undef, i32 undef) + call void @llvm.var.annotation(i8* %s2, i8* getelementptr inbounds ([41 x i8], [41 x i8]* @.str.2, i32 0, i32 0), i8* getelementptr inbounds ([25 x i8], [25 x i8]* @.str.1, i32 0, i32 0), i32 6) + %field = getelementptr inbounds %struct.numbanks_st, %struct.numbanks_st* %s, i32 0, i32 0 + store i32 0, i32* %field, align 4, !tbaa !9 + %2 = bitcast %struct.numbanks_st* %s to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %2) #5 + %3 = bitcast i32* %numbanks_var to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %3) #5 + ret void +} + +; Function Attrs: norecurse nounwind +define linkonce_odr spir_func void @_Z19templ_numbanks_attrILi4EEvv() #3 { +entry: + %templ_numbanks_var = alloca i32, align 4 + %s = alloca %struct.templ_numbanks_st, align 4 + %0 = bitcast i32* %templ_numbanks_var to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %0) #5 + %templ_numbanks_var1 = bitcast i32* %templ_numbanks_var to i8* + ; CHECK-LLVM: call void @llvm.var.annotation(i8* %{{[a-zA-Z0-9_]+}}, i8* getelementptr inbounds ([{{[0-9]+}} x i8], [{{[0-9]+}} x i8]* [[STR_NMB_TE1]], i32 0, i32 0), i8* undef, i32 undef) + call void @llvm.var.annotation(i8* %templ_numbanks_var1, i8* getelementptr inbounds ([41 x i8], [41 x i8]* @.str.3, i32 0, i32 0), i8* getelementptr inbounds ([25 x i8], [25 x i8]* @.str.1, i32 0, i32 0), i32 12) + %1 = bitcast %struct.templ_numbanks_st* %s to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %1) #5 + %s2 = bitcast %struct.templ_numbanks_st* %s to i8* + ; CHECK-LLVM: call void @llvm.var.annotation(i8* %{{[a-zA-Z0-9_]+}}, i8* getelementptr inbounds ([{{[0-9]+}} x i8], [{{[0-9]+}} x i8]* [[STR_NMB_TE2]], i32 0, i32 0), i8* undef, i32 undef) + call void @llvm.var.annotation(i8* %s2, i8* getelementptr inbounds ([41 x i8], [41 x i8]* @.str.3, i32 0, i32 0), i8* getelementptr inbounds ([25 x i8], [25 x i8]* @.str.1, i32 0, i32 0), i32 16) + %field = getelementptr inbounds %struct.templ_numbanks_st, %struct.templ_numbanks_st* %s, i32 0, i32 0 + store i32 0, i32* %field, align 4, !tbaa !12 + %2 = bitcast %struct.templ_numbanks_st* %s to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %2) #5 + %3 = bitcast i32* %templ_numbanks_var to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %3) #5 + ret void +} + +; Function Attrs: norecurse nounwind +define spir_func void @_Z13register_attrv() #3 { +entry: + %register_var = alloca i32, align 4 + %s = alloca %struct.register_st, align 4 + %0 = bitcast i32* %register_var to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %0) #5 + %register_var1 = bitcast i32* %register_var to i8* + ; CHECK-LLVM: call void @llvm.var.annotation(i8* %{{[a-zA-Z0-9_]+}}, i8* getelementptr inbounds ([{{[0-9]+}} x i8], [{{[0-9]+}} x i8]* [[STR_REG_VAR]], i32 0, i32 0), i8* undef, i32 undef) + call void @llvm.var.annotation(i8* %register_var1, i8* getelementptr inbounds ([13 x i8], [13 x i8]* @.str.4, i32 0, i32 0), i8* getelementptr inbounds ([25 x i8], [25 x i8]* @.str.1, i32 0, i32 0), i32 21) + %1 = bitcast %struct.register_st* %s to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %1) #5 + %s2 = bitcast %struct.register_st* %s to i8* + ; CHECK-LLVM: call void @llvm.var.annotation(i8* %{{[a-zA-Z0-9_]+}}, i8* getelementptr inbounds ([{{[0-9]+}} x i8], [{{[0-9]+}} x i8]* [[STR_REG_SCT]], i32 0, i32 0), i8* undef, i32 undef) + call void @llvm.var.annotation(i8* %s2, i8* getelementptr inbounds ([13 x i8], [13 x i8]* @.str.4, i32 0, i32 0), i8* getelementptr inbounds ([25 x i8], [25 x i8]* @.str.1, i32 0, i32 0), i32 25) + %field = getelementptr inbounds %struct.register_st, %struct.register_st* %s, i32 0, i32 0 + store i32 0, i32* %field, align 4, !tbaa !14 + %2 = bitcast %struct.register_st* %s to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %2) #5 + %3 = bitcast i32* %register_var to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %3) #5 + ret void +} + +; Function Attrs: norecurse nounwind +define spir_func void @_Z11memory_attrv() #3 { +entry: + %memory_var = alloca [500 x i32], align 4 + %s = alloca %struct.memory_st, align 4 + %0 = bitcast [500 x i32]* %memory_var to i8* + call void @llvm.lifetime.start.p0i8(i64 2000, i8* %0) #5 + %memory_var1 = bitcast [500 x i32]* %memory_var to i8* + ; CHECK-LLVM: call void @llvm.var.annotation(i8* %{{[a-zA-Z0-9_]+}}, i8* getelementptr inbounds ([{{[0-9]+}} x i8], [{{[0-9]+}} x i8]* [[STR_MEM_VAR]], i32 0, i32 0), i8* undef, i32 undef) + call void @llvm.var.annotation(i8* %memory_var1, i8* getelementptr inbounds ([30 x i8], [30 x i8]* @.str.5, i32 0, i32 0), i8* getelementptr inbounds ([25 x i8], [25 x i8]* @.str.1, i32 0, i32 0), i32 30) + %1 = bitcast %struct.memory_st* %s to i8* + call void @llvm.lifetime.start.p0i8(i64 80, i8* %1) #5 + %s2 = bitcast %struct.memory_st* %s to i8* + ; CHECK-LLVM: call void @llvm.var.annotation(i8* %{{[a-zA-Z0-9_]+}}, i8* getelementptr inbounds ([{{[0-9]+}} x i8], [{{[0-9]+}} x i8]* [[STR_MEM_SCT]], i32 0, i32 0), i8* undef, i32 undef) + call void @llvm.var.annotation(i8* %s2, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @.str.6, i32 0, i32 0), i8* getelementptr inbounds ([25 x i8], [25 x i8]* @.str.1, i32 0, i32 0), i32 34) + %field = getelementptr inbounds %struct.memory_st, %struct.memory_st* %s, i32 0, i32 0 + %arrayidx = getelementptr inbounds [10 x [2 x i32]], [10 x [2 x i32]]* %field, i64 0, i64 0 + %arrayidx3 = getelementptr inbounds [2 x i32], [2 x i32]* %arrayidx, i64 0, i64 0 + store i32 0, i32* %arrayidx3, align 4, !tbaa !16 + %2 = bitcast %struct.memory_st* %s to i8* + call void @llvm.lifetime.end.p0i8(i64 80, i8* %2) #5 + %3 = bitcast [500 x i32]* %memory_var to i8* + call void @llvm.lifetime.end.p0i8(i64 2000, i8* %3) #5 + ret void +} + +; Function Attrs: norecurse nounwind +define spir_func void @_Z14bankwidth_attrv() #3 { +entry: + %bankwidth_var = alloca i32, align 4 + %s = alloca %struct.bankwidth_st, align 4 + %0 = bitcast i32* %bankwidth_var to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %0) #5 + %bankwidth_var1 = bitcast i32* %bankwidth_var to i8* + ; CHECK-LLVM: call void @llvm.var.annotation(i8* %{{[a-zA-Z0-9_]+}}, i8* getelementptr inbounds ([{{[0-9]+}} x i8], [{{[0-9]+}} x i8]* [[STR_BWD_VAR]], i32 0, i32 0), i8* undef, i32 undef) + call void @llvm.var.annotation(i8* %bankwidth_var1, i8* getelementptr inbounds ([42 x i8], [42 x i8]* @.str.7, i32 0, i32 0), i8* getelementptr inbounds ([25 x i8], [25 x i8]* @.str.1, i32 0, i32 0), i32 39) + %1 = bitcast %struct.bankwidth_st* %s to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %1) #5 + %s2 = bitcast %struct.bankwidth_st* %s to i8* + ; CHECK-LLVM: call void @llvm.var.annotation(i8* %{{[a-zA-Z0-9_]+}}, i8* getelementptr inbounds ([{{[0-9]+}} x i8], [{{[0-9]+}} x i8]* [[STR_BWD_SCT]], i32 0, i32 0), i8* undef, i32 undef) + call void @llvm.var.annotation(i8* %s2, i8* getelementptr inbounds ([42 x i8], [42 x i8]* @.str.8, i32 0, i32 0), i8* getelementptr inbounds ([25 x i8], [25 x i8]* @.str.1, i32 0, i32 0), i32 43) + %field = getelementptr inbounds %struct.bankwidth_st, %struct.bankwidth_st* %s, i32 0, i32 0 + store i32 0, i32* %field, align 4, !tbaa !17 + %2 = bitcast %struct.bankwidth_st* %s to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %2) #5 + %3 = bitcast i32* %bankwidth_var to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %3) #5 + ret void +} + +; Function Attrs: norecurse nounwind +define linkonce_odr spir_func void @_Z20templ_bankwidth_attrILi16EEvv() #3 { +entry: + %templ_bankwidth_var = alloca i32, align 4 + %s = alloca %struct.templ_bankwidth_st, align 4 + %0 = bitcast i32* %templ_bankwidth_var to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %0) #5 + %templ_bankwidth_var1 = bitcast i32* %templ_bankwidth_var to i8* + ; CHECK-LLVM: call void @llvm.var.annotation(i8* %{{[a-zA-Z0-9_]+}}, i8* getelementptr inbounds ([{{[0-9]+}} x i8], [{{[0-9]+}} x i8]* [[STR_BWD_TE1]], i32 0, i32 0), i8* undef, i32 undef) + call void @llvm.var.annotation(i8* %templ_bankwidth_var1, i8* getelementptr inbounds ([43 x i8], [43 x i8]* @.str.9, i32 0, i32 0), i8* getelementptr inbounds ([25 x i8], [25 x i8]* @.str.1, i32 0, i32 0), i32 49) + %1 = bitcast %struct.templ_bankwidth_st* %s to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %1) #5 + %s2 = bitcast %struct.templ_bankwidth_st* %s to i8* + ; CHECK-LLVM: call void @llvm.var.annotation(i8* %{{[a-zA-Z0-9_]+}}, i8* getelementptr inbounds ([{{[0-9]+}} x i8], [{{[0-9]+}} x i8]* [[STR_BWD_TE2]], i32 0, i32 0), i8* undef, i32 undef) + call void @llvm.var.annotation(i8* %s2, i8* getelementptr inbounds ([43 x i8], [43 x i8]* @.str.9, i32 0, i32 0), i8* getelementptr inbounds ([25 x i8], [25 x i8]* @.str.1, i32 0, i32 0), i32 53) + %field = getelementptr inbounds %struct.templ_bankwidth_st, %struct.templ_bankwidth_st* %s, i32 0, i32 0 + store i32 0, i32* %field, align 4, !tbaa !19 + %2 = bitcast %struct.templ_bankwidth_st* %s to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %2) #5 + %3 = bitcast i32* %templ_bankwidth_var to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %3) #5 + ret void +} + +; Function Attrs: norecurse nounwind +define spir_func void @_Z19private_copies_attrv() #3 { +entry: + %priv_copies_var = alloca i32, align 4 + %s = alloca %struct.priv_copies_st, align 4 + %0 = bitcast i32* %priv_copies_var to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %0) #5 + %priv_copies_var1 = bitcast i32* %priv_copies_var to i8* + ; CHECK-LLVM: call void @llvm.var.annotation(i8* %{{[a-zA-Z0-9_]+}}, i8* getelementptr inbounds ([{{[0-9]+}} x i8], [{{[0-9]+}} x i8]* [[STR_PRC_VAR]], i32 0, i32 0), i8* undef, i32 undef) + call void @llvm.var.annotation(i8* %priv_copies_var1, i8* getelementptr inbounds ([47 x i8], [47 x i8]* @.str.10, i32 0, i32 0), i8* getelementptr inbounds ([25 x i8], [25 x i8]* @.str.1, i32 0, i32 0), i32 58) + %1 = bitcast %struct.priv_copies_st* %s to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %1) #5 + %s2 = bitcast %struct.priv_copies_st* %s to i8* + ; CHECK-LLVM: call void @llvm.var.annotation(i8* %{{[a-zA-Z0-9_]+}}, i8* getelementptr inbounds ([{{[0-9]+}} x i8], [{{[0-9]+}} x i8]* [[STR_PRC_SCT]], i32 0, i32 0), i8* undef, i32 undef) + call void @llvm.var.annotation(i8* %s2, i8* getelementptr inbounds ([47 x i8], [47 x i8]* @.str.11, i32 0, i32 0), i8* getelementptr inbounds ([25 x i8], [25 x i8]* @.str.1, i32 0, i32 0), i32 62) + %field = getelementptr inbounds %struct.priv_copies_st, %struct.priv_copies_st* %s, i32 0, i32 0 + store i32 0, i32* %field, align 4, !tbaa !21 + %2 = bitcast %struct.priv_copies_st* %s to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %2) #5 + %3 = bitcast i32* %priv_copies_var to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %3) #5 + ret void +} + +; Function Attrs: norecurse nounwind +define linkonce_odr spir_func void @_Z25templ_private_copies_attrILi8EEvv() #3 { +entry: + %templ_priv_copies_var = alloca i32, align 4 + %s = alloca %struct.templ_priv_copies_st, align 4 + %0 = bitcast i32* %templ_priv_copies_var to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %0) #5 + %templ_priv_copies_var1 = bitcast i32* %templ_priv_copies_var to i8* + ; CHECK-LLVM: call void @llvm.var.annotation(i8* %{{[a-zA-Z0-9_]+}}, i8* getelementptr inbounds ([{{[0-9]+}} x i8], [{{[0-9]+}} x i8]* [[STR_PRC_TE1]], i32 0, i32 0), i8* undef, i32 undef) + call void @llvm.var.annotation(i8* %templ_priv_copies_var1, i8* getelementptr inbounds ([47 x i8], [47 x i8]* @.str.12, i32 0, i32 0), i8* getelementptr inbounds ([25 x i8], [25 x i8]* @.str.1, i32 0, i32 0), i32 68) + %1 = bitcast %struct.templ_priv_copies_st* %s to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %1) #5 + %s2 = bitcast %struct.templ_priv_copies_st* %s to i8* + ; CHECK-LLVM: call void @llvm.var.annotation(i8* %{{[a-zA-Z0-9_]+}}, i8* getelementptr inbounds ([{{[0-9]+}} x i8], [{{[0-9]+}} x i8]* [[STR_PRC_TE2]], i32 0, i32 0), i8* undef, i32 undef) + call void @llvm.var.annotation(i8* %s2, i8* getelementptr inbounds ([47 x i8], [47 x i8]* @.str.12, i32 0, i32 0), i8* getelementptr inbounds ([25 x i8], [25 x i8]* @.str.1, i32 0, i32 0), i32 72) + %field = getelementptr inbounds %struct.templ_priv_copies_st, %struct.templ_priv_copies_st* %s, i32 0, i32 0 + store i32 0, i32* %field, align 4, !tbaa !23 + %2 = bitcast %struct.templ_priv_copies_st* %s to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %2) #5 + %3 = bitcast i32* %templ_priv_copies_var to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %3) #5 + ret void +} + +; Function Attrs: norecurse nounwind +define spir_func void @_Z15singlepump_attrv() #3 { +entry: + %singlepump_var = alloca i32, align 4 + %s = alloca %struct.singlepump_st, align 4 + %0 = bitcast i32* %singlepump_var to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %0) #5 + %singlepump_var1 = bitcast i32* %singlepump_var to i8* + ; CHECK-LLVM: call void @llvm.var.annotation(i8* %{{[a-zA-Z0-9_]+}}, i8* getelementptr inbounds ([{{[0-9]+}} x i8], [{{[0-9]+}} x i8]* [[STR_SNP_VAR]], i32 0, i32 0), i8* undef, i32 undef) + call void @llvm.var.annotation(i8* %singlepump_var1, i8* getelementptr inbounds ([37 x i8], [37 x i8]* @.str.13, i32 0, i32 0), i8* getelementptr inbounds ([25 x i8], [25 x i8]* @.str.1, i32 0, i32 0), i32 77) + %1 = bitcast %struct.singlepump_st* %s to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %1) #5 + %s2 = bitcast %struct.singlepump_st* %s to i8* + ; CHECK-LLVM: call void @llvm.var.annotation(i8* %{{[a-zA-Z0-9_]+}}, i8* getelementptr inbounds ([{{[0-9]+}} x i8], [{{[0-9]+}} x i8]* [[STR_SNP_SCT]], i32 0, i32 0), i8* undef, i32 undef) + call void @llvm.var.annotation(i8* %s2, i8* getelementptr inbounds ([37 x i8], [37 x i8]* @.str.13, i32 0, i32 0), i8* getelementptr inbounds ([25 x i8], [25 x i8]* @.str.1, i32 0, i32 0), i32 81) + %field = getelementptr inbounds %struct.singlepump_st, %struct.singlepump_st* %s, i32 0, i32 0 + store i32 0, i32* %field, align 4, !tbaa !25 + %2 = bitcast %struct.singlepump_st* %s to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %2) #5 + %3 = bitcast i32* %singlepump_var to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %3) #5 + ret void +} + +; Function Attrs: norecurse nounwind +define spir_func void @_Z15doublepump_attrv() #3 { +entry: + %doublepump_var = alloca i32, align 4 + %s = alloca %struct.doublepump_st, align 4 + %0 = bitcast i32* %doublepump_var to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %0) #5 + %doublepump_var1 = bitcast i32* %doublepump_var to i8* + ; CHECK-LLVM: call void @llvm.var.annotation(i8* %{{[a-zA-Z0-9_]+}}, i8* getelementptr inbounds ([{{[0-9]+}} x i8], [{{[0-9]+}} x i8]* [[STR_DBP_VAR]], i32 0, i32 0), i8* undef, i32 undef) + call void @llvm.var.annotation(i8* %doublepump_var1, i8* getelementptr inbounds ([37 x i8], [37 x i8]* @.str.14, i32 0, i32 0), i8* getelementptr inbounds ([25 x i8], [25 x i8]* @.str.1, i32 0, i32 0), i32 86) + %1 = bitcast %struct.doublepump_st* %s to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %1) #5 + %s2 = bitcast %struct.doublepump_st* %s to i8* + ; CHECK-LLVM: call void @llvm.var.annotation(i8* %{{[a-zA-Z0-9_]+}}, i8* getelementptr inbounds ([{{[0-9]+}} x i8], [{{[0-9]+}} x i8]* [[STR_DBP_SCT]], i32 0, i32 0), i8* undef, i32 undef) + call void @llvm.var.annotation(i8* %s2, i8* getelementptr inbounds ([37 x i8], [37 x i8]* @.str.14, i32 0, i32 0), i8* getelementptr inbounds ([25 x i8], [25 x i8]* @.str.1, i32 0, i32 0), i32 90) + %field = getelementptr inbounds %struct.doublepump_st, %struct.doublepump_st* %s, i32 0, i32 0 + store i32 0, i32* %field, align 4, !tbaa !27 + %2 = bitcast %struct.doublepump_st* %s to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %2) #5 + %3 = bitcast i32* %doublepump_var to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %3) #5 + ret void +} + +; Function Attrs: norecurse nounwind +define spir_func void @_Z10merge_attrv() #3 { +entry: + %merge_var = alloca i32, align 4 + %s = alloca %struct.merge_st, align 4 + %0 = bitcast i32* %merge_var to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %0) #5 + %merge_var1 = bitcast i32* %merge_var to i8* + ; CHECK-LLVM: call void @llvm.var.annotation(i8* %{{[a-zA-Z0-9_]+}}, i8* getelementptr inbounds ([{{[0-9]+}} x i8], [{{[0-9]+}} x i8]* [[STR_MRG_VAR]], i32 0, i32 0), i8* undef, i32 undef) + call void @llvm.var.annotation(i8* %merge_var1, i8* getelementptr inbounds ([46 x i8], [46 x i8]* @.str.15, i32 0, i32 0), i8* getelementptr inbounds ([25 x i8], [25 x i8]* @.str.1, i32 0, i32 0), i32 95) + %1 = bitcast %struct.merge_st* %s to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %1) #5 + %s2 = bitcast %struct.merge_st* %s to i8* + ; CHECK-LLVM: call void @llvm.var.annotation(i8* %{{[a-zA-Z0-9_]+}}, i8* getelementptr inbounds ([{{[0-9]+}} x i8], [{{[0-9]+}} x i8]* [[STR_MRG_SCT]], i32 0, i32 0), i8* undef, i32 undef) + call void @llvm.var.annotation(i8* %s2, i8* getelementptr inbounds ([46 x i8], [46 x i8]* @.str.16, i32 0, i32 0), i8* getelementptr inbounds ([25 x i8], [25 x i8]* @.str.1, i32 0, i32 0), i32 99) + %field = getelementptr inbounds %struct.merge_st, %struct.merge_st* %s, i32 0, i32 0 + store i32 0, i32* %field, align 4, !tbaa !29 + %2 = bitcast %struct.merge_st* %s to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %2) #5 + %3 = bitcast i32* %merge_var to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %3) #5 + ret void +} + +; Function Attrs: norecurse nounwind +define spir_func void @_Z19max_replicates_attrv() #3 { +entry: + %max_repl_var = alloca i32, align 4 + %s = alloca %struct.max_repl_st, align 4 + %0 = bitcast i32* %max_repl_var to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %0) #5 + %max_repl_var1 = bitcast i32* %max_repl_var to i8* + ; CHECK-LLVM: call void @llvm.var.annotation(i8* %{{[a-zA-Z0-9_]+}}, i8* getelementptr inbounds ([{{[0-9]+}} x i8], [{{[0-9]+}} x i8]* [[STR_MXR_VAR]], i32 0, i32 0), i8* undef, i32 undef) + call void @llvm.var.annotation(i8* %max_repl_var1, i8* getelementptr inbounds ([19 x i8], [19 x i8]* @.str.17, i32 0, i32 0), i8* getelementptr inbounds ([25 x i8], [25 x i8]* @.str.1, i32 0, i32 0), i32 104) + %1 = bitcast %struct.max_repl_st* %s to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %1) #5 + %s2 = bitcast %struct.max_repl_st* %s to i8* + ; CHECK-LLVM: call void @llvm.var.annotation(i8* %{{[a-zA-Z0-9_]+}}, i8* getelementptr inbounds ([{{[0-9]+}} x i8], [{{[0-9]+}} x i8]* [[STR_MXR_SCT]], i32 0, i32 0), i8* undef, i32 undef) + call void @llvm.var.annotation(i8* %s2, i8* getelementptr inbounds ([19 x i8], [19 x i8]* @.str.18, i32 0, i32 0), i8* getelementptr inbounds ([25 x i8], [25 x i8]* @.str.1, i32 0, i32 0), i32 108) + %field = getelementptr inbounds %struct.max_repl_st, %struct.max_repl_st* %s, i32 0, i32 0 + store i32 0, i32* %field, align 4, !tbaa !31 + %2 = bitcast %struct.max_repl_st* %s to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %2) #5 + %3 = bitcast i32* %max_repl_var to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %3) #5 + ret void +} + +; Function Attrs: norecurse nounwind +define linkonce_odr spir_func void @_Z25templ_max_replicates_attrILi8EEvv() #3 { +entry: + %templ_max_repl_var = alloca i32, align 4 + %s = alloca %struct.templ_max_repl_st, align 4 + %0 = bitcast i32* %templ_max_repl_var to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %0) #5 + %templ_max_repl_var1 = bitcast i32* %templ_max_repl_var to i8* + ; CHECK-LLVM: call void @llvm.var.annotation(i8* %{{[a-zA-Z0-9_]+}}, i8* getelementptr inbounds ([{{[0-9]+}} x i8], [{{[0-9]+}} x i8]* [[STR_MXR_TE1]], i32 0, i32 0), i8* undef, i32 undef) + call void @llvm.var.annotation(i8* %templ_max_repl_var1, i8* getelementptr inbounds ([19 x i8], [19 x i8]* @.str.19, i32 0, i32 0), i8* getelementptr inbounds ([25 x i8], [25 x i8]* @.str.1, i32 0, i32 0), i32 114) + %1 = bitcast %struct.templ_max_repl_st* %s to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %1) #5 + %s2 = bitcast %struct.templ_max_repl_st* %s to i8* + ; CHECK-LLVM: call void @llvm.var.annotation(i8* %{{[a-zA-Z0-9_]+}}, i8* getelementptr inbounds ([{{[0-9]+}} x i8], [{{[0-9]+}} x i8]* [[STR_MXR_TE2]], i32 0, i32 0), i8* undef, i32 undef) + call void @llvm.var.annotation(i8* %s2, i8* getelementptr inbounds ([19 x i8], [19 x i8]* @.str.19, i32 0, i32 0), i8* getelementptr inbounds ([25 x i8], [25 x i8]* @.str.1, i32 0, i32 0), i32 118) + %field = getelementptr inbounds %struct.templ_max_repl_st, %struct.templ_max_repl_st* %s, i32 0, i32 0 + store i32 0, i32* %field, align 4, !tbaa !33 + %2 = bitcast %struct.templ_max_repl_st* %s to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %2) #5 + %3 = bitcast i32* %templ_max_repl_var to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %3) #5 + ret void +} + +; Function Attrs: norecurse nounwind +define spir_func void @_Z21simple_dual_port_attrv() #3 { +entry: + %simple_dual_port_var = alloca i32, align 4 + %s = alloca %struct.simple_dual_port_st, align 4 + %0 = bitcast i32* %simple_dual_port_var to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %0) #5 + %simple_dual_port_var1 = bitcast i32* %simple_dual_port_var to i8* + ; CHECK-LLVM: call void @llvm.var.annotation(i8* %{{[a-zA-Z0-9_]+}}, i8* getelementptr inbounds ([{{[0-9]+}} x i8], [{{[0-9]+}} x i8]* [[STR_SDP_VAR]], i32 0, i32 0), i8* undef, i32 undef) + call void @llvm.var.annotation(i8* %simple_dual_port_var1, i8* getelementptr inbounds ([49 x i8], [49 x i8]* @.str.20, i32 0, i32 0), i8* getelementptr inbounds ([25 x i8], [25 x i8]* @.str.1, i32 0, i32 0), i32 123) + %1 = bitcast %struct.simple_dual_port_st* %s to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %1) #5 + %s2 = bitcast %struct.simple_dual_port_st* %s to i8* + ; CHECK-LLVM: call void @llvm.var.annotation(i8* %{{[a-zA-Z0-9_]+}}, i8* getelementptr inbounds ([{{[0-9]+}} x i8], [{{[0-9]+}} x i8]* [[STR_SDP_SCT]], i32 0, i32 0), i8* undef, i32 undef) + call void @llvm.var.annotation(i8* %s2, i8* getelementptr inbounds ([49 x i8], [49 x i8]* @.str.20, i32 0, i32 0), i8* getelementptr inbounds ([25 x i8], [25 x i8]* @.str.1, i32 0, i32 0), i32 127) + %field = getelementptr inbounds %struct.simple_dual_port_st, %struct.simple_dual_port_st* %s, i32 0, i32 0 + store i32 0, i32* %field, align 4, !tbaa !35 + %2 = bitcast %struct.simple_dual_port_st* %s to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %2) #5 + %3 = bitcast i32* %simple_dual_port_var to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %3) #5 + ret void +} + +; Function Attrs: norecurse nounwind +define spir_func void @_Z14bank_bits_attrv() #3 { +entry: + %bank_bits_var = alloca i32, align 4 + %s = alloca %struct.bank_bits_st, align 4 + %0 = bitcast i32* %bank_bits_var to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %0) #5 + %bank_bits_var1 = bitcast i32* %bank_bits_var to i8* + ; CHECK-LLVM: call void @llvm.var.annotation(i8* %{{[a-zA-Z0-9_]+}}, i8* getelementptr inbounds ([{{[0-9]+}} x i8], [{{[0-9]+}} x i8]* [[STR_BBT_VAR]], i32 0, i32 0), i8* undef, i32 undef) + call void @llvm.var.annotation(i8* %bank_bits_var1, i8* getelementptr inbounds ([58 x i8], [58 x i8]* @.str.21, i32 0, i32 0), i8* getelementptr inbounds ([25 x i8], [25 x i8]* @.str.1, i32 0, i32 0), i32 132) + %1 = bitcast %struct.bank_bits_st* %s to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %1) #5 + %s2 = bitcast %struct.bank_bits_st* %s to i8* + ; CHECK-LLVM: call void @llvm.var.annotation(i8* %{{[a-zA-Z0-9_]+}}, i8* getelementptr inbounds ([{{[0-9]+}} x i8], [{{[0-9]+}} x i8]* [[STR_BBT_SCT]], i32 0, i32 0), i8* undef, i32 undef) + call void @llvm.var.annotation(i8* %s2, i8* getelementptr inbounds ([54 x i8], [54 x i8]* @.str.22, i32 0, i32 0), i8* getelementptr inbounds ([25 x i8], [25 x i8]* @.str.1, i32 0, i32 0), i32 136) + %field = getelementptr inbounds %struct.bank_bits_st, %struct.bank_bits_st* %s, i32 0, i32 0 + store i32 0, i32* %field, align 4, !tbaa !37 + %2 = bitcast %struct.bank_bits_st* %s to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %2) #5 + %3 = bitcast i32* %bank_bits_var to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %3) #5 + ret void +} + +; Function Attrs: norecurse nounwind +define linkonce_odr spir_func void @_Z20templ_bank_bits_attrILi4ELi5EEvv() #3 { +entry: + %templ_bank_bits_var = alloca i32, align 4 + %s = alloca %struct.templ_bank_bits_st, align 4 + %0 = bitcast i32* %templ_bank_bits_var to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %0) #5 + %templ_bank_bits_var1 = bitcast i32* %templ_bank_bits_var to i8* + ; CHECK-LLVM: call void @llvm.var.annotation(i8* %{{[a-zA-Z0-9_]+}}, i8* getelementptr inbounds ([{{[0-9]+}} x i8], [{{[0-9]+}} x i8]* [[STR_BBT_TE1]], i32 0, i32 0), i8* undef, i32 undef) + call void @llvm.var.annotation(i8* %templ_bank_bits_var1, i8* getelementptr inbounds ([56 x i8], [56 x i8]* @.str.23, i32 0, i32 0), i8* getelementptr inbounds ([25 x i8], [25 x i8]* @.str.1, i32 0, i32 0), i32 142) + %1 = bitcast %struct.templ_bank_bits_st* %s to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %1) #5 + %s2 = bitcast %struct.templ_bank_bits_st* %s to i8* + ; CHECK-LLVM: call void @llvm.var.annotation(i8* %{{[a-zA-Z0-9_]+}}, i8* getelementptr inbounds ([{{[0-9]+}} x i8], [{{[0-9]+}} x i8]* [[STR_BBT_TE2]], i32 0, i32 0), i8* undef, i32 undef) + call void @llvm.var.annotation(i8* %s2, i8* getelementptr inbounds ([54 x i8], [54 x i8]* @.str.24, i32 0, i32 0), i8* getelementptr inbounds ([25 x i8], [25 x i8]* @.str.1, i32 0, i32 0), i32 146) + %field = getelementptr inbounds %struct.templ_bank_bits_st, %struct.templ_bank_bits_st* %s, i32 0, i32 0 + store i32 0, i32* %field, align 4, !tbaa !39 + %2 = bitcast %struct.templ_bank_bits_st* %s to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %2) #5 + %3 = bitcast i32* %templ_bank_bits_var to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %3) #5 + ret void +} + +; Function Attrs: norecurse nounwind +define spir_func void @_Z21force_pow2_depth_attrv() #3 { +entry: + %fp2d_var = alloca i32, align 4 + %s = alloca %struct.fp2d_st, align 4 + %0 = bitcast i32* %fp2d_var to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %0) #5 + %fp2d_var1 = bitcast i32* %fp2d_var to i8* + ; CHECK-LLVM: call void @llvm.var.annotation(i8* %{{[a-zA-Z0-9_]+}}, i8* getelementptr inbounds ([{{[0-9]+}} x i8], [{{[0-9]+}} x i8]* [[STR_FP2_VAR]], i32 0, i32 0), i8* undef, i32 undef) + call void @llvm.var.annotation(i8* %fp2d_var1, i8* getelementptr inbounds ([49 x i8], [49 x i8]* @.str.25, i32 0, i32 0), i8* getelementptr inbounds ([25 x i8], [25 x i8]* @.str.1, i32 0, i32 0), i32 151) + %1 = bitcast %struct.fp2d_st* %s to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %1) #5 + %s2 = bitcast %struct.fp2d_st* %s to i8* + ; CHECK-LLVM: call void @llvm.var.annotation(i8* %{{[a-zA-Z0-9_]+}}, i8* getelementptr inbounds ([{{[0-9]+}} x i8], [{{[0-9]+}} x i8]* [[STR_FP2_SCT]], i32 0, i32 0), i8* undef, i32 undef) + call void @llvm.var.annotation(i8* %s2, i8* getelementptr inbounds ([49 x i8], [49 x i8]* @.str.26, i32 0, i32 0), i8* getelementptr inbounds ([25 x i8], [25 x i8]* @.str.1, i32 0, i32 0), i32 155) + %field = getelementptr inbounds %struct.fp2d_st, %struct.fp2d_st* %s, i32 0, i32 0 + store i32 0, i32* %field, align 4, !tbaa !41 + %2 = bitcast %struct.fp2d_st* %s to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %2) #5 + %3 = bitcast i32* %fp2d_var to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %3) #5 + ret void +} + +; Function Attrs: norecurse nounwind +define linkonce_odr spir_func void @_Z27templ_force_pow2_depth_attrILi1EEvv() #3 { +entry: + %templ_fp2d_var = alloca i32, align 4 + %s = alloca %struct.templ_fp2d_st, align 4 + %0 = bitcast i32* %templ_fp2d_var to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %0) #5 + %templ_fp2d_var1 = bitcast i32* %templ_fp2d_var to i8* + ; CHECK-LLVM: call void @llvm.var.annotation(i8* %{{[a-zA-Z0-9_]+}}, i8* getelementptr inbounds ([{{[0-9]+}} x i8], [{{[0-9]+}} x i8]* [[STR_FP2_TE1]], i32 0, i32 0), i8* undef, i32 undef) + call void @llvm.var.annotation(i8* %templ_fp2d_var1, i8* getelementptr inbounds ([49 x i8], [49 x i8]* @.str.26, i32 0, i32 0), i8* getelementptr inbounds ([25 x i8], [25 x i8]* @.str.1, i32 0, i32 0), i32 161) + %1 = bitcast %struct.templ_fp2d_st* %s to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %1) #5 + %s2 = bitcast %struct.templ_fp2d_st* %s to i8* + ; CHECK-LLVM: call void @llvm.var.annotation(i8* %{{[a-zA-Z0-9_]+}}, i8* getelementptr inbounds ([{{[0-9]+}} x i8], [{{[0-9]+}} x i8]* [[STR_FP2_TE2]], i32 0, i32 0), i8* undef, i32 undef) + call void @llvm.var.annotation(i8* %s2, i8* getelementptr inbounds ([49 x i8], [49 x i8]* @.str.26, i32 0, i32 0), i8* getelementptr inbounds ([25 x i8], [25 x i8]* @.str.1, i32 0, i32 0), i32 165) + %field = getelementptr inbounds %struct.templ_fp2d_st, %struct.templ_fp2d_st* %s, i32 0, i32 0 + store i32 0, i32* %field, align 4, !tbaa !43 + %2 = bitcast %struct.templ_fp2d_st* %s to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %2) #5 + %3 = bitcast i32* %templ_fp2d_var to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %3) #5 + ret void +} + +attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "sycl-module-id"="intel-fpga-local-var.cpp" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { argmemonly nounwind willreturn } +attributes #2 = { inlinehint norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #3 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #4 = { nounwind willreturn } +attributes #5 = { nounwind } !llvm.module.flags = !{!0} !opencl.spir.version = !{!1} @@ -210,9 +829,45 @@ attributes #4 = { nounwind } !0 = !{i32 1, !"wchar_size", i32 4} !1 = !{i32 1, i32 2} !2 = !{i32 4, i32 100000} -!3 = !{!"clang version 9.0.0"} +!3 = !{!"clang version 11.0.0"} !4 = !{} !5 = !{!6, !6, i64 0} !6 = !{!"any pointer", !7, i64 0} !7 = !{!"omnipotent char", !8, i64 0} !8 = !{!"Simple C++ TBAA"} +!9 = !{!10, !11, i64 0} +!10 = !{!"_ZTSZ13numbanks_attrvE11numbanks_st", !11, i64 0} +!11 = !{!"int", !7, i64 0} +!12 = !{!13, !11, i64 0} +!13 = !{!"_ZTSZ19templ_numbanks_attrILi4EEvvE17templ_numbanks_st", !11, i64 0} +!14 = !{!15, !11, i64 0} +!15 = !{!"_ZTSZ13register_attrvE11register_st", !11, i64 0} +!16 = !{!11, !11, i64 0} +!17 = !{!18, !11, i64 0} +!18 = !{!"_ZTSZ14bankwidth_attrvE12bankwidth_st", !11, i64 0} +!19 = !{!20, !11, i64 0} +!20 = !{!"_ZTSZ20templ_bankwidth_attrILi16EEvvE18templ_bankwidth_st", !11, i64 0} +!21 = !{!22, !11, i64 0} +!22 = !{!"_ZTSZ19private_copies_attrvE14priv_copies_st", !11, i64 0} +!23 = !{!24, !11, i64 0} +!24 = !{!"_ZTSZ25templ_private_copies_attrILi8EEvvE20templ_priv_copies_st", !11, i64 0} +!25 = !{!26, !11, i64 0} +!26 = !{!"_ZTSZ15singlepump_attrvE13singlepump_st", !11, i64 0} +!27 = !{!28, !11, i64 0} +!28 = !{!"_ZTSZ15doublepump_attrvE13doublepump_st", !11, i64 0} +!29 = !{!30, !11, i64 0} +!30 = !{!"_ZTSZ10merge_attrvE8merge_st", !11, i64 0} +!31 = !{!32, !11, i64 0} +!32 = !{!"_ZTSZ19max_replicates_attrvE11max_repl_st", !11, i64 0} +!33 = !{!34, !11, i64 0} +!34 = !{!"_ZTSZ25templ_max_replicates_attrILi8EEvvE17templ_max_repl_st", !11, i64 0} +!35 = !{!36, !11, i64 0} +!36 = !{!"_ZTSZ21simple_dual_port_attrvE19simple_dual_port_st", !11, i64 0} +!37 = !{!38, !11, i64 0} +!38 = !{!"_ZTSZ14bank_bits_attrvE12bank_bits_st", !11, i64 0} +!39 = !{!40, !11, i64 0} +!40 = !{!"_ZTSZ20templ_bank_bits_attrILi4ELi5EEvvE18templ_bank_bits_st", !11, i64 0} +!41 = !{!42, !11, i64 0} +!42 = !{!"_ZTSZ21force_pow2_depth_attrvE7fp2d_st", !11, i64 0} +!43 = !{!44, !11, i64 0} +!44 = !{!"_ZTSZ27templ_force_pow2_depth_attrILi1EEvvE13templ_fp2d_st", !11, i64 0} diff --git a/llvm-spirv/test/IntelFPGAMemoryAttributesForStaticVar.ll b/llvm-spirv/test/IntelFPGAMemoryAttributesForStaticVar.ll index 6a33413e955d5..391a1453e821e 100644 --- a/llvm-spirv/test/IntelFPGAMemoryAttributesForStaticVar.ll +++ b/llvm-spirv/test/IntelFPGAMemoryAttributesForStaticVar.ll @@ -1,33 +1,44 @@ -; Source -; void foo(int a) { -; static int a_one [[intelfpga::numbanks(2)]]; -; a_one = a_one + a; -; } +; LLVM IR generated by Intel SYCL Clang compiler (https://github.com/intel/llvm) -; void bar(char b) { -; static char b_one [[intelfpga::memory("MLAB")]]; -; b_one = b_one + b; +; SYCL source code for this test: +; void numbanks_stat(int a) { +; static const int a_one [[intelfpga::numbanks(2)]] = 1; +; int a_two = a_one + a; ; } - -; void baz(int c) { -; static int c_one[[clang::annotate("foobarbaz")]]; -; c_one = c_one + c; +; +; void memory_stat(char b) { +; static const char b_one [[intelfpga::memory("MLAB")]] = 2; +; char b_two = b_one + b; ; } - +; +; void annotate_stat(int c) { +; static const int c_one [[clang::annotate("foobarbaz")]] = 3; +; int c_two = c_one + c; +; } +; +; void force_pow2_depth_stat(int fp2d) { +; static const int fp2d_stat [[intelfpga::force_pow2_depth(0)]] = 4; +; int fp2d_loc = fp2d_stat + fp2d; +; } +; ; template ; __attribute__((sycl_kernel)) void kernel_single_task(Func kernelFunc) { ; kernelFunc(); ; } - +; ; int main() { ; kernel_single_task([]() { -; foo(128); -; bar(42); -; baz(16); +; numbanks_stat(128); +; memory_stat(42); +; annotate_stat(16); +; force_pow2_depth_stat(25); ; }); ; return 0; ; } +; LLVM IR compilation command: +; clang -cc1 -triple spir -disable-llvm-passes -fsycl-is-device -emit-llvm intel-fpga-local-var.cpp + ; RUN: llvm-as %s -o %t.bc ; RUN: llvm-spirv %t.bc --spirv-ext=+SPV_INTEL_fpga_memory_attributes -o %t.spv ; RUN: llvm-spirv %t.spv -to-text -o - | FileCheck %s --check-prefix=CHECK-SPIRV @@ -37,107 +48,137 @@ ; CHECK-SPIRV: Capability FPGAMemoryAttributesINTEL ; CHECK-SPIRV: Extension "SPV_INTEL_fpga_memory_attributes" -; CHECK-SPIRV: Decorate {{[0-9]+}} UserSemantic "foobarbaz" ; CHECK-SPIRV: Decorate {{[0-9]+}} MemoryINTEL "DEFAULT" +; CHECK-SPIRV: Decorate {{[0-9]+}} UserSemantic "foobarbaz" ; CHECK-SPIRV: Decorate {{[0-9]+}} MemoryINTEL "MLAB" ; CHECK-SPIRV: Decorate {{[0-9]+}} NumbanksINTEL 2 +; CHECK-SPIRV: Decorate {{[0-9]+}} ForcePow2DepthINTEL 0 -target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" -target triple = "spir64-unknown-linux-sycldevice" +target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" +target triple = "spir" -%"class._ZTSZ4mainE3$_0.anon" = type { i8 } +%class.anon = type { i8 } -; CHECK-LLVM: [[STR:@[0-9_.]+]] = {{.*}}{memory:DEFAULT}{numbanks:2} -; CHECK-LLVM: [[STR2:@[0-9_.]+]] = {{.*}}{memory:MLAB} -; CHECK-LLVM: [[STR3:@[0-9_.]+]] = {{.*}}foobarbaz +; CHECK-LLVM: [[STR_NMB_STAT:@[0-9_.]+]] = {{.*}}{memory:DEFAULT}{numbanks:2} +; CHECK-LLVM: [[STR_MEM_STAT:@[0-9_.]+]] = {{.*}}{memory:MLAB} +; CHECK-LLVM: [[STR_ANN_STAT:@[0-9_.]+]] = {{.*}}foobarbaz +; CHECK-LLVM: [[STR_FP2_STAT:@[0-9_.]+]] = {{.*}}{memory:DEFAULT}{force_pow2_depth:0} ; CHECK-LLVM: @llvm.global.annotations -; CHECK-SAME: _ZZ3fooiE5a_one{{.*}}[[STR]]{{.*}}_ZZ3bariE5b_one{{.*}}[[STR2]]{{.*}}_ZZ3baziE5c_one{{.*}}[[STR3]] -@_ZZ3fooiE5a_one = internal addrspace(1) global i32 0, align 4 -@.str = private unnamed_addr constant [29 x i8] c"{memory:DEFAULT}{numbanks:2}\00", section "llvm.metadata" -@.str.1 = private unnamed_addr constant [9 x i8] c"test.cpp\00", section "llvm.metadata" -@_ZZ3barcE5b_one = internal addrspace(1) global i8 0, align 1 -@.str.2 = private unnamed_addr constant [14 x i8] c"{memory:MLAB}\00", section "llvm.metadata" -@_ZZ3baziE5c_one = internal addrspace(1) global i32 0, align 4 +; CHECK-LLVM-SAME: _ZZ13numbanks_statiE5a_one{{.*}}[[STR_NMB_STAT]]{{.*}}_ZZ11memory_statcE5b_one{{.*}}[[STR_MEM_STAT]]{{.*}}_ZZ13annotate_statiE5c_one{{.*}}[[STR_ANN_STAT]]{{.*}}_ZZ21force_pow2_depth_statiE9fp2d_stat{{.*}}[[STR_FP2_STAT]] +@_ZZ13numbanks_statiE5a_one = internal addrspace(1) constant i32 1, align 4 +@.str = private unnamed_addr constant [41 x i8] c"{memory:DEFAULT}{sizeinfo:4}{numbanks:2}\00", section "llvm.metadata" +@.str.1 = private unnamed_addr constant [28 x i8] c"intel-fpga-local-static.cpp\00", section "llvm.metadata" +@_ZZ11memory_statcE5b_one = internal addrspace(1) constant i8 2, align 1 +@.str.2 = private unnamed_addr constant [26 x i8] c"{memory:MLAB}{sizeinfo:1}\00", section "llvm.metadata" +@_ZZ13annotate_statiE5c_one = internal addrspace(1) constant i32 3, align 4 @.str.3 = private unnamed_addr constant [10 x i8] c"foobarbaz\00", section "llvm.metadata" -@llvm.global.annotations = appending global [3 x { i8 addrspace(1)*, i8*, i8*, i32 }] [{ i8 addrspace(1)*, i8*, i8*, i32 } { i8 addrspace(1)* bitcast (i32 addrspace(1)* @_ZZ3fooiE5a_one to i8 addrspace(1)*), i8* getelementptr inbounds ([29 x i8], [29 x i8]* @.str, i32 0, i32 0), i8* getelementptr inbounds ([9 x i8], [9 x i8]* @.str.1, i32 0, i32 0), i32 2 }, { i8 addrspace(1)*, i8*, i8*, i32 } { i8 addrspace(1)* @_ZZ3barcE5b_one, i8* getelementptr inbounds ([14 x i8], [14 x i8]* @.str.2, i32 0, i32 0), i8* getelementptr inbounds ([9 x i8], [9 x i8]* @.str.1, i32 0, i32 0), i32 7 }, { i8 addrspace(1)*, i8*, i8*, i32 } { i8 addrspace(1)* bitcast (i32 addrspace(1)* @_ZZ3baziE5c_one to i8 addrspace(1)*), i8* getelementptr inbounds ([10 x i8], [10 x i8]* @.str.3, i32 0, i32 0), i8* getelementptr inbounds ([9 x i8], [9 x i8]* @.str.1, i32 0, i32 0), i32 12 }], section "llvm.metadata" +@_ZZ21force_pow2_depth_statiE9fp2d_stat = internal addrspace(1) constant i32 4, align 4 +@.str.4 = private unnamed_addr constant [49 x i8] c"{memory:DEFAULT}{sizeinfo:4}{force_pow2_depth:0}\00", section "llvm.metadata" +@llvm.global.annotations = appending global [4 x { i8*, i8*, i8*, i32 }] [{ i8*, i8*, i8*, i32 } { i8* addrspacecast (i8 addrspace(1)* bitcast (i32 addrspace(1)* @_ZZ13numbanks_statiE5a_one to i8 addrspace(1)*) to i8*), i8* getelementptr inbounds ([41 x i8], [41 x i8]* @.str, i32 0, i32 0), i8* getelementptr inbounds ([28 x i8], [28 x i8]* @.str.1, i32 0, i32 0), i32 2 }, { i8*, i8*, i8*, i32 } { i8* addrspacecast (i8 addrspace(1)* @_ZZ11memory_statcE5b_one to i8*), i8* getelementptr inbounds ([26 x i8], [26 x i8]* @.str.2, i32 0, i32 0), i8* getelementptr inbounds ([28 x i8], [28 x i8]* @.str.1, i32 0, i32 0), i32 7 }, { i8*, i8*, i8*, i32 } { i8* addrspacecast (i8 addrspace(1)* bitcast (i32 addrspace(1)* @_ZZ13annotate_statiE5c_one to i8 addrspace(1)*) to i8*), i8* getelementptr inbounds ([10 x i8], [10 x i8]* @.str.3, i32 0, i32 0), i8* getelementptr inbounds ([28 x i8], [28 x i8]* @.str.1, i32 0, i32 0), i32 12 }, { i8*, i8*, i8*, i32 } { i8* addrspacecast (i8 addrspace(1)* bitcast (i32 addrspace(1)* @_ZZ21force_pow2_depth_statiE9fp2d_stat to i8 addrspace(1)*) to i8*), i8* getelementptr inbounds ([49 x i8], [49 x i8]* @.str.4, i32 0, i32 0), i8* getelementptr inbounds ([28 x i8], [28 x i8]* @.str.1, i32 0, i32 0), i32 17 }], section "llvm.metadata" -; Function Attrs: nounwind +; Function Attrs: norecurse nounwind define spir_kernel void @_ZTSZ4mainE15kernel_function() #0 !kernel_arg_addr_space !4 !kernel_arg_access_qual !4 !kernel_arg_type !4 !kernel_arg_base_type !4 !kernel_arg_type_qual !4 { entry: - %0 = alloca %"class._ZTSZ4mainE3$_0.anon", align 1 - %1 = bitcast %"class._ZTSZ4mainE3$_0.anon"* %0 to i8* + %0 = alloca %class.anon, align 1 + %1 = bitcast %class.anon* %0 to i8* call void @llvm.lifetime.start.p0i8(i64 1, i8* %1) #4 - %2 = addrspacecast %"class._ZTSZ4mainE3$_0.anon"* %0 to %"class._ZTSZ4mainE3$_0.anon" addrspace(4)* - call spir_func void @"_ZZ4mainENK3$_0clEv"(%"class._ZTSZ4mainE3$_0.anon" addrspace(4)* %2) - %3 = bitcast %"class._ZTSZ4mainE3$_0.anon"* %0 to i8* - call void @llvm.lifetime.end.p0i8(i64 1, i8* %3) #4 + call spir_func void @"_ZZ4mainENK3$_0clEv"(%class.anon* %0) + %2 = bitcast %class.anon* %0 to i8* + call void @llvm.lifetime.end.p0i8(i64 1, i8* %2) #4 ret void } -; Function Attrs: argmemonly nounwind +; Function Attrs: argmemonly nounwind willreturn declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #1 -; Function Attrs: inlinehint nounwind -define internal spir_func void @"_ZZ4mainENK3$_0clEv"(%"class._ZTSZ4mainE3$_0.anon" addrspace(4)* %this) #2 align 2 { +; Function Attrs: inlinehint norecurse nounwind +define internal spir_func void @"_ZZ4mainENK3$_0clEv"(%class.anon* %this) #2 align 2 { entry: - %this.addr = alloca %"class._ZTSZ4mainE3$_0.anon" addrspace(4)*, align 8 - store %"class._ZTSZ4mainE3$_0.anon" addrspace(4)* %this, %"class._ZTSZ4mainE3$_0.anon" addrspace(4)** %this.addr, align 8, !tbaa !5 - %this1 = load %"class._ZTSZ4mainE3$_0.anon" addrspace(4)*, %"class._ZTSZ4mainE3$_0.anon" addrspace(4)** %this.addr, align 8 - call spir_func void @_Z3fooi(i32 128) - call spir_func void @_Z3barc(i8 signext 42) - call spir_func void @_Z3bazi(i32 16) + %this.addr = alloca %class.anon*, align 4 + store %class.anon* %this, %class.anon** %this.addr, align 4, !tbaa !5 + %this1 = load %class.anon*, %class.anon** %this.addr, align 4 + call spir_func void @_Z13numbanks_stati(i32 128) + call spir_func void @_Z11memory_statc(i8 signext 42) + call spir_func void @_Z13annotate_stati(i32 16) + call spir_func void @_Z21force_pow2_depth_stati(i32 25) ret void } -; Function Attrs: argmemonly nounwind +; Function Attrs: argmemonly nounwind willreturn declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #1 -; CHECK-LLVM: void @_Z3fooi(i32 %a) -; Function Attrs: nounwind -define spir_func void @_Z3fooi(i32 %a) #3 { +; CHECK-LLVM: void @_Z13numbanks_stati(i32 %a) +; Function Attrs: norecurse nounwind +define spir_func void @_Z13numbanks_stati(i32 %a) #3 { entry: %a.addr = alloca i32, align 4 + %a_two = alloca i32, align 4 store i32 %a, i32* %a.addr, align 4, !tbaa !9 - %0 = load i32, i32 addrspace(4)* addrspacecast (i32 addrspace(1)* @_ZZ3fooiE5a_one to i32 addrspace(4)*), align 4, !tbaa !9 + %0 = bitcast i32* %a_two to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %0) #4 %1 = load i32, i32* %a.addr, align 4, !tbaa !9 - %add = add nsw i32 %0, %1 - store i32 %add, i32 addrspace(4)* addrspacecast (i32 addrspace(1)* @_ZZ3fooiE5a_one to i32 addrspace(4)*), align 4, !tbaa !9 + %add = add nsw i32 1, %1 + store i32 %add, i32* %a_two, align 4, !tbaa !9 + %2 = bitcast i32* %a_two to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %2) #4 ret void } -; CHECK-LLVM: void @_Z3barc(i8 signext %b) -; Function Attrs: nounwind -define spir_func void @_Z3barc(i8 signext %b) #3 { +; CHECK-LLVM: void @_Z11memory_statc(i8 signext %b) +; Function Attrs: norecurse nounwind +define spir_func void @_Z11memory_statc(i8 signext %b) #3 { entry: %b.addr = alloca i8, align 1 + %b_two = alloca i8, align 1 store i8 %b, i8* %b.addr, align 1, !tbaa !11 - %0 = load i8, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* @_ZZ3barcE5b_one to i8 addrspace(4)*), align 1, !tbaa !11 + call void @llvm.lifetime.start.p0i8(i64 1, i8* %b_two) #4 + %0 = load i8, i8* %b.addr, align 1, !tbaa !11 %conv = sext i8 %0 to i32 - %1 = load i8, i8* %b.addr, align 1, !tbaa !11 - %conv1 = sext i8 %1 to i32 - %add = add nsw i32 %conv, %conv1 - %conv2 = trunc i32 %add to i8 - store i8 %conv2, i8 addrspace(4)* addrspacecast (i8 addrspace(1)* @_ZZ3barcE5b_one to i8 addrspace(4)*), align 1, !tbaa !11 + %add = add nsw i32 2, %conv + %conv1 = trunc i32 %add to i8 + store i8 %conv1, i8* %b_two, align 1, !tbaa !11 + call void @llvm.lifetime.end.p0i8(i64 1, i8* %b_two) #4 ret void } -; CHECK-LLVM: void @_Z3bazi(i32 %c) -; Function Attrs: nounwind -define spir_func void @_Z3bazi(i32 %c) #3 { +; CHECK-LLVM: void @_Z13annotate_stati(i32 %c) +; Function Attrs: norecurse nounwind +define spir_func void @_Z13annotate_stati(i32 %c) #3 { entry: %c.addr = alloca i32, align 4 + %c_two = alloca i32, align 4 store i32 %c, i32* %c.addr, align 4, !tbaa !9 - %0 = load i32, i32 addrspace(4)* addrspacecast (i32 addrspace(1)* @_ZZ3baziE5c_one to i32 addrspace(4)*), align 4, !tbaa !9 + %0 = bitcast i32* %c_two to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %0) #4 %1 = load i32, i32* %c.addr, align 4, !tbaa !9 - %add = add nsw i32 %0, %1 - store i32 %add, i32 addrspace(4)* addrspacecast (i32 addrspace(1)* @_ZZ3baziE5c_one to i32 addrspace(4)*), align 4, !tbaa !9 + %add = add nsw i32 3, %1 + store i32 %add, i32* %c_two, align 4, !tbaa !9 + %2 = bitcast i32* %c_two to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %2) #4 + ret void +} + +; CHECK-LLVM: void @_Z21force_pow2_depth_stati(i32 %fp2d) +; Function Attrs: norecurse nounwind +define spir_func void @_Z21force_pow2_depth_stati(i32 %fp2d) #3 { +entry: + %fp2d.addr = alloca i32, align 4 + %fp2d_loc = alloca i32, align 4 + store i32 %fp2d, i32* %fp2d.addr, align 4, !tbaa !9 + %0 = bitcast i32* %fp2d_loc to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %0) #4 + %1 = load i32, i32* %fp2d.addr, align 4, !tbaa !9 + %add = add nsw i32 4, %1 + store i32 %add, i32* %fp2d_loc, align 4, !tbaa !9 + %2 = bitcast i32* %fp2d_loc to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %2) #4 ret void } -attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { argmemonly nounwind } -attributes #2 = { inlinehint nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #3 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "sycl-module-id"="intel-fpga-local-static.cpp" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { argmemonly nounwind willreturn } +attributes #2 = { inlinehint norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #3 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } attributes #4 = { nounwind } !llvm.module.flags = !{!0} @@ -148,7 +189,7 @@ attributes #4 = { nounwind } !0 = !{i32 1, !"wchar_size", i32 4} !1 = !{i32 1, i32 2} !2 = !{i32 4, i32 100000} -!3 = !{!"clang version 9.0.0"} +!3 = !{!"clang version 11.0.0"} !4 = !{} !5 = !{!6, !6, i64 0} !6 = !{!"any pointer", !7, i64 0} diff --git a/llvm-spirv/test/IntelFPGAMemoryAttributesForStruct.ll b/llvm-spirv/test/IntelFPGAMemoryAttributesForStruct.ll index 3f741e9c241e0..7d0cde557d927 100644 --- a/llvm-spirv/test/IntelFPGAMemoryAttributesForStruct.ll +++ b/llvm-spirv/test/IntelFPGAMemoryAttributesForStruct.ll @@ -1,3 +1,186 @@ +; LLVM IR generated by Intel SYCL Clang compiler (https://github.com/intel/llvm) + +; SYCL source code for this test: +; void field_numbanks_attr() { +; struct numbanks_st { +; [[intelfpga::numbanks(4)]] int field; +; } s; +; s.field = 0; +; } +; +; template +; void templ_field_numbanks_attr() { +; struct templ_numbanks_st { +; [[intelfpga::numbanks(A)]] int field; +; } s; +; s.field = 0; +; } +; +; void field_register_attr() { +; struct register_st { +; [[intelfpga::register]] int field; +; } s; +; s.field = 0; +; } +; +; void field_memory_attr() { +; struct memory_st { +; [[intelfpga::memory("MLAB")]] int field; +; } s; +; s.field = 0; +; } +; +; void field_bankwidth_attr() { +; struct bankwidth_st { +; [[intelfpga::bankwidth(8)]] int field; +; } s; +; s.field = 0; +; } +; +; template +; void templ_field_bankwidth_attr() { +; struct templ_bankwidth_st { +; [[intelfpga::bankwidth(A)]] int field; +; } s; +; s.field = 0; +; } +; +; void field_private_copies_attr() { +; struct private_copies_st { +; [[intelfpga::private_copies(4)]] int field; +; } s; +; s.field = 0; +; } +; +; template +; void templ_field_private_copies_attr() { +; struct templ_private_copies_st { +; [[intelfpga::private_copies(A)]] int field; +; } s; +; s.field = 0; +; } +; +; void field_singlepump_attr() { +; struct singlepump_st { +; [[intelfpga::singlepump]] int field; +; } s; +; s.field = 0; +; } +; +; void field_doublepump_attr() { +; struct doublepump_st { +; [[intelfpga::doublepump]] int field; +; } s; +; s.field = 0; +; } +; +; void field_merge_attr() { +; struct merge_st { +; [[intelfpga::merge("foobar", "width")]] int field; +; } s; +; s.field = 0; +; } +; +; void field_max_replicates_attr() { +; struct max_replicates_st { +; [[intelfpga::max_replicates(4)]] int field; +; } s; +; s.field = 0; +; } +; +; template +; void templ_field_max_replicates_attr() { +; struct templ_max_replicates_st { +; [[intelfpga::max_replicates(A)]] int field; +; } s; +; s.field = 0; +; } +; +; void field_simple_dual_port_attr() { +; struct simple_dual_port_st { +; [[intelfpga::simple_dual_port]] int field; +; } s; +; s.field = 0; +; } +; +; void field_bank_bits_attr() { +; struct bank_bits_st { +; [[intelfpga::bank_bits(42,41,40)]] int field; +; } s; +; s.field = 0; +; } +; +; template +; void templ_field_bank_bits_attr() { +; struct templ_bank_bits_st { +; [[intelfpga::bank_bits(A, B)]] int field; +; } s; +; s.field = 0; +; } +; +; void field_force_pow2_depth_attr() { +; struct force_pow2_depth_st { +; [[intelfpga::force_pow2_depth(0)]] int field; +; } s; +; s.field = 0; +; } +; +; template +; void templ_field_force_pow2_depth_attr() { +; struct templ_force_pow2_depth_st { +; [[intelfpga::force_pow2_depth(A)]] int field; +; } s; +; s.field = 0; +; } +; +; void field_addrspace_cast() { +; struct state { +; [[intelfpga::numbanks(2)]] int mem[8]; +; +; // The initialization code is not relevant to this example. +; // It prevents the compiler from optimizing away access to this struct. +; state() { +; for (auto i = 0; i < 8; i++) { +; mem[i] = i; +; } +; } +; } state_var; +; state_var.mem[0] = 42; +; } +; +; template +; __attribute__((sycl_kernel)) void kernel_single_task(Func kernelFunc) { +; kernelFunc(); +; } +; +; int main() { +; kernel_single_task([]() { +; field_numbanks_attr(); +; templ_field_numbanks_attr<8>(); +; field_register_attr(); +; field_memory_attr(); +; field_bankwidth_attr(); +; templ_field_bankwidth_attr<4>(); +; field_private_copies_attr(); +; templ_field_private_copies_attr<2>(); +; field_singlepump_attr(); +; field_doublepump_attr(); +; field_merge_attr(); +; field_max_replicates_attr(); +; templ_field_max_replicates_attr<2>(); +; field_simple_dual_port_attr(); +; field_bank_bits_attr(); +; templ_field_bank_bits_attr<2,3>(); +; field_force_pow2_depth_attr(); +; templ_field_force_pow2_depth_attr<1>(); +; field_addrspace_cast(); +; }); +; return 0; +; } + +; LLVM IR compilation command: +; clang -cc1 -triple spir -disable-llvm-passes -fsycl-is-device -emit-llvm intel-fpga-local-var.cpp + ; RUN: llvm-as %s -o %t.bc ; RUN: llvm-spirv %t.bc --spirv-ext=+SPV_INTEL_fpga_memory_attributes -o %t.spv ; RUN: llvm-spirv %t.spv --spirv-ext=+SPV_INTEL_fpga_memory_attributes -to-text -o %t.spt @@ -8,246 +191,517 @@ ; RUN: llvm-spirv -spirv-text -r %t.spt -o %t.rev.bc ; RUN: llvm-dis < %t.rev.bc | FileCheck %s --check-prefix=CHECK-LLVM -; + ; TODO: add a bunch of different tests for --spirv-ext option ; CHECK-SPIRV: Capability FPGAMemoryAttributesINTEL ; CHECK-SPIRV: Extension "SPV_INTEL_fpga_memory_attributes" -; CHECK-SPIRV: MemberDecorate {{[0-9]+}} 1 RegisterINTEL +; CHECK-SPIRV: MemberDecorate {{[0-9]+}} 0 RegisterINTEL ; CHECK-SPIRV: MemberDecorate {{[0-9]+}} 0 MemoryINTEL "DEFAULT" -; CHECK-SPIRV: MemberDecorate {{[0-9]+}} 3 MemoryINTEL "DEFAULT" -; CHECK-SPIRV: MemberDecorate {{[0-9]+}} 2 MemoryINTEL "MLAB" +; CHECK-SPIRV: MemberDecorate {{[0-9]+}} 0 MemoryINTEL "MLAB" ; CHECK-SPIRV: MemberDecorate {{[0-9]+}} 0 NumbanksINTEL 2 ; CHECK-SPIRV: MemberDecorate {{[0-9]+}} 0 NumbanksINTEL 4 -; CHECK-SPIRV: MemberDecorate {{[0-9]+}} 3 BankwidthINTEL 8 -; CHECK-SPIRV: MemberDecorate {{[0-9]+}} 4 MaxPrivateCopiesINTEL 4 -; CHECK-SPIRV: MemberDecorate {{[0-9]+}} 5 SinglepumpINTEL -; CHECK-SPIRV: MemberDecorate {{[0-9]+}} 6 DoublepumpINTEL -; CHECK-SPIRV: MemberDecorate {{[0-9]+}} 8 MaxReplicatesINTEL 4 -; CHECK-SPIRV: MemberDecorate {{[0-9]+}} 9 SimpleDualPortINTEL -; CHECK-SPIRV: MemberDecorate {{[0-9]+}} 7 MergeINTEL "foobar" "width" +; CHECK-SPIRV: MemberDecorate {{[0-9]+}} 0 NumbanksINTEL 8 +; CHECK-SPIRV: MemberDecorate {{[0-9]+}} 0 BankwidthINTEL 4 +; CHECK-SPIRV: MemberDecorate {{[0-9]+}} 0 BankwidthINTEL 8 +; CHECK-SPIRV: MemberDecorate {{[0-9]+}} 0 MaxPrivateCopiesINTEL 2 +; CHECK-SPIRV: MemberDecorate {{[0-9]+}} 0 MaxPrivateCopiesINTEL 4 +; CHECK-SPIRV: MemberDecorate {{[0-9]+}} 0 SinglepumpINTEL +; CHECK-SPIRV: MemberDecorate {{[0-9]+}} 0 DoublepumpINTEL +; CHECK-SPIRV: MemberDecorate {{[0-9]+}} 0 MaxReplicatesINTEL 2 +; CHECK-SPIRV: MemberDecorate {{[0-9]+}} 0 MaxReplicatesINTEL 4 +; CHECK-SPIRV: MemberDecorate {{[0-9]+}} 0 SimpleDualPortINTEL +; CHECK-SPIRV: MemberDecorate {{[0-9]+}} 0 MergeINTEL "foobar" "width" +; CHECK-SPIRV: MemberDecorate {{[0-9]+}} 0 BankBitsINTEL 2 3 ; CHECK-SPIRV: MemberDecorate {{[0-9]+}} 0 BankBitsINTEL 42 41 40 +; CHECK-SPIRV: MemberDecorate {{[0-9]+}} 0 ForcePow2DepthINTEL 0 +; CHECK-SPIRV: MemberDecorate {{[0-9]+}} 0 ForcePow2DepthINTEL 1 -target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" -target triple = "spir64-unknown-linux" +target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" +target triple = "spir" %class.anon = type { i8 } -%struct.foo = type { i32, i32, i32, i32, i8, i32, i32, i32, i32, i32 } -%struct.s = type { i32 } - -%struct._ZTSZ20field_addrspace_castvE5state.state = type { [8 x i32] } - -; CHECK-LLVM: [[STR1:@[0-9_.]+]] = {{.*}}{memory:DEFAULT}{numbanks:4} -; CHECK-LLVM: [[STR2:@[0-9_.]+]] = {{.*}}{register:1} -; CHECK-LLVM: [[STR3:@[0-9_.]+]] = {{.*}}{memory:MLAB} -; CHECK-LLVM: [[STR4:@[0-9_.]+]] = {{.*}}{memory:DEFAULT}{bankwidth:8} -; CHECK-LLVM: [[STR5:@[0-9_.]+]] = {{.*}}{memory:DEFAULT}{private_copies:4} -; CHECK-LLVM: [[STR6:@[0-9_.]+]] = {{.*}}{memory:DEFAULT}{pump:1} -; CHECK-LLVM: [[STR7:@[0-9_.]+]] = {{.*}}{memory:DEFAULT}{pump:2} -; CHECK-LLVM: [[STR8:@[0-9_.]+]] = {{.*}}{memory:DEFAULT}{merge:foobar:width} -; CHECK-LLVM: [[STR9:@[0-9_.]+]] = {{.*}}{max_replicates:4} -; CHECK-LLVM: [[STR10:@[0-9_.]+]] = {{.*}}{memory:DEFAULT}{simple_dual_port:1} -; CHECK-LLVM: [[STR12:@[0-9_.]+]] = {{.*}}{memory:DEFAULT}{numbanks:8}{bank_bits:42,41,40} -; CHECK-LLVM: [[STR11:@[0-9_.]+]] = {{.*}}{memory:DEFAULT}{numbanks:2} -@.str = private unnamed_addr constant [29 x i8] c"{memory:DEFAULT}{numbanks:4}\00", section "llvm.metadata" -@.str.1 = private unnamed_addr constant [16 x i8] c"test_struct.cpp\00", section "llvm.metadata" -@.str.2 = private unnamed_addr constant [13 x i8] c"{register:1}\00", section "llvm.metadata" -@.str.3 = private unnamed_addr constant [14 x i8] c"{memory:MLAB}\00", section "llvm.metadata" -@.str.4 = private unnamed_addr constant [30 x i8] c"{memory:DEFAULT}{bankwidth:8}\00", section "llvm.metadata" -@.str.5 = private unnamed_addr constant [35 x i8] c"{memory:DEFAULT}{private_copies:4}\00", section "llvm.metadata" -@.str.6 = private unnamed_addr constant [25 x i8] c"{memory:DEFAULT}{pump:1}\00", section "llvm.metadata" -@.str.7 = private unnamed_addr constant [25 x i8] c"{memory:DEFAULT}{pump:2}\00", section "llvm.metadata" -@.str.8 = private unnamed_addr constant [37 x i8] c"{memory:DEFAULT}{merge:foobar:width}\00", section "llvm.metadata" -@.str.9 = private unnamed_addr constant [19 x i8] c"{max_replicates:4}\00", section "llvm.metadata" -@.str.10 = private unnamed_addr constant [37 x i8] c"{memory:DEFAULT}{simple_dual_port:1}\00", section "llvm.metadata" -@.str.11 = private unnamed_addr constant [29 x i8] c"{memory:DEFAULT}{numbanks:2}\00", section "llvm.metadata" -@.str.12 = private unnamed_addr constant [49 x i8] c"{memory:DEFAULT}{numbanks:8}{bank_bits:42,41,40}\00", section "llvm.metadata" - -; Function Attrs: nounwind +%struct.numbanks_st = type { i32 } +%struct.templ_numbanks_st = type { i32 } +%struct.register_st = type { i32 } +%struct.memory_st = type { i32 } +%struct.bankwidth_st = type { i32 } +%struct.templ_bankwidth_st = type { i32 } +%struct.private_copies_st = type { i32 } +%struct.templ_private_copies_st = type { i32 } +%struct.singlepump_st = type { i32 } +%struct.doublepump_st = type { i32 } +%struct.merge_st = type { i32 } +%struct.max_replicates_st = type { i32 } +%struct.templ_max_replicates_st = type { i32 } +%struct.simple_dual_port_st = type { i32 } +%struct.bank_bits_st = type { i32 } +%struct.templ_bank_bits_st = type { i32 } +%struct.force_pow2_depth_st = type { i32 } +%struct.templ_force_pow2_depth_st = type { i32 } +%struct.state = type { [8 x i32] } + +; CHECK-LLVM: [[STR_NMB_SCT:@[0-9_.]+]] = {{.*}}{memory:DEFAULT}{numbanks:4} +; CHECK-LLVM: [[STR_NMB_STE:@[0-9_.]+]] = {{.*}}{memory:DEFAULT}{numbanks:8} +; CHECK-LLVM: [[STR_REG_SCT:@[0-9_.]+]] = {{.*}}{register:1} +; CHECK-LLVM: [[STR_MEM_SCT:@[0-9_.]+]] = {{.*}}{memory:MLAB} +; CHECK-LLVM: [[STR_BWD_SCT:@[0-9_.]+]] = {{.*}}{memory:DEFAULT}{bankwidth:8} +; CHECK-LLVM: [[STR_BWD_STE:@[0-9_.]+]] = {{.*}}{memory:DEFAULT}{bankwidth:4} +; CHECK-LLVM: [[STR_PRC_SCT:@[0-9_.]+]] = {{.*}}{memory:DEFAULT}{private_copies:4} +; CHECK-LLVM: [[STR_PRC_STE:@[0-9_.]+]] = {{.*}}{memory:DEFAULT}{private_copies:2} +; CHECK-LLVM: [[STR_SNP_SCT:@[0-9_.]+]] = {{.*}}{memory:DEFAULT}{pump:1} +; CHECK-LLVM: [[STR_DBP_SCT:@[0-9_.]+]] = {{.*}}{memory:DEFAULT}{pump:2} +; CHECK-LLVM: [[STR_MRG_SCT:@[0-9_.]+]] = {{.*}}{memory:DEFAULT}{merge:foobar:width} +; CHECK-LLVM: [[STR_MXR_SCT:@[0-9_.]+]] = {{.*}}{max_replicates:4} +; CHECK-LLVM: [[STR_MXR_STE:@[0-9_.]+]] = {{.*}}{max_replicates:2} +; CHECK-LLVM: [[STR_SDP_SCT:@[0-9_.]+]] = {{.*}}{memory:DEFAULT}{simple_dual_port:1} +; CHECK-LLVM: [[STR_BBT_SCT:@[0-9_.]+]] = {{.*}}{memory:DEFAULT}{numbanks:8}{bank_bits:42,41,40} +; CHECK-LLVM: [[STR_BBT_STE:@[0-9_.]+]] = {{.*}}{memory:DEFAULT}{numbanks:4}{bank_bits:2,3} +; CHECK-LLVM: [[STR_FP2_SCT:@[0-9_.]+]] = {{.*}}{memory:DEFAULT}{force_pow2_depth:0} +; CHECK-LLVM: [[STR_FP2_STE:@[0-9_.]+]] = {{.*}}{memory:DEFAULT}{force_pow2_depth:1} +; CHECK-LLVM: [[STR_NMB_ASC:@[0-9_.]+]] = {{.*}}{memory:DEFAULT}{numbanks:2} +@.str = private unnamed_addr constant [41 x i8] c"{memory:DEFAULT}{sizeinfo:4}{numbanks:4}\00", section "llvm.metadata" +@.str.1 = private unnamed_addr constant [28 x i8] c"intel-fpga-local-struct.cpp\00", section "llvm.metadata" +@.str.2 = private unnamed_addr constant [41 x i8] c"{memory:DEFAULT}{sizeinfo:4}{numbanks:8}\00", section "llvm.metadata" +@.str.3 = private unnamed_addr constant [13 x i8] c"{register:1}\00", section "llvm.metadata" +@.str.4 = private unnamed_addr constant [26 x i8] c"{memory:MLAB}{sizeinfo:4}\00", section "llvm.metadata" +@.str.5 = private unnamed_addr constant [42 x i8] c"{memory:DEFAULT}{sizeinfo:4}{bankwidth:8}\00", section "llvm.metadata" +@.str.6 = private unnamed_addr constant [42 x i8] c"{memory:DEFAULT}{sizeinfo:4}{bankwidth:4}\00", section "llvm.metadata" +@.str.7 = private unnamed_addr constant [47 x i8] c"{memory:DEFAULT}{sizeinfo:4}{private_copies:4}\00", section "llvm.metadata" +@.str.8 = private unnamed_addr constant [47 x i8] c"{memory:DEFAULT}{sizeinfo:4}{private_copies:2}\00", section "llvm.metadata" +@.str.9 = private unnamed_addr constant [37 x i8] c"{memory:DEFAULT}{sizeinfo:4}{pump:1}\00", section "llvm.metadata" +@.str.10 = private unnamed_addr constant [37 x i8] c"{memory:DEFAULT}{sizeinfo:4}{pump:2}\00", section "llvm.metadata" +@.str.11 = private unnamed_addr constant [49 x i8] c"{memory:DEFAULT}{sizeinfo:4}{merge:foobar:width}\00", section "llvm.metadata" +@.str.12 = private unnamed_addr constant [19 x i8] c"{max_replicates:4}\00", section "llvm.metadata" +@.str.13 = private unnamed_addr constant [19 x i8] c"{max_replicates:2}\00", section "llvm.metadata" +@.str.14 = private unnamed_addr constant [49 x i8] c"{memory:DEFAULT}{sizeinfo:4}{simple_dual_port:1}\00", section "llvm.metadata" +@.str.15 = private unnamed_addr constant [61 x i8] c"{memory:DEFAULT}{sizeinfo:4}{numbanks:8}{bank_bits:42,41,40}\00", section "llvm.metadata" +@.str.16 = private unnamed_addr constant [56 x i8] c"{memory:DEFAULT}{sizeinfo:4}{numbanks:4}{bank_bits:2,3}\00", section "llvm.metadata" +@.str.17 = private unnamed_addr constant [49 x i8] c"{memory:DEFAULT}{sizeinfo:4}{force_pow2_depth:0}\00", section "llvm.metadata" +@.str.18 = private unnamed_addr constant [49 x i8] c"{memory:DEFAULT}{sizeinfo:4}{force_pow2_depth:1}\00", section "llvm.metadata" +@.str.19 = private unnamed_addr constant [43 x i8] c"{memory:DEFAULT}{sizeinfo:4,8}{numbanks:2}\00", section "llvm.metadata" + +; Function Attrs: norecurse nounwind define spir_kernel void @_ZTSZ4mainE15kernel_function() #0 !kernel_arg_addr_space !4 !kernel_arg_access_qual !4 !kernel_arg_type !4 !kernel_arg_base_type !4 !kernel_arg_type_qual !4 { entry: %0 = alloca %class.anon, align 1 %1 = bitcast %class.anon* %0 to i8* - call void @llvm.lifetime.start.p0i8(i64 1, i8* %1) #4 + call void @llvm.lifetime.start.p0i8(i64 1, i8* %1) #5 call spir_func void @"_ZZ4mainENK3$_0clEv"(%class.anon* %0) %2 = bitcast %class.anon* %0 to i8* - call void @llvm.lifetime.end.p0i8(i64 1, i8* %2) #4 + call void @llvm.lifetime.end.p0i8(i64 1, i8* %2) #5 ret void } -; Function Attrs: argmemonly nounwind -declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1 +; Function Attrs: argmemonly nounwind willreturn +declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #1 -; Function Attrs: inlinehint nounwind +; Function Attrs: inlinehint norecurse nounwind define internal spir_func void @"_ZZ4mainENK3$_0clEv"(%class.anon* %this) #2 align 2 { entry: - %this.addr = alloca %class.anon*, align 8 - store %class.anon* %this, %class.anon** %this.addr, align 8, !tbaa !5 - %this1 = load %class.anon*, %class.anon** %this.addr, align 8 - call spir_func void @_Z3barv() - call spir_func void @_Z8bankbitsv() + %this.addr = alloca %class.anon*, align 4 + store %class.anon* %this, %class.anon** %this.addr, align 4, !tbaa !5 + %this1 = load %class.anon*, %class.anon** %this.addr, align 4 + call spir_func void @_Z19field_numbanks_attrv() + call spir_func void @_Z25templ_field_numbanks_attrILi8EEvv() + call spir_func void @_Z19field_register_attrv() + call spir_func void @_Z17field_memory_attrv() + call spir_func void @_Z20field_bankwidth_attrv() + call spir_func void @_Z26templ_field_bankwidth_attrILi4EEvv() + call spir_func void @_Z25field_private_copies_attrv() + call spir_func void @_Z31templ_field_private_copies_attrILi2EEvv() + call spir_func void @_Z21field_singlepump_attrv() + call spir_func void @_Z21field_doublepump_attrv() + call spir_func void @_Z16field_merge_attrv() + call spir_func void @_Z25field_max_replicates_attrv() + call spir_func void @_Z31templ_field_max_replicates_attrILi2EEvv() + call spir_func void @_Z27field_simple_dual_port_attrv() + call spir_func void @_Z20field_bank_bits_attrv() + call spir_func void @_Z26templ_field_bank_bits_attrILi2ELi3EEvv() + call spir_func void @_Z27field_force_pow2_depth_attrv() + call spir_func void @_Z33templ_field_force_pow2_depth_attrILi1EEvv() + call spir_func void @_Z20field_addrspace_castv() ret void } -; Function Attrs: argmemonly nounwind -declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1 +; Function Attrs: argmemonly nounwind willreturn +declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #1 -; Function Attrs: nounwind -define spir_func void @_Z3barv() #3 { +; Function Attrs: norecurse nounwind +define spir_func void @_Z19field_numbanks_attrv() #3 { entry: - %s1 = alloca %struct.foo, align 4 - %0 = bitcast %struct.foo* %s1 to i8* - call void @llvm.lifetime.start.p0i8(i64 20, i8* %0) #4 - ; CHECK-LLVM: %[[FIELD1:.*]] = getelementptr inbounds %struct.foo, %struct.foo* %{{[a-zA-Z0-9]+}}, i32 0, i32 0 - ; CHECK-LLVM: call i32* @llvm.ptr.annotation.p0i32{{.*}}%[[FIELD1]]{{.*}}[[STR1]] - %f1 = getelementptr inbounds %struct.foo, %struct.foo* %s1, i32 0, i32 0 - %1 = call i32* @llvm.ptr.annotation.p0i32(i32* %f1, i8* getelementptr inbounds ([29 x i8], [29 x i8]* @.str, i32 0, i32 0), i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.1, i32 0, i32 0), i32 2) + %s = alloca %struct.numbanks_st, align 4 + %0 = bitcast %struct.numbanks_st* %s to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %0) #5 + ; CHECK-LLVM: %[[FLD_NMB_SCT:.*]] = getelementptr inbounds %struct.numbanks_st, %struct.numbanks_st* %{{[a-zA-Z0-9]+}}, i32 0, i32 0 + ; CHECK-LLVM: call i32* @llvm.ptr.annotation.p0i32{{.*}}%[[FLD_NMB_SCT]]{{.*}}[[STR_NMB_SCT]] + %field = getelementptr inbounds %struct.numbanks_st, %struct.numbanks_st* %s, i32 0, i32 0 + %1 = call i32* @llvm.ptr.annotation.p0i32(i32* %field, i8* getelementptr inbounds ([41 x i8], [41 x i8]* @.str, i32 0, i32 0), i8* getelementptr inbounds ([28 x i8], [28 x i8]* @.str.1, i32 0, i32 0), i32 3) store i32 0, i32* %1, align 4, !tbaa !9 - ; CHECK-LLVM: %[[FIELD2:.*]] = getelementptr inbounds %struct.foo, %struct.foo* %{{[a-zA-Z0-9]+}}, i32 0, i32 1 - ; CHECK-LLVM: call i32* @llvm.ptr.annotation.p0i32{{.*}}%[[FIELD2]]{{.*}}[[STR2]] - %f2 = getelementptr inbounds %struct.foo, %struct.foo* %s1, i32 0, i32 1 - %2 = call i32* @llvm.ptr.annotation.p0i32(i32* %f2, i8* getelementptr inbounds ([13 x i8], [13 x i8]* @.str.2, i32 0, i32 0), i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.1, i32 0, i32 0), i32 3) - store i32 0, i32* %2, align 4, !tbaa !12 - ; CHECK-LLVM: %[[FIELD3:.*]] = getelementptr inbounds %struct.foo, %struct.foo* %{{[a-zA-Z0-9]+}}, i32 0, i32 2 - ; CHECK-LLVM: call i32* @llvm.ptr.annotation.p0i32{{.*}}%[[FIELD3]]{{.*}}[[STR3]] - %f3 = getelementptr inbounds %struct.foo, %struct.foo* %s1, i32 0, i32 2 - %3 = call i32* @llvm.ptr.annotation.p0i32(i32* %f3, i8* getelementptr inbounds ([14 x i8], [14 x i8]* @.str.3, i32 0, i32 0), i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.1, i32 0, i32 0), i32 4) - store i32 0, i32* %3, align 4, !tbaa !13 - ; CHECK-LLVM: %[[FIELD4:.*]] = getelementptr inbounds %struct.foo, %struct.foo* %{{[a-zA-Z0-9]+}}, i32 0, i32 3 - ; CHECK-LLVM: call i32* @llvm.ptr.annotation.p0i32{{.*}}%[[FIELD4]]{{.*}}[[STR4]] - %f4 = getelementptr inbounds %struct.foo, %struct.foo* %s1, i32 0, i32 3 - %4 = call i32* @llvm.ptr.annotation.p0i32(i32* %f4, i8* getelementptr inbounds ([30 x i8], [30 x i8]* @.str.4, i32 0, i32 0), i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.1, i32 0, i32 0), i32 5) - store i32 0, i32* %4, align 4, !tbaa !14 - ; CHECK-LLVM: %[[FIELD5:.*]] = getelementptr inbounds %struct.foo, %struct.foo* %{{[a-zA-Z0-9]+}}, i32 0, i32 4 - ; CHECK-LLVM: call i8* @llvm.ptr.annotation.p0i8{{.*}}%[[FIELD5]]{{.*}}[[STR5]] - %f5 = getelementptr inbounds %struct.foo, %struct.foo* %s1, i32 0, i32 4 - %5 = call i8* @llvm.ptr.annotation.p0i8(i8* %f5, i8* getelementptr inbounds ([35 x i8], [35 x i8]* @.str.5, i32 0, i32 0), i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.1, i32 0, i32 0), i32 6) - store i8 0, i8* %5, align 4, !tbaa !15 - ; CHECK-LLVM: %[[FIELD6:.*]] = getelementptr inbounds %struct.foo, %struct.foo* %{{[a-zA-Z0-9]+}}, i32 0, i32 5 - ; CHECK-LLVM: call i32* @llvm.ptr.annotation.p0i32{{.*}}%[[FIELD6]]{{.*}}[[STR6]] - %f6 = getelementptr inbounds %struct.foo, %struct.foo* %s1, i32 0, i32 5 - %6 = call i32* @llvm.ptr.annotation.p0i32(i32* %f6, i8* getelementptr inbounds ([25 x i8], [25 x i8]* @.str.6, i32 0, i32 0), i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.1, i32 0, i32 0), i32 7) - store i32 0, i32* %6, align 4, !tbaa !16 - ; CHECK-LLVM: %[[FIELD7:.*]] = getelementptr inbounds %struct.foo, %struct.foo* %{{[a-zA-Z0-9]+}}, i32 0, i32 6 - ; CHECK-LLVM: call i32* @llvm.ptr.annotation.p0i32{{.*}}%[[FIELD7]]{{.*}}[[STR7]] - %f7 = getelementptr inbounds %struct.foo, %struct.foo* %s1, i32 0, i32 6 - %7 = call i32* @llvm.ptr.annotation.p0i32(i32* %f7, i8* getelementptr inbounds ([25 x i8], [25 x i8]* @.str.7, i32 0, i32 0), i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.1, i32 0, i32 0), i32 8) - store i32 0, i32* %7, align 4, !tbaa !17 - ; CHECK-LLVM: %[[FIELD8:.*]] = getelementptr inbounds %struct.foo, %struct.foo* %{{[a-zA-Z0-9]+}}, i32 0, i32 7 - ; CHECK-LLVM: call i32* @llvm.ptr.annotation.p0i32{{.*}}%[[FIELD8]]{{.*}}[[STR8]] - %f8 = getelementptr inbounds %struct.foo, %struct.foo* %s1, i32 0, i32 7 - %8 = call i32* @llvm.ptr.annotation.p0i32(i32* %f8, i8* getelementptr inbounds ([37 x i8], [37 x i8]* @.str.8, i32 0, i32 0), i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.1, i32 0, i32 0), i32 9) - store i32 0, i32* %8, align 4, !tbaa !18 - ; CHECK-LLVM: %[[FIELD9:.*]] = getelementptr inbounds %struct.foo, %struct.foo* %{{[a-zA-Z0-9]+}}, i32 0, i32 8 - ; CHECK-LLVM: call i32* @llvm.ptr.annotation.p0i32{{.*}}%[[FIELD9]]{{.*}}[[STR9]] - %f9 = getelementptr inbounds %struct.foo, %struct.foo* %s1, i32 0, i32 8 - %9 = call i32* @llvm.ptr.annotation.p0i32(i32* %f9, i8* getelementptr inbounds ([19 x i8], [19 x i8]* @.str.9, i32 0, i32 0), i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.1, i32 0, i32 0), i32 10) - store i32 0, i32* %9, align 4, !tbaa !19 - ; CHECK-LLVM: %[[FIELD10:.*]] = getelementptr inbounds %struct.foo, %struct.foo* %{{[a-zA-Z0-9]+}}, i32 0, i32 9 - ; CHECK-LLVM: call i32* @llvm.ptr.annotation.p0i32{{.*}}%[[FIELD10]]{{.*}}[[STR10]] - %f10 = getelementptr inbounds %struct.foo, %struct.foo* %s1, i32 0, i32 9 - %10 = call i32* @llvm.ptr.annotation.p0i32(i32* %f10, i8* getelementptr inbounds ([37 x i8], [37 x i8]* @.str.10, i32 0, i32 0), i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.1, i32 0, i32 0), i32 11) - store i32 0, i32* %10, align 4, !tbaa !20 - %11 = bitcast %struct.foo* %s1 to i8* - call void @llvm.lifetime.end.p0i8(i64 40, i8* %11) #4 + %2 = bitcast %struct.numbanks_st* %s to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %2) #5 + ret void +} + +; Function Attrs: norecurse nounwind +define linkonce_odr spir_func void @_Z25templ_field_numbanks_attrILi8EEvv() #3 { +entry: + %s = alloca %struct.templ_numbanks_st, align 4 + %0 = bitcast %struct.templ_numbanks_st* %s to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %0) #5 + ; CHECK-LLVM: %[[FLD_NMB_STE:.*]] = getelementptr inbounds %struct.templ_numbanks_st, %struct.templ_numbanks_st* %{{[a-zA-Z0-9]+}}, i32 0, i32 0 + ; CHECK-LLVM: call i32* @llvm.ptr.annotation.p0i32{{.*}}%[[FLD_NMB_STE]]{{.*}}[[STR_NMB_STE]] + %field = getelementptr inbounds %struct.templ_numbanks_st, %struct.templ_numbanks_st* %s, i32 0, i32 0 + %1 = call i32* @llvm.ptr.annotation.p0i32(i32* %field, i8* getelementptr inbounds ([41 x i8], [41 x i8]* @.str.2, i32 0, i32 0), i8* getelementptr inbounds ([28 x i8], [28 x i8]* @.str.1, i32 0, i32 0), i32 11) + store i32 0, i32* %1, align 4, !tbaa !13 + %2 = bitcast %struct.templ_numbanks_st* %s to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %2) #5 + ret void +} + +; Function Attrs: norecurse nounwind +define spir_func void @_Z19field_register_attrv() #3 { +entry: + %s = alloca %struct.register_st, align 4 + %0 = bitcast %struct.register_st* %s to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %0) #5 + ; CHECK-LLVM: %[[FLD_REG_SCT:.*]] = getelementptr inbounds %struct.register_st, %struct.register_st* %{{[a-zA-Z0-9]+}}, i32 0, i32 0 + ; CHECK-LLVM: call i32* @llvm.ptr.annotation.p0i32{{.*}}%[[FLD_REG_SCT]]{{.*}}[[STR_REG_SCT]] + %field = getelementptr inbounds %struct.register_st, %struct.register_st* %s, i32 0, i32 0 + %1 = call i32* @llvm.ptr.annotation.p0i32(i32* %field, i8* getelementptr inbounds ([13 x i8], [13 x i8]* @.str.3, i32 0, i32 0), i8* getelementptr inbounds ([28 x i8], [28 x i8]* @.str.1, i32 0, i32 0), i32 18) + store i32 0, i32* %1, align 4, !tbaa !15 + %2 = bitcast %struct.register_st* %s to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %2) #5 + ret void +} + +; Function Attrs: norecurse nounwind +define spir_func void @_Z17field_memory_attrv() #3 { +entry: + %s = alloca %struct.memory_st, align 4 + %0 = bitcast %struct.memory_st* %s to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %0) #5 + ; CHECK-LLVM: %[[FLD_MEM_SCT:.*]] = getelementptr inbounds %struct.memory_st, %struct.memory_st* %{{[a-zA-Z0-9]+}}, i32 0, i32 0 + ; CHECK-LLVM: call i32* @llvm.ptr.annotation.p0i32{{.*}}%[[FLD_MEM_SCT]]{{.*}}[[STR_MEM_SCT]] + %field = getelementptr inbounds %struct.memory_st, %struct.memory_st* %s, i32 0, i32 0 + %1 = call i32* @llvm.ptr.annotation.p0i32(i32* %field, i8* getelementptr inbounds ([26 x i8], [26 x i8]* @.str.4, i32 0, i32 0), i8* getelementptr inbounds ([28 x i8], [28 x i8]* @.str.1, i32 0, i32 0), i32 25) + store i32 0, i32* %1, align 4, !tbaa !17 + %2 = bitcast %struct.memory_st* %s to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %2) #5 + ret void +} + +; Function Attrs: norecurse nounwind +define spir_func void @_Z20field_bankwidth_attrv() #3 { +entry: + %s = alloca %struct.bankwidth_st, align 4 + %0 = bitcast %struct.bankwidth_st* %s to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %0) #5 + %field = getelementptr inbounds %struct.bankwidth_st, %struct.bankwidth_st* %s, i32 0, i32 0 + ; CHECK-LLVM: %[[FLD_BWD_SCT:.*]] = getelementptr inbounds %struct.bankwidth_st, %struct.bankwidth_st* %{{[a-zA-Z0-9]+}}, i32 0, i32 0 + ; CHECK-LLVM: call i32* @llvm.ptr.annotation.p0i32{{.*}}%[[FLD_BWD_SCT]]{{.*}}[[STR_BWD_SCT]] + %1 = call i32* @llvm.ptr.annotation.p0i32(i32* %field, i8* getelementptr inbounds ([42 x i8], [42 x i8]* @.str.5, i32 0, i32 0), i8* getelementptr inbounds ([28 x i8], [28 x i8]* @.str.1, i32 0, i32 0), i32 32) + store i32 0, i32* %1, align 4, !tbaa !19 + %2 = bitcast %struct.bankwidth_st* %s to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %2) #5 + ret void +} + +; Function Attrs: norecurse nounwind +define linkonce_odr spir_func void @_Z26templ_field_bankwidth_attrILi4EEvv() #3 { +entry: + %s = alloca %struct.templ_bankwidth_st, align 4 + %0 = bitcast %struct.templ_bankwidth_st* %s to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %0) #5 + ; CHECK-LLVM: %[[FLD_BWD_STE:.*]] = getelementptr inbounds %struct.templ_bankwidth_st, %struct.templ_bankwidth_st* %{{[a-zA-Z0-9]+}}, i32 0, i32 0 + ; CHECK-LLVM: call i32* @llvm.ptr.annotation.p0i32{{.*}}%[[FLD_BWD_STE]]{{.*}}[[STR_BWD_STE]] + %field = getelementptr inbounds %struct.templ_bankwidth_st, %struct.templ_bankwidth_st* %s, i32 0, i32 0 + %1 = call i32* @llvm.ptr.annotation.p0i32(i32* %field, i8* getelementptr inbounds ([42 x i8], [42 x i8]* @.str.6, i32 0, i32 0), i8* getelementptr inbounds ([28 x i8], [28 x i8]* @.str.1, i32 0, i32 0), i32 40) + store i32 0, i32* %1, align 4, !tbaa !21 + %2 = bitcast %struct.templ_bankwidth_st* %s to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %2) #5 ret void } -; Function Attrs: nounwind -define spir_func void @_Z8bankbitsv() #3 { +; Function Attrs: norecurse nounwind +define spir_func void @_Z25field_private_copies_attrv() #3 { entry: - %s2 = alloca %struct.s, align 4 - %0 = bitcast %struct.s* %s2 to i8* - call void @llvm.lifetime.start.p0i8(i64 4, i8* %0) #4 - ; CHECK-LLVM: %[[FIELD:.*]] = getelementptr inbounds %struct.s, %struct.s* %{{[a-zA-Z0-9]+}}, i32 0, i32 0 - ; CHECK-LLVM: call i32* @llvm.ptr.annotation.p0i32{{.*}}%[[FIELD]]{{.*}}[[STR12]] - %a = getelementptr inbounds %struct.s, %struct.s* %s2, i32 0, i32 0 - %1 = call i32* @llvm.ptr.annotation.p0i32(i32* %a, i8* getelementptr inbounds ([49 x i8], [49 x i8]* @.str.12, i32 0, i32 0), i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.1, i32 0, i32 0), i32 84) - store i32 0, i32* %1, align 4, !tbaa !22 - %2 = bitcast %struct.s* %s2 to i8* - call void @llvm.lifetime.end.p0i8(i64 4, i8* %2) #4 + %s = alloca %struct.private_copies_st, align 4 + %0 = bitcast %struct.private_copies_st* %s to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %0) #5 + ; CHECK-LLVM: %[[FLD_PRC_SCT:.*]] = getelementptr inbounds %struct.private_copies_st, %struct.private_copies_st* %{{[a-zA-Z0-9]+}}, i32 0, i32 0 + ; CHECK-LLVM: call i32* @llvm.ptr.annotation.p0i32{{.*}}%[[FLD_PRC_SCT]]{{.*}}[[STR_PRC_SCT]] + %field = getelementptr inbounds %struct.private_copies_st, %struct.private_copies_st* %s, i32 0, i32 0 + %1 = call i32* @llvm.ptr.annotation.p0i32(i32* %field, i8* getelementptr inbounds ([47 x i8], [47 x i8]* @.str.7, i32 0, i32 0), i8* getelementptr inbounds ([28 x i8], [28 x i8]* @.str.1, i32 0, i32 0), i32 47) + store i32 0, i32* %1, align 4, !tbaa !23 + %2 = bitcast %struct.private_copies_st* %s to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %2) #5 ret void } +; Function Attrs: norecurse nounwind +define linkonce_odr spir_func void @_Z31templ_field_private_copies_attrILi2EEvv() #3 { +entry: + %s = alloca %struct.templ_private_copies_st, align 4 + %0 = bitcast %struct.templ_private_copies_st* %s to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %0) #5 + ; CHECK-LLVM: %[[FLD_PRC_STE:.*]] = getelementptr inbounds %struct.templ_private_copies_st, %struct.templ_private_copies_st* %{{[a-zA-Z0-9]+}}, i32 0, i32 0 + ; CHECK-LLVM: call i32* @llvm.ptr.annotation.p0i32{{.*}}%[[FLD_PRC_STE]]{{.*}}[[STR_PRC_STE]] + %field = getelementptr inbounds %struct.templ_private_copies_st, %struct.templ_private_copies_st* %s, i32 0, i32 0 + %1 = call i32* @llvm.ptr.annotation.p0i32(i32* %field, i8* getelementptr inbounds ([47 x i8], [47 x i8]* @.str.8, i32 0, i32 0), i8* getelementptr inbounds ([28 x i8], [28 x i8]* @.str.1, i32 0, i32 0), i32 55) + store i32 0, i32* %1, align 4, !tbaa !25 + %2 = bitcast %struct.templ_private_copies_st* %s to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %2) #5 + ret void +} + +; Function Attrs: norecurse nounwind +define spir_func void @_Z21field_singlepump_attrv() #3 { +entry: + %s = alloca %struct.singlepump_st, align 4 + %0 = bitcast %struct.singlepump_st* %s to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %0) #5 + ; CHECK-LLVM: %[[FLD_SNP_SCT:.*]] = getelementptr inbounds %struct.singlepump_st, %struct.singlepump_st* %{{[a-zA-Z0-9]+}}, i32 0, i32 0 + ; CHECK-LLVM: call i32* @llvm.ptr.annotation.p0i32{{.*}}%[[FLD_SNP_SCT]]{{.*}}[[STR_SNP_SCT]] + %field = getelementptr inbounds %struct.singlepump_st, %struct.singlepump_st* %s, i32 0, i32 0 + %1 = call i32* @llvm.ptr.annotation.p0i32(i32* %field, i8* getelementptr inbounds ([37 x i8], [37 x i8]* @.str.9, i32 0, i32 0), i8* getelementptr inbounds ([28 x i8], [28 x i8]* @.str.1, i32 0, i32 0), i32 62) + store i32 0, i32* %1, align 4, !tbaa !27 + %2 = bitcast %struct.singlepump_st* %s to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %2) #5 + ret void +} + +; Function Attrs: norecurse nounwind +define spir_func void @_Z21field_doublepump_attrv() #3 { +entry: + %s = alloca %struct.doublepump_st, align 4 + %0 = bitcast %struct.doublepump_st* %s to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %0) #5 + ; CHECK-LLVM: %[[FLD_DBP_SCT:.*]] = getelementptr inbounds %struct.doublepump_st, %struct.doublepump_st* %{{[a-zA-Z0-9]+}}, i32 0, i32 0 + ; CHECK-LLVM: call i32* @llvm.ptr.annotation.p0i32{{.*}}%[[FLD_DBP_SCT]]{{.*}}[[STR_DBP_SCT]] + %field = getelementptr inbounds %struct.doublepump_st, %struct.doublepump_st* %s, i32 0, i32 0 + %1 = call i32* @llvm.ptr.annotation.p0i32(i32* %field, i8* getelementptr inbounds ([37 x i8], [37 x i8]* @.str.10, i32 0, i32 0), i8* getelementptr inbounds ([28 x i8], [28 x i8]* @.str.1, i32 0, i32 0), i32 69) + store i32 0, i32* %1, align 4, !tbaa !29 + %2 = bitcast %struct.doublepump_st* %s to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %2) #5 + ret void +} + +; Function Attrs: norecurse nounwind +define spir_func void @_Z16field_merge_attrv() #3 { +entry: + %s = alloca %struct.merge_st, align 4 + %0 = bitcast %struct.merge_st* %s to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %0) #5 + ; CHECK-LLVM: %[[FLD_MRG_SCT:.*]] = getelementptr inbounds %struct.merge_st, %struct.merge_st* %{{[a-zA-Z0-9]+}}, i32 0, i32 0 + ; CHECK-LLVM: call i32* @llvm.ptr.annotation.p0i32{{.*}}%[[FLD_MRG_SCT]]{{.*}}[[STR_MRG_SCT]] + %field = getelementptr inbounds %struct.merge_st, %struct.merge_st* %s, i32 0, i32 0 + %1 = call i32* @llvm.ptr.annotation.p0i32(i32* %field, i8* getelementptr inbounds ([49 x i8], [49 x i8]* @.str.11, i32 0, i32 0), i8* getelementptr inbounds ([28 x i8], [28 x i8]* @.str.1, i32 0, i32 0), i32 76) + store i32 0, i32* %1, align 4, !tbaa !31 + %2 = bitcast %struct.merge_st* %s to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %2) #5 + ret void +} + +; Function Attrs: norecurse nounwind +define spir_func void @_Z25field_max_replicates_attrv() #3 { +entry: + %s = alloca %struct.max_replicates_st, align 4 + %0 = bitcast %struct.max_replicates_st* %s to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %0) #5 + ; CHECK-LLVM: %[[FLD_MXR_SCT:.*]] = getelementptr inbounds %struct.max_replicates_st, %struct.max_replicates_st* %{{[a-zA-Z0-9]+}}, i32 0, i32 0 + ; CHECK-LLVM: call i32* @llvm.ptr.annotation.p0i32{{.*}}%[[FLD_MXR_SCT]]{{.*}}[[STR_MXR_SCT]] + %field = getelementptr inbounds %struct.max_replicates_st, %struct.max_replicates_st* %s, i32 0, i32 0 + %1 = call i32* @llvm.ptr.annotation.p0i32(i32* %field, i8* getelementptr inbounds ([19 x i8], [19 x i8]* @.str.12, i32 0, i32 0), i8* getelementptr inbounds ([28 x i8], [28 x i8]* @.str.1, i32 0, i32 0), i32 83) + store i32 0, i32* %1, align 4, !tbaa !33 + %2 = bitcast %struct.max_replicates_st* %s to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %2) #5 + ret void +} + +; Function Attrs: norecurse nounwind +define linkonce_odr spir_func void @_Z31templ_field_max_replicates_attrILi2EEvv() #3 { +entry: + %s = alloca %struct.templ_max_replicates_st, align 4 + %0 = bitcast %struct.templ_max_replicates_st* %s to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %0) #5 + ; CHECK-LLVM: %[[FLD_MXR_STE:.*]] = getelementptr inbounds %struct.templ_max_replicates_st, %struct.templ_max_replicates_st* %{{[a-zA-Z0-9]+}}, i32 0, i32 0 + ; CHECK-LLVM: call i32* @llvm.ptr.annotation.p0i32{{.*}}%[[FLD_MXR_STE]]{{.*}}[[STR_MXR_STE]] + %field = getelementptr inbounds %struct.templ_max_replicates_st, %struct.templ_max_replicates_st* %s, i32 0, i32 0 + %1 = call i32* @llvm.ptr.annotation.p0i32(i32* %field, i8* getelementptr inbounds ([19 x i8], [19 x i8]* @.str.13, i32 0, i32 0), i8* getelementptr inbounds ([28 x i8], [28 x i8]* @.str.1, i32 0, i32 0), i32 91) + store i32 0, i32* %1, align 4, !tbaa !35 + %2 = bitcast %struct.templ_max_replicates_st* %s to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %2) #5 + ret void +} + +; Function Attrs: norecurse nounwind +define spir_func void @_Z27field_simple_dual_port_attrv() #3 { +entry: + %s = alloca %struct.simple_dual_port_st, align 4 + %0 = bitcast %struct.simple_dual_port_st* %s to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %0) #5 + ; CHECK-LLVM: %[[FLD_SDP_SCT:.*]] = getelementptr inbounds %struct.simple_dual_port_st, %struct.simple_dual_port_st* %{{[a-zA-Z0-9]+}}, i32 0, i32 0 + ; CHECK-LLVM: call i32* @llvm.ptr.annotation.p0i32{{.*}}%[[FLD_SDP_SCT]]{{.*}}[[STR_SDP_SCT]] + %field = getelementptr inbounds %struct.simple_dual_port_st, %struct.simple_dual_port_st* %s, i32 0, i32 0 + %1 = call i32* @llvm.ptr.annotation.p0i32(i32* %field, i8* getelementptr inbounds ([49 x i8], [49 x i8]* @.str.14, i32 0, i32 0), i8* getelementptr inbounds ([28 x i8], [28 x i8]* @.str.1, i32 0, i32 0), i32 98) + store i32 0, i32* %1, align 4, !tbaa !37 + %2 = bitcast %struct.simple_dual_port_st* %s to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %2) #5 + ret void +} + +; Function Attrs: norecurse nounwind +define spir_func void @_Z20field_bank_bits_attrv() #3 { +entry: + %s = alloca %struct.bank_bits_st, align 4 + %0 = bitcast %struct.bank_bits_st* %s to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %0) #5 + ; CHECK-LLVM: %[[FLD_BBT_SCT:.*]] = getelementptr inbounds %struct.bank_bits_st, %struct.bank_bits_st* %{{[a-zA-Z0-9]+}}, i32 0, i32 0 + ; CHECK-LLVM: call i32* @llvm.ptr.annotation.p0i32{{.*}}%[[FLD_BBT_SCT]]{{.*}}[[STR_BBT_SCT]] + %field = getelementptr inbounds %struct.bank_bits_st, %struct.bank_bits_st* %s, i32 0, i32 0 + %1 = call i32* @llvm.ptr.annotation.p0i32(i32* %field, i8* getelementptr inbounds ([61 x i8], [61 x i8]* @.str.15, i32 0, i32 0), i8* getelementptr inbounds ([28 x i8], [28 x i8]* @.str.1, i32 0, i32 0), i32 105) + store i32 0, i32* %1, align 4, !tbaa !39 + %2 = bitcast %struct.bank_bits_st* %s to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %2) #5 + ret void +} + +; Function Attrs: norecurse nounwind +define linkonce_odr spir_func void @_Z26templ_field_bank_bits_attrILi2ELi3EEvv() #3 { +entry: + %s = alloca %struct.templ_bank_bits_st, align 4 + %0 = bitcast %struct.templ_bank_bits_st* %s to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %0) #5 + ; CHECK-LLVM: %[[FLD_BBT_STE:.*]] = getelementptr inbounds %struct.templ_bank_bits_st, %struct.templ_bank_bits_st* %{{[a-zA-Z0-9]+}}, i32 0, i32 0 + ; CHECK-LLVM: call i32* @llvm.ptr.annotation.p0i32{{.*}}%[[FLD_BBT_STE]]{{.*}}[[STR_BBT_STE]] + %field = getelementptr inbounds %struct.templ_bank_bits_st, %struct.templ_bank_bits_st* %s, i32 0, i32 0 + %1 = call i32* @llvm.ptr.annotation.p0i32(i32* %field, i8* getelementptr inbounds ([56 x i8], [56 x i8]* @.str.16, i32 0, i32 0), i8* getelementptr inbounds ([28 x i8], [28 x i8]* @.str.1, i32 0, i32 0), i32 113) + store i32 0, i32* %1, align 4, !tbaa !41 + %2 = bitcast %struct.templ_bank_bits_st* %s to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %2) #5 + ret void +} + +; Function Attrs: norecurse nounwind +define spir_func void @_Z27field_force_pow2_depth_attrv() #3 { +entry: + %s = alloca %struct.force_pow2_depth_st, align 4 + %0 = bitcast %struct.force_pow2_depth_st* %s to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %0) #5 + ; CHECK-LLVM: %[[FLD_FP2_SCT:.*]] = getelementptr inbounds %struct.force_pow2_depth_st, %struct.force_pow2_depth_st* %{{[a-zA-Z0-9]+}}, i32 0, i32 0 + ; CHECK-LLVM: call i32* @llvm.ptr.annotation.p0i32{{.*}}%[[FLD_FP2_SCT]]{{.*}}[[STR_FP2_SCT]] + %field = getelementptr inbounds %struct.force_pow2_depth_st, %struct.force_pow2_depth_st* %s, i32 0, i32 0 + %1 = call i32* @llvm.ptr.annotation.p0i32(i32* %field, i8* getelementptr inbounds ([49 x i8], [49 x i8]* @.str.17, i32 0, i32 0), i8* getelementptr inbounds ([28 x i8], [28 x i8]* @.str.1, i32 0, i32 0), i32 120) + store i32 0, i32* %1, align 4, !tbaa !43 + %2 = bitcast %struct.force_pow2_depth_st* %s to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %2) #5 + ret void +} + +; Function Attrs: norecurse nounwind +define linkonce_odr spir_func void @_Z33templ_field_force_pow2_depth_attrILi1EEvv() #3 { +entry: + %s = alloca %struct.templ_force_pow2_depth_st, align 4 + %0 = bitcast %struct.templ_force_pow2_depth_st* %s to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %0) #5 + ; CHECK-LLVM: %[[FLD_FP2_STE:.*]] = getelementptr inbounds %struct.templ_force_pow2_depth_st, %struct.templ_force_pow2_depth_st* %{{[a-zA-Z0-9]+}}, i32 0, i32 0 + ; CHECK-LLVM: call i32* @llvm.ptr.annotation.p0i32{{.*}}%[[FLD_FP2_STE]]{{.*}}[[STR_FP2_STE]] + %field = getelementptr inbounds %struct.templ_force_pow2_depth_st, %struct.templ_force_pow2_depth_st* %s, i32 0, i32 0 + %1 = call i32* @llvm.ptr.annotation.p0i32(i32* %field, i8* getelementptr inbounds ([49 x i8], [49 x i8]* @.str.18, i32 0, i32 0), i8* getelementptr inbounds ([28 x i8], [28 x i8]* @.str.1, i32 0, i32 0), i32 128) + store i32 0, i32* %1, align 4, !tbaa !45 + %2 = bitcast %struct.templ_force_pow2_depth_st* %s to i8* + call void @llvm.lifetime.end.p0i8(i64 4, i8* %2) #5 + ret void +} + +; Function Attrs: norecurse nounwind define spir_func void @_Z20field_addrspace_castv() #3 { entry: - %state_var = alloca %struct._ZTSZ20field_addrspace_castvE5state.state, align 4 - %0 = bitcast %struct._ZTSZ20field_addrspace_castvE5state.state* %state_var to i8* - call void @llvm.lifetime.start.p0i8(i64 32, i8* %0) #4 - %1 = addrspacecast %struct._ZTSZ20field_addrspace_castvE5state.state* %state_var to %struct._ZTSZ20field_addrspace_castvE5state.state addrspace(4)* - call spir_func void @_ZZ20field_addrspace_castvEN5stateC2Ev(%struct._ZTSZ20field_addrspace_castvE5state.state addrspace(4)* %1) - %mem = getelementptr inbounds %struct._ZTSZ20field_addrspace_castvE5state.state, %struct._ZTSZ20field_addrspace_castvE5state.state* %state_var, i32 0, i32 0 - ; CHECK-LLVM: %[[GEP:.*]] = getelementptr inbounds %struct._ZTSZ20field_addrspace_castvE5state.state, %struct._ZTSZ20field_addrspace_castvE5state.state* %state_var, i32 0, i32 0 - ; CHECK-LLVM: %[[CAST11:.*]] = bitcast [8 x i32]* %[[GEP:.*]] to i8* - ; CHECK-LLVM: %{{[0-9]+}} = call i8* @llvm.ptr.annotation.p0i8(i8* %[[CAST11]]{{.*}}[[STR11]] - %2 = bitcast [8 x i32]* %mem to i8* - %3 = call i8* @llvm.ptr.annotation.p0i8(i8* %2, i8* getelementptr inbounds ([29 x i8], [29 x i8]* @.str.11, i32 0, i32 0), i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.1, i32 0, i32 0), i32 24) - %4 = bitcast i8* %3 to [8 x i32]* - %arrayidx = getelementptr inbounds [8 x i32], [8 x i32]* %4, i64 0, i64 0 - store i32 42, i32* %arrayidx, align 4, !tbaa !9 - %5 = bitcast %struct._ZTSZ20field_addrspace_castvE5state.state* %state_var to i8* - call void @llvm.lifetime.end.p0i8(i64 32, i8* %5) #4 + %state_var = alloca %struct.state, align 4 + %0 = bitcast %struct.state* %state_var to i8* + call void @llvm.lifetime.start.p0i8(i64 32, i8* %0) #5 + call spir_func void @_ZZ20field_addrspace_castvEN5stateC1Ev(%struct.state* %state_var) + ; CHECK-LLVM: %[[GEP:.*]] = getelementptr inbounds %struct.state, %struct.state* %state_var, i32 0, i32 0 + ; CHECK-LLVM: %[[CAST:.*]] = bitcast [8 x i32]* %[[GEP:.*]] to i8* + ; CHECK-LLVM: %{{[0-9]+}} = call i8* @llvm.ptr.annotation.p0i8(i8* %[[CAST]]{{.*}}[[STR_NMB_ASC]] + %mem = getelementptr inbounds %struct.state, %struct.state* %state_var, i32 0, i32 0 + %1 = bitcast [8 x i32]* %mem to i8* + %2 = call i8* @llvm.ptr.annotation.p0i8(i8* %1, i8* getelementptr inbounds ([43 x i8], [43 x i8]* @.str.19, i32 0, i32 0), i8* getelementptr inbounds ([28 x i8], [28 x i8]* @.str.1, i32 0, i32 0), i32 120) + %3 = bitcast i8* %2 to [8 x i32]* + %arrayidx = getelementptr inbounds [8 x i32], [8 x i32]* %3, i32 0, i32 0 + store i32 42, i32* %arrayidx, align 4, !tbaa !12 + %4 = bitcast %struct.state* %state_var to i8* + call void @llvm.lifetime.end.p0i8(i64 32, i8* %4) #5 + ret void +} + +; Function Attrs: norecurse nounwind +define internal spir_func void @_ZZ20field_addrspace_castvEN5stateC1Ev(%struct.state* %this) unnamed_addr #3 align 2 { +entry: + %this.addr = alloca %struct.state*, align 4 + store %struct.state* %this, %struct.state** %this.addr, align 4, !tbaa !5 + %this1 = load %struct.state*, %struct.state** %this.addr, align 4 + call spir_func void @_ZZ20field_addrspace_castvEN5stateC2Ev(%struct.state* %this1) ret void } -define internal spir_func void @_ZZ20field_addrspace_castvEN5stateC2Ev(%struct._ZTSZ20field_addrspace_castvE5state.state addrspace(4)* %this) unnamed_addr #3 align 2 { +; Function Attrs: norecurse nounwind +define internal spir_func void @_ZZ20field_addrspace_castvEN5stateC2Ev(%struct.state* %this) unnamed_addr #3 align 2 { entry: - %this.addr = alloca %struct._ZTSZ20field_addrspace_castvE5state.state addrspace(4)*, align 8 + %this.addr = alloca %struct.state*, align 4 %i = alloca i32, align 4 - store %struct._ZTSZ20field_addrspace_castvE5state.state addrspace(4)* %this, %struct._ZTSZ20field_addrspace_castvE5state.state addrspace(4)** %this.addr, align 8, !tbaa !5 - %this1 = load %struct._ZTSZ20field_addrspace_castvE5state.state addrspace(4)*, %struct._ZTSZ20field_addrspace_castvE5state.state addrspace(4)** %this.addr, align 8 + store %struct.state* %this, %struct.state** %this.addr, align 4, !tbaa !5 + %this1 = load %struct.state*, %struct.state** %this.addr, align 4 %0 = bitcast i32* %i to i8* - call void @llvm.lifetime.start.p0i8(i64 4, i8* %0) #4 - store i32 0, i32* %i, align 4, !tbaa !9 + call void @llvm.lifetime.start.p0i8(i64 4, i8* %0) #5 + store i32 0, i32* %i, align 4, !tbaa !12 br label %for.cond for.cond: ; preds = %for.inc, %entry - %1 = load i32, i32* %i, align 4, !tbaa !9 + %1 = load i32, i32* %i, align 4, !tbaa !12 %cmp = icmp slt i32 %1, 8 br i1 %cmp, label %for.body, label %for.cond.cleanup for.cond.cleanup: ; preds = %for.cond %2 = bitcast i32* %i to i8* - call void @llvm.lifetime.end.p0i8(i64 4, i8* %2) #4 + call void @llvm.lifetime.end.p0i8(i64 4, i8* %2) #5 br label %for.end for.body: ; preds = %for.cond - %3 = load i32, i32* %i, align 4, !tbaa !9 - %mem = getelementptr inbounds %struct._ZTSZ20field_addrspace_castvE5state.state, %struct._ZTSZ20field_addrspace_castvE5state.state addrspace(4)* %this1, i32 0, i32 0 + %3 = load i32, i32* %i, align 4, !tbaa !12 + %mem = getelementptr inbounds %struct.state, %struct.state* %this1, i32 0, i32 0 ; FIXME: currently llvm.ptr.annotation is not emitted for c'tors, need to fix it and add a check here - %4 = bitcast [8 x i32] addrspace(4)* %mem to i8 addrspace(4)* - %5 = call i8 addrspace(4)* @llvm.ptr.annotation.p4i8(i8 addrspace(4)* %4, i8* getelementptr inbounds ([29 x i8], [29 x i8]* @.str.11, i32 0, i32 0), i8* getelementptr inbounds ([16 x i8], [16 x i8]* @.str.1, i32 0, i32 0), i32 24) - %6 = bitcast i8 addrspace(4)* %5 to [8 x i32] addrspace(4)* - %7 = load i32, i32* %i, align 4, !tbaa !9 - %idxprom = sext i32 %7 to i64 - %arrayidx = getelementptr inbounds [8 x i32], [8 x i32] addrspace(4)* %6, i64 0, i64 %idxprom - store i32 %3, i32 addrspace(4)* %arrayidx, align 4, !tbaa !9 + %4 = bitcast [8 x i32]* %mem to i8* + %5 = call i8* @llvm.ptr.annotation.p0i8(i8* %4, i8* getelementptr inbounds ([43 x i8], [43 x i8]* @.str.19, i32 0, i32 0), i8* getelementptr inbounds ([28 x i8], [28 x i8]* @.str.1, i32 0, i32 0), i32 120) + %6 = bitcast i8* %5 to [8 x i32]* + %7 = load i32, i32* %i, align 4, !tbaa !12 + %arrayidx = getelementptr inbounds [8 x i32], [8 x i32]* %6, i32 0, i32 %7 + store i32 %3, i32* %arrayidx, align 4, !tbaa !12 br label %for.inc for.inc: ; preds = %for.body - %8 = load i32, i32* %i, align 4, !tbaa !9 + %8 = load i32, i32* %i, align 4, !tbaa !12 %inc = add nsw i32 %8, 1 - store i32 %inc, i32* %i, align 4, !tbaa !9 + store i32 %inc, i32* %i, align 4, !tbaa !12 br label %for.cond for.end: ; preds = %for.cond.cleanup ret void } -; Function Attrs: nounwind +; Function Attrs: nounwind willreturn declare i8* @llvm.ptr.annotation.p0i8(i8*, i8*, i8*, i32) #4 -; Function Attrs: nounwind +; Function Attrs: nounwind willreturn declare i32* @llvm.ptr.annotation.p0i32(i32*, i8*, i8*, i32) #4 -; Function Attrs: nounwind -declare i8 addrspace(4)* @llvm.ptr.annotation.p4i8(i8 addrspace(4)*, i8*, i8*, i32) #4 - -attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #1 = { argmemonly nounwind } -attributes #2 = { inlinehint nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #3 = { nounwind optnone noinline "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } -attributes #4 = { nounwind } +attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "sycl-module-id"="intel-fpga-local-struct.cpp" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { argmemonly nounwind willreturn } +attributes #2 = { inlinehint norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #3 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #4 = { nounwind willreturn } +attributes #5 = { nounwind } !llvm.module.flags = !{!0} !opencl.spir.version = !{!1} @@ -257,24 +711,47 @@ attributes #4 = { nounwind } !0 = !{i32 1, !"wchar_size", i32 4} !1 = !{i32 1, i32 2} !2 = !{i32 4, i32 100000} -!3 = !{!"clang version 9.0.0"} +!3 = !{!"clang version 11.0.0"} !4 = !{} !5 = !{!6, !6, i64 0} !6 = !{!"any pointer", !7, i64 0} !7 = !{!"omnipotent char", !8, i64 0} !8 = !{!"Simple C++ TBAA"} !9 = !{!10, !11, i64 0} -!10 = !{!"_ZTS3foo", !11, i64 0, !11, i64 4, !11, i64 8, !11, i64 12, !11, i64 16, !11, i64 20, !11, i64 24, !11, i64 28, !11, i64 32, !11, i64 36} +!10 = !{!"_ZTSZ19field_numbanks_attrvE11numbanks_st", !11, i64 0} !11 = !{!"int", !7, i64 0} -!12 = !{!10, !11, i64 4} -!13 = !{!10, !11, i64 8} -!14 = !{!10, !11, i64 12} -!15 = !{!10, !11, i64 16} -!16 = !{!10, !11, i64 20} -!17 = !{!10, !11, i64 24} -!18 = !{!10, !11, i64 28} -!19 = !{!10, !11, i64 32} -!20 = !{!10, !11, i64 36} -!21 = !{!10, !11, i64 40} -!22 = !{!23, !11, i64 0} -!23 = !{!"s", !11, i64 0} +!12 = !{!11, !11, i64 0} +!13 = !{!14, !11, i64 0} +!14 = !{!"_ZTSZ25templ_field_numbanks_attrILi8EEvvE17templ_numbanks_st", !11, i64 0} +!15 = !{!16, !11, i64 0} +!16 = !{!"_ZTSZ19field_register_attrvE11register_st", !11, i64 0} +!17 = !{!18, !11, i64 0} +!18 = !{!"_ZTSZ17field_memory_attrvE9memory_st", !11, i64 0} +!19 = !{!20, !11, i64 0} +!20 = !{!"_ZTSZ20field_bankwidth_attrvE12bankwidth_st", !11, i64 0} +!21 = !{!22, !11, i64 0} +!22 = !{!"_ZTSZ26templ_field_bankwidth_attrILi4EEvvE18templ_bankwidth_st", !11, i64 0} +!23 = !{!24, !11, i64 0} +!24 = !{!"_ZTSZ25field_private_copies_attrvE17private_copies_st", !11, i64 0} +!25 = !{!26, !11, i64 0} +!26 = !{!"_ZTSZ31templ_field_private_copies_attrILi2EEvvE23templ_private_copies_st", !11, i64 0} +!27 = !{!28, !11, i64 0} +!28 = !{!"_ZTSZ21field_singlepump_attrvE13singlepump_st", !11, i64 0} +!29 = !{!30, !11, i64 0} +!30 = !{!"_ZTSZ21field_doublepump_attrvE13doublepump_st", !11, i64 0} +!31 = !{!32, !11, i64 0} +!32 = !{!"_ZTSZ16field_merge_attrvE8merge_st", !11, i64 0} +!33 = !{!34, !11, i64 0} +!34 = !{!"_ZTSZ25field_max_replicates_attrvE17max_replicates_st", !11, i64 0} +!35 = !{!36, !11, i64 0} +!36 = !{!"_ZTSZ31templ_field_max_replicates_attrILi2EEvvE23templ_max_replicates_st", !11, i64 0} +!37 = !{!38, !11, i64 0} +!38 = !{!"_ZTSZ27field_simple_dual_port_attrvE19simple_dual_port_st", !11, i64 0} +!39 = !{!40, !11, i64 0} +!40 = !{!"_ZTSZ20field_bank_bits_attrvE12bank_bits_st", !11, i64 0} +!41 = !{!42, !11, i64 0} +!42 = !{!"_ZTSZ26templ_field_bank_bits_attrILi2ELi3EEvvE18templ_bank_bits_st", !11, i64 0} +!43 = !{!44, !11, i64 0} +!44 = !{!"_ZTSZ27field_force_pow2_depth_attrvE19force_pow2_depth_st", !11, i64 0} +!45 = !{!46, !11, i64 0} +!46 = !{!"_ZTSZ33templ_field_force_pow2_depth_attrILi1EEvvE25templ_force_pow2_depth_st", !11, i64 0} diff --git a/llvm-spirv/test/llvm.ceil.ll b/llvm-spirv/test/llvm.ceil.ll new file mode 100644 index 0000000000000..6bad58ab28083 --- /dev/null +++ b/llvm-spirv/test/llvm.ceil.ll @@ -0,0 +1,66 @@ +; RUN: llvm-as %s -o %t.bc +; RUN: llvm-spirv %t.bc -spirv-text -o - | FileCheck %s +; RUN: llvm-spirv %t.bc -o %t.spv +; RUN: spirv-val %t.spv + +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" +target triple = "spir64-unknown-unknown" + +; CHECK: ExtInstImport [[extinst_id:[0-9]+]] "OpenCL.std" + +; CHECK: 3 TypeFloat [[var1:[0-9]+]] 32 +; CHECK: 3 TypeFloat [[var2:[0-9]+]] 64 +; CHECK: 4 TypeVector [[var3:[0-9]+]] 2 4 + +; CHECK: Function +; CHECK: 6 ExtInst [[var1]] {{[0-9]+}} [[extinst_id]] ceil +; CHECK: FunctionEnd + +; Function Attrs: nounwind readnone +define spir_func float @TestCeil32(float %x) local_unnamed_addr #0 { +entry: + %0 = tail call float @llvm.ceil.f32(float %x) + ret float %0 +} + +; CHECK: Function +; CHECK: 6 ExtInst [[var2]] {{[0-9]+}} [[extinst_id]] ceil +; CHECK: FunctionEnd + +; Function Attrs: nounwind readnone +define spir_func double @TestCeil64(double %x) local_unnamed_addr #0 { +entry: + %0 = tail call double @llvm.ceil.f64(double %x) + ret double %0 +} + +; CHECK: Function +; CHECK: 6 ExtInst [[var3]] {{[0-9]+}} [[extinst_id]] ceil +; CHECK: FunctionEnd + +; Function Attrs: nounwind readnone +define spir_func <4 x float> @TestCeilVec(<4 x float> %x) local_unnamed_addr #0 { +entry: + %0 = tail call <4 x float> @llvm.ceil.v4f32(<4 x float> %x) + ret <4 x float> %0 +} + +; Function Attrs: nounwind readnone +declare float @llvm.ceil.f32(float) #1 + +; Function Attrs: nounwind readnone +declare double @llvm.ceil.f64(double) #1 + +; Function Attrs: nounwind readnone +declare <4 x float> @llvm.ceil.v4f32(<4 x float>) #1 + +attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind readnone speculatable willreturn } + +!llvm.module.flags = !{!0} +!opencl.ocl.version = !{!1} +!opencl.spir.version = !{!2} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 1, i32 0} +!2 = !{i32 1, i32 2} diff --git a/llvm-spirv/test/llvm.fabs.ll b/llvm-spirv/test/llvm.fabs.ll new file mode 100644 index 0000000000000..c850da9652cc1 --- /dev/null +++ b/llvm-spirv/test/llvm.fabs.ll @@ -0,0 +1,66 @@ +; RUN: llvm-as %s -o %t.bc +; RUN: llvm-spirv %t.bc -spirv-text -o - | FileCheck %s +; RUN: llvm-spirv %t.bc -o %t.spv +; RUN: spirv-val %t.spv + +target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" +target triple = "spir64-unknown-unknown" + +; CHECK: ExtInstImport [[extinst_id:[0-9]+]] "OpenCL.std" + +; CHECK: 3 TypeFloat [[var1:[0-9]+]] 32 +; CHECK: 3 TypeFloat [[var2:[0-9]+]] 64 +; CHECK: 4 TypeVector [[var3:[0-9]+]] 2 4 + +; CHECK: Function +; CHECK: 6 ExtInst [[var1]] {{[0-9]+}} [[extinst_id]] fabs +; CHECK: FunctionEnd + +; Function Attrs: nounwind readnone +define spir_func float @TestFabs32(float %x) local_unnamed_addr #0 { +entry: + %0 = tail call float @llvm.fabs.f32(float %x) + ret float %0 +} + +; CHECK: Function +; CHECK: 6 ExtInst [[var2]] {{[0-9]+}} [[extinst_id]] fabs +; CHECK: FunctionEnd + +; Function Attrs: nounwind readnone +define spir_func double @TestFabs64(double %x) local_unnamed_addr #0 { +entry: + %0 = tail call double @llvm.fabs.f64(double %x) + ret double %0 +} + +; CHECK: Function +; CHECK: 6 ExtInst [[var3]] {{[0-9]+}} [[extinst_id]] fabs +; CHECK: FunctionEnd + +; Function Attrs: nounwind readnone +define spir_func <4 x float> @TestFabsVec(<4 x float> %x) local_unnamed_addr #0 { +entry: + %0 = tail call <4 x float> @llvm.fabs.v4f32(<4 x float> %x) + ret <4 x float> %0 +} + +; Function Attrs: nounwind readnone +declare float @llvm.fabs.f32(float) #1 + +; Function Attrs: nounwind readnone +declare double @llvm.fabs.f64(double) #1 + +; Function Attrs: nounwind readnone +declare <4 x float> @llvm.fabs.v4f32(<4 x float>) #1 + +attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "denorms-are-zero"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind readnone speculatable willreturn } + +!llvm.module.flags = !{!0} +!opencl.ocl.version = !{!1} +!opencl.spir.version = !{!2} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 1, i32 0} +!2 = !{i32 1, i32 2} diff --git a/llvm-spirv/test/transcoding/FPGALoopMergeInst.ll b/llvm-spirv/test/transcoding/FPGALoopMergeInst.ll index e7f7f70fc42f1..3ae05d91a6729 100644 --- a/llvm-spirv/test/transcoding/FPGALoopMergeInst.ll +++ b/llvm-spirv/test/transcoding/FPGALoopMergeInst.ll @@ -44,6 +44,46 @@ ; } ; } +; void loop_pipelining() { +; int a[10]; +; [[intelfpga::disable_loop_pipelining]] +; for (int i = 0; i != 10; ++i) +; a[i] = 0; +; } + +; void loop_coalesce() { +; int i = 0, m = 42; +; [[intelfpga::loop_coalesce(4)]] +; while (i < m) { +; if (i % 2) { +; ++i; +; continue; +; } +; } +; i = 0; +; [[intelfpga::loop_coalesce]] +; while (i < m) { +; if (i % 3) { +; ++i; +; continue; +; } +; } +; } + +; void max_interleaving() { +; int a[10]; +; [[intelfpga::max_interleaving(3)]] +; for (int i = 0; i != 10; ++i) +; a[i] = 0; +; } + +; void speculated_iterations() { +; int a[10]; +; [[intelfpga::speculated_iterations(4)]] +; for (int i = 0; i != 10; ++i) +; a[i] = 0; +; } + ; TODO: This source code will result in different LLVM IR after ; rev [a47242e4b2c1c9] of https://github.com/intel/llvm (the ; [[intelfpga::ivdep]] attribute will be represented otherwise). @@ -228,6 +268,175 @@ while.end30: ; preds = %if.then28 ret void } +; Function Attrs: noinline nounwind optnone +define spir_func void @loop_pipelining() #3 { +entry: + %a = alloca [10 x i32], align 4 + %i = alloca i32, align 4 + store i32 0, i32* %i, align 4 + br label %for.cond + +; Per SPIR-V spec, LoopControlPipelineEnableINTEL = 0x80000 (524288) +; CHECK-SPIRV: 5 LoopMerge {{[0-9]+}} {{[0-9]+}} 524288 1 +; CHECK-SPIRV-NEXT: 4 BranchConditional {{[0-9]+}} {{[0-9]+}} {{[0-9]+}} +; CHECK-SPIRV-NEGATIVE-NOT: 5 LoopMerge {{[0-9]+}} {{[0-9]+}} 524288 1 +for.cond: ; preds = %for.inc, %entry + %0 = load i32, i32* %i, align 4 + %cmp = icmp ne i32 %0, 10 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %1 = load i32, i32* %i, align 4 + %idxprom = sext i32 %1 to i64 + %arrayidx = getelementptr inbounds [10 x i32], [10 x i32]* %a, i64 0, i64 %idxprom + store i32 0, i32* %arrayidx, align 4 + br label %for.inc + +for.inc: ; preds = %for.body + %2 = load i32, i32* %i, align 4 + %inc = add nsw i32 %2, 1 + store i32 %inc, i32* %i, align 4 + br label %for.cond, !llvm.loop !19 + +for.end: ; preds = %for.cond + ret void +} + +; Function Attrs: noinline nounwind optnone +define spir_func void @loop_coalesce() #3 { +entry: + %i = alloca i32, align 4 + %m = alloca i32, align 4 + store i32 0, i32* %i, align 4 + store i32 42, i32* %m, align 4 + br label %while.cond + +; Per SPIR-V spec, LoopControlLoopCoalesceINTEL = 0x100000 (1048576) +; CHECK-SPIRV: 5 LoopMerge {{[0-9]+}} {{[0-9]+}} 1048576 4 +; CHECK-SPIRV-NEXT: 4 BranchConditional {{[0-9]+}} {{[0-9]+}} {{[0-9]+}} +; CHECK-SPIRV-NEGATIVE-NOT: 5 LoopMerge {{[0-9]+}} {{[0-9]+}} 1048576 4 +while.cond: ; preds = %if.end, %if.then, %entry + %0 = load i32, i32* %i, align 4 + %1 = load i32, i32* %m, align 4 + %cmp = icmp slt i32 %0, %1 + br i1 %cmp, label %while.body, label %while.end + +while.body: ; preds = %while.cond + %2 = load i32, i32* %i, align 4 + %rem = srem i32 %2, 2 + %tobool = icmp ne i32 %rem, 0 + br i1 %tobool, label %if.then, label %if.end + +if.then: ; preds = %while.body + %3 = load i32, i32* %i, align 4 + %inc = add nsw i32 %3, 1 + store i32 %inc, i32* %i, align 4 + br label %while.cond, !llvm.loop !21 + +if.end: ; preds = %while.body + br label %while.cond, !llvm.loop !21 + +while.end: ; preds = %while.cond + store i32 0, i32* %i, align 4 + br label %while.cond1 + +; Per SPIR-V spec, LoopControlLoopCoalesceINTEL = 0x100000 (1048576) +; CHECK-SPIRV: 4 LoopMerge {{[0-9]+}} {{[0-9]+}} 1048576 +; CHECK-SPIRV-NEXT: 4 BranchConditional {{[0-9]+}} {{[0-9]+}} {{[0-9]+}} +; CHECK-SPIRV-NEGATIVE-NOT: 4 LoopMerge {{[0-9]+}} {{[0-9]+}} 1048576 +while.cond1: ; preds = %if.end8, %if.then6, %while.end + %4 = load i32, i32* %i, align 4 + %5 = load i32, i32* %m, align 4 + %cmp2 = icmp slt i32 %4, %5 + br i1 %cmp2, label %while.body3, label %while.end9 + +while.body3: ; preds = %while.cond1 + %6 = load i32, i32* %i, align 4 + %rem4 = srem i32 %6, 3 + %tobool5 = icmp ne i32 %rem4, 0 + br i1 %tobool5, label %if.then6, label %if.end8 + +if.then6: ; preds = %while.body3 + %7 = load i32, i32* %i, align 4 + %inc7 = add nsw i32 %7, 1 + store i32 %inc7, i32* %i, align 4 + br label %while.cond1, !llvm.loop !23 + +if.end8: ; preds = %while.body3 + br label %while.cond1, !llvm.loop !23 + +while.end9: ; preds = %while.cond1 + ret void +} + +; Function Attrs: noinline nounwind optnone +define spir_func void @max_interleaving() #3 { +entry: + %a = alloca [10 x i32], align 4 + %i = alloca i32, align 4 + store i32 0, i32* %i, align 4 + br label %for.cond + +; Per SPIR-V spec, LoopControlMaxInterleavingINTEL = 0x200000 (2097152) +; CHECK-SPIRV: 5 LoopMerge {{[0-9]+}} {{[0-9]+}} 2097152 3 +; CHECK-SPIRV-NEXT: 4 BranchConditional {{[0-9]+}} {{[0-9]+}} {{[0-9]+}} +; CHECK-SPIRV-NEGATIVE-NOT: 5 LoopMerge {{[0-9]+}} {{[0-9]+}} 2097152 3 +for.cond: ; preds = %for.inc, %entry + %0 = load i32, i32* %i, align 4 + %cmp = icmp ne i32 %0, 10 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %1 = load i32, i32* %i, align 4 + %idxprom = sext i32 %1 to i64 + %arrayidx = getelementptr inbounds [10 x i32], [10 x i32]* %a, i64 0, i64 %idxprom + store i32 0, i32* %arrayidx, align 4 + br label %for.inc + +for.inc: ; preds = %for.body + %2 = load i32, i32* %i, align 4 + %inc = add nsw i32 %2, 1 + store i32 %inc, i32* %i, align 4 + br label %for.cond, !llvm.loop !25 + +for.end: ; preds = %for.cond + ret void +} + +; Function Attrs: noinline nounwind optnone +define spir_func void @speculated_iterations() #3 { +entry: + %a = alloca [10 x i32], align 4 + %i = alloca i32, align 4 + store i32 0, i32* %i, align 4 + br label %for.cond + +; Per SPIR-V spec, LoopControlSpeculatedIterationsINTEL = 0x400000 (4194304) +; CHECK-SPIRV: 5 LoopMerge {{[0-9]+}} {{[0-9]+}} 4194304 4 +; CHECK-SPIRV-NEXT: 4 BranchConditional {{[0-9]+}} {{[0-9]+}} {{[0-9]+}} +; CHECK-SPIRV-NEGATIVE-NOT: 5 LoopMerge {{[0-9]+}} {{[0-9]+}} 4194304 4 +for.cond: ; preds = %for.inc, %entry + %0 = load i32, i32* %i, align 4 + %cmp = icmp ne i32 %0, 10 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + %1 = load i32, i32* %i, align 4 + %idxprom = sext i32 %1 to i64 + %arrayidx = getelementptr inbounds [10 x i32], [10 x i32]* %a, i64 0, i64 %idxprom + store i32 0, i32* %arrayidx, align 4 + br label %for.inc + +for.inc: ; preds = %for.body + %2 = load i32, i32* %i, align 4 + %inc = add nsw i32 %2, 1 + store i32 %inc, i32* %i, align 4 + br label %for.cond, !llvm.loop !27 + +for.end: ; preds = %for.cond + ret void +} + attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "sycl-module-id"="FPGALoopMergeInst.cpp" "uniform-work-group-size"="true" "unsafe-fp-math"="false" "use-soft-float"="false" } attributes #1 = { argmemonly nounwind willreturn } attributes #2 = { inlinehint nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } @@ -258,11 +467,26 @@ attributes #4 = { nounwind } !16 = !{!"llvm.loop.max_concurrency.count", i32 4} !17 = distinct !{!17, !18} !18 = !{!"llvm.loop.ivdep.safelen", i32 2} +!19 = distinct !{!19, !20} +!20 = !{!"llvm.loop.intel.pipelining.enable", i32 1} +!21 = distinct !{!21, !22} +!22 = !{!"llvm.loop.coalesce.count", i32 4} +!23 = distinct !{!23, !24} +!24 = !{!"llvm.loop.coalesce.enable"} +!25 = distinct !{!25, !26} +!26 = !{!"llvm.loop.max_interleaving.count", i32 3} +!27 = distinct !{!27, !28} +!28 = !{!"llvm.loop.intel.speculated.iterations.count", i32 4} ; CHECK-LLVM: br label %while.cond, !llvm.loop ![[MD_A:[0-9]+]] ; CHECK-LLVM: br label %while.cond{{[0-9]+}}, !llvm.loop ![[MD_B:[0-9]+]] ; CHECK-LLVM: br label %while.cond{{[0-9]+}}, !llvm.loop ![[MD_C:[0-9]+]] ; CHECK-LLVM: br label %while.cond{{[0-9]+}}, !llvm.loop ![[MD_D:[0-9]+]] +; CHECK-LLVM: br label %for.cond{{[0-9]*}}, !llvm.loop ![[MD_E:[0-9]+]] +; CHECK-LLVM: br label %while.cond{{[0-9]*}}, !llvm.loop ![[MD_F:[0-9]+]] +; CHECK-LLVM: br label %while.cond{{[0-9]+}}, !llvm.loop ![[MD_G:[0-9]+]] +; CHECK-LLVM: br label %for.cond{{[0-9]*}}, !llvm.loop ![[MD_H:[0-9]+]] +; CHECK-LLVM: br label %for.cond{{[0-9]*}}, !llvm.loop ![[MD_I:[0-9]+]] ; CHECK-LLVM: ![[MD_A]] = distinct !{![[MD_A]], ![[MD_ivdep_enable:[0-9]+]]} ; CHECK-LLVM: ![[MD_ivdep_enable]] = !{!"llvm.loop.ivdep.enable"} @@ -272,3 +496,14 @@ attributes #4 = { nounwind } ; CHECK-LLVM: ![[MD_max_conc]] = !{!"llvm.loop.max_concurrency.count", i32 4} ; CHECK-LLVM: ![[MD_D]] = distinct !{![[MD_D]], ![[MD_ivdep:[0-9]+]]} ; CHECK-LLVM: ![[MD_ivdep]] = !{!"llvm.loop.ivdep.safelen", i32 2} +; CHECK-LLVM: ![[MD_E]] = distinct !{![[MD_E]], ![[MD_pipelining:[0-9]+]]} +; CHECK-LLVM: ![[MD_pipelining]] = !{!"llvm.loop.intel.pipelining.enable", i32 1} +; CHECK-LLVM: ![[MD_F]] = distinct !{![[MD_F]], ![[MD_loop_coalesce_count:[0-9]+]]} +; CHECK-LLVM: ![[MD_loop_coalesce_count]] = !{!"llvm.loop.coalesce.count", i32 4} +; CHECK-LLVM: ![[MD_G]] = distinct !{![[MD_G]], ![[MD_loop_coalesce:[0-9]+]]} +; CHECK-LLVM: ![[MD_loop_coalesce]] = !{![[MD_loop_coalesce_enable:[0-9]+]]} +; CHECK-LLVM: ![[MD_loop_coalesce_enable]] = !{!"llvm.loop.coalesce.enable"} +; CHECK-LLVM: ![[MD_H]] = distinct !{![[MD_H]], ![[MD_max_interleaving:[0-9]+]]} +; CHECK-LLVM: ![[MD_max_interleaving]] = !{!"llvm.loop.max_interleaving.count", i32 3} +; CHECK-LLVM: ![[MD_I]] = distinct !{![[MD_I]], ![[MD_spec_iterations:[0-9]+]]} +; CHECK-LLVM: ![[MD_spec_iterations]] = !{!"llvm.loop.intel.speculated.iterations.count", i32 4} diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt index 3bb037b803dea..a4326377a0c38 100644 --- a/llvm/CMakeLists.txt +++ b/llvm/CMakeLists.txt @@ -418,7 +418,7 @@ option(LLVM_ENABLE_EXPENSIVE_CHECKS "Enable expensive checks" OFF) # Enabling this flag makes it easier to find cases where the compiler makes # assumptions on the size being 'fixed size', when building tests for # SVE/SVE2 or other scalable vector architectures. -option(LLVM_ENABLE_STRICT_IMPLICIT_CONVERSION_TYPESIZE +option(LLVM_ENABLE_STRICT_FIXED_SIZE_VECTORS "Enable assertions that type is not scalable in implicit conversion from TypeSize to uint64_t" OFF) set(LLVM_ABI_BREAKING_CHECKS "WITH_ASSERTS" CACHE STRING @@ -1072,6 +1072,7 @@ if (NOT LLVM_INSTALL_TOOLCHAIN_ONLY) add_dependencies(llvm-libraries ${lib}) if (NOT LLVM_ENABLE_IDE) add_dependencies(install-llvm-libraries install-${lib}) + add_dependencies(install-llvm-libraries-stripped install-${lib}-stripped) endif() endforeach() endif() diff --git a/llvm/cmake/modules/AddLLVM.cmake b/llvm/cmake/modules/AddLLVM.cmake index 7b95d8be1b606..8cd71eef2332e 100644 --- a/llvm/cmake/modules/AddLLVM.cmake +++ b/llvm/cmake/modules/AddLLVM.cmake @@ -1029,6 +1029,13 @@ function(export_executable_symbols target) endif() endfunction() +# Export symbols if LLVM plugins are enabled. +function(export_executable_symbols_for_plugins target) + if(LLVM_ENABLE_PLUGINS) + export_executable_symbols(${target}) + endif() +endfunction() + if(NOT LLVM_TOOLCHAIN_TOOLS) set (LLVM_TOOLCHAIN_TOOLS llvm-ar diff --git a/llvm/cmake/modules/HandleLLVMOptions.cmake b/llvm/cmake/modules/HandleLLVMOptions.cmake index 70ad34a41bde8..0c5f4e08aabaa 100644 --- a/llvm/cmake/modules/HandleLLVMOptions.cmake +++ b/llvm/cmake/modules/HandleLLVMOptions.cmake @@ -95,8 +95,8 @@ if(LLVM_ENABLE_EXPENSIVE_CHECKS) endif() endif() -if (LLVM_ENABLE_STRICT_IMPLICIT_CONVERSION_TYPESIZE) - add_definitions(-DSTRICT_IMPLICIT_CONVERSION_TYPESIZE) +if (LLVM_ENABLE_STRICT_FIXED_SIZE_VECTORS) + add_definitions(-DSTRICT_FIXED_SIZE_VECTORS) endif() string(TOUPPER "${LLVM_ABI_BREAKING_CHECKS}" uppercase_LLVM_ABI_BREAKING_CHECKS) @@ -1019,8 +1019,9 @@ if(macos_signposts_available) endif() endif() +set(LLVM_SOURCE_PREFIX "" CACHE STRING "Use prefix for sources") + option(LLVM_USE_RELATIVE_PATHS_IN_DEBUG_INFO "Use relative paths in debug info" OFF) -set(LLVM_SOURCE_PREFIX "" CACHE STRING "Use prefix for sources in debug info") if(LLVM_USE_RELATIVE_PATHS_IN_DEBUG_INFO) check_c_compiler_flag("-fdebug-prefix-map=foo=bar" SUPPORTS_FDEBUG_PREFIX_MAP) @@ -1034,3 +1035,18 @@ if(LLVM_USE_RELATIVE_PATHS_IN_DEBUG_INFO) append_if(SUPPORTS_FDEBUG_PREFIX_MAP "-fdebug-prefix-map=${source_root}/=${LLVM_SOURCE_PREFIX}" CMAKE_C_FLAGS CMAKE_CXX_FLAGS) add_flag_if_supported("-no-canonical-prefixes" NO_CANONICAL_PREFIXES) endif() + +option(LLVM_USE_RELATIVE_PATHS_IN_FILES "Use relative paths in sources and debug info" OFF) + +if(LLVM_USE_RELATIVE_PATHS_IN_FILES) + check_c_compiler_flag("-ffile-prefix-map=foo=bar" SUPPORTS_FFILE_PREFIX_MAP) + if(LLVM_ENABLE_PROJECTS_USED) + get_filename_component(source_root "${LLVM_MAIN_SRC_DIR}/.." ABSOLUTE) + else() + set(source_root "${LLVM_MAIN_SRC_DIR}") + endif() + file(RELATIVE_PATH relative_root "${source_root}" "${CMAKE_BINARY_DIR}") + append_if(SUPPORTS_FFILE_PREFIX_MAP "-ffile-prefix-map=${CMAKE_BINARY_DIR}=${relative_root}" CMAKE_C_FLAGS CMAKE_CXX_FLAGS) + append_if(SUPPORTS_FFILE_PREFIX_MAP "-ffile-prefix-map=${source_root}/=${LLVM_SOURCE_PREFIX}" CMAKE_C_FLAGS CMAKE_CXX_FLAGS) + add_flag_if_supported("-no-canonical-prefixes" NO_CANONICAL_PREFIXES) +endif() diff --git a/llvm/cmake/modules/LLVMExternalProjectUtils.cmake b/llvm/cmake/modules/LLVMExternalProjectUtils.cmake index a69a4720b8b0b..29f4fcbf8aba6 100644 --- a/llvm/cmake/modules/LLVMExternalProjectUtils.cmake +++ b/llvm/cmake/modules/LLVMExternalProjectUtils.cmake @@ -239,6 +239,7 @@ function(llvm_ExternalProject_Add name source_dir) -DLLVM_HOST_TRIPLE=${LLVM_HOST_TRIPLE} -DLLVM_HAVE_LINK_VERSION_SCRIPT=${LLVM_HAVE_LINK_VERSION_SCRIPT} -DLLVM_USE_RELATIVE_PATHS_IN_DEBUG_INFO=${LLVM_USE_RELATIVE_PATHS_IN_DEBUG_INFO} + -DLLVM_USE_RELATIVE_PATHS_IN_FILES=${LLVM_USE_RELATIVE_PATHS_IN_FILES} -DLLVM_SOURCE_PREFIX=${LLVM_SOURCE_PREFIX} -DPACKAGE_VERSION=${PACKAGE_VERSION} -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index 197765a3071b1..9e98dda9f9081 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -353,9 +353,9 @@ supported for the ``amdgcn`` target. (scratch), and group (LDS) memory depending on if the address is within one of the aperture ranges. Flat access to scratch requires hardware aperture setup and setup in the kernel prologue (see - :ref:`amdgpu-amdhsa-flat-scratch`). Flat access to LDS requires hardware - aperture setup and M0 (GFX7-GFX8) register setup (see - :ref:`amdgpu-amdhsa-m0`). + :ref:`amdgpu-amdhsa-kernel-prolog-flat-scratch`). Flat access to LDS requires + hardware aperture setup and M0 (GFX7-GFX8) register setup (see + :ref:`amdgpu-amdhsa-kernel-prolog-m0`). To convert between a private or group address space address (termed a segment address) and a flat address the base address of the corresponding aperture @@ -5954,7 +5954,7 @@ SGPR register initial state is defined in must be used to set up FLAT SCRATCH for flat addressing (see - :ref:`amdgpu-amdhsa-flat-scratch`). + :ref:`amdgpu-amdhsa-kernel-prolog-flat-scratch`). ========== ========================== ====== ============================== The order of the VGPR registers is defined, but the compiler can specify which @@ -6020,7 +6020,23 @@ following properties: Kernel Prolog ~~~~~~~~~~~~~ -.. _amdgpu-amdhsa-m0: +The compiler performs initialization in the kernel prologue depending on the +target and information about things like stack usage in the kernel and called +functions. Some of this initialization requires the compiler to request certain +User and System SGPRs be present in the +:ref:`amdgpu-amdhsa-initial-kernel-execution-state` via the +:ref:`amdgpu-amdhsa-kernel-descriptor`. + +.. _amdgpu-amdhsa-kernel-prolog-cfi: + +CFI ++++ + +1. The CFI return address is undefined. +2. The CFI CFA is defined using an expression which evaluates to a memory + location description for the private segment address ``0``. + +.. _amdgpu-amdhsa-kernel-prolog-m0: M0 ++ @@ -6035,15 +6051,35 @@ GFX9-GFX10 The M0 register is not used for range checking LDS accesses and so does not need to be initialized in the prolog. -.. _amdgpu-amdhsa-flat-scratch: +.. _amdgpu-amdhsa-kernel-prolog-stack-pointer: + +Stack Pointer ++++++++++++++ + +If the kernel has function calls it must set up the ABI stack pointer described +in :ref:`amdgpu-amdhsa-function-call-convention-non-kernel-functions` by +setting SGPR32 to the the unswizzled scratch offset of the address past the +last local allocation. + +.. _amdgpu-amdhsa-kernel-prolog-frame-pointer: + +Frame Pointer ++++++++++++++ + +If the kernel needs a frame pointer for the reasons defined in +``SIFrameLowering`` then SGPR33 is used and is always set to ``0`` in the +kernel prolog. If a frame pointer is not required then all uses of the frame +pointer are replaced with immediate ``0`` offsets. + +.. _amdgpu-amdhsa-kernel-prolog-flat-scratch: Flat Scratch ++++++++++++ -If the kernel may use flat operations to access scratch memory, the prolog code -must set up FLAT_SCRATCH register pair (FLAT_SCRATCH_LO/FLAT_SCRATCH_HI which -are in SGPRn-4/SGPRn-3). Initialization uses Flat Scratch Init and Scratch -Wavefront Offset SGPR registers (see +If the kernel or any function it calls may use flat operations to access +scratch memory, the prolog code must set up the FLAT_SCRATCH register pair +(FLAT_SCRATCH_LO/FLAT_SCRATCH_HI which are in SGPRn-4/SGPRn-3). Initialization +uses Flat Scratch Init and Scratch Wavefront Offset SGPR registers (see :ref:`amdgpu-amdhsa-initial-kernel-execution-state`): GFX6 @@ -6074,6 +6110,52 @@ GFX9-GFX10 FLAT_SCRATCH pair for use as the flat scratch base in flat memory instructions. +.. _amdgpu-amdhsa-kernel-prolog-private-segment-buffer: + +Private Segment Buffer +++++++++++++++++++++++ + +A set of four SGPRs beginning at a four-aligned SGPR index are always selected +to serve as the scratch V# for the kernel as follows: + + - If it is know during instruction selection that there is stack usage, + SGPR0-3 is reserved for use as the scratch V#. Stack usage is assumed if + optimisations are disabled (``-O0``), if stack objects already exist (for + locals, etc.), or if there are any function calls. + + - Otherwise, four high numbered SGPRs beginning at a four-aligned SGPR index + are reserved for the tentative scratch V#. These will be used if it is + determined that spilling is needed. + + - If no use is made of the tentative scratch V#, then it is unreserved + and the register count is determined ignoring it. + - If use is made of the tenatative scratch V#, then its register numbers + are shifted to the first four-aligned SGPR index after the highest one + allocated by the register allocator, and all uses are updated. The + register count includes them in the shifted location. + - In either case, if the processor has the SGPR allocation bug, the + tentative allocation is not shifted or unreserved in order to ensure + the register count is higher to workaround the bug. + + .. note:: + + This approach of using a tentative scratch V# and shifting the register + numbers if used avoids having to perform register allocation a second + time if the tentative V# is eliminated. This is more efficient and + avoids the problem that the second register allocation may perform + spilling which will fail as there is no longer a scratch V#. + +When the kernel prolog code is being emitted it is known whether the scratch V# +described above is actually used. If it is, the prolog code must set it up by +copying the Private Segment Buffer to the scratch V# registers and then adding +the Private Segment Wavefront Offset to the queue base address in the V#. The +result is a V# with a base address pointing to the beginning of the wavefront +scratch backing memory. + +The Private Segment Buffer is always requested, but the Private Segment +Wavefront Offset is only requested if it is used (see +:ref:`amdgpu-amdhsa-initial-kernel-execution-state`). + .. _amdgpu-amdhsa-memory-model: Memory Model @@ -8514,6 +8596,8 @@ Call Convention See :ref:`amdgpu-dwarf-address-space-mapping` for information on swizzled addresses. Unswizzled addresses are normal linear addresses. +.. _amdgpu-amdhsa-function-call-convention-kernel-functions: + Kernel Functions ++++++++++++++++ @@ -8537,44 +8621,10 @@ how the AMDGPU implements function calls: by-value struct? - What is ABI for lambda values? -2. The CFI return address is undefined. -3. If the kernel contains no calls then: - - - If using the ``amdhsa`` OS ABI (see :ref:`amdgpu-os-table`), and know - during ISel that there is stack usage SGPR0-3 is reserved for use as the - scratch SRD and SGPR33 reserved for the wave scratch offset. Stack usage - is assumed if ``-O0``, if already aware of stack objects for locals, etc., - or if there are any function calls. - - Otherwise, five high numbered SGPRs are reserved for the tentative scratch - SRD and wave scratch offset. These will be used if determine need to do - spilling. - - - If no use is made of the tentative scratch SRD or wave scratch offset, - then they are unreserved and the register count is determined ignoring - them. - - If use is made of the tenatative scratch SRD or wave scratch offset, - then the register numbers used are shifted to be after the highest one - allocated by the register allocator, and all uses updated. The register - count will include them in the shifted location. Since register - allocation may introduce spills, this shifting allows them to be - eliminated without having to perform register allocation again. - - In either case, if the processor has the SGPR allocation bug, the - tentative allocation is not shifted or unreserved inorder to ensure the - register count is higher to workaround the bug. - -4. If the kernel contains function calls: - - - SP is set to the wave scratch offset. - - - Since SP is an unswizzled address relative to the queue scratch base, an - wave scratch offset is an unswizzle offset, this means that if SP is - used to access swizzled scratch memory, it will access the private - segment address 0. - - .. note:: +4. The kernel performs certain setup in its prolog, as described in + :ref:`amdgpu-amdhsa-kernel-prolog`. - This is planned to be changed to be the unswizzled base address of the - wavefront scratch backing memory. +.. _amdgpu-amdhsa-function-call-convention-non-kernel-functions: Non-Kernel Functions ++++++++++++++++++++ @@ -8582,26 +8632,23 @@ Non-Kernel Functions This section describes the call convention ABI for functions other than the outer kernel function. -If a kernel has function calls then scratch is always allocated and used for the -call stack which grows from low address to high address using the swizzled +If a kernel has function calls then scratch is always allocated and used for +the call stack which grows from low address to high address using the swizzled scratch address space. On entry to a function: -1. SGPR0-3 contain a V# with the following properties: - - * Base address of the queue scratch backing memory. - - .. note:: - - This is planned to be changed to be the unswizzled base address of the - wavefront scratch backing memory. +1. SGPR0-3 contain a V# with the following properties (see + :ref:`amdgpu-amdhsa-kernel-prolog-private-segment-buffer`): + * Base address pointing to the beginning of the wavefront scratch backing + memory. * Swizzled with dword element size and stride of wavefront size elements. 2. The FLAT_SCRATCH register pair is setup. See - :ref:`amdgpu-amdhsa-flat-scratch`. -3. GFX6-8: M0 register set to the size of LDS in bytes. + :ref:`amdgpu-amdhsa-kernel-prolog-flat-scratch`. +3. GFX6-8: M0 register set to the size of LDS in bytes. See + :ref:`amdgpu-amdhsa-kernel-prolog-m0`. 4. The EXEC register is set to the lanes active on entry to the function. 5. MODE register: *TBD* 6. VGPR0-31 and SGPR4-29 are used to pass function input arguments as described @@ -8609,14 +8656,21 @@ On entry to a function: 7. SGPR30-31 return address (RA). The code address that the function must return to when it completes. The value is undefined if the function is *no return*. -8. SGPR32 is used for the stack pointer (SP). It is an unswizzled - scratch offset relative to the beginning of the queue scratch backing - memory. +8. SGPR32 is used for the stack pointer (SP). It is an unswizzled scratch + offset relative to the beginning of the wavefront scratch backing memory. The unswizzled SP can be used with buffer instructions as an unswizzled SGPR offset with the scratch V# in SGPR0-3 to access the stack in a swizzled manner. + The unswizzled SP value can be converted into the swizzled SP value by: + + | swizzled SP = unswizzled SP / wavefront size + + This may be used to obtain the private address space address of stack + objects and to convert this address to a flat address by adding the flat + scratch aperture base address. + The swizzled SP value is always 4 bytes aligned for the ``r600`` architecture and 16 byte aligned for the ``amdgcn`` architecture. @@ -8639,41 +8693,14 @@ On entry to a function: arguments after the last local allocation and adjust SGPR32 to the address after the last local allocation. - .. note:: - - The SP value is planned to be changed to be the unswizzled offset relative - to the wavefront scratch backing memory. - -9. SGPR33 wavefront scratch base offset. The unswizzled offset from the queue - scratch backing memory base to the base of the wavefront scratch backing - memory. - - It is used to convert the unswizzled SP value to swizzled address in the - private address space by: - - | private address = (unswizzled SP - wavefront scratch base offset) / - wavefront size - - This may be used to obtain the private address of stack objects and to - convert these address to a flat address by adding the flat scratch aperture - base address. - - .. note:: - - This is planned to be eliminated when SP is changed to be the unswizzled - offset relative to the wavefront scratch backing memory. The the - conversion simplifies to: - - | private address = unswizzled SP / wavefront size - -10. All other registers are unspecified. -11. Any necessary ``waitcnt`` has been performed to ensure memory is available +9. All other registers are unspecified. +10. Any necessary ``waitcnt`` has been performed to ensure memory is available to the function. On exit from a function: 1. VGPR0-31 and SGPR4-29 are used to pass function result arguments as - described below. Any registers used are considered clobbered registers, + described below. Any registers used are considered clobbered registers. 2. The following registers are preserved and have the same value as on entry: * FLAT_SCRATCH @@ -8870,9 +8897,9 @@ registers and some in memory. The following is not part of the AMDGPU function calling convention but describes how the AMDGPU implements function calls: -1. SGPR34 is used as a frame pointer (FP) if necessary. Like the SP it is an +1. SGPR33 is used as a frame pointer (FP) if necessary. Like the SP it is an unswizzled scratch address. It is only needed if runtime sized ``alloca`` - are used, or for the reasons defined in ``SiFrameLowering``. + are used, or for the reasons defined in ``SIFrameLowering``. 2. Runtime stack alignment is not currently supported. .. TODO:: @@ -8886,14 +8913,11 @@ describes how the AMDGPU implements function calls: ..note:: - Before CFI is generated, the call convention will be changed so that SP is - an unswizzled address relative to the wave scratch base. - CFI will be generated that defines the CFA as the unswizzled address relative to the wave scratch base in the unswizzled private address space of the lowest address stack allocated local variable. - ``DW_AT_frame_base`` will be defined as the swizelled address in the + ``DW_AT_frame_base`` will be defined as the swizzled address in the swizzled private address space by dividing the CFA by the wavefront size (since CFA is always at least dword aligned which matches the scratch swizzle element size). @@ -9953,4 +9977,4 @@ Additional Documentation .. [SEMVER] `Semantic Versioning `__ .. [OpenCL] `The OpenCL Specification Version 2.0 `__ .. [HRF] `Heterogeneous-race-free Memory Models `__ -.. [CLANG-ATTR] `Attributes in Clang `__ +.. [CLANG-ATTR] `Attributes in Clang `__ diff --git a/llvm/docs/AliasAnalysis.rst b/llvm/docs/AliasAnalysis.rst index 14decfeca6e72..23d374a42ddbd 100644 --- a/llvm/docs/AliasAnalysis.rst +++ b/llvm/docs/AliasAnalysis.rst @@ -19,7 +19,7 @@ indicating that two pointers always point to the same object, might point to the same object, or are known to never point to the same object. The LLVM `AliasAnalysis -`__ class is the +`__ class is the primary interface used by clients and implementations of alias analyses in the LLVM system. This class is the common interface between clients of alias analysis information and the implementations providing it, and is designed to @@ -36,7 +36,7 @@ points about what exactly results mean. ``AliasAnalysis`` Class Overview ================================ -The `AliasAnalysis `__ +The `AliasAnalysis `__ class defines the interface that the various alias analysis implementations should support. This class exports two important enums: ``AliasResult`` and ``ModRefResult`` which represent the result of an alias query or a mod/ref @@ -264,7 +264,7 @@ Interfaces which may be specified --------------------------------- All of the `AliasAnalysis -`__ virtual methods +`__ virtual methods default to providing :ref:`chaining ` to another alias analysis implementation, which ends up returning conservatively correct information (returning "May" Alias and "Mod/Ref" for alias and mod/ref queries @@ -435,7 +435,7 @@ Using the ``AliasSetTracker`` class Many transformations need information about alias **sets** that are active in some scope, rather than information about pairwise aliasing. The -`AliasSetTracker `__ +`AliasSetTracker `__ class is used to efficiently build these Alias Sets from the pairwise alias analysis information provided by the ``AliasAnalysis`` interface. diff --git a/llvm/docs/CMake.rst b/llvm/docs/CMake.rst index 70c81adfee3a1..32d2ebdfc2c2e 100644 --- a/llvm/docs/CMake.rst +++ b/llvm/docs/CMake.rst @@ -601,7 +601,7 @@ LLVM-specific variables **LLVM_BUILD_INSTRUMENTED_COVERAGE**:BOOL If enabled, `source-based code coverage - `_ instrumentation + `_ instrumentation is enabled while building llvm. **LLVM_CCACHE_BUILD**:BOOL @@ -631,6 +631,14 @@ LLVM-specific variables If enabled, the Z3 constraint solver is activated for the Clang static analyzer. A recent version of the z3 library needs to be available on the system. +**LLVM_USE_RELATIVE_PATHS_IN_DEBUG_INFO**:BOOL + Rewrite absolute source paths in debug info to relative ones. The source prefix + can be adjusted via the LLVM_SOURCE_PREFIX variable. + +**LLVM_USE_RELATIVE_PATHS_IN_FILES**:BOOL + Rewrite absolute source paths in sources and debug info to relative ones. The + source prefix can be adjusted via the LLVM_SOURCE_PREFIX variable. + CMake Caches ============ diff --git a/llvm/docs/CommandGuide/dsymutil.rst b/llvm/docs/CommandGuide/dsymutil.rst index 4aa9c1c7c49dd..2f638daf2ae3f 100644 --- a/llvm/docs/CommandGuide/dsymutil.rst +++ b/llvm/docs/CommandGuide/dsymutil.rst @@ -71,6 +71,12 @@ OPTIONS Specifies a ``path`` to prepend to all debug symbol object file paths. +.. option:: --object-prefix-map= + + Remap object file paths (but no source paths) before processing. Use + this for Clang objects where the module cache location was remapped using + ``-fdebug-prefix-map``; to help dsymutil find the Clang module cache. + .. option:: --papertrail When running dsymutil as part of your build system, it can be desirable for diff --git a/llvm/docs/CommandGuide/lit.rst b/llvm/docs/CommandGuide/lit.rst index ebc0bf2c27fdc..63518fb20adcb 100644 --- a/llvm/docs/CommandGuide/lit.rst +++ b/llvm/docs/CommandGuide/lit.rst @@ -251,12 +251,17 @@ convenient and flexible support for out-of-tree builds. TEST STATUS RESULTS ------------------- -Each test ultimately produces one of the following six results: +Each test ultimately produces one of the following eight results: **PASS** The test succeeded. +**FLAKYPASS** + + The test succeeded after being re-run more than once. This only applies to + tests containing an ``ALLOW_RETRIES:`` annotation. + **XFAIL** The test failed, but that is expected. This is used for test formats which allow @@ -283,6 +288,11 @@ Each test ultimately produces one of the following six results: The test is not supported in this environment. This is used by test formats which can report unsupported tests. +**TIMEOUT** + + The test was run, but it timed out before it was able to complete. This is + considered a failure. + Depending on the test format tests may produce additional information about their status (generally only for failures). See the :ref:`output-options` section for more information. diff --git a/llvm/docs/CommandGuide/llvm-lipo.rst b/llvm/docs/CommandGuide/llvm-lipo.rst index 7e661153a6508..20b2984fc9b2c 100644 --- a/llvm/docs/CommandGuide/llvm-lipo.rst +++ b/llvm/docs/CommandGuide/llvm-lipo.rst @@ -70,4 +70,4 @@ COMMANDS BUGS ---- -To report bugs, please visit . +To report bugs, please visit . diff --git a/llvm/docs/CommandGuide/llvm-objcopy.rst b/llvm/docs/CommandGuide/llvm-objcopy.rst index 63d4af342f79b..14e08d7641d54 100644 --- a/llvm/docs/CommandGuide/llvm-objcopy.rst +++ b/llvm/docs/CommandGuide/llvm-objcopy.rst @@ -545,7 +545,7 @@ Otherwise, it exits with code 0. BUGS ---- -To report bugs, please visit . +To report bugs, please visit . There is a known issue with :option:`--input-target` and :option:`--target` causing only ``binary`` and ``ihex`` formats to have any effect. Other values diff --git a/llvm/docs/CommandGuide/llvm-objdump.rst b/llvm/docs/CommandGuide/llvm-objdump.rst index 2730374d58103..04c76491101d7 100644 --- a/llvm/docs/CommandGuide/llvm-objdump.rst +++ b/llvm/docs/CommandGuide/llvm-objdump.rst @@ -324,7 +324,7 @@ MACH-O ONLY OPTIONS AND COMMANDS BUGS ---- -To report bugs, please visit . +To report bugs, please visit . SEE ALSO -------- diff --git a/llvm/docs/CommandGuide/llvm-size.rst b/llvm/docs/CommandGuide/llvm-size.rst index 08426db4a3286..b229bc63d40aa 100644 --- a/llvm/docs/CommandGuide/llvm-size.rst +++ b/llvm/docs/CommandGuide/llvm-size.rst @@ -195,4 +195,4 @@ Otherwise, it exits with code 0. BUGS ---- -To report bugs, please visit . +To report bugs, please visit . diff --git a/llvm/docs/CommandGuide/llvm-strings.rst b/llvm/docs/CommandGuide/llvm-strings.rst index f2d04c4190b85..e2fe4cb88c9ae 100644 --- a/llvm/docs/CommandGuide/llvm-strings.rst +++ b/llvm/docs/CommandGuide/llvm-strings.rst @@ -127,4 +127,4 @@ Otherwise, it exits with code 0. BUGS ---- -To report bugs, please visit . +To report bugs, please visit . diff --git a/llvm/docs/CommandGuide/llvm-strip.rst b/llvm/docs/CommandGuide/llvm-strip.rst index aa75154e90d07..455dc07e9c5cb 100644 --- a/llvm/docs/CommandGuide/llvm-strip.rst +++ b/llvm/docs/CommandGuide/llvm-strip.rst @@ -190,7 +190,7 @@ Otherwise, it exits with code 0. BUGS ---- -To report bugs, please visit . +To report bugs, please visit . SEE ALSO -------- diff --git a/llvm/docs/CompileCudaWithLLVM.rst b/llvm/docs/CompileCudaWithLLVM.rst index d85cac77982de..a2d7fd0b7453a 100644 --- a/llvm/docs/CompileCudaWithLLVM.rst +++ b/llvm/docs/CompileCudaWithLLVM.rst @@ -30,7 +30,7 @@ Before you build CUDA code, you'll need to have installed the CUDA SDK. See `NVIDIA's CUDA installation guide `_ for details. Note that clang `maynot support -`_ the CUDA toolkit as installed by +`_ the CUDA toolkit as installed by some Linux package managers. Clang does attempt to deal with specific details of CUDA installation on a handful of common Linux distributions, but in general the most reliable way to make it work is to install CUDA in a single directory from @@ -342,7 +342,7 @@ HD functions cannot be overloaded by H or D functions with the same signature: When resolving an overloaded function, clang considers the host/device attributes of the caller and callee. These are used as a tiebreaker during overload resolution. See `IdentifyCUDAPreference -`_ for the full set of rules, +`_ for the full set of rules, but at a high level they are: * D functions prefer to call other Ds. HDs are given lower priority. @@ -507,12 +507,12 @@ LLVM to make it generate good GPU code. Among these changes are: reduce redundancy within straight-line code. * `Aggressive speculative execution - `_ + `_ -- This is mainly for promoting straight-line scalar optimizations, which are most effective on code along dominator paths. * `Memory space inference - `_ -- + `_ -- In PTX, we can operate on pointers that are in a particular "address space" (global, shared, constant, or local), or we can operate on pointers in the "generic" address space, which can point to anything. Operations in a @@ -521,7 +521,7 @@ LLVM to make it generate good GPU code. Among these changes are: possible. * `Bypassing 64-bit divides - `_ -- + `_ -- This was an existing optimization that we enabled for the PTX backend. 64-bit integer divides are much slower than 32-bit ones on NVIDIA GPUs. @@ -536,7 +536,7 @@ LLVM to make it generate good GPU code. Among these changes are: SROA, which sometimes speed up code by over 10x. (Programmers can force unrolling and inline using clang's `loop unrolling pragmas - `_ + `_ and ``__attribute__((always_inline))``.) Publication @@ -558,4 +558,4 @@ Obtaining Help ============== To obtain help on LLVM in general and its CUDA support, see `the LLVM -community `_. +community `_. diff --git a/llvm/docs/Docker.rst b/llvm/docs/Docker.rst index 5a42cbe698da6..5a61ff988b05a 100644 --- a/llvm/docs/Docker.rst +++ b/llvm/docs/Docker.rst @@ -51,7 +51,7 @@ Overview The ``llvm/utils/docker`` folder contains Dockerfiles and simple bash scripts to serve as a basis for anyone who wants to create their own Docker image with LLVM components, compiled from sources. The sources are checked out from the -upstream svn repository when building the image. +upstream git repository when building the image. The resulting image contains only the requested LLVM components and a few extra packages to make the image minimally useful for C++ development, e.g. libstdc++ @@ -68,7 +68,7 @@ Usage ===== The ``llvm/utils/build_docker_image.sh`` script provides a rather high degree of control on how to run the build. It allows you to specify the projects to -checkout from svn and provide a list of CMake arguments to use during when +checkout from git and provide a list of CMake arguments to use during when building LLVM inside docker container. Here's a very simple example of getting a docker image with clang binary, diff --git a/llvm/docs/FAQ.rst b/llvm/docs/FAQ.rst index 6ce6051e1f653..aef15d6dc711d 100644 --- a/llvm/docs/FAQ.rst +++ b/llvm/docs/FAQ.rst @@ -12,8 +12,8 @@ License Can I modify LLVM source code and redistribute the modified source? ------------------------------------------------------------------- Yes. The modified source distribution must retain the copyright notice and -follow the conditions listed in the `LLVM license -`_. +follow the conditions listed in the `Apache License v2.0 with LLVM Exceptions +`_. Can I modify the LLVM source code and redistribute binaries or other tools based on it, without redistributing the source? @@ -72,9 +72,9 @@ What source languages are supported? ------------------------------------ LLVM currently has full support for C and C++ source languages through -`Clang `_. Many other language frontends have +`Clang `_. Many other language frontends have been written using LLVM, and an incomplete list is available at -`projects with LLVM `_. +`projects with LLVM `_. I'd like to write a self-hosting LLVM compiler. How should I interface with the LLVM middle-end optimizers and back-end code generators? diff --git a/llvm/docs/Frontend/PerformanceTips.rst b/llvm/docs/Frontend/PerformanceTips.rst index 3c290964723b3..f9e23fdbf8852 100644 --- a/llvm/docs/Frontend/PerformanceTips.rst +++ b/llvm/docs/Frontend/PerformanceTips.rst @@ -27,7 +27,7 @@ can often be useful to write a quick C program with the semantics you're trying to model and see what decisions Clang's IRGen makes about what IR to emit. Studying Clang's CodeGen directory can also be a good source of ideas. Note that Clang and LLVM are explicitly version locked so you'll need to make sure -you're using a Clang built from the same svn revision or release as the LLVM +you're using a Clang built from the same git revision or release as the LLVM library you're using. As always, it's *strongly* recommended that you track tip of tree development, particularly during bring up of a new project. diff --git a/llvm/docs/GettingStarted.rst b/llvm/docs/GettingStarted.rst index a3014064b80d5..0ede510edb3a2 100644 --- a/llvm/docs/GettingStarted.rst +++ b/llvm/docs/GettingStarted.rst @@ -16,7 +16,7 @@ files needed to process intermediate representations and converts it into object files. Tools include an assembler, disassembler, bitcode analyzer, and bitcode optimizer. It also contains basic regression tests. -C-like languages use the `Clang `_ front end. This +C-like languages use the `Clang `_ front end. This component compiles C, C++, Objective C, and Objective C++ code into LLVM bitcode -- and from there into object files, using LLVM. @@ -28,7 +28,7 @@ Getting the Source Code and Building LLVM ========================================= The LLVM Getting Started documentation may be out of date. The `Clang -Getting Started `_ page might have more +Getting Started `_ page might have more accurate information. This is an example workflow and configuration to get and build the LLVM source: @@ -522,7 +522,7 @@ you need to check the code out of SVN rather than git for some reason, you can do it like so: * ``cd where-you-want-llvm-to-live`` -* Read-Only: ``svn co http://llvm.org/svn/llvm-project/llvm/trunk llvm`` +* Read-Only: ``svn co https://llvm.org/svn/llvm-project/llvm/trunk llvm`` * Read-Write: ``svn co https://user@llvm.org/svn/llvm-project/llvm/trunk llvm`` This will create an '``llvm``' directory in the current directory and fully @@ -722,7 +722,7 @@ Note: There are some additional flags that need to be passed when building for iOS due to limitations in the iOS SDK. Check :doc:`HowToCrossCompileLLVM` and `Clang docs on how to cross-compile in general -`_ for more information +`_ for more information about cross-compiling. The Location of LLVM Object Files @@ -789,7 +789,7 @@ Directory Layout One useful source of information about the LLVM source base is the LLVM `doxygen `_ documentation available at -``_. The following is a brief introduction to code +``_. The following is a brief introduction to code layout: ``llvm/examples`` @@ -1105,8 +1105,8 @@ things... there are many more interesting and complicated things that you can do that aren't documented here (but we'll gladly accept a patch if you want to write something up!). For more information about LLVM, check out: -* `LLVM Homepage `_ -* `LLVM Doxygen Tree `_ -* `Starting a Project that Uses LLVM `_ +* `LLVM Homepage `_ +* `LLVM Doxygen Tree `_ +* `Starting a Project that Uses LLVM `_ .. _installing arcanist: https://secure.phabricator.com/book/phabricator/article/arcanist_quick_start/ diff --git a/llvm/docs/GettingStartedVS.rst b/llvm/docs/GettingStartedVS.rst index 7507f97bac830..84d0ecf4d8fa9 100644 --- a/llvm/docs/GettingStartedVS.rst +++ b/llvm/docs/GettingStartedVS.rst @@ -18,7 +18,7 @@ to use LLVM. It contains an assembler, disassembler, bitcode analyzer and bitcode optimizer. It also contains basic regression tests that can be used to test the LLVM tools and the Clang front end. -The second piece is the `Clang `_ front end. This +The second piece is the `Clang `_ front end. This component compiles C, C++, Objective C, and Objective C++ code into LLVM bitcode. Clang typically uses LLVM libraries to optimize the bitcode and emit machine code. LLVM fully supports the COFF object file format, which is @@ -74,15 +74,12 @@ Here's the short story for getting up and running quickly with LLVM: (*or use WinZip*) 3. ``cd llvm`` - * With anonymous Subversion access: + * With git access: - *Note:* some regression tests require Unix-style line ending (``\n``). To - pass all regression tests, please add two lines *enable-auto-props = yes* - and *\* = svn:mime-type=application/octet-stream* to - ``C:\Users\\AppData\Roaming\Subversion\config``. + *Note:* some regression tests require Unix-style line ending (``\n``). 1. ``cd `` - 2. ``svn co http://llvm.org/svn/llvm-project/llvm/trunk llvm`` + 2. ``git clone https://github.com/llvm/llvm-project.git llvm`` 3. ``cd llvm`` 5. Use `CMake `_ to generate up-to-date project files: @@ -103,7 +100,7 @@ Here's the short story for getting up and running quickly with LLVM: * See the :doc:`LLVM CMake guide ` for detailed information about how to configure the LLVM build. * CMake generates project files for all build types. To select a specific - build type, use the Configuration manager from the VS IDE or the + build type, use the Configuration manager from the VS IDE or the ``/property:Configuration`` command line option when using MSBuild. * By default, the Visual Studio project files generated by CMake use the 32-bit toolset. If you are developing on a 64-bit version of Windows and @@ -236,6 +233,6 @@ things... there are many more interesting and complicated things that you can do that aren't documented here (but we'll gladly accept a patch if you want to write something up!). For more information about LLVM, check out: -* `LLVM homepage `_ -* `LLVM doxygen tree `_ +* `LLVM homepage `_ +* `LLVM doxygen tree `_ diff --git a/llvm/docs/GlobalISel/GMIR.rst b/llvm/docs/GlobalISel/GMIR.rst index 52f3864147648..ca0d606922241 100644 --- a/llvm/docs/GlobalISel/GMIR.rst +++ b/llvm/docs/GlobalISel/GMIR.rst @@ -76,7 +76,7 @@ generic and non-generic). There are some exceptions to this but in general: way of getting a given operand's type (as there was no 1:1 mapping between instruction types and operands). We considered putting the type in some variant of MCInstrDesc instead: - See `PR26576 `_: [GlobalISel] Generic MachineInstrs + See `PR26576 `_: [GlobalISel] Generic MachineInstrs need a type but this increases the memory footprint of the related objects .. _gmir-regbank: diff --git a/llvm/docs/GlobalISel/GenericOpcode.rst b/llvm/docs/GlobalISel/GenericOpcode.rst index 3c083a523a612..3110e1fa8398c 100644 --- a/llvm/docs/GlobalISel/GenericOpcode.rst +++ b/llvm/docs/GlobalISel/GenericOpcode.rst @@ -243,6 +243,15 @@ These each perform their respective integer arithmetic on a scalar. %2:_(s32) = G_ADD %0:_(s32), %1:_(s32) +G_SADDSAT, G_UADDSAT, G_SSUBSAT, G_USUBSAT +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Signed and unsigned addition and subtraction with saturation. + +.. code-block:: none + + %2:_(s32) = G_SADDSAT %0:_(s32), %1:_(s32) + G_SHL, G_LSHR, G_ASHR ^^^^^^^^^^^^^^^^^^^^^ diff --git a/llvm/docs/GlobalISel/IRTranslator.rst b/llvm/docs/GlobalISel/IRTranslator.rst index a4d9bdad20156..712fe95a82925 100644 --- a/llvm/docs/GlobalISel/IRTranslator.rst +++ b/llvm/docs/GlobalISel/IRTranslator.rst @@ -73,7 +73,7 @@ This differs from SelectionDAG's multiple vregs via ``GetValueVTs``. As some of the bits are undef (padding), we should consider augmenting the representation with additional metadata (in effect, caching computeKnownBits information on vregs). -See `PR26161 `_: [GlobalISel] Value to vreg during +See `PR26161 `_: [GlobalISel] Value to vreg during IR to MachineInstr translation for aggregate type .. _irtranslator-constants: diff --git a/llvm/docs/GlobalISel/KnownBits.rst b/llvm/docs/GlobalISel/KnownBits.rst index 49989f9c9c69f..7e628722d5323 100644 --- a/llvm/docs/GlobalISel/KnownBits.rst +++ b/llvm/docs/GlobalISel/KnownBits.rst @@ -97,4 +97,4 @@ Then it's just a matter of fetching the analysis and using it: } There are many more API's beyond ``getKnownBits()``. See the `API reference -`_ for more information +`_ for more information diff --git a/llvm/docs/HistoricalNotes/2007-OriginalClangReadme.txt b/llvm/docs/HistoricalNotes/2007-OriginalClangReadme.txt index 611dc9d2c01c1..1759ad1e1f9ed 100644 --- a/llvm/docs/HistoricalNotes/2007-OriginalClangReadme.txt +++ b/llvm/docs/HistoricalNotes/2007-OriginalClangReadme.txt @@ -125,7 +125,7 @@ II. Usage of clang driver: invoking Graphviz. For more information on getting Graphviz to work with clang/LLVM, - see: http://llvm.org/docs/ProgrammersManual.html#ViewGraph + see: https://llvm.org/docs/ProgrammersManual.html#ViewGraph III. Current advantages over GCC: diff --git a/llvm/docs/HowToCrossCompileLLVM.rst b/llvm/docs/HowToCrossCompileLLVM.rst index e71c0b07a7a0e..d2dc7bf60e5cb 100644 --- a/llvm/docs/HowToCrossCompileLLVM.rst +++ b/llvm/docs/HowToCrossCompileLLVM.rst @@ -9,7 +9,7 @@ This document contains information about building LLVM and Clang on host machine, targeting another platform. For more information on how to use Clang as a cross-compiler, -please check http://clang.llvm.org/docs/CrossCompilation.html. +please check https://clang.llvm.org/docs/CrossCompilation.html. TODO: Add MIPS and other platforms to this document. @@ -189,7 +189,7 @@ identification), like: If you copy that tarball to your target board, you'll be able to use it for running the test-suite, for example. Follow the guidelines at -http://llvm.org/docs/lnt/quickstart.html, unpack the tarball in the +https://llvm.org/docs/lnt/quickstart.html, unpack the tarball in the test directory, and use options: .. code-block:: bash diff --git a/llvm/docs/HowToSetUpLLVMStyleRTTI.rst b/llvm/docs/HowToSetUpLLVMStyleRTTI.rst index 3892994859091..222ea890322ef 100644 --- a/llvm/docs/HowToSetUpLLVMStyleRTTI.rst +++ b/llvm/docs/HowToSetUpLLVMStyleRTTI.rst @@ -380,8 +380,8 @@ contract, you can tweak and optimize it as much as you want. For example, LLVM-style RTTI can work fine in the presence of multiple-inheritance by defining an appropriate ``classof``. An example of this in practice is -`Decl `_ vs. -`DeclContext `_ +`Decl `_ vs. +`DeclContext `_ inside Clang. The ``Decl`` hierarchy is done very similarly to the example setup demonstrated in this tutorial. @@ -396,7 +396,7 @@ returning true for ones that are known to be ``DeclContext``'s. Touch on some of the more advanced features, like ``isa_impl`` and ``simplify_type``. However, those two need reference documentation in the form of doxygen comments as well. We need the doxygen so that we can - say "for full details, see http://llvm.org/doxygen/..." + say "for full details, see https://llvm.org/doxygen/..." Rules of Thumb ============== diff --git a/llvm/docs/HowToSubmitABug.rst b/llvm/docs/HowToSubmitABug.rst index d276ee8681f38..ac28f290bbdd1 100644 --- a/llvm/docs/HowToSubmitABug.rst +++ b/llvm/docs/HowToSubmitABug.rst @@ -26,7 +26,7 @@ contain the following information: * All information necessary to reproduce the problem. * The reduced test-case that triggers the bug. -* The location where you obtained LLVM (if not from our Subversion +* The location where you obtained LLVM (if not from our Git repository). Thanks for helping us make LLVM better! diff --git a/llvm/docs/LLVMBuild.txt b/llvm/docs/LLVMBuild.txt index 00d82f664c16e..74422db0494fd 100644 --- a/llvm/docs/LLVMBuild.txt +++ b/llvm/docs/LLVMBuild.txt @@ -10,7 +10,7 @@ ; ; For more information on the LLVMBuild system, please see: ; -; http://llvm.org/docs/LLVMBuild.html +; https://llvm.org/docs/LLVMBuild.html ; ;===------------------------------------------------------------------------===; diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index 3d1f9d3883199..4f3640443c4ba 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -1692,7 +1692,7 @@ example: functions. ``safestack`` This attribute indicates that - `SafeStack `_ + `SafeStack `_ protection is enabled for this function. If a function that has a ``safestack`` attribute is inlined into a @@ -3478,9 +3478,12 @@ the ``nsw`` flag. Poison value behavior is defined in terms of value *dependence*: -- Values other than :ref:`phi ` nodes depend on their operands. +- Values other than :ref:`phi ` nodes and :ref:`select ` + instructions depend on their operands. - :ref:`Phi ` nodes depend on the operand corresponding to their dynamic predecessor basic block. +- Select instructions depend on their condition operand and their + selected operand. - Function arguments depend on the corresponding actual argument values in the dynamic callers of their functions. - :ref:`Call ` instructions depend on the :ref:`ret ` @@ -6687,7 +6690,7 @@ TypeIdInfo ^^^^^^^^^^ The optional ``TypeIdInfo`` field, used for -`Control Flow Integrity `_, +`Control Flow Integrity `_, looks like: .. code-block:: text @@ -6764,7 +6767,7 @@ Type ID Summary Entry Each type id summary entry corresponds to a type identifier resolution which is generated during the LTO link portion of the compile when building -with `Control Flow Integrity `_, +with `Control Flow Integrity `_, so these are only present in a combined summary index. Example: @@ -7724,6 +7727,8 @@ Example: = fadd float 4.0, %var ; yields float:result = 4.0 + %var +.. _i_sub: + '``sub``' Instruction ^^^^^^^^^^^^^^^^^^^^^ @@ -7819,6 +7824,8 @@ Example: = fsub float 4.0, %var ; yields float:result = 4.0 - %var = fsub float -0.0, %val ; yields float:result = -%var +.. _i_mul: + '``mul``' Instruction ^^^^^^^^^^^^^^^^^^^^^ @@ -7913,6 +7920,8 @@ Example: = fmul float 4.0, %var ; yields float:result = 4.0 * %var +.. _i_udiv: + '``udiv``' Instruction ^^^^^^^^^^^^^^^^^^^^^^ @@ -7959,6 +7968,8 @@ Example: = udiv i32 4, %var ; yields i32:result = 4 / %var +.. _i_sdiv: + '``sdiv``' Instruction ^^^^^^^^^^^^^^^^^^^^^^ @@ -8047,6 +8058,8 @@ Example: = fdiv float 4.0, %var ; yields float:result = 4.0 / %var +.. _i_urem: + '``urem``' Instruction ^^^^^^^^^^^^^^^^^^^^^^ @@ -8091,6 +8104,8 @@ Example: = urem i32 4, %var ; yields i32:result = 4 % %var +.. _i_srem: + '``srem``' Instruction ^^^^^^^^^^^^^^^^^^^^^^ @@ -8204,6 +8219,8 @@ commonly be strength reduced from other instructions. They require two operands of the same type, execute an operation on them, and produce a single value. The resulting value is the same type as its operands. +.. _i_shl: + '``shl``' Instruction ^^^^^^^^^^^^^^^^^^^^^ @@ -8256,6 +8273,9 @@ Example: = shl i32 1, 32 ; undefined = shl <2 x i32> < i32 1, i32 1>, < i32 1, i32 2> ; yields: result=<2 x i32> < i32 2, i32 4> +.. _i_lshr: + + '``lshr``' Instruction ^^^^^^^^^^^^^^^^^^^^^^ @@ -8305,6 +8325,8 @@ Example: = lshr i32 1, 32 ; undefined = lshr <2 x i32> < i32 -2, i32 4>, < i32 1, i32 2> ; yields: result=<2 x i32> < i32 0x7FFFFFFF, i32 1> +.. _i_ashr: + '``ashr``' Instruction ^^^^^^^^^^^^^^^^^^^^^^ @@ -8355,6 +8377,8 @@ Example: = ashr i32 1, 32 ; undefined = ashr <2 x i32> < i32 -2, i32 4>, < i32 1, i32 3> ; yields: result=<2 x i32> < i32 -1, i32 0> +.. _i_and: + '``and``' Instruction ^^^^^^^^^^^^^^^^^^^^^ @@ -8404,6 +8428,8 @@ Example: = and i32 15, 40 ; yields i32:result = 8 = and i32 4, 8 ; yields i32:result = 0 +.. _i_or: + '``or``' Instruction ^^^^^^^^^^^^^^^^^^^^ @@ -8453,6 +8479,8 @@ Example: = or i32 15, 40 ; yields i32:result = 47 = or i32 4, 8 ; yields i32:result = 12 +.. _i_xor: + '``xor``' Instruction ^^^^^^^^^^^^^^^^^^^^^ @@ -15256,6 +15284,678 @@ intrinsic returns the executable address corresponding to ``tramp`` after performing the required machine specific adjustments. The pointer returned can then be :ref:`bitcast and executed `. + +.. _int_vp: + +Vector Predication Intrinsics +----------------------------- +VP intrinsics are intended for predicated SIMD/vector code. A typical VP +operation takes a vector mask and an explicit vector length parameter as in: + +:: + + llvm.vp..*( %x, %y, %mask, i32 %evl) + +The vector mask parameter (%mask) always has a vector of `i1` type, for example +`<32 x i1>`. The explicit vector length parameter always has the type `i32` and +is an unsigned integer value. The explicit vector length parameter (%evl) is in +the range: + +:: + + 0 <= %evl <= W, where W is the number of vector elements + +Note that for :ref:`scalable vector types ` ``W`` is the runtime +length of the vector. + +The VP intrinsic has undefined behavior if ``%evl > W``. The explicit vector +length (%evl) creates a mask, %EVLmask, with all elements ``0 <= i < %evl`` set +to True, and all other lanes ``%evl <= i < W`` to False. A new mask %M is +calculated with an element-wise AND from %mask and %EVLmask: + +:: + + M = %mask AND %EVLmask + +A vector operation ```` on vectors ``A`` and ``B`` calculates: + +:: + + A B = { A[i] B[i] M[i] = True, and + { undef otherwise + +Optimization Hint +^^^^^^^^^^^^^^^^^ + +Some targets, such as AVX512, do not support the %evl parameter in hardware. +The use of an effective %evl is discouraged for those targets. The function +``TargetTransformInfo::hasActiveVectorLength()`` returns true when the target +has native support for %evl. + + +.. _int_vp_add: + +'``llvm.vp.add.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare <16 x i32> @llvm.vp.add.v16i32 (<16 x i32> , <16 x i32> , <16 x i1> , i32 ) + declare @llvm.vp.add.nxv4i32 ( , , , i32 ) + declare <256 x i64> @llvm.vp.add.v256i64 (<256 x i64> , <256 x i64> , <256 x i1> , i32 ) + +Overview: +""""""""" + +Predicated integer addition of two vectors of integers. + + +Arguments: +"""""""""" + +The first two operands and the result have the same vector of integer type. The +third operand is the vector mask and has the same number of elements as the +result vector type. The fourth operand is the explicit vector length of the +operation. + +Semantics: +"""""""""" + +The '``llvm.vp.add``' intrinsic performs integer addition (:ref:`add `) +of the first and second vector operand on each enabled lane. The result on +disabled lanes is undefined. + +Examples: +""""""""" + +.. code-block:: llvm + + %r = call <4 x i32> @llvm.vp.add.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i1> %mask, i32 %evl) + ;; For all lanes below %evl, %r is lane-wise equivalent to %also.r + + %t = add <4 x i32> %a, %b + %also.r = select <4 x i1> %mask, <4 x i32> %t, <4 x i32> undef + +.. _int_vp_sub: + +'``llvm.vp.sub.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare <16 x i32> @llvm.vp.sub.v16i32 (<16 x i32> , <16 x i32> , <16 x i1> , i32 ) + declare @llvm.vp.sub.nxv4i32 ( , , , i32 ) + declare <256 x i64> @llvm.vp.sub.v256i64 (<256 x i64> , <256 x i64> , <256 x i1> , i32 ) + +Overview: +""""""""" + +Predicated integer subtraction of two vectors of integers. + + +Arguments: +"""""""""" + +The first two operands and the result have the same vector of integer type. The +third operand is the vector mask and has the same number of elements as the +result vector type. The fourth operand is the explicit vector length of the +operation. + +Semantics: +"""""""""" + +The '``llvm.vp.sub``' intrinsic performs integer subtraction +(:ref:`sub `) of the first and second vector operand on each enabled +lane. The result on disabled lanes is undefined. + +Examples: +""""""""" + +.. code-block:: llvm + + %r = call <4 x i32> @llvm.vp.sub.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i1> %mask, i32 %evl) + ;; For all lanes below %evl, %r is lane-wise equivalent to %also.r + + %t = sub <4 x i32> %a, %b + %also.r = select <4 x i1> %mask, <4 x i32> %t, <4 x i32> undef + + + +.. _int_vp_mul: + +'``llvm.vp.mul.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare <16 x i32> @llvm.vp.mul.v16i32 (<16 x i32> , <16 x i32> , <16 x i1> , i32 ) + declare @llvm.vp.mul.nxv46i32 ( , , , i32 ) + declare <256 x i64> @llvm.vp.mul.v256i64 (<256 x i64> , <256 x i64> , <256 x i1> , i32 ) + +Overview: +""""""""" + +Predicated integer multiplication of two vectors of integers. + + +Arguments: +"""""""""" + +The first two operands and the result have the same vector of integer type. The +third operand is the vector mask and has the same number of elements as the +result vector type. The fourth operand is the explicit vector length of the +operation. + +Semantics: +"""""""""" +The '``llvm.vp.mul``' intrinsic performs integer multiplication +(:ref:`mul `) of the first and second vector operand on each enabled +lane. The result on disabled lanes is undefined. + +Examples: +""""""""" + +.. code-block:: llvm + + %r = call <4 x i32> @llvm.vp.mul.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i1> %mask, i32 %evl) + ;; For all lanes below %evl, %r is lane-wise equivalent to %also.r + + %t = mul <4 x i32> %a, %b + %also.r = select <4 x i1> %mask, <4 x i32> %t, <4 x i32> undef + + +.. _int_vp_sdiv: + +'``llvm.vp.sdiv.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare <16 x i32> @llvm.vp.sdiv.v16i32 (<16 x i32> , <16 x i32> , <16 x i1> , i32 ) + declare @llvm.vp.sdiv.nxv4i32 ( , , , i32 ) + declare <256 x i64> @llvm.vp.sdiv.v256i64 (<256 x i64> , <256 x i64> , <256 x i1> , i32 ) + +Overview: +""""""""" + +Predicated, signed division of two vectors of integers. + + +Arguments: +"""""""""" + +The first two operands and the result have the same vector of integer type. The +third operand is the vector mask and has the same number of elements as the +result vector type. The fourth operand is the explicit vector length of the +operation. + +Semantics: +"""""""""" + +The '``llvm.vp.sdiv``' intrinsic performs signed division (:ref:`sdiv `) +of the first and second vector operand on each enabled lane. The result on +disabled lanes is undefined. + +Examples: +""""""""" + +.. code-block:: llvm + + %r = call <4 x i32> @llvm.vp.sdiv.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i1> %mask, i32 %evl) + ;; For all lanes below %evl, %r is lane-wise equivalent to %also.r + + %t = sdiv <4 x i32> %a, %b + %also.r = select <4 x ii> %mask, <4 x i32> %t, <4 x i32> undef + + +.. _int_vp_udiv: + +'``llvm.vp.udiv.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare <16 x i32> @llvm.vp.udiv.v16i32 (<16 x i32> , <16 x i32> , <16 x i1> , i32 ) + declare @llvm.vp.udiv.nxv4i32 ( , , , i32 ) + declare <256 x i64> @llvm.vp.udiv.v256i64 (<256 x i64> , <256 x i64> , <256 x i1> , i32 ) + +Overview: +""""""""" + +Predicated, unsigned division of two vectors of integers. + + +Arguments: +"""""""""" + +The first two operands and the result have the same vector of integer type. The third operand is the vector mask and has the same number of elements as the result vector type. The fourth operand is the explicit vector length of the operation. + +Semantics: +"""""""""" + +The '``llvm.vp.udiv``' intrinsic performs unsigned division +(:ref:`udiv `) of the first and second vector operand on each enabled +lane. The result on disabled lanes is undefined. + +Examples: +""""""""" + +.. code-block:: llvm + + %r = call <4 x i32> @llvm.vp.udiv.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i1> %mask, i32 %evl) + ;; For all lanes below %evl, %r is lane-wise equivalent to %also.r + + %t = udiv <4 x i32> %a, %b + %also.r = select <4 x ii> %mask, <4 x i32> %t, <4 x i32> undef + + + +.. _int_vp_srem: + +'``llvm.vp.srem.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare <16 x i32> @llvm.vp.srem.v16i32 (<16 x i32> , <16 x i32> , <16 x i1> , i32 ) + declare @llvm.vp.srem.nxv4i32 ( , , , i32 ) + declare <256 x i64> @llvm.vp.srem.v256i64 (<256 x i64> , <256 x i64> , <256 x i1> , i32 ) + +Overview: +""""""""" + +Predicated computations of the signed remainder of two integer vectors. + + +Arguments: +"""""""""" + +The first two operands and the result have the same vector of integer type. The +third operand is the vector mask and has the same number of elements as the +result vector type. The fourth operand is the explicit vector length of the +operation. + +Semantics: +"""""""""" + +The '``llvm.vp.srem``' intrinsic computes the remainder of the signed division +(:ref:`srem `) of the first and second vector operand on each enabled +lane. The result on disabled lanes is undefined. + +Examples: +""""""""" + +.. code-block:: llvm + + %r = call <4 x i32> @llvm.vp.srem.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i1> %mask, i32 %evl) + ;; For all lanes below %evl, %r is lane-wise equivalent to %also.r + + %t = srem <4 x i32> %a, %b + %also.r = select <4 x i1> %mask, <4 x i32> %t, <4 x i32> undef + + + +.. _int_vp_urem: + +'``llvm.vp.urem.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare <16 x i32> @llvm.vp.urem.v16i32 (<16 x i32> , <16 x i32> , <16 x i1> , i32 ) + declare @llvm.vp.urem.nxv4i32 ( , , , i32 ) + declare <256 x i64> @llvm.vp.urem.v256i64 (<256 x i64> , <256 x i64> , <256 x i1> , i32 ) + +Overview: +""""""""" + +Predicated computation of the unsigned remainder of two integer vectors. + + +Arguments: +"""""""""" + +The first two operands and the result have the same vector of integer type. The +third operand is the vector mask and has the same number of elements as the +result vector type. The fourth operand is the explicit vector length of the +operation. + +Semantics: +"""""""""" + +The '``llvm.vp.urem``' intrinsic computes the remainder of the unsigned division +(:ref:`urem `) of the first and second vector operand on each enabled +lane. The result on disabled lanes is undefined. + +Examples: +""""""""" + +.. code-block:: llvm + + %r = call <4 x i32> @llvm.vp.urem.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i1> %mask, i32 %evl) + ;; For all lanes below %evl, %r is lane-wise equivalent to %also.r + + %t = urem <4 x i32> %a, %b + %also.r = select <4 x i1> %mask, <4 x i32> %t, <4 x i32> undef + + +.. _int_vp_ashr: + +'``llvm.vp.ashr.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare <16 x i32> @llvm.vp.ashr.v16i32 (<16 x i32> , <16 x i32> , <16 x i1> , i32 ) + declare @llvm.vp.ashr.nxv4i32 ( , , , i32 ) + declare <256 x i64> @llvm.vp.ashr.v256i64 (<256 x i64> , <256 x i64> , <256 x i1> , i32 ) + +Overview: +""""""""" + +Vector-predicated arithmetic right-shift. + + +Arguments: +"""""""""" + +The first two operands and the result have the same vector of integer type. The +third operand is the vector mask and has the same number of elements as the +result vector type. The fourth operand is the explicit vector length of the +operation. + +Semantics: +"""""""""" + +The '``llvm.vp.ashr``' intrinsic computes the arithmetic right shift +(:ref:`ashr `) of the first operand by the second operand on each +enabled lane. The result on disabled lanes is undefined. + +Examples: +""""""""" + +.. code-block:: llvm + + %r = call <4 x i32> @llvm.vp.ashr.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i1> %mask, i32 %evl) + ;; For all lanes below %evl, %r is lane-wise equivalent to %also.r + + %t = ashr <4 x i32> %a, %b + %also.r = select <4 x i1> %mask, <4 x i32> %t, <4 x i32> undef + + +.. _int_vp_lshr: + + +'``llvm.vp.lshr.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare <16 x i32> @llvm.vp.lshr.v16i32 (<16 x i32> , <16 x i32> , <16 x i1> , i32 ) + declare @llvm.vp.lshr.nxv4i32 ( , , , i32 ) + declare <256 x i64> @llvm.vp.lshr.v256i64 (<256 x i64> , <256 x i64> , <256 x i1> , i32 ) + +Overview: +""""""""" + +Vector-predicated logical right-shift. + + +Arguments: +"""""""""" + +The first two operands and the result have the same vector of integer type. The +third operand is the vector mask and has the same number of elements as the +result vector type. The fourth operand is the explicit vector length of the +operation. + +Semantics: +"""""""""" + +The '``llvm.vp.lshr``' intrinsic computes the logical right shift +(:ref:`lshr `) of the first operand by the second operand on each +enabled lane. The result on disabled lanes is undefined. + +Examples: +""""""""" + +.. code-block:: llvm + + %r = call <4 x i32> @llvm.vp.lshr.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i1> %mask, i32 %evl) + ;; For all lanes below %evl, %r is lane-wise equivalent to %also.r + + %t = lshr <4 x i32> %a, %b + %also.r = select <4 x i1> %mask, <4 x i32> %t, <4 x i32> undef + + +.. _int_vp_shl: + +'``llvm.vp.shl.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare <16 x i32> @llvm.vp.shl.v16i32 (<16 x i32> , <16 x i32> , <16 x i1> , i32 ) + declare @llvm.vp.shl.nxv4i32 ( , , , i32 ) + declare <256 x i64> @llvm.vp.shl.v256i64 (<256 x i64> , <256 x i64> , <256 x i1> , i32 ) + +Overview: +""""""""" + +Vector-predicated left shift. + + +Arguments: +"""""""""" + +The first two operands and the result have the same vector of integer type. The +third operand is the vector mask and has the same number of elements as the +result vector type. The fourth operand is the explicit vector length of the +operation. + +Semantics: +"""""""""" + +The '``llvm.vp.shl``' intrinsic computes the left shift (:ref:`shl `) of +the first operand by the second operand on each enabled lane. The result on +disabled lanes is undefined. + +Examples: +""""""""" + +.. code-block:: llvm + + %r = call <4 x i32> @llvm.vp.shl.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i1> %mask, i32 %evl) + ;; For all lanes below %evl, %r is lane-wise equivalent to %also.r + + %t = shl <4 x i32> %a, %b + %also.r = select <4 x i1> %mask, <4 x i32> %t, <4 x i32> undef + + +.. _int_vp_or: + +'``llvm.vp.or.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare <16 x i32> @llvm.vp.or.v16i32 (<16 x i32> , <16 x i32> , <16 x i1> , i32 ) + declare @llvm.vp.or.nxv4i32 ( , , , i32 ) + declare <256 x i64> @llvm.vp.or.v256i64 (<256 x i64> , <256 x i64> , <256 x i1> , i32 ) + +Overview: +""""""""" + +Vector-predicated or. + + +Arguments: +"""""""""" + +The first two operands and the result have the same vector of integer type. The +third operand is the vector mask and has the same number of elements as the +result vector type. The fourth operand is the explicit vector length of the +operation. + +Semantics: +"""""""""" + +The '``llvm.vp.or``' intrinsic performs a bitwise or (:ref:`or `) of the +first two operands on each enabled lane. The result on disabled lanes is +undefined. + +Examples: +""""""""" + +.. code-block:: llvm + + %r = call <4 x i32> @llvm.vp.or.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i1> %mask, i32 %evl) + ;; For all lanes below %evl, %r is lane-wise equivalent to %also.r + + %t = or <4 x i32> %a, %b + %also.r = select <4 x i1> %mask, <4 x i32> %t, <4 x i32> undef + + +.. _int_vp_and: + +'``llvm.vp.and.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare <16 x i32> @llvm.vp.and.v16i32 (<16 x i32> , <16 x i32> , <16 x i1> , i32 ) + declare @llvm.vp.and.nxv4i32 ( , , , i32 ) + declare <256 x i64> @llvm.vp.and.v256i64 (<256 x i64> , <256 x i64> , <256 x i1> , i32 ) + +Overview: +""""""""" + +Vector-predicated and. + + +Arguments: +"""""""""" + +The first two operands and the result have the same vector of integer type. The +third operand is the vector mask and has the same number of elements as the +result vector type. The fourth operand is the explicit vector length of the +operation. + +Semantics: +"""""""""" + +The '``llvm.vp.and``' intrinsic performs a bitwise and (:ref:`and `) of +the first two operands on each enabled lane. The result on disabled lanes is +undefined. + +Examples: +""""""""" + +.. code-block:: llvm + + %r = call <4 x i32> @llvm.vp.and.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i1> %mask, i32 %evl) + ;; For all lanes below %evl, %r is lane-wise equivalent to %also.r + + %t = and <4 x i32> %a, %b + %also.r = select <4 x i1> %mask, <4 x i32> %t, <4 x i32> undef + + +.. _int_vp_xor: + +'``llvm.vp.xor.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare <16 x i32> @llvm.vp.xor.v16i32 (<16 x i32> , <16 x i32> , <16 x i1> , i32 ) + declare @llvm.vp.xor.nxv4i32 ( , , , i32 ) + declare <256 x i64> @llvm.vp.xor.v256i64 (<256 x i64> , <256 x i64> , <256 x i1> , i32 ) + +Overview: +""""""""" + +Vector-predicated, bitwise xor. + + +Arguments: +"""""""""" + +The first two operands and the result have the same vector of integer type. The +third operand is the vector mask and has the same number of elements as the +result vector type. The fourth operand is the explicit vector length of the +operation. + +Semantics: +"""""""""" + +The '``llvm.vp.xor``' intrinsic performs a bitwise xor (:ref:`xor `) of +the first two operands on each enabled lane. +The result on disabled lanes is undefined. + +Examples: +""""""""" + +.. code-block:: llvm + + %r = call <4 x i32> @llvm.vp.xor.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i1> %mask, i32 %evl) + ;; For all lanes below %evl, %r is lane-wise equivalent to %also.r + + %t = xor <4 x i32> %a, %b + %also.r = select <4 x i1> %mask, <4 x i32> %t, <4 x i32> undef + + .. _int_mload_mstore: Masked Vector Load and Store Intrinsics diff --git a/llvm/docs/Lexicon.rst b/llvm/docs/Lexicon.rst index b0a6e4655fe8e..8df078457589f 100644 --- a/llvm/docs/Lexicon.rst +++ b/llvm/docs/Lexicon.rst @@ -112,7 +112,7 @@ G **GEP** ``GetElementPtr``. An LLVM IR instruction that is used to get the address of a subelement of an aggregate data structure. It is documented in detail - `here `_. + `here `_. **GVN** Global Value Numbering. GVN is a pass that partitions values computed by a diff --git a/llvm/docs/LibFuzzer.rst b/llvm/docs/LibFuzzer.rst index 0bf6f6bc6ffdd..4e83955a05460 100644 --- a/llvm/docs/LibFuzzer.rst +++ b/llvm/docs/LibFuzzer.rst @@ -581,7 +581,7 @@ you will want to know whether the function or the corpus can be improved further One easy to use metric is, of course, code coverage. We recommend to use -`Clang Coverage `_, +`Clang Coverage `_, to visualize and study your code coverage (`example `_). @@ -768,7 +768,7 @@ Trophies * WOFF2: `[1] `__ -* LLVM: `Clang `_, `Clang-format `_, `libc++ `_, `llvm-as `_, `Demangler `_, Disassembler: http://reviews.llvm.org/rL247405, http://reviews.llvm.org/rL247414, http://reviews.llvm.org/rL247416, http://reviews.llvm.org/rL247417, http://reviews.llvm.org/rL247420, http://reviews.llvm.org/rL247422. +* LLVM: `Clang `_, `Clang-format `_, `libc++ `_, `llvm-as `_, `Demangler `_, Disassembler: http://reviews.llvm.org/rL247405, http://reviews.llvm.org/rL247414, http://reviews.llvm.org/rL247416, http://reviews.llvm.org/rL247417, http://reviews.llvm.org/rL247420, http://reviews.llvm.org/rL247422. * Tensorflow: `[1] `__ @@ -781,18 +781,18 @@ Trophies .. _pcre2: http://www.pcre.org/ .. _AFL: http://lcamtuf.coredump.cx/afl/ .. _Radamsa: https://github.com/aoh/radamsa -.. _SanitizerCoverage: http://clang.llvm.org/docs/SanitizerCoverage.html -.. _SanitizerCoverageTraceDataFlow: http://clang.llvm.org/docs/SanitizerCoverage.html#tracing-data-flow -.. _AddressSanitizer: http://clang.llvm.org/docs/AddressSanitizer.html -.. _LeakSanitizer: http://clang.llvm.org/docs/LeakSanitizer.html +.. _SanitizerCoverage: https://clang.llvm.org/docs/SanitizerCoverage.html +.. _SanitizerCoverageTraceDataFlow: https://clang.llvm.org/docs/SanitizerCoverage.html#tracing-data-flow +.. _AddressSanitizer: https://clang.llvm.org/docs/AddressSanitizer.html +.. _LeakSanitizer: https://clang.llvm.org/docs/LeakSanitizer.html .. _Heartbleed: http://en.wikipedia.org/wiki/Heartbleed .. _FuzzerInterface.h: https://github.com/llvm/llvm-project/blob/master/compiler-rt/lib/fuzzer/FuzzerInterface.h -.. _3.7.0: http://llvm.org/releases/3.7.0/docs/LibFuzzer.html -.. _building Clang from trunk: http://clang.llvm.org/get_started.html -.. _MemorySanitizer: http://clang.llvm.org/docs/MemorySanitizer.html -.. _UndefinedBehaviorSanitizer: http://clang.llvm.org/docs/UndefinedBehaviorSanitizer.html -.. _`coverage counters`: http://clang.llvm.org/docs/SanitizerCoverage.html#coverage-counters +.. _3.7.0: https://llvm.org/releases/3.7.0/docs/LibFuzzer.html +.. _building Clang from trunk: https://clang.llvm.org/get_started.html +.. _MemorySanitizer: https://clang.llvm.org/docs/MemorySanitizer.html +.. _UndefinedBehaviorSanitizer: https://clang.llvm.org/docs/UndefinedBehaviorSanitizer.html +.. _`coverage counters`: https://clang.llvm.org/docs/SanitizerCoverage.html#coverage-counters .. _`value profile`: #value-profile -.. _`caller-callee pairs`: http://clang.llvm.org/docs/SanitizerCoverage.html#caller-callee-coverage +.. _`caller-callee pairs`: https://clang.llvm.org/docs/SanitizerCoverage.html#caller-callee-coverage .. _BoringSSL: https://boringssl.googlesource.com/boringssl/ diff --git a/llvm/docs/LoopTerminology.rst b/llvm/docs/LoopTerminology.rst index ef0593419a461..bb2b40a098eff 100644 --- a/llvm/docs/LoopTerminology.rst +++ b/llvm/docs/LoopTerminology.rst @@ -152,7 +152,7 @@ It is ensured by the LoopSimplify (:ref:`-loop-simplify `) pass and is automatically added by the pass managers when scheduling a LoopPass. This pass is implemented in -`LoopSimplify.h `_. +`LoopSimplify.h `_. When it is successful, the loop has: * A preheader. @@ -178,7 +178,7 @@ Rotated Loops Loops are rotated by the LoopRotate (:ref:`loop-rotate `) pass, which converts loops into do/while style loops and is implemented in -`LoopRotation.h `_. Example: +`LoopRotation.h `_. Example: .. code-block:: C diff --git a/llvm/docs/MarkdownQuickstartTemplate.md b/llvm/docs/MarkdownQuickstartTemplate.md index 734152188e578..1ed9f2f80f902 100644 --- a/llvm/docs/MarkdownQuickstartTemplate.md +++ b/llvm/docs/MarkdownQuickstartTemplate.md @@ -64,7 +64,7 @@ structure. ### Example Subsection -Make a link [like this](http://llvm.org/). There is also a more +Make a link [like this](https://llvm.org/). There is also a more sophisticated syntax which [can be more readable] for longer links since it disrupts the flow less. You can put the `[link name]: ` block pretty much anywhere later in the document. diff --git a/llvm/docs/MergeFunctions.rst b/llvm/docs/MergeFunctions.rst index 7c51adac681a0..13294129538ce 100644 --- a/llvm/docs/MergeFunctions.rst +++ b/llvm/docs/MergeFunctions.rst @@ -39,16 +39,16 @@ LLVM code fundamentals. In this article, we assume the reader is familiar with `Single Static Assignment `_ concept and has an understanding of -`IR structure `_. +`IR structure `_. We will use terms such as -"`module `_", -"`function `_", +"`module `_", +"`function `_", "`basic block `_", -"`user `_", -"`value `_", +"`user `_", +"`value `_", "`instruction -`_". +`_". As a good starting point, the Kaleidoscope tutorial can be used: @@ -99,8 +99,8 @@ and a ``void*`` as equal. This is just an example; more possible details are described a bit below. As another example, the reader may imagine two more functions. The first -function performs a multiplication on 2, while the second one performs an -arithmetic right shift on 1. +function performs a multiplication by 2, while the second one performs an +logical left shift by 1. Possible solutions ^^^^^^^^^^^^^^^^^^ diff --git a/llvm/docs/Packaging.rst b/llvm/docs/Packaging.rst index 7c2dc956128ea..176e5b391229b 100644 --- a/llvm/docs/Packaging.rst +++ b/llvm/docs/Packaging.rst @@ -38,7 +38,7 @@ versions of LLVM in parallel. The following configure flags are relevant: should turn it back on to let users debug their programs. ``--enable-optimized`` - (For svn checkouts) Builds LLVM with ``-O2`` and, by default, turns off + (For git checkouts) Builds LLVM with ``-O2`` and, by default, turns off debug symbols. Also available by setting ``ENABLE_OPTIMIZED=0|1`` in ``make``'s environment. This defaults to enabled when not in a checkout. diff --git a/llvm/docs/ProgrammersManual.rst b/llvm/docs/ProgrammersManual.rst index 99caac6e29b3f..4d0c29cfcd4e4 100644 --- a/llvm/docs/ProgrammersManual.rst +++ b/llvm/docs/ProgrammersManual.rst @@ -24,7 +24,7 @@ continuously growing source code that makes up the LLVM infrastructure. Note that this manual is not intended to serve as a replacement for reading the source code, so if you think there should be a method in one of these classes to do something, but it's not listed, check the source. Links to the `doxygen -`__ sources are provided to make this as easy as +`__ sources are provided to make this as easy as possible. The first section of this document describes general information that is useful @@ -32,7 +32,7 @@ to know when working in the LLVM infrastructure, and the second describes the Core LLVM classes. In the future this manual will be extended with information describing how to use extension libraries, such as dominator information, CFG traversal routines, and useful utilities like the ``InstVisitor`` (`doxygen -`__) template. +`__) template. .. _general: @@ -108,7 +108,7 @@ they don't have some drawbacks (primarily stemming from the fact that ``dynamic_cast<>`` only works on classes that have a v-table). Because they are used so often, you must know what they do and how they work. All of these templates are defined in the ``llvm/Support/Casting.h`` (`doxygen -`__) file (note that you very +`__) file (note that you very rarely have to include this file directly). ``isa<>``: @@ -231,7 +231,7 @@ and clients can call it using any one of: Similarly, APIs which need to return a string may return a ``StringRef`` instance, which can be used directly or converted to an ``std::string`` using the ``str`` member function. See ``llvm/ADT/StringRef.h`` (`doxygen -`__) for more +`__) for more information. You should rarely use the ``StringRef`` class directly, because it contains @@ -243,7 +243,7 @@ passed by value. The ``Twine`` class ^^^^^^^^^^^^^^^^^^^ -The ``Twine`` (`doxygen `__) +The ``Twine`` (`doxygen `__) class is an efficient way for APIs to accept concatenated strings. For example, a common LLVM paradigm is to name one instruction based on the name of another instruction with a suffix, for example: @@ -261,7 +261,7 @@ of strings until it is actually required, at which point it can be efficiently rendered directly into a character array. This avoids unnecessary heap allocation involved in constructing the temporary results of string concatenation. See ``llvm/ADT/Twine.h`` (`doxygen -`__) and :ref:`here ` +`__) and :ref:`here ` for more information. As with a ``StringRef``, ``Twine`` objects point to external memory and should @@ -1056,7 +1056,7 @@ The ``function_ref`` class template ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The ``function_ref`` -(`doxygen `__) class +(`doxygen `__) class template represents a reference to a callable object, templated over the type of the callable. This is a good choice for passing a callback to a function, if you don't need to hold onto the callback after the function returns. In this @@ -1106,7 +1106,7 @@ you don't want them to always be noisy. A standard compromise is to comment them out, allowing you to enable them if you need them in the future. The ``llvm/Support/Debug.h`` (`doxygen -`__) file provides a macro named +`__) file provides a macro named ``LLVM_DEBUG()`` that is a much nicer solution to this problem. Basically, you can put arbitrary code into the argument of the ``LLVM_DEBUG`` macro, and it is only executed if '``opt``' (or any other tool) is run with the '``-debug``' command @@ -1203,7 +1203,7 @@ The ``Statistic`` class & ``-stats`` option ------------------------------------------- The ``llvm/ADT/Statistic.h`` (`doxygen -`__) file provides a class +`__) file provides a class named ``Statistic`` that is used as a unified way to keep track of what the LLVM compiler is doing and how effective various optimizations are. It is useful to see what optimizations are contributing to making a particular program run @@ -1298,7 +1298,7 @@ They provide a framework for making parts of your code only execute a certain number of times. The ``llvm/Support/DebugCounter.h`` (`doxygen -`__) file +`__) file provides a class named ``DebugCounter`` that can be used to create command line counter options that control execution of parts of your code. @@ -2513,7 +2513,7 @@ If you're finding that you commonly iterate over a ``Function``'s ``BasicBlock``\ s and then that ``BasicBlock``'s ``Instruction``\ s, ``InstIterator`` should be used instead. You'll need to include ``llvm/IR/InstIterator.h`` (`doxygen -`__) and then instantiate +`__) and then instantiate ``InstIterator``\ s explicitly in your code. Here's a small example that shows how to dump all instructions in a function to the standard error stream: @@ -2664,7 +2664,7 @@ and in other situations, you may find that you want to treat ``CallInst``\ s and ``InvokeInst``\ s the same way, even though their most-specific common base class is ``Instruction``, which includes lots of less closely-related things. For these cases, LLVM provides a handy wrapper class called ``CallSite`` -(`doxygen `__) It is +(`doxygen `__) It is essentially a wrapper around an ``Instruction`` pointer, with some methods that provide functionality common to ``CallInst``\ s and ``InvokeInst``\ s. @@ -2680,7 +2680,7 @@ Iterating over def-use & use-def chains ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Frequently, we might have an instance of the ``Value`` class (`doxygen -`__) and we want to determine +`__) and we want to determine which ``User``\ s use the ``Value``. The list of all ``User``\ s of a particular ``Value`` is called a *def-use* chain. For example, let's say we have a ``Function*`` named ``F`` to a particular function ``foo``. Finding all of the @@ -2698,7 +2698,7 @@ chain of ``F``: } Alternatively, it's common to have an instance of the ``User`` Class (`doxygen -`__) and need to know what +`__) and need to know what ``Value``\ s are used by it. The list of all ``Value``\ s used by a ``User`` is known as a *use-def* chain. Instances of class ``Instruction`` are common ``User`` s, so we might want to iterate over all of the values that a particular @@ -2770,7 +2770,7 @@ will create an ``AllocaInst`` instance that represents the allocation of one integer in the current stack frame, at run time. Each ``Instruction`` subclass is likely to have varying default parameters which change the semantics of the instruction, so refer to the `doxygen documentation for the subclass of -Instruction `_ that +Instruction `_ that you're interested in instantiating. *Naming values* @@ -2928,7 +2928,7 @@ Replacing individual instructions """"""""""""""""""""""""""""""""" Including "`llvm/Transforms/Utils/BasicBlockUtils.h -`_" permits use of two +`_" permits use of two very useful replace functions: ``ReplaceInstWithValue`` and ``ReplaceInstWithInst``. @@ -2974,8 +2974,8 @@ Replacing multiple uses of Users and Values You can use ``Value::replaceAllUsesWith`` and ``User::replaceUsesOfWith`` to change more than one use at a time. See the doxygen documentation for the -`Value Class `_ and `User Class -`_, respectively, for more +`Value Class `_ and `User Class +`_, respectively, for more information. .. _schanges_deletingGV: @@ -3103,7 +3103,7 @@ The ``ValueSymbolTable`` class ------------------------------ The ``ValueSymbolTable`` (`doxygen -`__) class provides +`__) class provides a symbol table that the :ref:`Function ` and Module_ classes use for naming value definitions. The symbol table can provide a name for any Value_. @@ -3124,10 +3124,10 @@ autoinsert it into the appropriate symbol table. The ``User`` and owned ``Use`` classes' memory layout ----------------------------------------------------- -The ``User`` (`doxygen `__) +The ``User`` (`doxygen `__) class provides a basis for expressing the ownership of ``User`` towards other -`Value instance `_\ s. The -``Use`` (`doxygen `__) helper +`Value instance `_\ s. The +``Use`` (`doxygen `__) helper class is employed to do the bookkeeping and to facilitate *O(1)* addition and removal. @@ -3414,9 +3414,9 @@ The Core LLVM Class Hierarchy Reference ``#include "llvm/IR/Type.h"`` -header source: `Type.h `_ +header source: `Type.h `_ -doxygen info: `Type Classes `_ +doxygen info: `Type Classes `_ The Core LLVM classes are the primary means of representing the program being inspected or transformed. The core LLVM classes are defined in header files in @@ -3518,9 +3518,9 @@ The ``Module`` class ``#include "llvm/IR/Module.h"`` -header source: `Module.h `_ +header source: `Module.h `_ -doxygen info: `Module Class `_ +doxygen info: `Module Class `_ The ``Module`` class represents the top level structure present in LLVM programs. An LLVM module is effectively either a translation unit of the @@ -3611,9 +3611,9 @@ The ``Value`` class ``#include "llvm/IR/Value.h"`` -header source: `Value.h `_ +header source: `Value.h `_ -doxygen info: `Value Class `_ +doxygen info: `Value Class `_ The ``Value`` class is the most important class in the LLVM Source base. It represents a typed value that may be used (among other things) as an operand to @@ -3702,9 +3702,9 @@ The ``User`` class ``#include "llvm/IR/User.h"`` -header source: `User.h `_ +header source: `User.h `_ -doxygen info: `User Class `_ +doxygen info: `User Class `_ Superclass: Value_ @@ -3749,10 +3749,10 @@ The ``Instruction`` class ``#include "llvm/IR/Instruction.h"`` header source: `Instruction.h -`_ +`_ doxygen info: `Instruction Class -`_ +`_ Superclasses: User_, Value_ @@ -3773,7 +3773,7 @@ instructions in LLVM. It describes the enum values that are used as opcodes concrete sub-classes of ``Instruction`` that implement the instruction (for example BinaryOperator_ and CmpInst_). Unfortunately, the use of macros in this file confuses doxygen, so these enum values don't show up correctly in the -`doxygen output `_. +`doxygen output `_. .. _s_Instruction: @@ -3890,10 +3890,10 @@ The ``GlobalValue`` class ``#include "llvm/IR/GlobalValue.h"`` header source: `GlobalValue.h -`_ +`_ doxygen info: `GlobalValue Class -`_ +`_ Superclasses: Constant_, User_, Value_ @@ -3948,10 +3948,10 @@ The ``Function`` class ``#include "llvm/IR/Function.h"`` -header source: `Function.h `_ +header source: `Function.h `_ doxygen info: `Function Class -`_ +`_ Superclasses: GlobalValue_, Constant_, User_, Value_ @@ -4057,10 +4057,10 @@ The ``GlobalVariable`` class ``#include "llvm/IR/GlobalVariable.h"`` header source: `GlobalVariable.h -`_ +`_ doxygen info: `GlobalVariable Class -`_ +`_ Superclasses: GlobalValue_, Constant_, User_, Value_ @@ -4115,10 +4115,10 @@ The ``BasicBlock`` class ``#include "llvm/IR/BasicBlock.h"`` header source: `BasicBlock.h -`_ +`_ doxygen info: `BasicBlock Class -`_ +`_ Superclass: Value_ diff --git a/llvm/docs/Proposals/GitHubMove.rst b/llvm/docs/Proposals/GitHubMove.rst index ae799b57af795..977f0849aa708 100644 --- a/llvm/docs/Proposals/GitHubMove.rst +++ b/llvm/docs/Proposals/GitHubMove.rst @@ -319,7 +319,7 @@ Currently # direct SVN checkout svn co https://user@llvm.org/svn/llvm-project/llvm/trunk llvm # or using the read-only Git view, with git-svn - git clone http://llvm.org/git/llvm.git + git clone https://llvm.org/git/llvm.git cd llvm git svn init https://llvm.org/svn/llvm-project/llvm/trunk --username= git config svn-remote.svn.fetch :refs/remotes/origin/master @@ -381,29 +381,29 @@ Currently :: - svn co http://llvm.org/svn/llvm-project/llvm/trunk llvm -r $REVISION + svn co https://llvm.org/svn/llvm-project/llvm/trunk llvm -r $REVISION cd llvm/tools - svn co http://llvm.org/svn/llvm-project/clang/trunk clang -r $REVISION + svn co https://llvm.org/svn/llvm-project/clang/trunk clang -r $REVISION cd ../projects - svn co http://llvm.org/svn/llvm-project/libcxx/trunk libcxx -r $REVISION + svn co https://llvm.org/svn/llvm-project/libcxx/trunk libcxx -r $REVISION Or using git-svn:: - git clone http://llvm.org/git/llvm.git + git clone https://llvm.org/git/llvm.git cd llvm/ git svn init https://llvm.org/svn/llvm-project/llvm/trunk --username= git config svn-remote.svn.fetch :refs/remotes/origin/master git svn rebase -l git checkout `git svn find-rev -B r258109` cd tools - git clone http://llvm.org/git/clang.git + git clone https://llvm.org/git/clang.git cd clang/ git svn init https://llvm.org/svn/llvm-project/clang/trunk --username= git config svn-remote.svn.fetch :refs/remotes/origin/master git svn rebase -l git checkout `git svn find-rev -B r258109` cd ../../projects/ - git clone http://llvm.org/git/libcxx.git + git clone https://llvm.org/git/libcxx.git cd libcxx git svn init https://llvm.org/svn/llvm-project/libcxx/trunk --username= git config svn-remote.svn.fetch :refs/remotes/origin/master diff --git a/llvm/docs/README.txt b/llvm/docs/README.txt index 92b146f0c1c29..2a9b2e3830298 100644 --- a/llvm/docs/README.txt +++ b/llvm/docs/README.txt @@ -5,7 +5,7 @@ LLVM's documentation is written in reStructuredText, a lightweight plaintext markup language (file extension `.rst`). While the reStructuredText documentation should be quite readable in source form, it is mostly meant to be processed by the Sphinx documentation generation -system to create HTML pages which are hosted on and +system to create HTML pages which are hosted on and updated after every commit. Manpage output is also supported, see below. If you instead would like to generate and view the HTML locally, install @@ -17,7 +17,7 @@ Sphinx and then do: $BROWSER /docs//html/index.html The mapping between reStructuredText files and generated documentation is -`docs/Foo.rst` <-> `/docs//html/Foo.html` <-> `http://llvm.org/docs/Foo.html`. +`docs/Foo.rst` <-> `/docs//html/Foo.html` <-> `https://llvm.org/docs/Foo.html`. If you are interested in writing new documentation, you will want to read `SphinxQuickstartTemplate.rst` which will get you writing documentation @@ -41,7 +41,7 @@ The correspondence between .rst files and man pages is `docs/CommandGuide/Foo.rst` <-> `/docs//man/Foo.1`. These .rst files are also included during HTML generation so they are also viewable online (as noted above) at e.g. -`http://llvm.org/docs/CommandGuide/Foo.html`. +`https://llvm.org/docs/CommandGuide/Foo.html`. Checking links ============== diff --git a/llvm/docs/Reference.rst b/llvm/docs/Reference.rst index 9d747b8498faf..d116edafb9bfb 100644 --- a/llvm/docs/Reference.rst +++ b/llvm/docs/Reference.rst @@ -53,8 +53,8 @@ LLVM and API reference documentation. API Reference ------------- -`Doxygen generated documentation `_ - (`classes `_) +`Doxygen generated documentation `_ + (`classes `_) :doc:`HowToUseAttributes` Answers some questions about the new Attributes infrastructure. diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst index bbfcc6076c018..4f6e759bbeb39 100644 --- a/llvm/docs/ReleaseNotes.rst +++ b/llvm/docs/ReleaseNotes.rst @@ -72,6 +72,9 @@ Changes to the ARM Backend During this release ... +* Implemented C-language intrinsics for the full Arm v8.1-M MVE instruction + set. ```` now supports the complete API defined in the Arm C + Language Extensions. Changes to the MIPS Target -------------------------- diff --git a/llvm/docs/ReleaseProcess.rst b/llvm/docs/ReleaseProcess.rst index 6a14e28d189a8..69a209cbc66d8 100644 --- a/llvm/docs/ReleaseProcess.rst +++ b/llvm/docs/ReleaseProcess.rst @@ -115,7 +115,7 @@ Test Suite :local: Follow the `LNT Quick Start Guide -`__ link on how to set-up the +`__ link on how to set-up the test-suite The binary location you'll have to use for testing is inside the @@ -160,7 +160,7 @@ candidates, on the previous release. You should: * Download the previous release sources from - http://llvm.org/releases/download.html. + https://llvm.org/releases/download.html. * Run the test-release.sh script on ``final`` mode (change ``-rc 1`` to ``-final``). @@ -190,7 +190,7 @@ to them), and run the release test as above. You should: * Download the current candidate sources from where the release manager points - you (ex. http://llvm.org/pre-releases/3.3/rc1/). + you (ex. https://llvm.org/pre-releases/3.3/rc1/). * Repeat the steps above with ``-rc 1``, ``-rc 2`` etc modes and run the test-suite the same way. diff --git a/llvm/docs/SphinxQuickstartTemplate.rst b/llvm/docs/SphinxQuickstartTemplate.rst index cd23b61d40813..5ebb92affb87a 100644 --- a/llvm/docs/SphinxQuickstartTemplate.rst +++ b/llvm/docs/SphinxQuickstartTemplate.rst @@ -84,7 +84,7 @@ To create a new paragraph, simply insert a blank line. Links ===== -You can format a link `like this `_. A more `sophisticated syntax`_ allows you to place the ``.. _`link text`: `` block +You can format a link `like this `_. A more `sophisticated syntax`_ allows you to place the ``.. _`link text`: `` block pretty much anywhere else in the document. This is useful when linking to especially long URLs. .. _`sophisticated syntax`: http://en.wikipedia.org/wiki/LLVM diff --git a/llvm/docs/TableGen/index.rst b/llvm/docs/TableGen/index.rst index 0697bd0298e8c..6100c13bea765 100644 --- a/llvm/docs/TableGen/index.rst +++ b/llvm/docs/TableGen/index.rst @@ -28,7 +28,7 @@ hands the result off to a domain-specific `backend`_ for processing. The current major users of TableGen are :doc:`../CodeGenerator` and the -`Clang diagnostics and attributes `_. +`Clang diagnostics and attributes `_. Note that if you work on TableGen much, and use emacs or vim, that you can find an emacs "TableGen mode" and a vim language file in the ``llvm/utils/emacs`` and diff --git a/llvm/docs/TestSuiteGuide.md b/llvm/docs/TestSuiteGuide.md index b41d7ec59343e..6128636ce5eed 100644 --- a/llvm/docs/TestSuiteGuide.md +++ b/llvm/docs/TestSuiteGuide.md @@ -19,7 +19,7 @@ Quickstart % mkdir venv % virtualenv venv % . venv/bin/activate - % pip install svn+http://llvm.org/svn/llvm-project/llvm/trunk/utils/lit + % pip install svn+https://llvm.org/svn/llvm-project/llvm/trunk/utils/lit % lit --version lit 0.8.0dev ``` @@ -279,7 +279,7 @@ Example usage: LNT is a set of client and server tools for continuously monitoring performance. You can find more information at -[http://llvm.org/docs/lnt](http://llvm.org/docs/lnt). The official LNT instance +[https://llvm.org/docs/lnt](https://llvm.org/docs/lnt). The official LNT instance of the LLVM project is hosted at [http://lnt.llvm.org](http://lnt.llvm.org). @@ -348,7 +348,7 @@ Cross Compilation and External Devices CMake allows to cross compile to a different target via toolchain files. More information can be found here: -- [http://llvm.org/docs/lnt/tests.html#cross-compiling](http://llvm.org/docs/lnt/tests.html#cross-compiling) +- [https://llvm.org/docs/lnt/tests.html#cross-compiling](https://llvm.org/docs/lnt/tests.html#cross-compiling) - [https://cmake.org/cmake/help/latest/manual/cmake-toolchains.7.html](https://cmake.org/cmake/help/latest/manual/cmake-toolchains.7.html) @@ -389,7 +389,7 @@ Running the test-suite via LNT The LNT tool can run the test-suite. Use this when submitting test results to an LNT instance. See -[http://llvm.org/docs/lnt/tests.html#llvm-cmake-test-suite](http://llvm.org/docs/lnt/tests.html#llvm-cmake-test-suite) +[https://llvm.org/docs/lnt/tests.html#llvm-cmake-test-suite](https://llvm.org/docs/lnt/tests.html#llvm-cmake-test-suite) for details. Running the test-suite via Makefiles (deprecated) diff --git a/llvm/docs/TestingGuide.rst b/llvm/docs/TestingGuide.rst index 1659e8777fe00..c8ee65f132f99 100644 --- a/llvm/docs/TestingGuide.rst +++ b/llvm/docs/TestingGuide.rst @@ -129,7 +129,7 @@ in release mode, i.e. % cmake -DCMAKE_BUILD_TYPE="Release" -DLLVM_ENABLE_ASSERTIONS=On -If you have `Clang `_ checked out and built, you +If you have `Clang `_ checked out and built, you can run the LLVM and Clang tests simultaneously using: .. code-block:: bash @@ -556,7 +556,7 @@ RUN lines: output affects test results. It's usually easy to tell: just look for redirection or piping of the ``FileCheck`` call's stdout or stderr. -To add more substituations, look at ``test/lit.cfg`` or ``lit.local.cfg``. +To add more substitutions, look at ``test/lit.cfg`` or ``lit.local.cfg``. Options @@ -593,7 +593,7 @@ To make the output more useful, :program:`lit` will scan the lines of the test case for ones that contain a pattern that matches ``PR[0-9]+``. This is the syntax for specifying a PR (Problem Report) number that is related to the test case. The number after "PR" specifies the -LLVM bugzilla number. When a PR number is specified, it will be used in +LLVM Bugzilla number. When a PR number is specified, it will be used in the pass/fail reporting. This is useful to quickly get some context when a test fails. diff --git a/llvm/docs/TypeMetadata.rst b/llvm/docs/TypeMetadata.rst index 7d0745b927963..74d439411497e 100644 --- a/llvm/docs/TypeMetadata.rst +++ b/llvm/docs/TypeMetadata.rst @@ -29,7 +29,7 @@ or functions. An intrinsic, :ref:`llvm.type.test `, is used to test whether a given pointer is associated with a type identifier. -.. _control flow integrity: http://clang.llvm.org/docs/ControlFlowIntegrity.html +.. _control flow integrity: https://clang.llvm.org/docs/ControlFlowIntegrity.html Representing Type Information using Type Metadata ================================================= @@ -160,7 +160,7 @@ as the former will be the jump table entry if a jump table is necessary. The `GlobalLayoutBuilder`_ class is responsible for laying out the globals efficiently to minimize the sizes of the underlying bitsets. -.. _control flow integrity design document: http://clang.llvm.org/docs/ControlFlowIntegrityDesign.html +.. _control flow integrity design document: https://clang.llvm.org/docs/ControlFlowIntegrityDesign.html :Example: diff --git a/llvm/docs/UserGuides.rst b/llvm/docs/UserGuides.rst index 1643778360306..5673ae65cce96 100644 --- a/llvm/docs/UserGuides.rst +++ b/llvm/docs/UserGuides.rst @@ -2,7 +2,7 @@ User Guides =========== NOTE: If you are a user who is only interested in using an LLVM-based compiler, -you should look into `Clang `_ instead. The +you should look into `Clang `_ instead. The documentation here is intended for users who have a need to work with the intermediate LLVM representation. @@ -71,7 +71,7 @@ Clang `How to build the C, C++, ObjC, and ObjC++ front end`__ Instructions for building the clang front-end from source. - .. __: http://clang.llvm.org/get_started.html + .. __: https://clang.llvm.org/get_started.html :doc:`CoverageMappingFormat` This describes the format and encoding used for LLVM’s code coverage mapping. diff --git a/llvm/docs/Vectorizers.rst b/llvm/docs/Vectorizers.rst index 83eb5fb256ae7..c322797025fb6 100644 --- a/llvm/docs/Vectorizers.rst +++ b/llvm/docs/Vectorizers.rst @@ -80,7 +80,7 @@ specifying a vector width and interleaving count: See the Clang `language extensions -`_ +`_ for details. Diagnostics @@ -133,7 +133,7 @@ switch statement cannot be vectorized. To ensure line and column numbers are produced include the command line options ``-gline-tables-only`` and ``-gcolumn-info``. See the Clang `user manual -`_ +`_ for details Features diff --git a/llvm/docs/WritingAnLLVMPass.rst b/llvm/docs/WritingAnLLVMPass.rst index ecd1db1344d89..b9ba13f302f12 100644 --- a/llvm/docs/WritingAnLLVMPass.rst +++ b/llvm/docs/WritingAnLLVMPass.rst @@ -17,7 +17,7 @@ build the analysis results that are used by these transformations, and they are, above all, a structuring technique for compiler code. All LLVM passes are subclasses of the `Pass -`_ class, which implement +`_ class, which implement functionality by overriding virtual methods inherited from ``Pass``. Depending on how your pass works, you should inherit from the :ref:`ModulePass ` , :ref:`CallGraphSCCPass @@ -98,8 +98,8 @@ Start out with: #include "llvm/Support/raw_ostream.h" Which are needed because we are writing a `Pass -`_, we are operating on -`Function `_\ s, and we will +`_, we are operating on +`Function `_\ s, and we will be doing some printing. Next we have: @@ -336,7 +336,7 @@ The ``ImmutablePass`` class --------------------------- The most plain and boring type of pass is the "`ImmutablePass -`_" class. This pass +`_" class. This pass type is used for passes that do not have to be run, do not change state, and never need to be updated. This is not a normal type of transformation or analysis, but can provide information about the current compiler configuration. @@ -353,7 +353,7 @@ invalidated, and are never "run". The ``ModulePass`` class ------------------------ -The `ModulePass `_ class +The `ModulePass `_ class is the most general of all superclasses that you can use. Deriving from ``ModulePass`` indicates that your pass uses the entire program as a unit, referring to function bodies in no predictable order, or adding and removing @@ -388,7 +388,7 @@ The ``CallGraphSCCPass`` class ------------------------------ The `CallGraphSCCPass -`_ is used by +`_ is used by passes that need to traverse the program bottom-up on the call graph (callees before callers). Deriving from ``CallGraphSCCPass`` provides some mechanics for building and traversing the ``CallGraph``, but also allows the system to @@ -460,7 +460,7 @@ The ``FunctionPass`` class -------------------------- In contrast to ``ModulePass`` subclasses, `FunctionPass -`_ subclasses do have a +`_ subclasses do have a predictable, local behavior that can be expected by the system. All ``FunctionPass`` execute on each function in the program independent of all of the other functions in the program. ``FunctionPass``\ es do not require that @@ -498,7 +498,7 @@ being processed. The ``doInitialization`` method call is not scheduled to overlap with any other pass executions (thus it should be very fast). A good example of how this method should be used is the `LowerAllocations -`_ pass. This pass +`_ pass. This pass converts ``malloc`` and ``free`` instructions into platform dependent ``malloc()`` and ``free()`` function calls. It uses the ``doInitialization`` method to get a reference to the ``malloc`` and ``free`` functions that it @@ -761,7 +761,7 @@ The ``getAnalysisUsage`` method By implementing the ``getAnalysisUsage`` method, the required and invalidated sets may be specified for your transformation. The implementation should fill in the `AnalysisUsage -`_ object with +`_ object with information about which passes are required and not invalidated. To do this, a pass may call any of the following methods on the ``AnalysisUsage`` object: @@ -914,14 +914,14 @@ be registered with :ref:`RegisterAnalysisGroup `. As a concrete example of an Analysis Group in action, consider the -`AliasAnalysis `_ +`AliasAnalysis `_ analysis group. The default implementation of the alias analysis interface -(the `basicaa `_ pass) +(the `basicaa `_ pass) just does a few simple checks that don't require significant analysis to compute (such as: two different globals can never alias each other, etc). Passes that use the `AliasAnalysis -`_ interface (for -example the `gvn `_ pass), do not +`_ interface (for +example the `gvn `_ pass), do not care which implementation of alias analysis is actually provided, they just use the designated interface. @@ -963,7 +963,7 @@ implementations of the interface by using the following code: This just shows a class ``FancyAA`` that uses the ``INITIALIZE_AG_PASS`` macro both to register and to "join" the `AliasAnalysis -`_ analysis group. +`_ analysis group. Every implementation of an analysis group should join using this macro. .. code-block:: c++ @@ -982,13 +982,13 @@ argument to the ``INITIALIZE_AG_PASS`` template). There must be exactly one default implementation available at all times for an Analysis Group to be used. Only default implementation can derive from ``ImmutablePass``. Here we declare that the `BasicAliasAnalysis -`_ pass is the default +`_ pass is the default implementation for the interface. Pass Statistics =============== -The `Statistic `_ class is +The `Statistic `_ class is designed to be an easy way to expose various success metrics from passes. These statistics are printed at the end of a run, when the :option:`-stats` command line option is enabled on the command line. See the :ref:`Statistics @@ -999,8 +999,8 @@ section ` in the Programmer's Manual for details. What PassManager does --------------------- -The `PassManager `_ `class -`_ takes a list of +The `PassManager `_ `class +`_ takes a list of passes, ensures their :ref:`prerequisites ` are set up correctly, and then schedules passes to run efficiently. All of the LLVM tools that run passes use the PassManager for execution of these passes. @@ -1030,7 +1030,7 @@ series of passes: touching the LLVM program representation for a single function at a time, instead of traversing the entire program. It reduces the memory consumption of compiler, because, for example, only one `DominatorSet - `_ needs to be + `_ needs to be calculated at a time. This also makes it possible to implement some :ref:`interesting enhancements ` in the future. diff --git a/llvm/docs/index.rst b/llvm/docs/index.rst index d4cf6d24e6c57..7315d7278f8b3 100644 --- a/llvm/docs/index.rst +++ b/llvm/docs/index.rst @@ -4,7 +4,7 @@ About .. warning:: If you are using a released version of LLVM, see `the download page - `_ to find your documentation. + `_ to find your documentation. The LLVM compiler infrastructure supports a wide range of projects, from industrial strength compilers to specialized JIT applications to small @@ -27,7 +27,7 @@ Several introductory papers and presentations. `Introduction to the LLVM Compiler`__ Presentation providing a users introduction to LLVM. - .. __: http://llvm.org/pubs/2008-10-04-ACAT-LLVM-Intro.html + .. __: https://llvm.org/pubs/2008-10-04-ACAT-LLVM-Intro.html `Intro to LLVM`__ A chapter from the book "The Architecture of Open Source Applications" that @@ -39,12 +39,12 @@ Several introductory papers and presentations. `LLVM: A Compilation Framework for Lifelong Program Analysis & Transformation`__ Design overview. - .. __: http://llvm.org/pubs/2004-01-30-CGO-LLVM.html + .. __: https://llvm.org/pubs/2004-01-30-CGO-LLVM.html `LLVM: An Infrastructure for Multi-Stage Optimization`__ More details (quite old now). - .. __: http://llvm.org/pubs/2002-12-LattnerMSThesis.html + .. __: https://llvm.org/pubs/2002-12-LattnerMSThesis.html Documentation ============= diff --git a/llvm/docs/tutorial/BuildingAJIT1.rst b/llvm/docs/tutorial/BuildingAJIT1.rst index 5c711fcba141f..33d36896cbfc1 100644 --- a/llvm/docs/tutorial/BuildingAJIT1.rst +++ b/llvm/docs/tutorial/BuildingAJIT1.rst @@ -320,4 +320,4 @@ Here is the code: +-----------------------------+-----------------------------------------------+ .. [3] See the ErrorHandling section in the LLVM Programmer's Manual - (http://llvm.org/docs/ProgrammersManual.html#error-handling) + (https://llvm.org/docs/ProgrammersManual.html#error-handling) diff --git a/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl02.rst b/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl02.rst index ab9656b2fb4b7..9091043e6a03c 100644 --- a/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl02.rst +++ b/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl02.rst @@ -718,7 +718,7 @@ Full Code Listing Here is the complete code listing for our running example. Because this uses the LLVM libraries, we need to link them in. To do this, we use the -`llvm-config `_ tool to inform +`llvm-config `_ tool to inform our makefile/command line about which options to use: .. code-block:: bash diff --git a/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl03.rst b/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl03.rst index 50e8c44bfc1c1..32472e3a48202 100644 --- a/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl03.rst +++ b/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl03.rst @@ -20,7 +20,7 @@ later. LLVM 3.6 and before will not work with it. Also note that you need to use a version of this tutorial that matches your LLVM release: If you are using an official LLVM release, use the version of the documentation included with your release or on the `llvm.org releases -page `_. +page `_. Code Generation Setup ===================== @@ -90,7 +90,7 @@ detail, we just need a single instance to pass into APIs that require it. The ``Builder`` object is a helper object that makes it easy to generate LLVM instructions. Instances of the -`IRBuilder `_ +`IRBuilder `_ class template keep track of the current place to insert instructions and has methods to create new instructions. @@ -549,7 +549,7 @@ Full Code Listing Here is the complete code listing for our running example, enhanced with the LLVM code generator. Because this uses the LLVM libraries, we need to link them in. To do this, we use the -`llvm-config `_ tool to inform +`llvm-config `_ tool to inform our makefile/command line about which options to use: .. code-block:: bash diff --git a/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl04.rst b/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl04.rst index 24c2b0f1755f6..85c233c9d3d59 100644 --- a/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl04.rst +++ b/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl04.rst @@ -98,7 +98,7 @@ LLVM Optimization Passes Due to the transition to the new PassManager infrastructure this tutorial is based on ``llvm::legacy::FunctionPassManager`` which can be found in - `LegacyPassManager.h `_. + `LegacyPassManager.h `_. For the purpose of the this tutorial the above should be used until the pass manager transition is complete. diff --git a/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl05.rst b/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl05.rst index 11ae79de3019c..a55cfe277de5a 100644 --- a/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl05.rst +++ b/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl05.rst @@ -213,7 +213,7 @@ Kaleidoscope looks like this: } To visualize the control flow graph, you can use a nifty feature of the -LLVM '`opt `_' tool. If you put this LLVM +LLVM '`opt `_' tool. If you put this LLVM IR into "t.ll" and run "``llvm-as < t.ll | opt -analyze -view-cfg``", `a window will pop up <../../ProgrammersManual.html#viewing-graphs-while-debugging-code>`_ and you'll see this graph: diff --git a/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl08.rst b/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl08.rst index 82776006a80ea..16b45323154a9 100644 --- a/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl08.rst +++ b/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl08.rst @@ -23,7 +23,7 @@ machine. To specify the architecture that you want to target, we use a string called a "target triple". This takes the form ``---`` (see the `cross compilation docs -`_). +`_). As an example, we can see what clang thinks is our current target triple: diff --git a/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl09.rst b/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl09.rst index 4cdecc3ff1318..0304c8ec813fa 100644 --- a/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl09.rst +++ b/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl09.rst @@ -165,13 +165,13 @@ DWARF Emission Setup ==================== Similar to the ``IRBuilder`` class we have a -`DIBuilder `_ class +`DIBuilder `_ class that helps in constructing debug metadata for an LLVM IR file. It corresponds 1:1 similarly to ``IRBuilder`` and LLVM IR, but with nicer names. Using it does require that you be more familiar with DWARF terminology than you needed to be with ``IRBuilder`` and ``Instruction`` names, but if you read through the general documentation on the -`Metadata Format `_ it +`Metadata Format `_ it should be a little more clear. We'll be using this class to construct all of our IR level descriptions. Construction for it takes a module so we need to construct it shortly after we construct our module. We've left it diff --git a/llvm/docs/tutorial/OCamlLangImpl3.rst b/llvm/docs/tutorial/OCamlLangImpl3.rst index fb0648928caa4..0b37ecd5ffd9a 100644 --- a/llvm/docs/tutorial/OCamlLangImpl3.rst +++ b/llvm/docs/tutorial/OCamlLangImpl3.rst @@ -65,7 +65,7 @@ the top-level structure that the LLVM IR uses to contain code. The ``Codegen.builder`` object is a helper object that makes it easy to generate LLVM instructions. Instances of the -`IRBuilder `_ +`IRBuilder `_ class keep track of the current place to insert instructions and has methods to create new instructions. @@ -522,7 +522,7 @@ Full Code Listing Here is the complete code listing for our running example, enhanced with the LLVM code generator. Because this uses the LLVM libraries, we need to link them in. To do this, we use the -`llvm-config `_ tool to inform +`llvm-config `_ tool to inform our makefile/command line about which options to use: .. code-block:: bash diff --git a/llvm/docs/tutorial/OCamlLangImpl5.rst b/llvm/docs/tutorial/OCamlLangImpl5.rst index 34d2dbb4c4de7..871eb05df37c3 100644 --- a/llvm/docs/tutorial/OCamlLangImpl5.rst +++ b/llvm/docs/tutorial/OCamlLangImpl5.rst @@ -161,7 +161,7 @@ Kaleidoscope looks like this: } To visualize the control flow graph, you can use a nifty feature of the -LLVM '`opt `_' tool. If you put this LLVM +LLVM '`opt `_' tool. If you put this LLVM IR into "t.ll" and run "``llvm-as < t.ll | opt -analyze -view-cfg``", `a window will pop up <../ProgrammersManual.html#viewing-graphs-while-debugging-code>`_ and you'll see this graph: diff --git a/llvm/docs/tutorial/index.rst b/llvm/docs/tutorial/index.rst index 8aa45184902db..e3c50f0424560 100644 --- a/llvm/docs/tutorial/index.rst +++ b/llvm/docs/tutorial/index.rst @@ -51,5 +51,5 @@ External Tutorials Advanced Topics =============== -#. `Writing an Optimization for LLVM `_ +#. `Writing an Optimization for LLVM `_ diff --git a/llvm/examples/OrcV2Examples/CMakeLists.txt b/llvm/examples/OrcV2Examples/CMakeLists.txt index f1c51b79690a0..b6d8d705f2162 100644 --- a/llvm/examples/OrcV2Examples/CMakeLists.txt +++ b/llvm/examples/OrcV2Examples/CMakeLists.txt @@ -1,6 +1,7 @@ add_subdirectory(BasicOrcV2CBindings) add_subdirectory(LLJITDumpObjects) -add_subdirectory(LLJITWithObjectCache) add_subdirectory(LLJITWithCustomObjectLinkingLayer) +add_subdirectory(LLJITWithGDBRegistrationListener) add_subdirectory(LLJITWithLazyReexports) +add_subdirectory(LLJITWithObjectCache) add_subdirectory(LLJITWithObjectLinkingLayerPlugin) diff --git a/llvm/examples/OrcV2Examples/LLJITWithGDBRegistrationListener/CMakeLists.txt b/llvm/examples/OrcV2Examples/LLJITWithGDBRegistrationListener/CMakeLists.txt new file mode 100644 index 0000000000000..0b1cdffb38053 --- /dev/null +++ b/llvm/examples/OrcV2Examples/LLJITWithGDBRegistrationListener/CMakeLists.txt @@ -0,0 +1,17 @@ +set(LLVM_LINK_COMPONENTS + Core + ExecutionEngine + IRReader + JITLink + OrcJIT + Support + nativecodegen + ) + +add_llvm_example(LLJITWithGDBRegistrationListener + LLJITWithGDBRegistrationListener.cpp + ) + +# We want JIT'd code to be able to link against process symbols like printf +# for this example, so make sure they're exported. +export_executable_symbols(LLJITWithGDBRegistrationListener) diff --git a/llvm/examples/OrcV2Examples/LLJITWithGDBRegistrationListener/LLJITWithGDBRegistrationListener.cpp b/llvm/examples/OrcV2Examples/LLJITWithGDBRegistrationListener/LLJITWithGDBRegistrationListener.cpp new file mode 100644 index 0000000000000..5cf7cd00ffc53 --- /dev/null +++ b/llvm/examples/OrcV2Examples/LLJITWithGDBRegistrationListener/LLJITWithGDBRegistrationListener.cpp @@ -0,0 +1,109 @@ +//===--------------- LLJITWithCustomObjectLinkingLayer.cpp ----------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file shows how to switch LLJIT to use a custom object linking layer (we +// use ObjectLinkingLayer, which is backed by JITLink, as an example). +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/StringMap.h" +#include "llvm/ExecutionEngine/JITEventListener.h" +#include "llvm/ExecutionEngine/JITLink/JITLinkMemoryManager.h" +#include "llvm/ExecutionEngine/Orc/ExecutionUtils.h" +#include "llvm/ExecutionEngine/Orc/LLJIT.h" +#include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h" +#include "llvm/ExecutionEngine/SectionMemoryManager.h" +#include "llvm/Support/InitLLVM.h" +#include "llvm/Support/TargetSelect.h" +#include "llvm/Support/raw_ostream.h" + +#include "../ExampleModules.h" + +using namespace llvm; +using namespace llvm::orc; + +ExitOnError ExitOnErr; + +static cl::opt + EntryPointName("entry", cl::desc("Symbol to call as main entry point"), + cl::init("main")); + +static cl::list InputFiles(cl::Positional, cl::OneOrMore, + cl::desc("input files")); + +static cl::list InputArgv("args", cl::Positional, + cl::desc("..."), + cl::ZeroOrMore, cl::PositionalEatsArgs); + +int main(int argc, char *argv[]) { + // Initialize LLVM. + InitLLVM X(argc, argv); + + InitializeNativeTarget(); + InitializeNativeTargetAsmPrinter(); + + cl::ParseCommandLineOptions(argc, argv, "LLJITWithCustomObjectLinkingLayer"); + ExitOnErr.setBanner(std::string(argv[0]) + ": "); + + // Detect the host and set code model to small. + auto JTMB = ExitOnErr(JITTargetMachineBuilder::detectHost()); + if (!JTMB.getTargetTriple().isOSLinux()) + errs() + << "Warning: This demo may not work for platforms other than Linux.\n"; + + // Create an LLJIT instance and use a custom object linking layer creator to + // register the GDBRegistrationListener with our RTDyldObjectLinkingLayer. + auto J = + ExitOnErr(LLJITBuilder() + .setJITTargetMachineBuilder(std::move(JTMB)) + .setObjectLinkingLayerCreator([&](ExecutionSession &ES, + const Triple &TT) { + auto GetMemMgr = []() { + return std::make_unique(); + }; + auto ObjLinkingLayer = + std::make_unique( + ES, std::move(GetMemMgr)); + ObjLinkingLayer->registerJITEventListener( + *JITEventListener::createGDBRegistrationListener()); + return ObjLinkingLayer; + }) + .create()); + + // Make sure that our process symbols are visible to JIT'd code. + { + MangleAndInterner Mangle(J->getExecutionSession(), J->getDataLayout()); + J->getMainJITDylib().addGenerator( + ExitOnErr(orc::DynamicLibrarySearchGenerator::GetForCurrentProcess( + J->getDataLayout().getGlobalPrefix(), + [MainName = Mangle("main")](const orc::SymbolStringPtr &Name) { + return Name != MainName; + }))); + } + + // Load the input modules. + for (auto &InputFile : InputFiles) { + auto Ctx = std::make_unique(); + SMDiagnostic Err; + std::unique_ptr M = parseIRFile(InputFile, Err, *Ctx); + if (!M) { + Err.print(argv[0], errs()); + return 1; + } + + ExitOnErr(J->addIRModule(ThreadSafeModule(std::move(M), std::move(Ctx)))); + } + + // Look up the entry point, cast it to a C main function pointer, then use + // runAsMain to call it. + auto EntrySym = ExitOnErr(J->lookup(EntryPointName)); + auto EntryFn = + jitTargetAddressToFunction(EntrySym.getAddress()); + + return runAsMain(EntryFn, InputArgv, StringRef(InputFiles.front())); +} diff --git a/llvm/include/llvm/ADT/CoalescingBitVector.h b/llvm/include/llvm/ADT/CoalescingBitVector.h index 6fc81b3126450..8ef6f4fac0a1b 100644 --- a/llvm/include/llvm/ADT/CoalescingBitVector.h +++ b/llvm/include/llvm/ADT/CoalescingBitVector.h @@ -21,7 +21,6 @@ #include #include -#include namespace llvm { @@ -34,8 +33,7 @@ namespace llvm { /// performance for non-sequential find() operations. /// /// \tparam IndexT - The type of the index into the bitvector. -/// \tparam N - The first N coalesced intervals of set bits are stored in-place -/// (in the initial heap allocation). +/// \tparam N - The first N coalesced intervals of set bits are stored in-place. template class CoalescingBitVector { static_assert(std::is_unsigned::value, "Index must be an unsigned integer."); @@ -55,13 +53,13 @@ template class CoalescingBitVector { /// Construct by passing in a CoalescingBitVector::Allocator /// reference. CoalescingBitVector(Allocator &Alloc) - : Alloc(&Alloc), Intervals(std::make_unique(Alloc)) {} + : Alloc(&Alloc), Intervals(Alloc) {} /// \name Copy/move constructors and assignment operators. /// @{ CoalescingBitVector(const ThisT &Other) - : Alloc(Other.Alloc), Intervals(std::make_unique(*Other.Alloc)) { + : Alloc(Other.Alloc), Intervals(*Other.Alloc) { set(Other); } @@ -71,27 +69,21 @@ template class CoalescingBitVector { return *this; } - CoalescingBitVector(ThisT &&Other) - : Alloc(Other.Alloc), Intervals(std::move(Other.Intervals)) {} - - ThisT &operator=(ThisT &&Other) { - Alloc = Other.Alloc; - Intervals = std::move(Other.Intervals); - return *this; - } + CoalescingBitVector(ThisT &&Other) = delete; + ThisT &operator=(ThisT &&Other) = delete; /// @} /// Clear all the bits. - void clear() { Intervals->clear(); } + void clear() { Intervals.clear(); } /// Check whether no bits are set. - bool empty() const { return Intervals->empty(); } + bool empty() const { return Intervals.empty(); } /// Count the number of set bits. unsigned count() const { unsigned Bits = 0; - for (auto It = Intervals->begin(), End = Intervals->end(); It != End; ++It) + for (auto It = Intervals.begin(), End = Intervals.end(); It != End; ++It) Bits += 1 + It.stop() - It.start(); return Bits; } @@ -112,7 +104,7 @@ template class CoalescingBitVector { /// This method does /not/ support setting already-set bits, see \ref set /// for the rationale. For a safe set union operation, use \ref operator|=. void set(const ThisT &Other) { - for (auto It = Other.Intervals->begin(), End = Other.Intervals->end(); + for (auto It = Other.Intervals.begin(), End = Other.Intervals.end(); It != End; ++It) insert(It.start(), It.stop()); } @@ -125,8 +117,8 @@ template class CoalescingBitVector { /// Check whether the bit at \p Index is set. bool test(IndexT Index) const { - const auto It = Intervals->find(Index); - if (It == Intervals->end()) + const auto It = Intervals.find(Index); + if (It == Intervals.end()) return false; assert(It.stop() >= Index && "Interval must end after Index"); return It.start() <= Index; @@ -140,8 +132,8 @@ template class CoalescingBitVector { /// Reset the bit at \p Index. Supports resetting an already-unset bit. void reset(IndexT Index) { - auto It = Intervals->find(Index); - if (It == Intervals->end()) + auto It = Intervals.find(Index); + if (It == Intervals.end()) return; // Split the interval containing Index into up to two parts: one from @@ -169,7 +161,7 @@ template class CoalescingBitVector { getOverlaps(RHS, Overlaps); // Insert the non-overlapping parts of all the intervals from RHS. - for (auto It = RHS.Intervals->begin(), End = RHS.Intervals->end(); + for (auto It = RHS.Intervals.begin(), End = RHS.Intervals.end(); It != End; ++It) { IndexT Start = It.start(); IndexT Stop = It.stop(); @@ -205,7 +197,7 @@ template class CoalescingBitVector { IndexT OlapStart, OlapStop; std::tie(OlapStart, OlapStop) = Overlap; - auto It = Intervals->find(OlapStart); + auto It = Intervals.find(OlapStart); IndexT CurrStart = It.start(); IndexT CurrStop = It.stop(); assert(CurrStart <= OlapStart && OlapStop <= CurrStop && @@ -227,14 +219,14 @@ template class CoalescingBitVector { // We cannot just use std::equal because it checks the dereferenced values // of an iterator pair for equality, not the iterators themselves. In our // case that results in comparison of the (unused) IntervalMap values. - auto ItL = Intervals->begin(); - auto ItR = RHS.Intervals->begin(); - while (ItL != Intervals->end() && ItR != RHS.Intervals->end() && + auto ItL = Intervals.begin(); + auto ItR = RHS.Intervals.begin(); + while (ItL != Intervals.end() && ItR != RHS.Intervals.end() && ItL.start() == ItR.start() && ItL.stop() == ItR.stop()) { ++ItL; ++ItR; } - return ItL == Intervals->end() && ItR == RHS.Intervals->end(); + return ItL == Intervals.end() && ItR == RHS.Intervals.end(); } bool operator!=(const ThisT &RHS) const { return !operator==(RHS); } @@ -274,9 +266,9 @@ template class CoalescingBitVector { } /// Advance the iterator to \p Index, if it is contained within the current - /// interval. + /// interval. The public-facing method which supports advancing past the + /// current interval is \ref advanceToLowerBound. void advanceTo(IndexT Index) { - assert(OffsetIntoMapIterator == 0 && "Not implemented"); assert(Index <= CachedStop && "Cannot advance to OOB index"); if (Index < CachedStart) // We're already past this index. @@ -322,17 +314,38 @@ template class CoalescingBitVector { operator++(); return tmp; } + + /// Advance the iterator to the first set bit AT, OR AFTER, \p Index. If + /// no such set bit exists, advance to end(). This is like std::lower_bound. + /// This is useful if \p Index is close to the current iterator position. + /// However, unlike \ref find(), this has worst-case O(n) performance. + void advanceToLowerBound(IndexT Index) { + if (OffsetIntoMapIterator == kIteratorAtTheEndOffset) + return; + + // Advance to the first interval containing (or past) Index, or to end(). + while (Index > CachedStop) { + ++MapIterator; + resetCache(); + if (OffsetIntoMapIterator == kIteratorAtTheEndOffset) + return; + } + + advanceTo(Index); + } }; - const_iterator begin() const { return const_iterator(Intervals->begin()); } + const_iterator begin() const { return const_iterator(Intervals.begin()); } const_iterator end() const { return const_iterator(); } /// Return an iterator pointing to the first set bit AT, OR AFTER, \p Index. /// If no such set bit exists, return end(). This is like std::lower_bound. + /// This has worst-case logarithmic performance (roughly O(log(gaps between + /// contiguous ranges))). const_iterator find(IndexT Index) const { - auto UnderlyingIt = Intervals->find(Index); - if (UnderlyingIt == Intervals->end()) + auto UnderlyingIt = Intervals.find(Index); + if (UnderlyingIt == Intervals.end()) return end(); auto It = const_iterator(UnderlyingIt); It.advanceTo(Index); @@ -341,7 +354,7 @@ template class CoalescingBitVector { void print(raw_ostream &OS) const { OS << "{"; - for (auto It = Intervals->begin(), End = Intervals->end(); It != End; + for (auto It = Intervals.begin(), End = Intervals.end(); It != End; ++It) { OS << "[" << It.start(); if (It.start() != It.stop()) @@ -362,13 +375,13 @@ template class CoalescingBitVector { #endif private: - void insert(IndexT Start, IndexT End) { Intervals->insert(Start, End, 0); } + void insert(IndexT Start, IndexT End) { Intervals.insert(Start, End, 0); } /// Record the overlaps between \p this and \p Other in \p Overlaps. Return /// true if there is any overlap. bool getOverlaps(const ThisT &Other, SmallVectorImpl &Overlaps) const { - for (IntervalMapOverlaps I(*Intervals, *Other.Intervals); + for (IntervalMapOverlaps I(Intervals, Other.Intervals); I.valid(); ++I) Overlaps.emplace_back(I.start(), I.stop()); assert(std::is_sorted(Overlaps.begin(), Overlaps.end(), @@ -409,7 +422,7 @@ template class CoalescingBitVector { } Allocator *Alloc; - std::unique_ptr Intervals; + MapT Intervals; }; } // namespace llvm diff --git a/llvm/include/llvm/ADT/Triple.h b/llvm/include/llvm/ADT/Triple.h index 520ca60010fdf..7c9e9ff8c72a7 100644 --- a/llvm/include/llvm/ADT/Triple.h +++ b/llvm/include/llvm/ADT/Triple.h @@ -705,6 +705,10 @@ class Triple { return getArch() == Triple::nvptx || getArch() == Triple::nvptx64; } + bool isAMDGPU() const { + return getArch() == Triple::r600 || getArch() == Triple::amdgcn; + } + /// Tests whether the target is Thumb (little and big endian). bool isThumb() const { return getArch() == Triple::thumb || getArch() == Triple::thumbeb; diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index ca6892b14ef3b..ce04592bf53ea 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1167,6 +1167,15 @@ class TargetTransformInfo { /// to a stack reload. unsigned getGISelRematGlobalCost() const; + /// \name Vector Predication Information + /// @{ + /// Whether the target supports the %evl parameter of VP intrinsic efficiently in hardware. + /// (see LLVM Language Reference - "Vector Predication Intrinsics") + /// Use of %evl is discouraged when that is not the case. + bool hasActiveVectorLength() const; + + /// @} + /// @} private: @@ -1420,6 +1429,7 @@ class TargetTransformInfo::Concept { ReductionFlags) const = 0; virtual bool shouldExpandReduction(const IntrinsicInst *II) const = 0; virtual unsigned getGISelRematGlobalCost() const = 0; + virtual bool hasActiveVectorLength() const = 0; virtual int getInstructionLatency(const Instruction *I) = 0; }; @@ -1913,6 +1923,10 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept { return Impl.getGISelRematGlobalCost(); } + bool hasActiveVectorLength() const override { + return Impl.hasActiveVectorLength(); + } + int getInstructionLatency(const Instruction *I) override { return Impl.getInstructionLatency(I); } diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index 2acb88a6a83d1..765d35a05a46c 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -628,6 +628,10 @@ class TargetTransformInfoImplBase { return 1; } + bool hasActiveVectorLength() const { + return false; + } + protected: // Obtain the minimum required size to hold the value (without the sign) // In case of a vector it returns the min required size for one element. diff --git a/llvm/include/llvm/Analysis/ValueTracking.h b/llvm/include/llvm/Analysis/ValueTracking.h index dc28d2375a78c..577c03bb1dd2a 100644 --- a/llvm/include/llvm/Analysis/ValueTracking.h +++ b/llvm/include/llvm/Analysis/ValueTracking.h @@ -59,6 +59,22 @@ class Value; OptimizationRemarkEmitter *ORE = nullptr, bool UseInstrInfo = true); + /// Determine which bits of V are known to be either zero or one and return + /// them in the KnownZero/KnownOne bit sets. + /// + /// This function is defined on values with integer type, values with pointer + /// type, and vectors of integers. In the case + /// where V is a vector, the known zero and known one values are the + /// same width as the vector element, and the bit is set only if it is true + /// for all of the demanded elements in the vector. + void computeKnownBits(const Value *V, const APInt &DemandedElts, + KnownBits &Known, const DataLayout &DL, + unsigned Depth = 0, AssumptionCache *AC = nullptr, + const Instruction *CxtI = nullptr, + const DominatorTree *DT = nullptr, + OptimizationRemarkEmitter *ORE = nullptr, + bool UseInstrInfo = true); + /// Returns the known bits rather than passing by reference. KnownBits computeKnownBits(const Value *V, const DataLayout &DL, unsigned Depth = 0, AssumptionCache *AC = nullptr, @@ -67,6 +83,15 @@ class Value; OptimizationRemarkEmitter *ORE = nullptr, bool UseInstrInfo = true); + /// Returns the known bits rather than passing by reference. + KnownBits computeKnownBits(const Value *V, const APInt &DemandedElts, + const DataLayout &DL, unsigned Depth = 0, + AssumptionCache *AC = nullptr, + const Instruction *CxtI = nullptr, + const DominatorTree *DT = nullptr, + OptimizationRemarkEmitter *ORE = nullptr, + bool UseInstrInfo = true); + /// Compute known bits from the range metadata. /// \p KnownZero the set of bits that are known to be zero /// \p KnownOne the set of bits that are known to be one diff --git a/llvm/include/llvm/Analysis/VectorUtils.h b/llvm/include/llvm/Analysis/VectorUtils.h index 383f33c63ede5..6797ed2369d84 100644 --- a/llvm/include/llvm/Analysis/VectorUtils.h +++ b/llvm/include/llvm/Analysis/VectorUtils.h @@ -328,6 +328,34 @@ const Value *getSplatValue(const Value *V); /// not limited by finding a scalar source value to a splatted vector. bool isSplatValue(const Value *V, int Index = -1, unsigned Depth = 0); +/// Scale a shuffle or target shuffle mask, replacing each mask index with the +/// scaled sequential indices for an equivalent mask of narrowed elements. +/// Mask elements that are less than 0 (sentinel values) are repeated in the +/// output mask. +/// +/// Example with Scale = 4: +/// <4 x i32> <3, 2, 0, -1> --> +/// <16 x i8> <12, 13, 14, 15, 8, 9, 10, 11, 0, 1, 2, 3, -1, -1, -1, -1> +/// +/// This is the reverse process of "canWidenShuffleElements", but can always +/// succeed. +template +void scaleShuffleMask(size_t Scale, ArrayRef Mask, + SmallVectorImpl &ScaledMask) { + assert(Scale > 0 && "Unexpected scaling factor"); + + // Fast-path: if no scaling, then it is just a copy. + if (Scale == 1) { + ScaledMask.assign(Mask.begin(), Mask.end()); + return; + } + + ScaledMask.clear(); + for (int MaskElt : Mask) + for (int ScaleElt = 0; ScaleElt != (int)Scale; ++ScaleElt) + ScaledMask.push_back(MaskElt < 0 ? MaskElt : Scale * MaskElt + ScaleElt); +} + /// Compute a map of integer instructions to their minimum legal type /// size. /// @@ -530,7 +558,10 @@ template class InterleaveGroup { bool isReverse() const { return Reverse; } uint32_t getFactor() const { return Factor; } - uint32_t getAlignment() const { return Alignment.value(); } + LLVM_ATTRIBUTE_DEPRECATED(uint32_t getAlignment() const, + "Use getAlign instead.") { + return Alignment.value(); + } Align getAlign() const { return Alignment; } uint32_t getNumMembers() const { return Members.size(); } diff --git a/llvm/include/llvm/CodeGen/CallingConvLower.h b/llvm/include/llvm/CodeGen/CallingConvLower.h index a30ca638ee6da..c4ecd843bd0be 100644 --- a/llvm/include/llvm/CodeGen/CallingConvLower.h +++ b/llvm/include/llvm/CodeGen/CallingConvLower.h @@ -435,7 +435,7 @@ class CCState { void ensureMaxAlignment(Align Alignment) { if (!AnalyzingMustTailForwardedRegs) - MF.getFrameInfo().ensureMaxAlignment(Alignment.value()); + MF.getFrameInfo().ensureMaxAlignment(Alignment); } /// Version of AllocateStack with extra register to be shadowed. diff --git a/llvm/include/llvm/CodeGen/FastISel.h b/llvm/include/llvm/CodeGen/FastISel.h index d9c680392e50a..326a7eb2e9387 100644 --- a/llvm/include/llvm/CodeGen/FastISel.h +++ b/llvm/include/llvm/CodeGen/FastISel.h @@ -534,6 +534,7 @@ class FastISel { bool selectCall(const User *I); bool selectIntrinsicCall(const IntrinsicInst *II); bool selectBitCast(const User *I); + bool selectFreeze(const User *I); bool selectCast(const User *I, unsigned Opcode); bool selectExtractValue(const User *U); bool selectInsertValue(const User *I); diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h index d47dddf88f516..f97c22ac420ad 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -194,6 +194,13 @@ class CombinerHelper { /// G_IMPLICIT_DEF. bool matchAnyExplicitUseIsUndef(MachineInstr &MI); + /// Return true if all register explicit use operands on \p MI are defined by + /// a G_IMPLICIT_DEF. + bool matchAllExplicitUsesAreUndef(MachineInstr &MI); + + /// Return true if a G_SHUFFLE_VECTOR instruction \p MI has an undef mask. + bool matchUndefShuffleVectorMask(MachineInstr &MI); + /// Replace an instruction with a G_FCONSTANT with value \p C. bool replaceInstWithFConstant(MachineInstr &MI, double C); @@ -203,6 +210,16 @@ class CombinerHelper { /// Replace an instruction with a G_IMPLICIT_DEF. bool replaceInstWithUndef(MachineInstr &MI); + /// Delete \p MI and replace all of its uses with its \p OpIdx-th operand. + bool replaceSingleDefInstWithOperand(MachineInstr &MI, unsigned OpIdx); + + /// Return true if \p MOP1 and \p MOP2 are register operands are defined by + /// equivalent instructions. + bool matchEqualDefs(const MachineOperand &MOP1, const MachineOperand &MOP2); + + /// Optimize (cond ? x : x) -> x + bool matchSelectSameVal(MachineInstr &MI); + /// Try to transform \p MI by using all of the above /// combine functions. Returns true if changed. bool tryCombine(MachineInstr &MI); diff --git a/llvm/include/llvm/CodeGen/GlobalISel/GISelKnownBits.h b/llvm/include/llvm/CodeGen/GlobalISel/GISelKnownBits.h index 906e8a9ac3121..976d42d588462 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/GISelKnownBits.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/GISelKnownBits.h @@ -50,6 +50,9 @@ class GISelKnownBits : public GISelChangeObserver { // KnownBitsAPI KnownBits getKnownBits(Register R); + KnownBits getKnownBits(Register R, const APInt &DemandedElts, + unsigned Depth = 0); + // Calls getKnownBits for first operand def of MI. KnownBits getKnownBits(MachineInstr &MI); APInt getKnownZeroes(Register R); diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h index 08d12a57adff0..60c0a71762e51 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h @@ -74,6 +74,9 @@ class LegalizerHelper { /// precision, ignoring the unused bits). LegalizeResult widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy); + /// Legalize an instruction by replacing the value type + LegalizeResult bitcast(MachineInstr &MI, unsigned TypeIdx, LLT Ty); + /// Legalize an instruction by splitting it into simpler parts, hopefully /// understood by the target. LegalizeResult lower(MachineInstr &MI, unsigned TypeIdx, LLT Ty); @@ -128,6 +131,14 @@ class LegalizerHelper { /// original vector type, and replacing the vreg of the operand in place. void moreElementsVectorSrc(MachineInstr &MI, LLT MoreTy, unsigned OpIdx); + /// Legalize a single operand \p OpIdx of the machine instruction \p MI as a + /// use by inserting a G_BITCAST to \p CastTy + void bitcastSrc(MachineInstr &MI, LLT CastTy, unsigned OpIdx); + + /// Legalize a single operand \p OpIdx of the machine instruction \p MI as a + /// def by inserting a G_BITCAST from \p CastTy + void bitcastDst(MachineInstr &MI, LLT CastTy, unsigned OpIdx); + private: LegalizeResult widenScalarMergeValues(MachineInstr &MI, unsigned TypeIdx, LLT WideTy); diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h index e2ed44e283d88..624fa70f1aa69 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h @@ -68,6 +68,9 @@ enum LegalizeAction : std::uint8_t { /// the first two results. MoreElements, + /// Perform the operation on a different, but equivalently sized type. + Bitcast, + /// The operation itself must be expressed in terms of simpler actions on /// this target. E.g. a SREM replaced by an SDIV and subtraction. Lower, diff --git a/llvm/include/llvm/CodeGen/GlobalISel/RegisterBank.h b/llvm/include/llvm/CodeGen/GlobalISel/RegisterBank.h index 8a8d3ce200409..f528d1a460128 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/RegisterBank.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/RegisterBank.h @@ -29,23 +29,18 @@ class RegisterBank { private: unsigned ID; const char *Name; - const unsigned *Sizes; + unsigned Size; BitVector ContainedRegClasses; - /// HwMode of the target. Not initialized by the constructor, initialized - /// within generated RegisterBankInfo class constructor. - unsigned HwMode; - - /// Sentinel values used to recognize register bank not properly + /// Sentinel value used to recognize register bank not properly /// initialized yet. static const unsigned InvalidID; - static const unsigned InvalidHwMode; /// Only the RegisterBankInfo can initialize RegisterBank properly. friend RegisterBankInfo; public: - RegisterBank(unsigned ID, const char *Name, const unsigned *Sizes, + RegisterBank(unsigned ID, const char *Name, unsigned Size, const uint32_t *CoveredClasses, unsigned NumRegClasses); /// Get the identifier of this register bank. @@ -56,7 +51,7 @@ class RegisterBank { const char *getName() const { return Name; } /// Get the maximal size in bits that fits in this register bank. - unsigned getSize() const { return Sizes[HwMode]; } + unsigned getSize() const { return Size; } /// Check whether this instance is ready to be used. bool isValid() const; diff --git a/llvm/include/llvm/CodeGen/GlobalISel/RegisterBankInfo.h b/llvm/include/llvm/CodeGen/GlobalISel/RegisterBankInfo.h index b86d2d10322f0..8725d96efd821 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/RegisterBankInfo.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/RegisterBankInfo.h @@ -415,8 +415,7 @@ class RegisterBankInfo { /// Create a RegisterBankInfo that can accommodate up to \p NumRegBanks /// RegisterBank instances. - RegisterBankInfo(RegisterBank **RegBanks, unsigned NumRegBanks, - unsigned HwMode); + RegisterBankInfo(RegisterBank **RegBanks, unsigned NumRegBanks); /// This constructor is meaningless. /// It just provides a default constructor that can be used at link time diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h index 4bc8d7bd42f0e..17e8f53c23008 100644 --- a/llvm/include/llvm/CodeGen/ISDOpcodes.h +++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h @@ -178,6 +178,11 @@ namespace ISD { /// UNDEF - An undefined node. UNDEF, + // FREEZE - FREEZE(VAL) returns an arbitrary value if VAL is UNDEF (or + // is evaluated to UNDEF), or returns VAL otherwise. Note that each + // read of UNDEF can yield different value, but FREEZE(UNDEF) cannot. + FREEZE, + /// EXTRACT_ELEMENT - This is used to get the lower or upper (determined by /// a Constant, which is required to be operand #1) half of the integer or /// float value specified as operand #0. This is only for use before diff --git a/llvm/include/llvm/CodeGen/MachineFrameInfo.h b/llvm/include/llvm/CodeGen/MachineFrameInfo.h index 384b206fc2202..03811bc5145c6 100644 --- a/llvm/include/llvm/CodeGen/MachineFrameInfo.h +++ b/llvm/include/llvm/CodeGen/MachineFrameInfo.h @@ -484,7 +484,7 @@ class MachineFrameInfo { // Only ensure max alignment for the default stack. if (getStackID(ObjectIdx) == 0) - ensureMaxAlignment(Align); + ensureMaxAlignment(assumeAligned(Align)); } /// setObjectAlignment - Change the alignment of the specified stack object. @@ -594,7 +594,8 @@ class MachineFrameInfo { /// Make sure the function is at least Align bytes aligned. void ensureMaxAlignment(Align Alignment); /// FIXME: Remove this once transition to Align is over. - inline void ensureMaxAlignment(unsigned Align) { + LLVM_ATTRIBUTE_DEPRECATED(inline void ensureMaxAlignment(unsigned Align), + "Use the version that uses Align instead") { ensureMaxAlignment(assumeAligned(Align)); } diff --git a/llvm/include/llvm/CodeGen/MachineLoopInfo.h b/llvm/include/llvm/CodeGen/MachineLoopInfo.h index 7d2273827b3f5..8a93f91ae54de 100644 --- a/llvm/include/llvm/CodeGen/MachineLoopInfo.h +++ b/llvm/include/llvm/CodeGen/MachineLoopInfo.h @@ -67,10 +67,6 @@ class MachineLoop : public LoopBase { /// it returns an unknown location. DebugLoc getStartLoc() const; - /// Returns true if a machine loop has blocks that have static profiling - /// information---e.g. from '__builtin_expect()'. - bool hasStaticProfInfo() const; - void dump() const; private: diff --git a/llvm/include/llvm/CodeGen/SelectionDAGISel.h b/llvm/include/llvm/CodeGen/SelectionDAGISel.h index 52c3e0c6148af..084badcbe0291 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAGISel.h +++ b/llvm/include/llvm/CodeGen/SelectionDAGISel.h @@ -324,6 +324,8 @@ class SelectionDAGISel : public MachineFunctionPass { void Select_UNDEF(SDNode *N); void CannotYetSelect(SDNode *N); + void Select_FREEZE(SDNode *N); + private: void DoInstructionSelection(); SDNode *MorphNode(SDNode *Node, unsigned TargetOpc, SDVTList VTList, diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 0552420c3c336..fefa8daa60a16 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -3332,6 +3332,16 @@ class TargetLowering : public TargetLoweringBase { const SelectionDAG &DAG, unsigned Depth = 0) const; + /// This method can be implemented by targets that want to expose additional + /// information about sign bits to GlobalISel combiners. The DemandedElts + /// argument allows us to only collect the minimum sign bits that are shared + /// by the requested vector elements. + virtual unsigned computeNumSignBitsForTargetInstr(GISelKnownBits &Analysis, + Register R, + const APInt &DemandedElts, + const MachineRegisterInfo &MRI, + unsigned Depth = 0) const; + /// Attempt to simplify any target nodes based on the demanded vector /// elements, returning true on success. Otherwise, analyze the expression and /// return a mask of KnownUndef and KnownZero elements for the expression diff --git a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h index 658e3263559ac..07aaddd47f44f 100644 --- a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h +++ b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h @@ -285,6 +285,12 @@ class TargetRegisterInfo : public MCRegisterInfo { return getRegClassInfo(RC).SpillAlignment / 8; } + /// Return the minimum required alignment in bytes for a spill slot for + /// a register of this class. + Align getSpillAlign(const TargetRegisterClass &RC) const { + return Align(getRegClassInfo(RC).SpillAlignment / 8); + } + /// Return true if the given TargetRegisterClass has the ValueType T. bool isTypeLegalForClass(const TargetRegisterClass &RC, MVT T) const { for (auto I = legalclasstypes_begin(RC); *I != MVT::Other; ++I) diff --git a/llvm/include/llvm/CodeGen/ValueTypes.h b/llvm/include/llvm/CodeGen/ValueTypes.h index bcf4177629200..15a4bfe1e5553 100644 --- a/llvm/include/llvm/CodeGen/ValueTypes.h +++ b/llvm/include/llvm/CodeGen/ValueTypes.h @@ -19,6 +19,7 @@ #include "llvm/Support/MachineValueType.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/TypeSize.h" +#include "llvm/Support/WithColor.h" #include #include #include @@ -75,9 +76,7 @@ namespace llvm { MVT M = MVT::getVectorVT(VT.V, NumElements, IsScalable); if (M.SimpleTy != MVT::INVALID_SIMPLE_VALUE_TYPE) return M; - - assert(!IsScalable && "We don't support extended scalable types yet"); - return getExtendedVectorVT(Context, VT, NumElements); + return getExtendedVectorVT(Context, VT, NumElements, IsScalable); } /// Returns the EVT that represents a vector EC.Min elements in length, @@ -86,19 +85,15 @@ namespace llvm { MVT M = MVT::getVectorVT(VT.V, EC); if (M.SimpleTy != MVT::INVALID_SIMPLE_VALUE_TYPE) return M; - assert (!EC.Scalable && "We don't support extended scalable types yet"); - return getExtendedVectorVT(Context, VT, EC.Min); + return getExtendedVectorVT(Context, VT, EC); } /// Return a vector with the same number of elements as this vector, but /// with the element type converted to an integer type with the same /// bitwidth. EVT changeVectorElementTypeToInteger() const { - if (!isSimple()) { - assert (!isScalableVector() && - "We don't support extended scalable types yet"); + if (!isSimple()) return changeExtendedVectorElementTypeToInteger(); - } MVT EltTy = getSimpleVT().getVectorElementType(); unsigned BitWidth = EltTy.getSizeInBits(); MVT IntTy = MVT::getIntegerVT(BitWidth); @@ -155,12 +150,12 @@ namespace llvm { /// Return true if this is a vector type where the runtime /// length is machine dependent bool isScalableVector() const { - // FIXME: We don't support extended scalable types yet, because the - // matching IR type doesn't exist. Once it has been added, this can - // be changed to call isExtendedScalableVector. - if (!isSimple()) - return false; - return V.isScalableVector(); + return isSimple() ? V.isScalableVector() : isExtendedScalableVector(); + } + + bool isFixedLengthVector() const { + return isSimple() ? V.isFixedLengthVector() + : isExtendedFixedLengthVector(); } /// Return true if this is a 16-bit vector type. @@ -273,7 +268,16 @@ namespace llvm { /// Given a vector type, return the number of elements it contains. unsigned getVectorNumElements() const { +#ifdef STRICT_FIXED_SIZE_VECTORS + assert(isFixedLengthVector() && "Invalid vector type!"); +#else assert(isVector() && "Invalid vector type!"); + if (isScalableVector()) + WithColor::warning() + << "Possible incorrect use of EVT::getVectorNumElements() for " + "scalable vector. Scalable flag may be dropped, use" + "EVT::getVectorElementCount() instead\n"; +#endif if (isSimple()) return V.getVectorNumElements(); return getExtendedVectorNumElements(); @@ -285,9 +289,7 @@ namespace llvm { if (isSimple()) return V.getVectorElementCount(); - assert(!isScalableVector() && - "We don't support extended scalable types yet"); - return {getExtendedVectorNumElements(), false}; + return {getExtendedVectorNumElements(), isExtendedScalableVector()}; } /// Return the size of the specified value type in bits. @@ -428,8 +430,10 @@ namespace llvm { EVT changeExtendedTypeToInteger() const; EVT changeExtendedVectorElementTypeToInteger() const; static EVT getExtendedIntegerVT(LLVMContext &C, unsigned BitWidth); - static EVT getExtendedVectorVT(LLVMContext &C, EVT VT, - unsigned NumElements); + static EVT getExtendedVectorVT(LLVMContext &C, EVT VT, unsigned NumElements, + bool IsScalable); + static EVT getExtendedVectorVT(LLVMContext &Context, EVT VT, + ElementCount EC); bool isExtendedFloatingPoint() const LLVM_READONLY; bool isExtendedInteger() const LLVM_READONLY; bool isExtendedScalarInteger() const LLVM_READONLY; @@ -442,8 +446,11 @@ namespace llvm { bool isExtended512BitVector() const LLVM_READONLY; bool isExtended1024BitVector() const LLVM_READONLY; bool isExtended2048BitVector() const LLVM_READONLY; + bool isExtendedFixedLengthVector() const LLVM_READONLY; + bool isExtendedScalableVector() const LLVM_READONLY; EVT getExtendedVectorElementType() const; unsigned getExtendedVectorNumElements() const LLVM_READONLY; + ElementCount getExtendedVectorElementCount() const LLVM_READONLY; TypeSize getExtendedSizeInBits() const LLVM_READONLY; }; diff --git a/llvm/include/llvm/DWARFLinker/DWARFLinker.h b/llvm/include/llvm/DWARFLinker/DWARFLinker.h index 5b0108606ac9f..d75ed8b98949b 100644 --- a/llvm/include/llvm/DWARFLinker/DWARFLinker.h +++ b/llvm/include/llvm/DWARFLinker/DWARFLinker.h @@ -226,6 +226,7 @@ typedef std::function(StringRef ContainerName, StringRef Path)> objFileLoader; typedef std::map swiftInterfacesMap; +typedef std::map objectPrefixMap; /// The core of the Dwarf linking logic. /// @@ -311,6 +312,11 @@ class DWARFLinker { Options.ParseableSwiftInterfaces = Map; } + /// Set prefix map for objects. + void setObjectPrefixMap(objectPrefixMap *Map) { + Options.ObjectPrefixMap = Map; + } + private: /// Flags passed to DwarfLinker::lookForDIEsToKeep enum TraversalFlags { @@ -576,6 +582,9 @@ class DWARFLinker { /// Value of DW_AT_call_return_pc in the input DIE uint64_t OrigCallReturnPc = 0; + /// Value of DW_AT_call_pc in the input DIE + uint64_t OrigCallPc = 0; + /// Offset to apply to PC addresses inside a function. int64_t PCOffset = 0; @@ -780,6 +789,9 @@ class DWARFLinker { /// per compile unit, which is why this is a std::map. /// this is dsymutil specific fag. swiftInterfacesMap *ParseableSwiftInterfaces = nullptr; + + /// A list of remappings to apply to file paths. + objectPrefixMap *ObjectPrefixMap = nullptr; } Options; }; diff --git a/llvm/include/llvm/DebugInfo/DIContext.h b/llvm/include/llvm/DebugInfo/DIContext.h index 0bad415f6bfd2..29e62f24b49a2 100644 --- a/llvm/include/llvm/DebugInfo/DIContext.h +++ b/llvm/include/llvm/DebugInfo/DIContext.h @@ -136,7 +136,11 @@ enum class DINameKind { None, ShortName, LinkageName }; struct DILineInfoSpecifier { enum class FileLineInfoKind { None, - Default, + // RawValue is whatever the compiler stored in the filename table. Could be + // a full path, could be something else. + RawValue, + BaseNameOnly, + // Relative to the compilation directory. RelativeFilePath, AbsoluteFilePath }; @@ -145,7 +149,7 @@ struct DILineInfoSpecifier { FileLineInfoKind FLIKind; FunctionNameKind FNKind; - DILineInfoSpecifier(FileLineInfoKind FLIKind = FileLineInfoKind::Default, + DILineInfoSpecifier(FileLineInfoKind FLIKind = FileLineInfoKind::RawValue, FunctionNameKind FNKind = FunctionNameKind::None) : FLIKind(FLIKind), FNKind(FNKind) {} }; diff --git a/llvm/include/llvm/DebugInfo/Symbolize/DIPrinter.h b/llvm/include/llvm/DebugInfo/Symbolize/DIPrinter.h index db7a61a8f1605..8d4c64915ffd5 100644 --- a/llvm/include/llvm/DebugInfo/Symbolize/DIPrinter.h +++ b/llvm/include/llvm/DebugInfo/Symbolize/DIPrinter.h @@ -14,6 +14,7 @@ #ifndef LLVM_DEBUGINFO_SYMBOLIZE_DIPRINTER_H #define LLVM_DEBUGINFO_SYMBOLIZE_DIPRINTER_H +#include "llvm/DebugInfo/DIContext.h" #include "llvm/Support/raw_ostream.h" namespace llvm { @@ -34,7 +35,6 @@ class DIPrinter { bool PrintPretty; int PrintSourceContext; bool Verbose; - bool Basenames; OutputStyle Style; void print(const DILineInfo &Info, bool Inlined); @@ -43,11 +43,10 @@ class DIPrinter { public: DIPrinter(raw_ostream &OS, bool PrintFunctionNames = true, bool PrintPretty = false, int PrintSourceContext = 0, - bool Verbose = false, bool Basenames = false, - OutputStyle Style = OutputStyle::LLVM) + bool Verbose = false, OutputStyle Style = OutputStyle::LLVM) : OS(OS), PrintFunctionNames(PrintFunctionNames), PrintPretty(PrintPretty), PrintSourceContext(PrintSourceContext), - Verbose(Verbose), Basenames(Basenames), Style(Style) {} + Verbose(Verbose), Style(Style) {} DIPrinter &operator<<(const DILineInfo &Info); DIPrinter &operator<<(const DIInliningInfo &Info); @@ -58,4 +57,3 @@ class DIPrinter { } #endif - diff --git a/llvm/include/llvm/DebugInfo/Symbolize/SymbolizableModule.h b/llvm/include/llvm/DebugInfo/Symbolize/SymbolizableModule.h index 506ecc424b4c9..51e92b83eadba 100644 --- a/llvm/include/llvm/DebugInfo/Symbolize/SymbolizableModule.h +++ b/llvm/include/llvm/DebugInfo/Symbolize/SymbolizableModule.h @@ -25,11 +25,12 @@ class SymbolizableModule { virtual ~SymbolizableModule() = default; virtual DILineInfo symbolizeCode(object::SectionedAddress ModuleOffset, - FunctionNameKind FNKind, + DILineInfoSpecifier LineInfoSpecifier, bool UseSymbolTable) const = 0; virtual DIInliningInfo symbolizeInlinedCode(object::SectionedAddress ModuleOffset, - FunctionNameKind FNKind, bool UseSymbolTable) const = 0; + DILineInfoSpecifier LineInfoSpecifier, + bool UseSymbolTable) const = 0; virtual DIGlobal symbolizeData(object::SectionedAddress ModuleOffset) const = 0; virtual std::vector diff --git a/llvm/include/llvm/DebugInfo/Symbolize/Symbolize.h b/llvm/include/llvm/DebugInfo/Symbolize/Symbolize.h index 632540c79b0df..b0972ed115819 100644 --- a/llvm/include/llvm/DebugInfo/Symbolize/Symbolize.h +++ b/llvm/include/llvm/DebugInfo/Symbolize/Symbolize.h @@ -32,11 +32,13 @@ namespace symbolize { using namespace object; using FunctionNameKind = DILineInfoSpecifier::FunctionNameKind; +using FileLineInfoKind = DILineInfoSpecifier::FileLineInfoKind; class LLVMSymbolizer { public: struct Options { FunctionNameKind PrintFunctions = FunctionNameKind::LinkageName; + FileLineInfoKind PathStyle = FileLineInfoKind::AbsoluteFilePath; bool UseSymbolTable = true; bool Demangle = true; bool RelativeAddresses = false; diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Core.h b/llvm/include/llvm/ExecutionEngine/Orc/Core.h index 14621768db809..f56e819114652 100644 --- a/llvm/include/llvm/ExecutionEngine/Orc/Core.h +++ b/llvm/include/llvm/ExecutionEngine/Orc/Core.h @@ -24,8 +24,6 @@ #include #include -#define DEBUG_TYPE "orc" - namespace llvm { namespace orc { @@ -309,66 +307,6 @@ struct SymbolAliasMapEntry { /// A map of Symbols to (Symbol, Flags) pairs. using SymbolAliasMap = DenseMap; -/// Render a SymbolStringPtr. -raw_ostream &operator<<(raw_ostream &OS, const SymbolStringPtr &Sym); - -/// Render a SymbolNameSet. -raw_ostream &operator<<(raw_ostream &OS, const SymbolNameSet &Symbols); - -/// Render a SymbolNameVector. -raw_ostream &operator<<(raw_ostream &OS, const SymbolNameVector &Symbols); - -/// Render a SymbolFlagsMap entry. -raw_ostream &operator<<(raw_ostream &OS, const SymbolFlagsMap::value_type &KV); - -/// Render a SymbolMap entry. -raw_ostream &operator<<(raw_ostream &OS, const SymbolMap::value_type &KV); - -/// Render a SymbolFlagsMap. -raw_ostream &operator<<(raw_ostream &OS, const SymbolFlagsMap &SymbolFlags); - -/// Render a SymbolMap. -raw_ostream &operator<<(raw_ostream &OS, const SymbolMap &Symbols); - -/// Render a SymbolDependenceMap entry. -raw_ostream &operator<<(raw_ostream &OS, - const SymbolDependenceMap::value_type &KV); - -/// Render a SymbolDependendeMap. -raw_ostream &operator<<(raw_ostream &OS, const SymbolDependenceMap &Deps); - -/// Render a MaterializationUnit. -raw_ostream &operator<<(raw_ostream &OS, const MaterializationUnit &MU); - -//// Render a JITDylibLookupFlags instance. -raw_ostream &operator<<(raw_ostream &OS, - const JITDylibLookupFlags &JDLookupFlags); - -/// Rendar a SymbolLookupFlags instance. -raw_ostream &operator<<(raw_ostream &OS, const SymbolLookupFlags &LookupFlags); - -/// Render a JITDylibLookupFlags instance. -raw_ostream &operator<<(raw_ostream &OS, const LookupKind &K); - -/// Render a SymbolLookupSet entry. -raw_ostream &operator<<(raw_ostream &OS, const SymbolLookupSet::value_type &KV); - -/// Render a SymbolLookupSet. -raw_ostream &operator<<(raw_ostream &OS, const SymbolLookupSet &LookupSet); - -/// Render a JITDylibSearchOrder. -raw_ostream &operator<<(raw_ostream &OS, - const JITDylibSearchOrder &SearchOrder); - -/// Render a SymbolAliasMap. -raw_ostream &operator<<(raw_ostream &OS, const SymbolAliasMap &Aliases); - -/// Render a SymbolState. -raw_ostream &operator<<(raw_ostream &OS, const SymbolState &S); - -/// Render a LookupKind. -raw_ostream &operator<<(raw_ostream &OS, const LookupKind &K); - /// Callback to notify client that symbols have been resolved. using SymbolsResolvedCallback = unique_function)>; @@ -1301,11 +1239,8 @@ class ExecutionSession { /// Materialize the given unit. void dispatchMaterialization(JITDylib &JD, std::unique_ptr MU) { - LLVM_DEBUG({ - runSessionLocked([&]() { - dbgs() << "Dispatching " << *MU << " for " << JD.getName() << "\n"; - }); - }); + assert(MU && "MU must be non-null"); + DEBUG_WITH_TYPE("orc", dumpDispatchInfo(JD, *MU)); DispatchMaterialization(JD, std::move(MU)); } @@ -1325,6 +1260,10 @@ class ExecutionSession { void runOutstandingMUs(); +#ifndef NDEBUG + void dumpDispatchInfo(JITDylib &JD, MaterializationUnit &MU); +#endif // NDEBUG + mutable std::recursive_mutex SessionMutex; std::shared_ptr SSP; std::unique_ptr P; @@ -1425,6 +1364,4 @@ class ReexportsGenerator : public JITDylib::DefinitionGenerator { } // End namespace orc } // End namespace llvm -#undef DEBUG_TYPE // "orc" - #endif // LLVM_EXECUTIONENGINE_ORC_CORE_H diff --git a/llvm/include/llvm/ExecutionEngine/Orc/DebugUtils.h b/llvm/include/llvm/ExecutionEngine/Orc/DebugUtils.h index b2ef29d65ffee..ac6cc30c948a0 100644 --- a/llvm/include/llvm/ExecutionEngine/Orc/DebugUtils.h +++ b/llvm/include/llvm/ExecutionEngine/Orc/DebugUtils.h @@ -13,7 +13,11 @@ #ifndef LLVM_EXECUTIONENGINE_ORC_DEBUGUTILS_H #define LLVM_EXECUTIONENGINE_ORC_DEBUGUTILS_H +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ExecutionEngine/Orc/Core.h" +#include "llvm/ExecutionEngine/Orc/SymbolStringPool.h" #include "llvm/Support/Error.h" +#include "llvm/Support/raw_ostream.h" #include #include @@ -23,6 +27,71 @@ class MemoryBuffer; namespace orc { +// --raw_ostream operators for ORC types-- + +/// Render a SymbolStringPtr. +raw_ostream &operator<<(raw_ostream &OS, const SymbolStringPtr &Sym); + +/// Render a SymbolNameSet. +raw_ostream &operator<<(raw_ostream &OS, const SymbolNameSet &Symbols); + +/// Render a SymbolNameVector. +raw_ostream &operator<<(raw_ostream &OS, const SymbolNameVector &Symbols); + +/// Render JITSymbolFlags. +raw_ostream &operator<<(raw_ostream &OS, const JITSymbolFlags &Flags); + +/// Render a SymbolFlagsMap entry. +raw_ostream &operator<<(raw_ostream &OS, const SymbolFlagsMap::value_type &KV); + +/// Render a SymbolMap entry. +raw_ostream &operator<<(raw_ostream &OS, const SymbolMap::value_type &KV); + +/// Render a SymbolFlagsMap. +raw_ostream &operator<<(raw_ostream &OS, const SymbolFlagsMap &SymbolFlags); + +/// Render a SymbolMap. +raw_ostream &operator<<(raw_ostream &OS, const SymbolMap &Symbols); + +/// Render a SymbolDependenceMap entry. +raw_ostream &operator<<(raw_ostream &OS, + const SymbolDependenceMap::value_type &KV); + +/// Render a SymbolDependendeMap. +raw_ostream &operator<<(raw_ostream &OS, const SymbolDependenceMap &Deps); + +/// Render a MaterializationUnit. +raw_ostream &operator<<(raw_ostream &OS, const MaterializationUnit &MU); + +//// Render a JITDylibLookupFlags instance. +raw_ostream &operator<<(raw_ostream &OS, + const JITDylibLookupFlags &JDLookupFlags); + +/// Rendar a SymbolLookupFlags instance. +raw_ostream &operator<<(raw_ostream &OS, const SymbolLookupFlags &LookupFlags); + +/// Render a JITDylibLookupFlags instance. +raw_ostream &operator<<(raw_ostream &OS, const LookupKind &K); + +/// Render a SymbolLookupSet entry. +raw_ostream &operator<<(raw_ostream &OS, const SymbolLookupSet::value_type &KV); + +/// Render a SymbolLookupSet. +raw_ostream &operator<<(raw_ostream &OS, const SymbolLookupSet &LookupSet); + +/// Render a JITDylibSearchOrder. +raw_ostream &operator<<(raw_ostream &OS, + const JITDylibSearchOrder &SearchOrder); + +/// Render a SymbolAliasMap. +raw_ostream &operator<<(raw_ostream &OS, const SymbolAliasMap &Aliases); + +/// Render a SymbolState. +raw_ostream &operator<<(raw_ostream &OS, const SymbolState &S); + +/// Render a LookupKind. +raw_ostream &operator<<(raw_ostream &OS, const LookupKind &K); + /// A function object that can be used as an ObjectTransformLayer transform /// to dump object files to disk at a specified path. class DumpObjects { diff --git a/llvm/include/llvm/ExecutionEngine/Orc/MachOPlatform.h b/llvm/include/llvm/ExecutionEngine/Orc/MachOPlatform.h index f869ebdfbe4e7..15fe079eccafd 100644 --- a/llvm/include/llvm/ExecutionEngine/Orc/MachOPlatform.h +++ b/llvm/include/llvm/ExecutionEngine/Orc/MachOPlatform.h @@ -143,12 +143,15 @@ class MachOPlatform : public Platform { MachOJITDylibInitializers::SectionExtent ObjCSelRefs, MachOJITDylibInitializers::SectionExtent ObjCClassList); - std::mutex PlatformMutex; ExecutionSession &ES; ObjectLinkingLayer &ObjLinkingLayer; std::unique_ptr StandardSymbolsObject; DenseMap RegisteredInitSymbols; + + // InitSeqs gets its own mutex to avoid locking the whole session when + // aggregating data from the jitlink. + std::mutex InitSeqsMutex; DenseMap InitSeqs; }; diff --git a/llvm/include/llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h b/llvm/include/llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h index 435c882d506dd..3c4ba1a87959d 100644 --- a/llvm/include/llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h +++ b/llvm/include/llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h @@ -16,6 +16,7 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringMap.h" #include "llvm/ADT/StringRef.h" +#include "llvm/ExecutionEngine/JITEventListener.h" #include "llvm/ExecutionEngine/JITSymbol.h" #include "llvm/ExecutionEngine/Orc/Core.h" #include "llvm/ExecutionEngine/Orc/Layer.h" @@ -115,15 +116,23 @@ class RTDyldObjectLinkingLayer : public ObjectLayer { return *this; } + /// Register a JITEventListener. + void registerJITEventListener(JITEventListener &L); + + /// Unregister a JITEventListener. + void unregisterJITEventListener(JITEventListener &L); + private: Error onObjLoad(VModuleKey K, MaterializationResponsibility &R, - object::ObjectFile &Obj, + const object::ObjectFile &Obj, + RuntimeDyld::MemoryManager *MemMgr, std::unique_ptr LoadedObjInfo, std::map Resolved, std::set &InternalSymbols); - void onObjEmit(VModuleKey K, std::unique_ptr ObjBuffer, - MaterializationResponsibility &R, Error Err); + void onObjEmit(VModuleKey K, MaterializationResponsibility &R, + object::OwningBinary O, + RuntimeDyld::MemoryManager *MemMgr, Error Err); mutable std::mutex RTDyldLayerMutex; GetMemoryManagerFunction GetMemoryManager; @@ -133,6 +142,10 @@ class RTDyldObjectLinkingLayer : public ObjectLayer { bool OverrideObjectFlags = false; bool AutoClaimObjectSymbols = false; std::vector> MemMgrs; + std::vector EventListeners; + DenseMap> + LoadedObjInfos; }; class LegacyRTDyldObjectLinkingLayerBase { diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Speculation.h b/llvm/include/llvm/ExecutionEngine/Orc/Speculation.h index 8c89fb4fe3423..c6bfd6db7353c 100644 --- a/llvm/include/llvm/ExecutionEngine/Orc/Speculation.h +++ b/llvm/include/llvm/ExecutionEngine/Orc/Speculation.h @@ -17,6 +17,7 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/Optional.h" #include "llvm/ExecutionEngine/Orc/Core.h" +#include "llvm/ExecutionEngine/Orc/DebugUtils.h" #include "llvm/ExecutionEngine/Orc/IRCompileLayer.h" #include "llvm/IR/PassManager.h" #include "llvm/Passes/PassBuilder.h" diff --git a/llvm/include/llvm/ExecutionEngine/RuntimeDyld.h b/llvm/include/llvm/ExecutionEngine/RuntimeDyld.h index ce7024a7f19b7..1b3ce1127e4a2 100644 --- a/llvm/include/llvm/ExecutionEngine/RuntimeDyld.h +++ b/llvm/include/llvm/ExecutionEngine/RuntimeDyld.h @@ -267,15 +267,16 @@ class RuntimeDyld { void finalizeWithMemoryManagerLocking(); private: - friend void - jitLinkForORC(object::ObjectFile &Obj, - std::unique_ptr UnderlyingBuffer, - RuntimeDyld::MemoryManager &MemMgr, JITSymbolResolver &Resolver, - bool ProcessAllSections, - unique_function, - std::map)> - OnLoaded, - unique_function OnEmitted); + friend void jitLinkForORC( + object::OwningBinary O, + RuntimeDyld::MemoryManager &MemMgr, JITSymbolResolver &Resolver, + bool ProcessAllSections, + unique_function, + std::map)> + OnLoaded, + unique_function O, Error)> + OnEmitted); // RuntimeDyldImpl is the actual class. RuntimeDyld is just the public // interface. @@ -293,13 +294,15 @@ class RuntimeDyld { // instance and uses continuation passing to perform the fix-up and finalize // steps asynchronously. void jitLinkForORC( - object::ObjectFile &Obj, std::unique_ptr UnderlyingBuffer, + object::OwningBinary O, RuntimeDyld::MemoryManager &MemMgr, JITSymbolResolver &Resolver, bool ProcessAllSections, - unique_function, + unique_function, std::map)> OnLoaded, - unique_function OnEmitted); + unique_function, Error)> + OnEmitted); } // end namespace llvm diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPContext.h b/llvm/include/llvm/Frontend/OpenMP/OMPContext.h index 960c557f55d40..0a9d9c277d992 100644 --- a/llvm/include/llvm/Frontend/OpenMP/OMPContext.h +++ b/llvm/include/llvm/Frontend/OpenMP/OMPContext.h @@ -16,6 +16,7 @@ #define LLVM_OPENMP_CONTEXT_H #include "llvm/ADT/APSInt.h" +#include "llvm/ADT/BitVector.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Triple.h" @@ -43,6 +44,7 @@ enum class TraitSelector { /// IDs for all OpenMP context trait properties (host/gpu/bsc/llvm/...) enum class TraitProperty { #define OMP_TRAIT_PROPERTY(Enum, ...) Enum, +#define OMP_LAST_TRAIT_PROPERTY(Enum) Last = Enum #include "llvm/Frontend/OpenMP/OMPKinds.def" }; @@ -122,12 +124,12 @@ struct VariantMatchInfo { void addTrait(TraitSet Set, TraitProperty Property, APInt *Score = nullptr) { if (Score) ScoreMap[Property] = *Score; - RequiredTraits.insert(Property); + RequiredTraits.set(unsigned(Property)); if (Set == TraitSet::construct) ConstructTraits.push_back(Property); } - SmallSet RequiredTraits; + BitVector RequiredTraits = BitVector(unsigned(TraitProperty::Last) + 1); SmallVector ConstructTraits; SmallDenseMap ScoreMap; }; @@ -142,12 +144,12 @@ struct OMPContext { addTrait(getOpenMPContextTraitSetForProperty(Property), Property); } void addTrait(TraitSet Set, TraitProperty Property) { - ActiveTraits.insert(Property); + ActiveTraits.set(unsigned(Property)); if (Set == TraitSet::construct) ConstructTraits.push_back(Property); } - SmallSet ActiveTraits; + BitVector ActiveTraits = BitVector(unsigned(TraitProperty::Last) + 1); SmallVector ConstructTraits; }; diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def index 20e5b95a827ae..5d26f07a7f5ac 100644 --- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def +++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def @@ -92,6 +92,7 @@ __OMP_DIRECTIVE_EXT(master_taskloop_simd, "master taskloop simd") __OMP_DIRECTIVE_EXT(parallel_master_taskloop_simd, "parallel master taskloop simd") __OMP_DIRECTIVE(depobj) +__OMP_DIRECTIVE(scan) // Has to be the last because Clang implicitly expects it to be. __OMP_DIRECTIVE(unknown) @@ -208,8 +209,8 @@ __OMP_RTL(omp_get_thread_limit, false, Int32, ) __OMP_RTL(omp_get_supported_active_levels, false, Int32, ) __OMP_RTL(omp_get_max_active_levels, false, Int32, ) __OMP_RTL(omp_get_level, false, Int32, ) -__OMP_RTL(omp_get_ancestor_thread_num, false, Int32, ) -__OMP_RTL(omp_get_team_size, false, Int32, ) +__OMP_RTL(omp_get_ancestor_thread_num, false, Int32, Int32) +__OMP_RTL(omp_get_team_size, false, Int32, Int32) __OMP_RTL(omp_get_active_level, false, Int32, ) __OMP_RTL(omp_in_final, false, Int32, ) __OMP_RTL(omp_get_proc_bind, false, Int32, ) @@ -218,7 +219,7 @@ __OMP_RTL(omp_get_num_procs, false, Int32, ) __OMP_RTL(omp_get_place_proc_ids, false, Void, Int32, Int32Ptr) __OMP_RTL(omp_get_place_num, false, Int32, ) __OMP_RTL(omp_get_partition_num_places, false, Int32, ) -__OMP_RTL(omp_get_partition_place_nums, false, Int32, ) +__OMP_RTL(omp_get_partition_place_nums, false, Void, Int32Ptr) __OMP_RTL(omp_set_num_threads, false, Void, Int32) __OMP_RTL(omp_set_dynamic, false, Void, Int32) @@ -426,6 +427,9 @@ __OMP_PROC_BIND_KIND(unknown, 7) #ifndef OMP_TRAIT_PROPERTY #define OMP_TRAIT_PROPERTY(Enum, TraitSetEnum, TraitSelectorEnum, Str) #endif +#ifndef OMP_LAST_TRAIT_PROPERTY +#define OMP_LAST_TRAIT_PROPERTY(Enum) +#endif #define __OMP_TRAIT_SET(Name) OMP_TRAIT_SET(Name, #Name) #define __OMP_TRAIT_SELECTOR(TraitSet, Name, RequiresProperty) \ @@ -533,10 +537,14 @@ __OMP_REQUIRES_TRAIT(reverse_offload) __OMP_REQUIRES_TRAIT(dynamic_allocators) __OMP_REQUIRES_TRAIT(atomic_default_mem_order) +OMP_LAST_TRAIT_PROPERTY( + implementation_atomic_default_mem_order_atomic_default_mem_order) + #undef __OMP_TRAIT_SELECTOR_AND_PROPERTY #undef OMP_TRAIT_SELECTOR #undef __OMP_TRAIT_SELECTOR #undef OMP_TRAIT_PROPERTY +#undef OMP_LAST_TRAIT_PROPERTY #undef __OMP_TRAIT_PROPERTY #undef __OMP_REQUIRES_TRAIT #undef OMP_REQUIRES_TRAIT diff --git a/llvm/include/llvm/IR/CFGDiff.h b/llvm/include/llvm/IR/CFGDiff.h index f40434cc5d4eb..cc38addd058e8 100644 --- a/llvm/include/llvm/IR/CFGDiff.h +++ b/llvm/include/llvm/IR/CFGDiff.h @@ -158,58 +158,44 @@ template > struct CFGViewChildren { using DataRef = const GraphDiff *; - using RawNodeRef = typename GT::NodeRef; - using NodeRef = std::pair; - - using ExistingChildIterator = - WrappedPairNodeDataIterator; - struct DeletedEdgesFilter { - RawNodeRef BB; - DeletedEdgesFilter(RawNodeRef BB) : BB(BB){}; - bool operator()(NodeRef N) const { - return !N.first->ignoreChild(BB, N.second, InverseEdge); - } - }; - using FilterExistingChildrenIterator = - filter_iterator; - - using vec_iterator = typename SmallVectorImpl::const_iterator; - using AddNewChildrenIterator = - WrappedPairNodeDataIterator; - using ChildIteratorType = - concat_iterator; - - static ChildIteratorType child_begin(NodeRef N) { - auto InsertVec = N.first->getAddedChildren(N.second, InverseEdge); - // filter iterator init: - auto firstit = make_filter_range( - make_range({GT::child_begin(N.second), N.first}, - {GT::child_end(N.second), N.first}), - DeletedEdgesFilter(N.second)); - // new inserts iterator init: - auto secondit = make_range( - {InsertVec.begin(), N.first}, {InsertVec.end(), N.first}); + using NodeRef = std::pair; - return concat_iterator(firstit, secondit); + template + static auto makeChildRange(Range &&R, DataRef DR) { + using Iter = WrappedPairNodeDataIterator(R).begin()), NodeRef, DataRef>; + return make_range(Iter(R.begin(), DR), Iter(R.end(), DR)); } - static ChildIteratorType child_end(NodeRef N) { - auto InsertVec = N.first->getAddedChildren(N.second, InverseEdge); + static auto children(NodeRef N) { + // filter iterator init: - auto firstit = make_filter_range( - make_range({GT::child_end(N.second), N.first}, - {GT::child_end(N.second), N.first}), - DeletedEdgesFilter(N.second)); + auto R = make_range(GT::child_begin(N.second), GT::child_end(N.second)); + // This lambda is copied into the iterators and persists to callers, ensure + // captures are by value or otherwise have sufficient lifetime. + auto First = make_filter_range(makeChildRange(R, N.first), [N](NodeRef C) { + return !C.first->ignoreChild(N.second, C.second, InverseEdge); + }); + // new inserts iterator init: - auto secondit = make_range( - {InsertVec.end(), N.first}, {InsertVec.end(), N.first}); + auto InsertVec = N.first->getAddedChildren(N.second, InverseEdge); + auto Second = makeChildRange(InsertVec, N.first); + + auto CR = concat(First, Second); + // concat_range contains references to other ranges, returning it would + // leave those references dangling - the iterators contain + // other iterators by value so they're safe to return. + return make_range(CR.begin(), CR.end()); + } - return concat_iterator(firstit, secondit); + static auto child_begin(NodeRef N) { + return children(N).begin(); } + + static auto child_end(NodeRef N) { + return children(N).end(); + } + + using ChildIteratorType = decltype(child_end(std::declval())); }; template diff --git a/llvm/include/llvm/IR/CallSite.h b/llvm/include/llvm/IR/CallSite.h index 0e957c4797e8b..6a82e73537cf5 100644 --- a/llvm/include/llvm/IR/CallSite.h +++ b/llvm/include/llvm/IR/CallSite.h @@ -146,6 +146,13 @@ class CallSiteBase { return static_cast(0); } + /// Return if this call is to an intrinsic. + bool isIntrinsic() const { + if (auto *F = getCalledFunction()) + return F->isIntrinsic(); + return false; + } + /// Determine whether the passed iterator points to the callee operand's Use. bool isCallee(Value::const_user_iterator UI) const { return isCallee(&UI.getUse()); diff --git a/llvm/include/llvm/IR/ConstantRange.h b/llvm/include/llvm/IR/ConstantRange.h index e6bac8a5f9339..8ecb9aa0ce020 100644 --- a/llvm/include/llvm/IR/ConstantRange.h +++ b/llvm/include/llvm/IR/ConstantRange.h @@ -409,6 +409,10 @@ class LLVM_NODISCARD ConstantRange { /// from a binary-or of a value in this range by a value in \p Other. ConstantRange binaryOr(const ConstantRange &Other) const; + /// Return a new range representing the possible values resulting + /// from a binary-xor of a value in this range by a value in \p Other. + ConstantRange binaryXor(const ConstantRange &Other) const; + /// Return a new range representing the possible values resulting /// from a left shift of a value in this range by a value in \p Other. /// TODO: This isn't fully implemented yet. diff --git a/llvm/include/llvm/IR/Function.h b/llvm/include/llvm/IR/Function.h index d9cbcc63fa629..2e3ea1400d9ad 100644 --- a/llvm/include/llvm/IR/Function.h +++ b/llvm/include/llvm/IR/Function.h @@ -349,6 +349,13 @@ class Function : public GlobalObject, public ilist_node { return 0; } + /// Return the stack alignment for the function. + MaybeAlign getFnStackAlign() const { + if (!hasFnAttribute(Attribute::StackAlignment)) + return None; + return AttributeSets.getStackAlignment(AttributeList::FunctionIndex); + } + /// hasGC/getGC/setGC/clearGC - The name of the garbage collection algorithm /// to use during code generation. bool hasGC() const { diff --git a/llvm/include/llvm/IR/IntrinsicInst.h b/llvm/include/llvm/IR/IntrinsicInst.h index cebe07e42afc7..0ca1688a7c91f 100644 --- a/llvm/include/llvm/IR/IntrinsicInst.h +++ b/llvm/include/llvm/IR/IntrinsicInst.h @@ -206,6 +206,48 @@ namespace llvm { /// @} }; + /// This is the common base class for vector predication intrinsics. + class VPIntrinsic : public IntrinsicInst { + public: + static Optional GetMaskParamPos(Intrinsic::ID IntrinsicID); + static Optional GetVectorLengthParamPos(Intrinsic::ID IntrinsicID); + + /// The llvm.vp.* intrinsics for this instruction Opcode + static Intrinsic::ID GetForOpcode(unsigned OC); + + // Whether \p ID is a VP intrinsic ID. + static bool IsVPIntrinsic(Intrinsic::ID); + + /// \return the mask parameter or nullptr. + Value *getMaskParam() const; + + /// \return the vector length parameter or nullptr. + Value *getVectorLengthParam() const; + + /// \return whether the vector length param can be ignored. + bool canIgnoreVectorLengthParam() const; + + /// \return the static element count (vector number of elements) the vector + /// length parameter applies to. + ElementCount getStaticVectorLength() const; + + // Methods for support type inquiry through isa, cast, and dyn_cast: + static bool classof(const IntrinsicInst *I) { + return IsVPIntrinsic(I->getIntrinsicID()); + } + static bool classof(const Value *V) { + return isa(V) && classof(cast(V)); + } + + // Equivalent non-predicated opcode + unsigned getFunctionalOpcode() const { + return GetFunctionalOpcodeForVP(getIntrinsicID()); + } + + // Equivalent non-predicated opcode + static unsigned GetFunctionalOpcodeForVP(Intrinsic::ID ID); + }; + /// This is the common base class for constrained floating point intrinsics. class ConstrainedFPIntrinsic : public IntrinsicInst { public: diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td index 797d7b1765c3d..0812d707e4fed 100644 --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -27,6 +27,10 @@ class IntrinsicProperty; // effects. It may be CSE'd deleted if dead, etc. def IntrNoMem : IntrinsicProperty; +// IntrNoSync - Threads executing the intrinsic will not synchronize using +// memory or other means. +def IntrNoSync : IntrinsicProperty; + // IntrReadMem - This intrinsic only reads from memory. It does not write to // memory and has no other side effects. Therefore, it cannot be moved across // potentially aliasing stores. However, it can be reordered otherwise and can @@ -1153,6 +1157,79 @@ def int_is_constant : Intrinsic<[llvm_i1_ty], [llvm_any_ty], [IntrNoMem, IntrWil def int_ptrmask: Intrinsic<[llvm_anyptr_ty], [llvm_anyptr_ty, llvm_anyint_ty], [IntrNoMem, IntrSpeculatable, IntrWillReturn]>; +//===---------------- Vector Predication Intrinsics --------------===// + +// Binary operators +let IntrProperties = [IntrNoMem, IntrNoSync, IntrWillReturn] in { + def int_vp_add : Intrinsic<[ llvm_anyvector_ty ], + [ LLVMMatchType<0>, + LLVMMatchType<0>, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; + def int_vp_sub : Intrinsic<[ llvm_anyvector_ty ], + [ LLVMMatchType<0>, + LLVMMatchType<0>, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; + def int_vp_mul : Intrinsic<[ llvm_anyvector_ty ], + [ LLVMMatchType<0>, + LLVMMatchType<0>, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; + def int_vp_sdiv : Intrinsic<[ llvm_anyvector_ty ], + [ LLVMMatchType<0>, + LLVMMatchType<0>, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; + def int_vp_udiv : Intrinsic<[ llvm_anyvector_ty ], + [ LLVMMatchType<0>, + LLVMMatchType<0>, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; + def int_vp_srem : Intrinsic<[ llvm_anyvector_ty ], + [ LLVMMatchType<0>, + LLVMMatchType<0>, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; + def int_vp_urem : Intrinsic<[ llvm_anyvector_ty ], + [ LLVMMatchType<0>, + LLVMMatchType<0>, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; + def int_vp_ashr : Intrinsic<[ llvm_anyvector_ty ], + [ LLVMMatchType<0>, + LLVMMatchType<0>, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; + def int_vp_lshr : Intrinsic<[ llvm_anyvector_ty ], + [ LLVMMatchType<0>, + LLVMMatchType<0>, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; + def int_vp_shl : Intrinsic<[ llvm_anyvector_ty ], + [ LLVMMatchType<0>, + LLVMMatchType<0>, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; + def int_vp_or : Intrinsic<[ llvm_anyvector_ty ], + [ LLVMMatchType<0>, + LLVMMatchType<0>, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; + def int_vp_and : Intrinsic<[ llvm_anyvector_ty ], + [ LLVMMatchType<0>, + LLVMMatchType<0>, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; + def int_vp_xor : Intrinsic<[ llvm_anyvector_ty ], + [ LLVMMatchType<0>, + LLVMMatchType<0>, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty]>; + +} + + //===-------------------------- Masked Intrinsics -------------------------===// // def int_masked_store : Intrinsic<[], [llvm_anyvector_ty, diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td index a6de3f949199e..cad4f7cb4cd13 100644 --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -1282,6 +1282,8 @@ class SVE_gather_prf_vector_base_scalar_offset // Loads // +def int_aarch64_sve_ld1 : AdvSIMD_1Vec_PredLoad_Intrinsic; + def int_aarch64_sve_ldnt1 : AdvSIMD_1Vec_PredLoad_Intrinsic; def int_aarch64_sve_ldnf1 : AdvSIMD_1Vec_PredLoad_Intrinsic; def int_aarch64_sve_ldff1 : AdvSIMD_1Vec_PredLoad_Intrinsic; @@ -1290,6 +1292,8 @@ def int_aarch64_sve_ldff1 : AdvSIMD_1Vec_PredLoad_Intrinsic; // Stores // +def int_aarch64_sve_st1 : AdvSIMD_1Vec_PredStore_Intrinsic; + def int_aarch64_sve_stnt1 : AdvSIMD_1Vec_PredStore_Intrinsic; // @@ -1302,29 +1306,29 @@ def int_aarch64_sve_prf // Scalar + 32-bit scaled offset vector, zero extend, packed and // unpacked. -def int_aarch64_sve_gather_prfb_scaled_uxtw : SVE_gather_prf_scalar_base_vector_offset_scaled; -def int_aarch64_sve_gather_prfh_scaled_uxtw : SVE_gather_prf_scalar_base_vector_offset_scaled; -def int_aarch64_sve_gather_prfw_scaled_uxtw : SVE_gather_prf_scalar_base_vector_offset_scaled; -def int_aarch64_sve_gather_prfd_scaled_uxtw : SVE_gather_prf_scalar_base_vector_offset_scaled; +def int_aarch64_sve_prfb_gather_scaled_uxtw : SVE_gather_prf_scalar_base_vector_offset_scaled; +def int_aarch64_sve_prfh_gather_scaled_uxtw : SVE_gather_prf_scalar_base_vector_offset_scaled; +def int_aarch64_sve_prfw_gather_scaled_uxtw : SVE_gather_prf_scalar_base_vector_offset_scaled; +def int_aarch64_sve_prfd_gather_scaled_uxtw : SVE_gather_prf_scalar_base_vector_offset_scaled; // Scalar + 32-bit scaled offset vector, sign extend, packed and // unpacked. -def int_aarch64_sve_gather_prfb_scaled_sxtw : SVE_gather_prf_scalar_base_vector_offset_scaled; -def int_aarch64_sve_gather_prfw_scaled_sxtw : SVE_gather_prf_scalar_base_vector_offset_scaled; -def int_aarch64_sve_gather_prfh_scaled_sxtw : SVE_gather_prf_scalar_base_vector_offset_scaled; -def int_aarch64_sve_gather_prfd_scaled_sxtw : SVE_gather_prf_scalar_base_vector_offset_scaled; +def int_aarch64_sve_prfb_gather_scaled_sxtw : SVE_gather_prf_scalar_base_vector_offset_scaled; +def int_aarch64_sve_prfw_gather_scaled_sxtw : SVE_gather_prf_scalar_base_vector_offset_scaled; +def int_aarch64_sve_prfh_gather_scaled_sxtw : SVE_gather_prf_scalar_base_vector_offset_scaled; +def int_aarch64_sve_prfd_gather_scaled_sxtw : SVE_gather_prf_scalar_base_vector_offset_scaled; // Scalar + 64-bit scaled offset vector. -def int_aarch64_sve_gather_prfb_scaled : SVE_gather_prf_scalar_base_vector_offset_scaled; -def int_aarch64_sve_gather_prfh_scaled : SVE_gather_prf_scalar_base_vector_offset_scaled; -def int_aarch64_sve_gather_prfw_scaled : SVE_gather_prf_scalar_base_vector_offset_scaled; -def int_aarch64_sve_gather_prfd_scaled : SVE_gather_prf_scalar_base_vector_offset_scaled; +def int_aarch64_sve_prfb_gather_scaled : SVE_gather_prf_scalar_base_vector_offset_scaled; +def int_aarch64_sve_prfh_gather_scaled : SVE_gather_prf_scalar_base_vector_offset_scaled; +def int_aarch64_sve_prfw_gather_scaled : SVE_gather_prf_scalar_base_vector_offset_scaled; +def int_aarch64_sve_prfd_gather_scaled : SVE_gather_prf_scalar_base_vector_offset_scaled; // Vector + scalar. -def int_aarch64_sve_gather_prfb : SVE_gather_prf_vector_base_scalar_offset; -def int_aarch64_sve_gather_prfh : SVE_gather_prf_vector_base_scalar_offset; -def int_aarch64_sve_gather_prfw : SVE_gather_prf_vector_base_scalar_offset; -def int_aarch64_sve_gather_prfd : SVE_gather_prf_vector_base_scalar_offset; +def int_aarch64_sve_prfb_gather : SVE_gather_prf_vector_base_scalar_offset; +def int_aarch64_sve_prfh_gather : SVE_gather_prf_vector_base_scalar_offset; +def int_aarch64_sve_prfw_gather : SVE_gather_prf_vector_base_scalar_offset; +def int_aarch64_sve_prfd_gather : SVE_gather_prf_vector_base_scalar_offset; // // Scalar to vector operations diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 3f962cc667c52..c01db52b1622e 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -1371,8 +1371,8 @@ def int_amdgcn_writelane : [IntrNoMem, IntrConvergent] >; -def int_amdgcn_alignbit : - GCCBuiltin<"__builtin_amdgcn_alignbit">, Intrinsic<[llvm_i32_ty], +// FIXME: Deprecated. This is equivalent to llvm.fshr +def int_amdgcn_alignbit : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem, IntrSpeculatable] >; diff --git a/llvm/include/llvm/IR/IntrinsicsARM.td b/llvm/include/llvm/IR/IntrinsicsARM.td index 80ed0792a209c..b41831ed1f61e 100644 --- a/llvm/include/llvm/IR/IntrinsicsARM.td +++ b/llvm/include/llvm/IR/IntrinsicsARM.td @@ -798,14 +798,6 @@ def int_arm_mve_pred_v2i : Intrinsic< def int_arm_mve_vreinterpretq : Intrinsic< [llvm_anyvector_ty], [llvm_anyvector_ty], [IntrNoMem]>; -multiclass IntrinsicSignSuffix rets, list params = [], - list props = [], - string name = "", - list sdprops = []> { - def _s: Intrinsic; - def _u: Intrinsic; -} - def int_arm_mve_min_predicated: Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMMatchType<0>, llvm_i32_ty /* unsigned */, llvm_anyvector_ty, LLVMMatchType<0>], @@ -891,11 +883,6 @@ def int_arm_mve_vmaxnma_predicated: Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMMatchType<0>, llvm_anyvector_ty], [IntrNoMem]>; -defm int_arm_mve_minv: IntrinsicSignSuffix<[llvm_i32_ty], - [llvm_i32_ty, llvm_anyvector_ty], [IntrNoMem]>; -defm int_arm_mve_maxv: IntrinsicSignSuffix<[llvm_i32_ty], - [llvm_i32_ty, llvm_anyvector_ty], [IntrNoMem]>; - multiclass MVEPredicated rets, list params, LLVMType pred = llvm_anyvector_ty, list props = [IntrNoMem]> { @@ -911,6 +898,24 @@ multiclass MVEPredicatedM rets, list params, LLVMMatchType<0>, rets[0])], props>; } +multiclass MVE_minmaxv { + defm v: MVEPredicated<[llvm_i32_ty], + [llvm_i32_ty, llvm_anyvector_ty, llvm_i32_ty /* unsigned */]>; + defm av: MVEPredicated<[llvm_i32_ty], + [llvm_i32_ty, llvm_anyvector_ty]>; + defm nmv: MVEPredicated<[llvm_anyfloat_ty], + [LLVMMatchType<0>, llvm_anyvector_ty]>; + defm nmav: MVEPredicated<[llvm_anyfloat_ty], + [LLVMMatchType<0>, llvm_anyvector_ty]>; +} +defm int_arm_mve_min: MVE_minmaxv; +defm int_arm_mve_max: MVE_minmaxv; + +defm int_arm_mve_addv: MVEPredicated<[llvm_i32_ty], + [llvm_anyvector_ty, llvm_i32_ty /* unsigned */]>; +defm int_arm_mve_addlv: MVEPredicated<[llvm_i64_ty], + [llvm_anyvector_ty, llvm_i32_ty /* unsigned */]>; + // Intrinsic with a predicated and a non-predicated case. The predicated case // has two additional parameters: inactive (the value for inactive lanes, can // be undef) and predicate. @@ -1275,9 +1280,62 @@ defm int_arm_mve_vqdmlad: MVEPredicated<[llvm_anyvector_ty], // CDE (Custom Datapath Extension) -def int_arm_cde_cx1: Intrinsic< - [llvm_i32_ty], - [llvm_i32_ty /* coproc */, llvm_i32_ty /* imm */], - [IntrNoMem, ImmArg<0>, ImmArg<1>]>; +multiclass CDEGPRIntrinsics args> { + def "" : Intrinsic< + [llvm_i32_ty], + !listconcat([llvm_i32_ty /* coproc */], args, [llvm_i32_ty /* imm */]), + [IntrNoMem, ImmArg<0>, ImmArg]>; + def a : Intrinsic< + [llvm_i32_ty], + !listconcat([llvm_i32_ty /* coproc */, llvm_i32_ty /* acc */], args, + [llvm_i32_ty /* imm */]), + [IntrNoMem, ImmArg<0>, ImmArg]>; + + def d: Intrinsic< + [llvm_i32_ty /* lo */, llvm_i32_ty /* hi */], + !listconcat([llvm_i32_ty /* coproc */], args, [llvm_i32_ty /* imm */]), + [IntrNoMem, ImmArg<0>, ImmArg]>; + def da: Intrinsic< + [llvm_i32_ty /* lo */, llvm_i32_ty /* hi */], + !listconcat([llvm_i32_ty /* coproc */, llvm_i32_ty /* acc_lo */, + llvm_i32_ty /* acc_hi */], args, [llvm_i32_ty /* imm */]), + [IntrNoMem, ImmArg<0>, ImmArg]>; +} + +defm int_arm_cde_cx1: CDEGPRIntrinsics<[]>; +defm int_arm_cde_cx2: CDEGPRIntrinsics<[llvm_i32_ty]>; +defm int_arm_cde_cx3: CDEGPRIntrinsics<[llvm_i32_ty, llvm_i32_ty]>; + +multiclass CDEVCXIntrinsics args> { + def "" : Intrinsic< + [llvm_anyfloat_ty], + !listconcat([llvm_i32_ty /* coproc */], args, [llvm_i32_ty /* imm */]), + [IntrNoMem, ImmArg<0>, ImmArg]>; + def a : Intrinsic< + [llvm_anyfloat_ty], + !listconcat([llvm_i32_ty /* coproc */, LLVMMatchType<0> /* acc */], + args, [llvm_i32_ty /* imm */]), + [IntrNoMem, ImmArg<0>, ImmArg]>; +} + +defm int_arm_cde_vcx1 : CDEVCXIntrinsics<[]>; +defm int_arm_cde_vcx2 : CDEVCXIntrinsics<[LLVMMatchType<0>]>; +defm int_arm_cde_vcx3 : CDEVCXIntrinsics<[LLVMMatchType<0>, LLVMMatchType<0>]>; + +multiclass CDEVCXVecIntrinsics args> { + def "" : Intrinsic< + [llvm_v16i8_ty], + !listconcat([llvm_i32_ty /* coproc */], args, [llvm_i32_ty /* imm */]), + [IntrNoMem, ImmArg<0>, ImmArg]>; + def a : Intrinsic< + [llvm_v16i8_ty], + !listconcat([llvm_i32_ty /* coproc */, llvm_v16i8_ty /* acc */], + args, [llvm_i32_ty /* imm */]), + [IntrNoMem, ImmArg<0>, ImmArg]>; +} + +defm int_arm_cde_vcx1q : CDEVCXVecIntrinsics<[]>; +defm int_arm_cde_vcx2q : CDEVCXVecIntrinsics<[llvm_v16i8_ty]>; +defm int_arm_cde_vcx3q : CDEVCXVecIntrinsics<[llvm_v16i8_ty, llvm_v16i8_ty]>; } // end TargetPrefix diff --git a/llvm/include/llvm/IR/IntrinsicsWebAssembly.td b/llvm/include/llvm/IR/IntrinsicsWebAssembly.td index e97700ad724ae..90b5a25d16c06 100644 --- a/llvm/include/llvm/IR/IntrinsicsWebAssembly.td +++ b/llvm/include/llvm/IR/IntrinsicsWebAssembly.td @@ -129,6 +129,10 @@ def int_wasm_alltrue : Intrinsic<[llvm_i32_ty], [llvm_anyvector_ty], [IntrNoMem, IntrSpeculatable]>; +def int_wasm_bitmask : + Intrinsic<[llvm_i32_ty], + [llvm_anyvector_ty], + [IntrNoMem, IntrSpeculatable]>; def int_wasm_qfma : Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], diff --git a/llvm/include/llvm/IR/KnowledgeRetention.h b/llvm/include/llvm/IR/KnowledgeRetention.h index f5c2d554c2305..a7de7cd16b660 100644 --- a/llvm/include/llvm/IR/KnowledgeRetention.h +++ b/llvm/include/llvm/IR/KnowledgeRetention.h @@ -22,6 +22,7 @@ #include "llvm/ADT/DenseMap.h" namespace llvm { +class IntrinsicInst; /// Build a call to llvm.assume to preserve informations that can be derived /// from the given instruction. @@ -84,7 +85,14 @@ struct MinMax { unsigned Max; }; -using RetainedKnowledgeMap = DenseMap; +/// A mapping from intrinsics (=`llvm.assume` calls) to a value range +/// (=knowledge) that is encoded in them. How the value range is interpreted +/// depends on the RetainedKnowledgeKey that was used to get this out of the +/// RetainedKnowledgeMap. +using Assume2KnowledgeMap = DenseMap; + +using RetainedKnowledgeMap = + DenseMap; /// Insert into the map all the informations contained in the operand bundles of /// the llvm.assume. This should be used instead of hasAttributeInAssume when diff --git a/llvm/include/llvm/IR/VPIntrinsics.def b/llvm/include/llvm/IR/VPIntrinsics.def new file mode 100644 index 0000000000000..d3e1fc854373d --- /dev/null +++ b/llvm/include/llvm/IR/VPIntrinsics.def @@ -0,0 +1,84 @@ +//===-- IR/VPIntrinsics.def - Describes llvm.vp.* Intrinsics -*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains descriptions of the various Vector Predication intrinsics. +// This is used as a central place for enumerating the different instructions +// and should eventually be the place to put comments about the instructions. +// +//===----------------------------------------------------------------------===// + +// NOTE: NO INCLUDE GUARD DESIRED! + +// Provide definitions of macros so that users of this file do not have to +// define everything to use it... +// +#ifndef REGISTER_VP_INTRINSIC +#define REGISTER_VP_INTRINSIC(VPID, MASKPOS, VLENPOS) +#endif + +// Map this VP intrinsic to its functional Opcode +#ifndef HANDLE_VP_TO_OC +#define HANDLE_VP_TO_OC(VPID, OC) +#endif + +///// Integer Arithmetic ///// + +// llvm.vp.add(x,y,mask,vlen) +REGISTER_VP_INTRINSIC(vp_add, 2, 3) +HANDLE_VP_TO_OC(vp_add, Add) + +// llvm.vp.and(x,y,mask,vlen) +REGISTER_VP_INTRINSIC(vp_and, 2, 3) +HANDLE_VP_TO_OC(vp_and, And) + +// llvm.vp.ashr(x,y,mask,vlen) +REGISTER_VP_INTRINSIC(vp_ashr, 2, 3) +HANDLE_VP_TO_OC(vp_ashr, AShr) + +// llvm.vp.lshr(x,y,mask,vlen) +REGISTER_VP_INTRINSIC(vp_lshr, 2, 3) +HANDLE_VP_TO_OC(vp_lshr, LShr) + +// llvm.vp.mul(x,y,mask,vlen) +REGISTER_VP_INTRINSIC(vp_mul, 2, 3) +HANDLE_VP_TO_OC(vp_mul, Mul) + +// llvm.vp.or(x,y,mask,vlen) +REGISTER_VP_INTRINSIC(vp_or, 2, 3) +HANDLE_VP_TO_OC(vp_or, Or) + +// llvm.vp.sdiv(x,y,mask,vlen) +REGISTER_VP_INTRINSIC(vp_sdiv, 2, 3) +HANDLE_VP_TO_OC(vp_sdiv, SDiv) + +// llvm.vp.shl(x,y,mask,vlen) +REGISTER_VP_INTRINSIC(vp_shl, 2, 3) +HANDLE_VP_TO_OC(vp_shl, Shl) + +// llvm.vp.srem(x,y,mask,vlen) +REGISTER_VP_INTRINSIC(vp_srem, 2, 3) +HANDLE_VP_TO_OC(vp_srem, SRem) + +// llvm.vp.sub(x,y,mask,vlen) +REGISTER_VP_INTRINSIC(vp_sub, 2, 3) +HANDLE_VP_TO_OC(vp_sub, Sub) + +// llvm.vp.udiv(x,y,mask,vlen) +REGISTER_VP_INTRINSIC(vp_udiv, 2, 3) +HANDLE_VP_TO_OC(vp_udiv, UDiv) + +// llvm.vp.urem(x,y,mask,vlen) +REGISTER_VP_INTRINSIC(vp_urem, 2, 3) +HANDLE_VP_TO_OC(vp_urem, URem) + +// llvm.vp.xor(x,y,mask,vlen) +REGISTER_VP_INTRINSIC(vp_xor, 2, 3) +HANDLE_VP_TO_OC(vp_xor, Xor) + +#undef REGISTER_VP_INTRINSIC +#undef HANDLE_VP_TO_OC diff --git a/llvm/include/llvm/MC/MCObjectFileInfo.h b/llvm/include/llvm/MC/MCObjectFileInfo.h index b40facc8c0c80..c61350e554cd2 100644 --- a/llvm/include/llvm/MC/MCObjectFileInfo.h +++ b/llvm/include/llvm/MC/MCObjectFileInfo.h @@ -221,6 +221,7 @@ class MCObjectFileInfo { public: void InitMCObjectFileInfo(const Triple &TT, bool PIC, MCContext &ctx, bool LargeCodeModel = false); + MCContext &getContext() const { return *Ctx; } bool getSupportsWeakOmittedEHFrame() const { return SupportsWeakOmittedEHFrame; diff --git a/llvm/include/llvm/Support/LockFileManager.h b/llvm/include/llvm/Support/LockFileManager.h index 2efeca3b62001..ab66621e67566 100644 --- a/llvm/include/llvm/Support/LockFileManager.h +++ b/llvm/include/llvm/Support/LockFileManager.h @@ -78,8 +78,8 @@ class LockFileManager { /// For a shared lock, wait until the owner releases the lock. /// Total timeout for the file to appear is ~1.5 minutes. - /// \param MaxSeconds the maximum wait time per iteration in seconds. - WaitForUnlockResult waitForUnlock(const unsigned MaxSeconds = 40); + /// \param MaxSeconds the maximum total wait time in seconds. + WaitForUnlockResult waitForUnlock(const unsigned MaxSeconds = 90); /// Remove the lock file. This may delete a different lock file than /// the one previously read if there is a race. diff --git a/llvm/include/llvm/Support/OptimalLayout.h b/llvm/include/llvm/Support/OptimalLayout.h new file mode 100644 index 0000000000000..870dc78791bb3 --- /dev/null +++ b/llvm/include/llvm/Support/OptimalLayout.h @@ -0,0 +1,130 @@ +//===-- OptimalLayout.h - Optimal data layout algorithm -----------*- C++ -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// This file provides an interface for laying out a sequence of fields +/// as a struct in a way that attempts to minimizes the total space +/// requirements of the struct. +/// +/// The word "optimal" is a misnomer in several ways. First, minimizing +/// space usage doesn't necessarily yield optimal performance because it +/// may decrease locality. Second, there is no known efficient algorithm +/// that guarantees a minimal layout for arbitrary inputs. Nonetheless, +/// this algorithm is likely to produce much more compact layouts than +/// would be produced by just allocating space in a buffer. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_SUPPORT_OPTIMALLAYOUT_H +#define LLVM_SUPPORT_OPTIMALLAYOUT_H + +#include "llvm/Support/Alignment.h" +#include "llvm/ADT/ArrayRef.h" +#include + +namespace llvm { + +/// A field in a structure. +struct OptimalLayoutField { + /// A special value for Offset indicating that the field can be moved + /// anywhere. + static constexpr uint64_t FlexibleOffset = ~(uint64_t)0; + + OptimalLayoutField(const void *Id, uint64_t Size, Align Alignment, + uint64_t FixedOffset = FlexibleOffset) + : Offset(FixedOffset), Size(Size), Id(Id), Alignment(Alignment) { + assert(Size > 0 && "adding an empty field to the layout"); + } + + /// The offset of this field in the final layout. If this is + /// initialized to FlexibleOffset, layout will overwrite it with + /// the assigned offset of the field. + uint64_t Offset; + + /// The required size of this field in bytes. Does not have to be + /// a multiple of Alignment. Must be non-zero. + uint64_t Size; + + /// A opaque value which uniquely identifies this field. + const void *Id; + + /// Private scratch space for the algorithm. The implementation + /// must treat this as uninitialized memory on entry. + void *Scratch; + + /// The required alignment of this field. + Align Alignment; + + /// Return true if this field has been assigned a fixed offset. + /// After layout, this will be true of all the fields. + bool hasFixedOffset() const { + return (Offset != FlexibleOffset); + } + + /// Given that this field has a fixed offset, return the offset + /// of the first byte following it. + uint64_t getEndOffset() const { + assert(hasFixedOffset()); + return Offset + Size; + } +}; + +/// Compute a layout for a struct containing the given fields, making a +/// best-effort attempt to minimize the amount of space required. +/// +/// Two features are supported which require a more careful solution +/// than the well-known "sort by decreasing alignment" solution: +/// +/// - Fields may be assigned a fixed offset in the layout. If there are +/// gaps among the fixed-offset fields, the algorithm may attempt +/// to allocate flexible-offset fields into those gaps. If that's +/// undesirable, the caller should "block out" those gaps by e.g. +/// just creating a single fixed-offset field that represents the +/// entire "header". +/// +/// - The size of a field is not required to be a multiple of, or even +/// greater than, the field's required alignment. The only constraint +/// on fields is that they must not be zero-sized. +/// +/// To simplify the implementation, any fixed-offset fields in the +/// layout must appear at the start of the field array, and they must +/// be ordered by increasing offset. +/// +/// The algorithm will produce a guaranteed-minimal layout with no +/// interior padding in the following "C-style" case: +/// +/// - every field's size is a multiple of its required alignment and +/// - either no fields have initially fixed offsets, or the fixed-offset +/// fields have no interior padding and end at an offset that is at +/// least as aligned as all the flexible-offset fields. +/// +/// Otherwise, while the algorithm will make a best-effort attempt to +/// avoid padding, it cannot guarantee a minimal layout, as there is +/// no known efficient algorithm for doing so. +/// +/// The layout produced by this algorithm may not be stable across LLVM +/// releases. Do not use this anywhere where ABI stability is required. +/// +/// Flexible-offset fields with the same size and alignment will be ordered +/// the same way they were in the initial array. Otherwise the current +/// algorithm makes no effort to preserve the initial order of +/// flexible-offset fields. +/// +/// On return, all fields will have been assigned a fixed offset, and the +/// array will be sorted in order of ascending offsets. Note that this +/// means that the fixed-offset fields may no longer form a strict prefix +/// if there's any padding before they end. +/// +/// The return value is the total size of the struct and its required +/// alignment. Note that the total size is not rounded up to a multiple +/// of the required alignment; clients which require this can do so easily. +std::pair +performOptimalLayout(MutableArrayRef Fields); + +} // namespace llvm + +#endif diff --git a/llvm/include/llvm/Support/Path.h b/llvm/include/llvm/Support/Path.h index 97955f882d51e..f0b2810cd7a9b 100644 --- a/llvm/include/llvm/Support/Path.h +++ b/llvm/include/llvm/Support/Path.h @@ -468,10 +468,6 @@ StringRef remove_leading_dotslash(StringRef path, Style style = Style::native); bool remove_dots(SmallVectorImpl &path, bool remove_dot_dot = false, Style style = Style::native); -#if defined(_WIN32) -std::error_code widenPath(const Twine &Path8, SmallVectorImpl &Path16); -#endif - } // end namespace path } // end namespace sys } // end namespace llvm diff --git a/llvm/include/llvm/Support/TargetOpcodes.def b/llvm/include/llvm/Support/TargetOpcodes.def index e004550059d41..12bd1a77b18da 100644 --- a/llvm/include/llvm/Support/TargetOpcodes.def +++ b/llvm/include/llvm/Support/TargetOpcodes.def @@ -442,6 +442,18 @@ HANDLE_TARGET_OPCODE(G_UMULH) // the high half of the result. HANDLE_TARGET_OPCODE(G_SMULH) +/// Generic saturating unsigned addition. +HANDLE_TARGET_OPCODE(G_UADDSAT) + +/// Generic saturating signed addition. +HANDLE_TARGET_OPCODE(G_SADDSAT) + +/// Generic saturating unsigned subtraction. +HANDLE_TARGET_OPCODE(G_USUBSAT) + +/// Generic saturating signed subtraction. +HANDLE_TARGET_OPCODE(G_SSUBSAT) + /// Generic FP addition. HANDLE_TARGET_OPCODE(G_FADD) diff --git a/llvm/include/llvm/Support/TypeSize.h b/llvm/include/llvm/Support/TypeSize.h index d800317204223..f66c043b29fe7 100644 --- a/llvm/include/llvm/Support/TypeSize.h +++ b/llvm/include/llvm/Support/TypeSize.h @@ -146,6 +146,9 @@ class TypeSize { return (MinSize & 7) == 0; } + // Returns true if the type size is non-zero. + bool isNonZero() const { return MinSize != 0; } + // Casts to a uint64_t if this is a fixed-width size. // // This interface is deprecated and will be removed in a future version @@ -165,7 +168,7 @@ class TypeSize { // bail out early for scalable vectors and use getFixedSize() // } operator uint64_t() const { -#ifdef STRICT_IMPLICIT_CONVERSION_TYPESIZE +#ifdef STRICT_FIXED_SIZE_VECTORS return getFixedSize(); #else if (isScalable()) diff --git a/llvm/include/llvm/Support/Windows/WindowsSupport.h b/llvm/include/llvm/Support/Windows/WindowsSupport.h index bb7e79b860180..bd5a90c2c3f00 100644 --- a/llvm/include/llvm/Support/Windows/WindowsSupport.h +++ b/llvm/include/llvm/Support/Windows/WindowsSupport.h @@ -236,6 +236,12 @@ namespace windows { // UTF-8 regardless of the current code page setting. std::error_code GetCommandLineArguments(SmallVectorImpl &Args, BumpPtrAllocator &Alloc); + +/// Convert UTF-8 path to a suitable UTF-16 path for use with the Win32 Unicode +/// File API. +std::error_code widenPath(const Twine &Path8, SmallVectorImpl &Path16, + size_t MaxPathLen = MAX_PATH); + } // end namespace windows } // end namespace sys } // end namespace llvm. diff --git a/llvm/include/llvm/Target/GenericOpcodes.td b/llvm/include/llvm/Target/GenericOpcodes.td index 6e7531496efcb..901c3d17457d6 100644 --- a/llvm/include/llvm/Target/GenericOpcodes.td +++ b/llvm/include/llvm/Target/GenericOpcodes.td @@ -469,6 +469,42 @@ def G_SMULH : GenericInstruction { let isCommutable = 1; } +//------------------------------------------------------------------------------ +// Saturating ops +//------------------------------------------------------------------------------ + +// Generic saturating unsigned addition. +def G_UADDSAT : GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src1, type0:$src2); + let hasSideEffects = 0; + let isCommutable = 1; +} + +// Generic saturating signed addition. +def G_SADDSAT : GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src1, type0:$src2); + let hasSideEffects = 0; + let isCommutable = 1; +} + +// Generic saturating unsigned subtraction. +def G_USUBSAT : GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src1, type0:$src2); + let hasSideEffects = 0; + let isCommutable = 0; +} + +// Generic saturating signed subtraction. +def G_SSUBSAT : GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src1, type0:$src2); + let hasSideEffects = 0; + let isCommutable = 0; +} + //------------------------------------------------------------------------------ // Floating Point Unary Ops. //------------------------------------------------------------------------------ diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td index baa31d0a09e5f..b0f189a3cd085 100644 --- a/llvm/include/llvm/Target/GlobalISel/Combine.td +++ b/llvm/include/llvm/Target/GlobalISel/Combine.td @@ -161,15 +161,47 @@ def undef_to_negative_one: GICombineRule< [{ return Helper.matchAnyExplicitUseIsUndef(*${root}); }]), (apply [{ Helper.replaceInstWithConstant(*${root}, -1); }])>; -def propagate_undef: GICombineRule< +// Instructions where if any source operand is undef, the instruction can be +// replaced with undef. +def propagate_undef_any_op: GICombineRule< (defs root:$root), (match (wip_match_opcode G_ADD, G_FPTOSI, G_FPTOUI, G_SUB, G_XOR):$root, [{ return Helper.matchAnyExplicitUseIsUndef(*${root}); }]), (apply [{ Helper.replaceInstWithUndef(*${root}); }])>; +// Instructions where if all source operands are undef, the instruction can be +// replaced with undef. +def propagate_undef_all_ops: GICombineRule< + (defs root:$root), + (match (wip_match_opcode G_SHUFFLE_VECTOR):$root, + [{ return Helper.matchAllExplicitUsesAreUndef(*${root}); }]), + (apply [{ Helper.replaceInstWithUndef(*${root}); }])>; + +// Replace a G_SHUFFLE_VECTOR with an undef mask with a G_IMPLICIT_DEF. +def propagate_undef_shuffle_mask: GICombineRule< + (defs root:$root), + (match (wip_match_opcode G_SHUFFLE_VECTOR):$root, + [{ return Helper.matchUndefShuffleVectorMask(*${root}); }]), + (apply [{ Helper.replaceInstWithUndef(*${root}); }])>; + +// Fold (cond ? x : x) -> x +def select_same_val: GICombineRule< + (defs root:$root), + (match (wip_match_opcode G_SELECT):$root, + [{ return Helper.matchSelectSameVal(*${root}); }]), + (apply [{ return Helper.replaceSingleDefInstWithOperand(*${root}, 2); }]) +>; + +// FIXME: These should use the custom predicate feature once it lands. def undef_combines : GICombineGroup<[undef_to_fp_zero, undef_to_int_zero, - undef_to_negative_one, propagate_undef]>; + undef_to_negative_one, + propagate_undef_any_op, + propagate_undef_all_ops, + propagate_undef_shuffle_mask]>; + +def identity_combines : GICombineGroup<[select_same_val]>; def trivial_combines : GICombineGroup<[copy_prop, mul_to_shl]>; def all_combines : GICombineGroup<[trivial_combines, ptr_add_immed_chain, - combines_for_extload, combine_indexed_load_store, undef_combines]>; + combines_for_extload, combine_indexed_load_store, undef_combines, + identity_combines]>; diff --git a/llvm/include/llvm/Target/TargetLoweringObjectFile.h b/llvm/include/llvm/Target/TargetLoweringObjectFile.h index 2a5ac9a4de9a0..6d571c0ba1e25 100644 --- a/llvm/include/llvm/Target/TargetLoweringObjectFile.h +++ b/llvm/include/llvm/Target/TargetLoweringObjectFile.h @@ -37,8 +37,6 @@ class MCValue; class TargetMachine; class TargetLoweringObjectFile : public MCObjectFileInfo { - MCContext *Ctx = nullptr; - /// Name-mangler for global names. Mangler *Mang = nullptr; @@ -67,7 +65,6 @@ class TargetLoweringObjectFile : public MCObjectFileInfo { operator=(const TargetLoweringObjectFile &) = delete; virtual ~TargetLoweringObjectFile(); - MCContext &getContext() const { return *Ctx; } Mangler &getMangler() const { return *Mang; } /// This method must be called before any actual lowering is done. This diff --git a/llvm/include/llvm/Target/TargetMachine.h b/llvm/include/llvm/Target/TargetMachine.h index fdf1313e5491b..c2e14d14d3511 100644 --- a/llvm/include/llvm/Target/TargetMachine.h +++ b/llvm/include/llvm/Target/TargetMachine.h @@ -237,6 +237,9 @@ class TargetMachine { void setSupportsDefaultOutlining(bool Enable) { Options.SupportsDefaultOutlining = Enable; } + void setSupportsDebugEntryValues(bool Enable) { + Options.SupportsDebugEntryValues = Enable; + } bool shouldPrintMachineCode() const { return Options.PrintMachineCode; } diff --git a/llvm/include/llvm/Target/TargetOptions.h b/llvm/include/llvm/Target/TargetOptions.h index 9378e290bed14..7282040a352fa 100644 --- a/llvm/include/llvm/Target/TargetOptions.h +++ b/llvm/include/llvm/Target/TargetOptions.h @@ -134,8 +134,8 @@ namespace llvm { EmulatedTLS(false), ExplicitEmulatedTLS(false), EnableIPRA(false), EmitStackSizeSection(false), EnableMachineOutliner(false), SupportsDefaultOutlining(false), EmitAddrsig(false), - EmitCallSiteInfo(false), EnableDebugEntryValues(false), - ForceDwarfFrameSection(false) {} + EmitCallSiteInfo(false), SupportsDebugEntryValues(false), + EnableDebugEntryValues(false), ForceDwarfFrameSection(false) {} /// PrintMachineCode - This flag is enabled when the -print-machineinstrs /// option is specified on the command line, and should enable debugging @@ -286,8 +286,16 @@ namespace llvm { /// info, and it is restricted only to optimized code. This can be used for /// something else, so that should be controlled in the frontend. unsigned EmitCallSiteInfo : 1; - /// Emit debug info about parameter's entry values. - unsigned EnableDebugEntryValues : 1; + /// Set if the target supports the debug entry values by default. + unsigned SupportsDebugEntryValues : 1; + /// When set to true, the EnableDebugEntryValues option forces production + /// of debug entry values even if the target does not officially support + /// it. Useful for testing purposes only. This flag should never be checked + /// directly, always use \ref ShouldEmitDebugEntryValues instead. + unsigned EnableDebugEntryValues : 1; + /// NOTE: There are targets that still do not support the debug entry values + /// production. + bool ShouldEmitDebugEntryValues() const; /// Emit DWARF debug frame section. unsigned ForceDwarfFrameSection : 1; diff --git a/llvm/include/llvm/TextAPI/MachO/InterfaceFile.h b/llvm/include/llvm/TextAPI/MachO/InterfaceFile.h index 1a9711cfb4057..f86851143b8d6 100644 --- a/llvm/include/llvm/TextAPI/MachO/InterfaceFile.h +++ b/llvm/include/llvm/TextAPI/MachO/InterfaceFile.h @@ -278,11 +278,6 @@ class InterfaceFile { return ParentUmbrellas; } - /// Get the parent umbrella framework. - const std::vector> getParentUmbrellas() const { - return ParentUmbrellas; - } - /// Add an allowable client. /// /// Mach-O Dynamic libraries have the concept of allowable clients that are diff --git a/llvm/include/llvm/Transforms/IPO/Attributor.h b/llvm/include/llvm/Transforms/IPO/Attributor.h index 41d3c2f9ec6dc..2113197c067db 100644 --- a/llvm/include/llvm/Transforms/IPO/Attributor.h +++ b/llvm/include/llvm/Transforms/IPO/Attributor.h @@ -29,7 +29,7 @@ // automatically capture a potential dependence from Q to P. This dependence // will cause P to be reevaluated whenever Q changes in the future. // -// The Attributor will only reevaluated abstract attributes that might have +// The Attributor will only reevaluate abstract attributes that might have // changed since the last iteration. That means that the Attribute will not // revisit all instructions/blocks/functions in the module but only query // an update from a subset of the abstract attributes. @@ -110,11 +110,13 @@ #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/ConstantRange.h" +#include "llvm/IR/KnowledgeRetention.h" #include "llvm/IR/PassManager.h" #include "llvm/Transforms/Utils/CallGraphUpdater.h" namespace llvm { +struct Attributor; struct AbstractAttribute; struct InformationCache; struct AAIsDead; @@ -152,8 +154,8 @@ struct IRPosition { /// The positions we distinguish in the IR. /// - /// The values are chosen such that the KindOrArgNo member has a value >= 1 - /// if it is an argument or call site argument while a value < 1 indicates the + /// The values are chosen such that the KindOrArgNo member has a value >= 0 + /// if it is an argument or call site argument while a value < 0 indicates the /// respective kind of that value. enum Kind : int { IRP_INVALID = -6, ///< An invalid position. @@ -273,18 +275,11 @@ struct IRPosition { /// Return the associated function, if any. Function *getAssociatedFunction() const { - if (auto *CB = dyn_cast(AnchorVal)) - return CB->getCalledFunction(); assert(KindOrArgNo != IRP_INVALID && "Invalid position does not have an anchor scope!"); - Value &V = getAnchorValue(); - if (isa(V)) - return &cast(V); - if (isa(V)) - return cast(V).getParent(); - if (isa(V)) - return cast(V).getFunction(); - return nullptr; + if (auto *CB = dyn_cast(AnchorVal)) + return CB->getCalledFunction(); + return getAnchorScope(); } /// Return the associated argument, if any. @@ -398,7 +393,8 @@ struct IRPosition { /// e.g., the function position if this is an /// argument position, should be ignored. bool hasAttr(ArrayRef AKs, - bool IgnoreSubsumingPositions = false) const; + bool IgnoreSubsumingPositions = false, + Attributor *A = nullptr) const; /// Return the attributes of any kind in \p AKs existing in the IR at a /// position that will affect this one. While each position can only have a @@ -410,7 +406,8 @@ struct IRPosition { /// argument position, should be ignored. void getAttrs(ArrayRef AKs, SmallVectorImpl &Attrs, - bool IgnoreSubsumingPositions = false) const; + bool IgnoreSubsumingPositions = false, + Attributor *A = nullptr) const; /// Remove the attribute of kind \p AKs existing in the IR at this position. void removeAttrs(ArrayRef AKs) const { @@ -470,11 +467,20 @@ struct IRPosition { bool getAttrsFromIRAttr(Attribute::AttrKind AK, SmallVectorImpl &Attrs) const; + /// Return the attributes of kind \p AK existing in the IR as operand bundles + /// of an llvm.assume. + bool getAttrsFromAssumes(Attribute::AttrKind AK, + SmallVectorImpl &Attrs, + Attributor &A) const; + protected: /// The value this position is anchored at. Value *AnchorVal; - /// The argument number, if non-negative, or the position "kind". + /// If AnchorVal is Argument or CallBase then this number should be + /// non-negative and it denotes the argument or call site argument index + /// respectively. Otherwise, it denotes the kind of this IRPosition according + /// to Kind above. int KindOrArgNo; }; @@ -611,6 +617,9 @@ struct InformationCache { /// Return datalayout used in the module. const DataLayout &getDL() { return DL; } + /// Return the map conaining all the knowledge we have from `llvm.assume`s. + const RetainedKnowledgeMap &getKnowledgeMap() const { return KnowledgeMap; } + private: /// A map type from functions to opcode to instruction maps. using FuncInstOpcodeMapTy = DenseMap; @@ -631,6 +640,9 @@ struct InformationCache { /// MustBeExecutedContextExplorer MustBeExecutedContextExplorer Explorer; + /// A map with knowledge retained in `llvm.assume` instructions. + RetainedKnowledgeMap KnowledgeMap; + /// Getters for analysis. AnalysisGetter &AG; @@ -894,7 +906,7 @@ struct Attributor { /// /// This method will evaluate \p Pred on all (transitive) uses of the /// associated value and return true if \p Pred holds every time. - bool checkForAllUses(const function_ref &Pred, + bool checkForAllUses(function_ref Pred, const AbstractAttribute &QueryingAA, const Value &V, DepClassTy LivenessDepClass = DepClassTy::OPTIONAL); @@ -1006,7 +1018,7 @@ struct Attributor { /// all call sites are known, hence the function has internal linkage. /// If true is returned, \p AllCallSitesKnown is set if all possible call /// sites of the function have been visited. - bool checkForAllCallSites(const function_ref &Pred, + bool checkForAllCallSites(function_ref Pred, const AbstractAttribute &QueryingAA, bool RequireAllCallSites, bool &AllCallSitesKnown); @@ -1017,22 +1029,21 @@ struct Attributor { /// matched with their respective return instructions. Returns true if \p Pred /// holds on all of them. bool checkForAllReturnedValuesAndReturnInsts( - const function_ref &)> - &Pred, + function_ref &)> Pred, const AbstractAttribute &QueryingAA); /// Check \p Pred on all values potentially returned by the function /// associated with \p QueryingAA. /// /// This is the context insensitive version of the method above. - bool checkForAllReturnedValues(const function_ref &Pred, + bool checkForAllReturnedValues(function_ref Pred, const AbstractAttribute &QueryingAA); /// Check \p Pred on all instructions with an opcode present in \p Opcodes. /// /// This method will evaluate \p Pred on all instructions with an opcode /// present in \p Opcode and return true if \p Pred holds on all of them. - bool checkForAllInstructions(const function_ref &Pred, + bool checkForAllInstructions(function_ref Pred, const AbstractAttribute &QueryingAA, const ArrayRef &Opcodes, bool CheckBBLivenessOnly = false); @@ -1040,9 +1051,8 @@ struct Attributor { /// Check \p Pred on all call-like instructions (=CallBased derived). /// /// See checkForAllCallLikeInstructions(...) for more information. - bool - checkForAllCallLikeInstructions(const function_ref &Pred, - const AbstractAttribute &QueryingAA) { + bool checkForAllCallLikeInstructions(function_ref Pred, + const AbstractAttribute &QueryingAA) { return checkForAllInstructions(Pred, QueryingAA, {(unsigned)Instruction::Invoke, (unsigned)Instruction::CallBr, @@ -1054,9 +1064,8 @@ struct Attributor { /// This method will evaluate \p Pred on all instructions that read or write /// to memory present in the information cache and return true if \p Pred /// holds on all of them. - bool checkForAllReadWriteInstructions( - const llvm::function_ref &Pred, - AbstractAttribute &QueryingAA); + bool checkForAllReadWriteInstructions(function_ref Pred, + AbstractAttribute &QueryingAA); /// Return the data layout associated with the anchor scope. const DataLayout &getDataLayout() const { return InfoCache.DL; } @@ -1069,7 +1078,7 @@ struct Attributor { /// all call sites are known, hence the function has internal linkage. /// If true is returned, \p AllCallSitesKnown is set if all possible call /// sites of the function have been visited. - bool checkForAllCallSites(const function_ref &Pred, + bool checkForAllCallSites(function_ref Pred, const Function &Fn, bool RequireAllCallSites, const AbstractAttribute *QueryingAA, bool &AllCallSitesKnown); @@ -1717,7 +1726,8 @@ struct IRAttribute : public IRPosition, public Base { /// See AbstractAttribute::initialize(...). virtual void initialize(Attributor &A) override { const IRPosition &IRP = this->getIRPosition(); - if (isa(IRP.getAssociatedValue()) || hasAttr(getAttrKind())) { + if (isa(IRP.getAssociatedValue()) || + hasAttr(getAttrKind(), /* IgnoreSubsumingPositions */ false, &A)) { this->getState().indicateOptimisticFixpoint(); return; } @@ -1918,8 +1928,8 @@ struct AAReturnedValues /// Note: Unlike the Attributor::checkForAllReturnedValuesAndReturnInsts /// method, this one will not filter dead return instructions. virtual bool checkForAllReturnedValuesAndReturnInsts( - const function_ref &)> - &Pred) const = 0; + function_ref &)> Pred) + const = 0; using iterator = MapVector>::iterator; @@ -2291,7 +2301,8 @@ struct DerefState : AbstractState { /// Add accessed bytes to the map. void addAccessedBytes(int64_t Offset, uint64_t Size) { - AccessedBytesMap[Offset] = std::max(AccessedBytesMap[Offset], Size); + uint64_t &AccessedBytes = AccessedBytesMap[Offset]; + AccessedBytes = std::max(AccessedBytes, Size); // Known bytes might increase. computeKnownDerefBytesFromAccessedMap(); @@ -2723,8 +2734,9 @@ struct AAMemoryLocation /// underlying accessed memory pointer) and it will return true if \p Pred /// holds every time. virtual bool checkForAllAccessesToMemoryKind( - const function_ref &Pred, + function_ref + Pred, MemoryLocationsKind MLK) const = 0; /// Create an abstract attribute view for the position \p IRP. diff --git a/llvm/include/llvm/Transforms/IPO/PassManagerBuilder.h b/llvm/include/llvm/Transforms/IPO/PassManagerBuilder.h index 289ca9b39f347..8b03bcba10e4e 100644 --- a/llvm/include/llvm/Transforms/IPO/PassManagerBuilder.h +++ b/llvm/include/llvm/Transforms/IPO/PassManagerBuilder.h @@ -217,7 +217,6 @@ class PassManagerBuilder { void addLateLTOOptimizationPasses(legacy::PassManagerBase &PM); void addPGOInstrPasses(legacy::PassManagerBase &MPM, bool IsCS); void addFunctionSimplificationPasses(legacy::PassManagerBase &MPM); - void addInstructionCombiningPass(legacy::PassManagerBase &MPM) const; public: /// populateFunctionPassManager - This fills in the function pass manager, diff --git a/llvm/include/llvm/Transforms/InstCombine/InstCombine.h b/llvm/include/llvm/Transforms/InstCombine/InstCombine.h index d7a6662510d32..0ad4f54fd465d 100644 --- a/llvm/include/llvm/Transforms/InstCombine/InstCombine.h +++ b/llvm/include/llvm/Transforms/InstCombine/InstCombine.h @@ -24,14 +24,13 @@ namespace llvm { class InstCombinePass : public PassInfoMixin { InstCombineWorklist Worklist; - const bool ExpensiveCombines; const unsigned MaxIterations; public: static StringRef name() { return "InstCombinePass"; } - explicit InstCombinePass(bool ExpensiveCombines = true); - explicit InstCombinePass(bool ExpensiveCombines, unsigned MaxIterations); + explicit InstCombinePass(); + explicit InstCombinePass(unsigned MaxIterations); PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); }; @@ -42,15 +41,13 @@ class InstCombinePass : public PassInfoMixin { /// will try to combine all instructions in the function. class InstructionCombiningPass : public FunctionPass { InstCombineWorklist Worklist; - const bool ExpensiveCombines; const unsigned MaxIterations; public: static char ID; // Pass identification, replacement for typeid - explicit InstructionCombiningPass(bool ExpensiveCombines = true); - explicit InstructionCombiningPass(bool ExpensiveCombines, - unsigned MaxIterations); + explicit InstructionCombiningPass(); + explicit InstructionCombiningPass(unsigned MaxIterations); void getAnalysisUsage(AnalysisUsage &AU) const override; bool runOnFunction(Function &F) override; @@ -68,9 +65,8 @@ class InstructionCombiningPass : public FunctionPass { // into: // %Z = add int 2, %X // -FunctionPass *createInstructionCombiningPass(bool ExpensiveCombines = true); -FunctionPass *createInstructionCombiningPass(bool ExpensiveCombines, - unsigned MaxIterations); +FunctionPass *createInstructionCombiningPass(); +FunctionPass *createInstructionCombiningPass(unsigned MaxIterations); } #endif diff --git a/llvm/include/llvm/Transforms/Utils/LowerMemIntrinsics.h b/llvm/include/llvm/Transforms/Utils/LowerMemIntrinsics.h index 8e9d7b522c78b..8d0956033d9f8 100644 --- a/llvm/include/llvm/Transforms/Utils/LowerMemIntrinsics.h +++ b/llvm/include/llvm/Transforms/Utils/LowerMemIntrinsics.h @@ -23,12 +23,13 @@ class MemMoveInst; class MemSetInst; class TargetTransformInfo; class Value; +struct Align; /// Emit a loop implementing the semantics of llvm.memcpy where the size is not /// a compile-time constant. Loop will be insterted at \p InsertBefore. void createMemCpyLoopUnknownSize(Instruction *InsertBefore, Value *SrcAddr, Value *DstAddr, Value *CopyLen, - unsigned SrcAlign, unsigned DestAlign, + Align SrcAlign, Align DestAlign, bool SrcIsVolatile, bool DstIsVolatile, const TargetTransformInfo &TTI); @@ -36,11 +37,10 @@ void createMemCpyLoopUnknownSize(Instruction *InsertBefore, Value *SrcAddr, /// compile time constant. Loop is inserted at \p InsertBefore. void createMemCpyLoopKnownSize(Instruction *InsertBefore, Value *SrcAddr, Value *DstAddr, ConstantInt *CopyLen, - unsigned SrcAlign, unsigned DestAlign, + Align SrcAlign, Align DestAlign, bool SrcIsVolatile, bool DstIsVolatile, const TargetTransformInfo &TTI); - /// Expand \p MemCpy as a loop. \p MemCpy is not deleted. void expandMemCpyAsLoop(MemCpyInst *MemCpy, const TargetTransformInfo &TTI); diff --git a/llvm/include/llvm/module.modulemap b/llvm/include/llvm/module.modulemap index 3c8639992fdd4..ff05f6251fb05 100644 --- a/llvm/include/llvm/module.modulemap +++ b/llvm/include/llvm/module.modulemap @@ -290,6 +290,7 @@ module LLVM_IR { textual header "IR/Metadata.def" textual header "IR/FixedMetadataKinds.def" textual header "IR/Value.def" + textual header "IR/VPIntrinsics.def" textual header "IR/RuntimeLibcalls.def" } diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp index 1e4d7f7117851..ba1f550487006 100644 --- a/llvm/lib/Analysis/InstructionSimplify.cpp +++ b/llvm/lib/Analysis/InstructionSimplify.cpp @@ -2340,10 +2340,9 @@ computePointerICmp(const DataLayout &DL, const TargetLibraryInfo *TLI, RHS = RHS->stripPointerCasts(); // A non-null pointer is not equal to a null pointer. - if (llvm::isKnownNonZero(LHS, DL, 0, nullptr, nullptr, nullptr, - IIQ.UseInstrInfo) && - isa(RHS) && - (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE)) + if (isa(RHS) && ICmpInst::isEquality(Pred) && + llvm::isKnownNonZero(LHS, DL, 0, nullptr, nullptr, nullptr, + IIQ.UseInstrInfo)) return ConstantInt::get(GetCompareTy(LHS), !CmpInst::isTrueWhenEqual(Pred)); diff --git a/llvm/lib/Analysis/LazyValueInfo.cpp b/llvm/lib/Analysis/LazyValueInfo.cpp index 7ae7a1fd54937..296f27799b945 100644 --- a/llvm/lib/Analysis/LazyValueInfo.cpp +++ b/llvm/lib/Analysis/LazyValueInfo.cpp @@ -596,19 +596,9 @@ static ValueLatticeElement getFromRangeMetadata(Instruction *BBI) { } bool LazyValueInfoImpl::solveBlockValue(Value *Val, BasicBlock *BB) { - if (isa(Val)) - return true; - - if (TheCache.hasCachedValueInfo(Val, BB)) { - // If we have a cached value, use that. - LLVM_DEBUG(dbgs() << " reuse BB '" << BB->getName() << "' val=" - << TheCache.getCachedValueInfo(Val, BB) << '\n'); - - // Since we're reusing a cached value, we don't need to update the - // OverDefinedCache. The cache will have been properly updated whenever the - // cached value was inserted. - return true; - } + assert(!isa(Val) && "Value should not be constant"); + assert(!TheCache.hasCachedValueInfo(Val, BB) && + "Value should not be in cache"); // Hold off inserting this value into the Cache in case we have to return // false and come back later. @@ -1278,11 +1268,11 @@ static ValueLatticeElement getValueFromOverflowCondition( static ValueLatticeElement getValueFromCondition(Value *Val, Value *Cond, bool isTrueDest, - DenseMap &Visited); + SmallDenseMap &Visited); static ValueLatticeElement getValueFromConditionImpl(Value *Val, Value *Cond, bool isTrueDest, - DenseMap &Visited) { + SmallDenseMap &Visited) { if (ICmpInst *ICI = dyn_cast(Cond)) return getValueFromICmpCondition(Val, ICI, isTrueDest); @@ -1315,7 +1305,7 @@ getValueFromConditionImpl(Value *Val, Value *Cond, bool isTrueDest, static ValueLatticeElement getValueFromCondition(Value *Val, Value *Cond, bool isTrueDest, - DenseMap &Visited) { + SmallDenseMap &Visited) { auto I = Visited.find(Cond); if (I != Visited.end()) return I->second; @@ -1328,7 +1318,7 @@ getValueFromCondition(Value *Val, Value *Cond, bool isTrueDest, ValueLatticeElement getValueFromCondition(Value *Val, Value *Cond, bool isTrueDest) { assert(Cond && "precondition"); - DenseMap Visited; + SmallDenseMap Visited; return getValueFromCondition(Val, Cond, isTrueDest, Visited); } diff --git a/llvm/lib/Analysis/TargetLibraryInfo.cpp b/llvm/lib/Analysis/TargetLibraryInfo.cpp index 3928fbd344764..fa4deea632d8f 100644 --- a/llvm/lib/Analysis/TargetLibraryInfo.cpp +++ b/llvm/lib/Analysis/TargetLibraryInfo.cpp @@ -105,14 +105,12 @@ static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T, TLI.setShouldExtI32Return(ShouldExtI32Return); TLI.setShouldSignExtI32Param(ShouldSignExtI32Param); - if (T.getArch() == Triple::r600 || - T.getArch() == Triple::amdgcn) + if (T.isAMDGPU()) TLI.disableAllFunctions(); // There are no library implementations of memcpy and memset for AMD gpus and // these can be difficult to lower in the backend. - if (T.getArch() == Triple::r600 || - T.getArch() == Triple::amdgcn) { + if (T.isAMDGPU()) { TLI.setUnavailable(LibFunc_memcpy); TLI.setUnavailable(LibFunc_memset); TLI.setUnavailable(LibFunc_memset_pattern16); diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp index f1adc037d17f0..ab6afa1a81dcc 100644 --- a/llvm/lib/Analysis/ValueTracking.cpp +++ b/llvm/lib/Analysis/ValueTracking.cpp @@ -215,6 +215,18 @@ void llvm::computeKnownBits(const Value *V, KnownBits &Known, Query(DL, AC, safeCxtI(V, CxtI), DT, UseInstrInfo, ORE)); } +void llvm::computeKnownBits(const Value *V, const APInt &DemandedElts, + KnownBits &Known, const DataLayout &DL, + unsigned Depth, AssumptionCache *AC, + const Instruction *CxtI, const DominatorTree *DT, + OptimizationRemarkEmitter *ORE, bool UseInstrInfo) { + ::computeKnownBits(V, DemandedElts, Known, Depth, + Query(DL, AC, safeCxtI(V, CxtI), DT, UseInstrInfo, ORE)); +} + +static KnownBits computeKnownBits(const Value *V, const APInt &DemandedElts, + unsigned Depth, const Query &Q); + static KnownBits computeKnownBits(const Value *V, unsigned Depth, const Query &Q); @@ -228,6 +240,17 @@ KnownBits llvm::computeKnownBits(const Value *V, const DataLayout &DL, V, Depth, Query(DL, AC, safeCxtI(V, CxtI), DT, UseInstrInfo, ORE)); } +KnownBits llvm::computeKnownBits(const Value *V, const APInt &DemandedElts, + const DataLayout &DL, unsigned Depth, + AssumptionCache *AC, const Instruction *CxtI, + const DominatorTree *DT, + OptimizationRemarkEmitter *ORE, + bool UseInstrInfo) { + return ::computeKnownBits( + V, DemandedElts, Depth, + Query(DL, AC, safeCxtI(V, CxtI), DT, UseInstrInfo, ORE)); +} + bool llvm::haveNoCommonBitsSet(const Value *LHS, const Value *RHS, const DataLayout &DL, AssumptionCache *AC, const Instruction *CxtI, const DominatorTree *DT, @@ -275,6 +298,9 @@ bool llvm::isKnownToBeAPowerOfTwo(const Value *V, const DataLayout &DL, V, OrZero, Depth, Query(DL, AC, safeCxtI(V, CxtI), DT, UseInstrInfo)); } +static bool isKnownNonZero(const Value *V, const APInt &DemandedElts, + unsigned Depth, const Query &Q); + static bool isKnownNonZero(const Value *V, unsigned Depth, const Query &Q); bool llvm::isKnownNonZero(const Value *V, const DataLayout &DL, unsigned Depth, @@ -356,26 +382,27 @@ unsigned llvm::ComputeNumSignBits(const Value *V, const DataLayout &DL, } static void computeKnownBitsAddSub(bool Add, const Value *Op0, const Value *Op1, - bool NSW, + bool NSW, const APInt &DemandedElts, KnownBits &KnownOut, KnownBits &Known2, unsigned Depth, const Query &Q) { - unsigned BitWidth = KnownOut.getBitWidth(); + computeKnownBits(Op1, DemandedElts, KnownOut, Depth + 1, Q); - // If an initial sequence of bits in the result is not needed, the - // corresponding bits in the operands are not needed. - KnownBits LHSKnown(BitWidth); - computeKnownBits(Op0, LHSKnown, Depth + 1, Q); - computeKnownBits(Op1, Known2, Depth + 1, Q); + // If one operand is unknown and we have no nowrap information, + // the result will be unknown independently of the second operand. + if (KnownOut.isUnknown() && !NSW) + return; - KnownOut = KnownBits::computeForAddSub(Add, NSW, LHSKnown, Known2); + computeKnownBits(Op0, DemandedElts, Known2, Depth + 1, Q); + KnownOut = KnownBits::computeForAddSub(Add, NSW, Known2, KnownOut); } static void computeKnownBitsMul(const Value *Op0, const Value *Op1, bool NSW, - KnownBits &Known, KnownBits &Known2, - unsigned Depth, const Query &Q) { + const APInt &DemandedElts, KnownBits &Known, + KnownBits &Known2, unsigned Depth, + const Query &Q) { unsigned BitWidth = Known.getBitWidth(); - computeKnownBits(Op1, Known, Depth + 1, Q); - computeKnownBits(Op0, Known2, Depth + 1, Q); + computeKnownBits(Op1, DemandedElts, Known, Depth + 1, Q); + computeKnownBits(Op0, DemandedElts, Known2, Depth + 1, Q); bool isKnownNegative = false; bool isKnownNonNegative = false; @@ -1002,16 +1029,17 @@ static void computeKnownBitsFromAssume(const Value *V, KnownBits &Known, /// amount. The results from calling KZF and KOF are conservatively combined for /// all permitted shift amounts. static void computeKnownBitsFromShiftOperator( - const Operator *I, KnownBits &Known, KnownBits &Known2, - unsigned Depth, const Query &Q, + const Operator *I, const APInt &DemandedElts, KnownBits &Known, + KnownBits &Known2, unsigned Depth, const Query &Q, function_ref KZF, function_ref KOF) { unsigned BitWidth = Known.getBitWidth(); - if (auto *SA = dyn_cast(I->getOperand(1))) { - unsigned ShiftAmt = SA->getLimitedValue(BitWidth-1); + computeKnownBits(I->getOperand(1), DemandedElts, Known, Depth + 1, Q); + if (Known.isConstant()) { + unsigned ShiftAmt = Known.getConstant().getLimitedValue(BitWidth - 1); - computeKnownBits(I->getOperand(0), Known, Depth + 1, Q); + computeKnownBits(I->getOperand(0), DemandedElts, Known, Depth + 1, Q); Known.Zero = KZF(Known.Zero, ShiftAmt); Known.One = KOF(Known.One, ShiftAmt); // If the known bits conflict, this must be an overflowing left shift, so @@ -1023,11 +1051,10 @@ static void computeKnownBitsFromShiftOperator( return; } - computeKnownBits(I->getOperand(1), Known, Depth + 1, Q); - // If the shift amount could be greater than or equal to the bit-width of the // LHS, the value could be poison, but bail out because the check below is - // expensive. TODO: Should we just carry on? + // expensive. + // TODO: Should we just carry on? if (Known.getMaxValue().uge(BitWidth)) { Known.resetAll(); return; @@ -1051,12 +1078,13 @@ static void computeKnownBitsFromShiftOperator( // Early exit if we can't constrain any well-defined shift amount. if (!(ShiftAmtKZ & (PowerOf2Ceil(BitWidth) - 1)) && !(ShiftAmtKO & (PowerOf2Ceil(BitWidth) - 1))) { - ShifterOperandIsNonZero = isKnownNonZero(I->getOperand(1), Depth + 1, Q); + ShifterOperandIsNonZero = + isKnownNonZero(I->getOperand(1), DemandedElts, Depth + 1, Q); if (!*ShifterOperandIsNonZero) return; } - computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q); + computeKnownBits(I->getOperand(0), DemandedElts, Known2, Depth + 1, Q); Known.Zero.setAllBits(); Known.One.setAllBits(); @@ -1073,7 +1101,7 @@ static void computeKnownBitsFromShiftOperator( if (ShiftAmt == 0) { if (!ShifterOperandIsNonZero.hasValue()) ShifterOperandIsNonZero = - isKnownNonZero(I->getOperand(1), Depth + 1, Q); + isKnownNonZero(I->getOperand(1), DemandedElts, Depth + 1, Q); if (*ShifterOperandIsNonZero) continue; } @@ -1121,7 +1149,7 @@ static void computeKnownBitsFromOperator(const Operator *I, if (!Known.Zero[0] && !Known.One[0] && match(I, m_c_BinOp(m_Value(X), m_Add(m_Deferred(X), m_Value(Y))))) { Known2.resetAll(); - computeKnownBits(Y, Known2, Depth + 1, Q); + computeKnownBits(Y, DemandedElts, Known2, Depth + 1, Q); if (Known2.countMinTrailingOnes() > 0) Known.Zero.setBit(0); } @@ -1149,8 +1177,8 @@ static void computeKnownBitsFromOperator(const Operator *I, } case Instruction::Mul: { bool NSW = Q.IIQ.hasNoSignedWrap(cast(I)); - computeKnownBitsMul(I->getOperand(0), I->getOperand(1), NSW, Known, - Known2, Depth, Q); + computeKnownBitsMul(I->getOperand(0), I->getOperand(1), NSW, DemandedElts, + Known, Known2, Depth, Q); break; } case Instruction::UDiv: { @@ -1301,7 +1329,8 @@ static void computeKnownBitsFromOperator(const Operator *I, return KOResult; }; - computeKnownBitsFromShiftOperator(I, Known, Known2, Depth, Q, KZF, KOF); + computeKnownBitsFromShiftOperator(I, DemandedElts, Known, Known2, Depth, Q, + KZF, KOF); break; } case Instruction::LShr: { @@ -1317,7 +1346,8 @@ static void computeKnownBitsFromOperator(const Operator *I, return KnownOne.lshr(ShiftAmt); }; - computeKnownBitsFromShiftOperator(I, Known, Known2, Depth, Q, KZF, KOF); + computeKnownBitsFromShiftOperator(I, DemandedElts, Known, Known2, Depth, Q, + KZF, KOF); break; } case Instruction::AShr: { @@ -1330,19 +1360,20 @@ static void computeKnownBitsFromOperator(const Operator *I, return KnownOne.ashr(ShiftAmt); }; - computeKnownBitsFromShiftOperator(I, Known, Known2, Depth, Q, KZF, KOF); + computeKnownBitsFromShiftOperator(I, DemandedElts, Known, Known2, Depth, Q, + KZF, KOF); break; } case Instruction::Sub: { bool NSW = Q.IIQ.hasNoSignedWrap(cast(I)); computeKnownBitsAddSub(false, I->getOperand(0), I->getOperand(1), NSW, - Known, Known2, Depth, Q); + DemandedElts, Known, Known2, Depth, Q); break; } case Instruction::Add: { bool NSW = Q.IIQ.hasNoSignedWrap(cast(I)); computeKnownBitsAddSub(true, I->getOperand(0), I->getOperand(1), NSW, - Known, Known2, Depth, Q); + DemandedElts, Known, Known2, Depth, Q); break; } case Instruction::SRem: @@ -1706,7 +1737,12 @@ static void computeKnownBitsFromOperator(const Operator *I, } break; case Instruction::ShuffleVector: { - auto *Shuf = cast(I); + auto *Shuf = dyn_cast(I); + // FIXME: Do we need to handle ConstantExpr involving shufflevectors? + if (!Shuf) { + Known.resetAll(); + return; + } // For undef elements, we don't know anything about the common state of // the shuffle result. APInt DemandedLHS, DemandedRHS; @@ -1732,10 +1768,9 @@ static void computeKnownBitsFromOperator(const Operator *I, break; } case Instruction::InsertElement: { - auto *IEI = cast(I); - Value *Vec = IEI->getOperand(0); - Value *Elt = IEI->getOperand(1); - auto *CIdx = dyn_cast(IEI->getOperand(2)); + const Value *Vec = I->getOperand(0); + const Value *Elt = I->getOperand(1); + auto *CIdx = dyn_cast(I->getOperand(2)); // Early out if the index is non-constant or out-of-range. unsigned NumElts = DemandedElts.getBitWidth(); if (!CIdx || CIdx->getValue().uge(NumElts)) { @@ -1765,9 +1800,8 @@ static void computeKnownBitsFromOperator(const Operator *I, case Instruction::ExtractElement: { // Look through extract element. If the index is non-constant or // out-of-range demand all elements, otherwise just the extracted element. - auto* EEI = cast(I); - const Value* Vec = EEI->getVectorOperand(); - const Value* Idx = EEI->getIndexOperand(); + const Value *Vec = I->getOperand(0); + const Value *Idx = I->getOperand(1); auto *CIdx = dyn_cast(Idx); unsigned NumElts = Vec->getType()->getVectorNumElements(); APInt DemandedVecElts = APInt::getAllOnesValue(NumElts); @@ -1786,19 +1820,19 @@ static void computeKnownBitsFromOperator(const Operator *I, case Intrinsic::uadd_with_overflow: case Intrinsic::sadd_with_overflow: computeKnownBitsAddSub(true, II->getArgOperand(0), - II->getArgOperand(1), false, Known, Known2, - Depth, Q); + II->getArgOperand(1), false, DemandedElts, + Known, Known2, Depth, Q); break; case Intrinsic::usub_with_overflow: case Intrinsic::ssub_with_overflow: computeKnownBitsAddSub(false, II->getArgOperand(0), - II->getArgOperand(1), false, Known, Known2, - Depth, Q); + II->getArgOperand(1), false, DemandedElts, + Known, Known2, Depth, Q); break; case Intrinsic::umul_with_overflow: case Intrinsic::smul_with_overflow: computeKnownBitsMul(II->getArgOperand(0), II->getArgOperand(1), false, - Known, Known2, Depth, Q); + DemandedElts, Known, Known2, Depth, Q); break; } } @@ -1807,6 +1841,15 @@ static void computeKnownBitsFromOperator(const Operator *I, } } +/// Determine which bits of V are known to be either zero or one and return +/// them. +KnownBits computeKnownBits(const Value *V, const APInt &DemandedElts, + unsigned Depth, const Query &Q) { + KnownBits Known(getBitWidth(V->getType(), Q.DL)); + computeKnownBits(V, DemandedElts, Known, Depth, Q); + return Known; +} + /// Determine which bits of V are known to be either zero or one and return /// them. KnownBits computeKnownBits(const Value *V, unsigned Depth, const Query &Q) { @@ -2219,12 +2262,13 @@ static bool rangeMetadataExcludesValue(const MDNode* Ranges, const APInt& Value) } /// Return true if the given value is known to be non-zero when defined. For -/// vectors, return true if every element is known to be non-zero when +/// vectors, return true if every demanded element is known to be non-zero when /// defined. For pointers, if the context instruction and dominator tree are /// specified, perform context-sensitive analysis and return true if the /// pointer couldn't possibly be null at the specified instruction. /// Supports values with integer or pointer type and vectors of integers. -bool isKnownNonZero(const Value *V, unsigned Depth, const Query &Q) { +bool isKnownNonZero(const Value *V, const APInt &DemandedElts, unsigned Depth, + const Query &Q) { if (auto *C = dyn_cast(V)) { if (C->isNullValue()) return false; @@ -2245,6 +2289,8 @@ bool isKnownNonZero(const Value *V, unsigned Depth, const Query &Q) { // non-zero to determine that the whole vector is known non-zero. if (auto *VecTy = dyn_cast(C->getType())) { for (unsigned i = 0, e = VecTy->getNumElements(); i != e; ++i) { + if (!DemandedElts[i]) + continue; Constant *Elt = C->getAggregateElement(i); if (!Elt || Elt->isNullValue()) return false; @@ -2345,7 +2391,8 @@ bool isKnownNonZero(const Value *V, unsigned Depth, const Query &Q) { // X | Y != 0 if X != 0 or Y != 0. Value *X = nullptr, *Y = nullptr; if (match(V, m_Or(m_Value(X), m_Value(Y)))) - return isKnownNonZero(X, Depth, Q) || isKnownNonZero(Y, Depth, Q); + return isKnownNonZero(X, DemandedElts, Depth, Q) || + isKnownNonZero(Y, DemandedElts, Depth, Q); // ext X != 0 if X != 0. if (isa(V) || isa(V)) @@ -2360,7 +2407,7 @@ bool isKnownNonZero(const Value *V, unsigned Depth, const Query &Q) { return isKnownNonZero(X, Depth, Q); KnownBits Known(BitWidth); - computeKnownBits(X, Known, Depth, Q); + computeKnownBits(X, DemandedElts, Known, Depth, Q); if (Known.One[0]) return true; } @@ -2372,7 +2419,7 @@ bool isKnownNonZero(const Value *V, unsigned Depth, const Query &Q) { if (BO->isExact()) return isKnownNonZero(X, Depth, Q); - KnownBits Known = computeKnownBits(X, Depth, Q); + KnownBits Known = computeKnownBits(X, DemandedElts, Depth, Q); if (Known.isNegative()) return true; @@ -2386,22 +2433,23 @@ bool isKnownNonZero(const Value *V, unsigned Depth, const Query &Q) { return true; // Are all the bits to be shifted out known zero? if (Known.countMinTrailingZeros() >= ShiftVal) - return isKnownNonZero(X, Depth, Q); + return isKnownNonZero(X, DemandedElts, Depth, Q); } } // div exact can only produce a zero if the dividend is zero. else if (match(V, m_Exact(m_IDiv(m_Value(X), m_Value())))) { - return isKnownNonZero(X, Depth, Q); + return isKnownNonZero(X, DemandedElts, Depth, Q); } // X + Y. else if (match(V, m_Add(m_Value(X), m_Value(Y)))) { - KnownBits XKnown = computeKnownBits(X, Depth, Q); - KnownBits YKnown = computeKnownBits(Y, Depth, Q); + KnownBits XKnown = computeKnownBits(X, DemandedElts, Depth, Q); + KnownBits YKnown = computeKnownBits(Y, DemandedElts, Depth, Q); // If X and Y are both non-negative (as signed values) then their sum is not // zero unless both X and Y are zero. if (XKnown.isNonNegative() && YKnown.isNonNegative()) - if (isKnownNonZero(X, Depth, Q) || isKnownNonZero(Y, Depth, Q)) + if (isKnownNonZero(X, DemandedElts, Depth, Q) || + isKnownNonZero(Y, DemandedElts, Depth, Q)) return true; // If X and Y are both negative (as signed values) then their sum is not @@ -2432,13 +2480,14 @@ bool isKnownNonZero(const Value *V, unsigned Depth, const Query &Q) { // If X and Y are non-zero then so is X * Y as long as the multiplication // does not overflow. if ((Q.IIQ.hasNoSignedWrap(BO) || Q.IIQ.hasNoUnsignedWrap(BO)) && - isKnownNonZero(X, Depth, Q) && isKnownNonZero(Y, Depth, Q)) + isKnownNonZero(X, DemandedElts, Depth, Q) && + isKnownNonZero(Y, DemandedElts, Depth, Q)) return true; } // (C ? X : Y) != 0 if X != 0 and Y != 0. else if (const SelectInst *SI = dyn_cast(V)) { - if (isKnownNonZero(SI->getTrueValue(), Depth, Q) && - isKnownNonZero(SI->getFalseValue(), Depth, Q)) + if (isKnownNonZero(SI->getTrueValue(), DemandedElts, Depth, Q) && + isKnownNonZero(SI->getFalseValue(), DemandedElts, Depth, Q)) return true; } // PHI @@ -2468,12 +2517,31 @@ bool isKnownNonZero(const Value *V, unsigned Depth, const Query &Q) { if (AllNonZeroConstants) return true; } + // ExtractElement + else if (const auto *EEI = dyn_cast(V)) { + const Value *Vec = EEI->getVectorOperand(); + const Value *Idx = EEI->getIndexOperand(); + auto *CIdx = dyn_cast(Idx); + unsigned NumElts = Vec->getType()->getVectorNumElements(); + APInt DemandedVecElts = APInt::getAllOnesValue(NumElts); + if (CIdx && CIdx->getValue().ult(NumElts)) + DemandedVecElts = APInt::getOneBitSet(NumElts, CIdx->getZExtValue()); + return isKnownNonZero(Vec, DemandedVecElts, Depth, Q); + } KnownBits Known(BitWidth); - computeKnownBits(V, Known, Depth, Q); + computeKnownBits(V, DemandedElts, Known, Depth, Q); return Known.One != 0; } +bool isKnownNonZero(const Value* V, unsigned Depth, const Query& Q) { + Type *Ty = V->getType(); + APInt DemandedElts = Ty->isVectorTy() + ? APInt::getAllOnesValue(Ty->getVectorNumElements()) + : APInt(1, 1); + return isKnownNonZero(V, DemandedElts, Depth, Q); +} + /// Return true if V2 == V1 + X, where X is known non-zero. static bool isAddOfNonZero(const Value *V1, const Value *V2, const Query &Q) { const BinaryOperator *BO = dyn_cast(V1); @@ -3496,8 +3564,8 @@ Value *llvm::isBytewiseValue(Value *V, const DataLayout &DL) { if (isa(V)) return UndefInt8; - const uint64_t Size = DL.getTypeStoreSize(V->getType()); - if (!Size) + // Return Undef for zero-sized type. + if (!DL.getTypeStoreSize(V->getType()).isNonZero()) return UndefInt8; Constant *C = dyn_cast(V); @@ -3815,7 +3883,7 @@ bool llvm::getConstantDataArrayInfo(const Value *V, Array = nullptr; } else { const DataLayout &DL = GV->getParent()->getDataLayout(); - uint64_t SizeInBytes = DL.getTypeStoreSize(GVTy); + uint64_t SizeInBytes = DL.getTypeStoreSize(GVTy).getFixedSize(); uint64_t Length = SizeInBytes / (ElementSize / 8); if (Length <= Offset) return false; @@ -4542,9 +4610,22 @@ bool llvm::isGuaranteedNotToBeUndefOrPoison(const Value *V, // TODO: Some instructions are guaranteed to return neither undef // nor poison if their arguments are not poison/undef. - // TODO: Deal with other Constant subclasses. - if (isa(V) || isa(V)) - return true; + if (auto *C = dyn_cast(V)) { + // TODO: We can analyze ConstExpr by opcode to determine if there is any + // possibility of poison. + if (isa(C) || isa(C)) + return false; + + // TODO: Add ConstantFP and pointers. + if (isa(C) || isa(C) ) + return true; + + if (C->getType()->isVectorTy()) + return !C->containsUndefElement() && !C->containsConstantExpression(); + + // TODO: Recursively analyze aggregates or other constants. + return false; + } if (auto PN = dyn_cast(V)) { if (llvm::all_of(PN->incoming_values(), [](const Use &U) { @@ -6146,10 +6227,12 @@ getOffsetFromIndex(const GEPOperator *GEP, unsigned Idx, const DataLayout &DL) { continue; } - // Otherwise, we have a sequential type like an array or vector. Multiply - // the index by the ElementSize. - uint64_t Size = DL.getTypeAllocSize(GTI.getIndexedType()); - Offset += Size * OpC->getSExtValue(); + // Otherwise, we have a sequential type like an array or fixed-length + // vector. Multiply the index by the ElementSize. + TypeSize Size = DL.getTypeAllocSize(GTI.getIndexedType()); + if (Size.isScalable()) + return None; + Offset += Size.getFixedSize() * OpC->getSExtValue(); } return Offset; diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp index 7d0050ff8eaac..f44457f3b054b 100644 --- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp @@ -2090,21 +2090,10 @@ void AsmPrinter::emitXXStructorList(const DataLayout &DL, const Constant *List, // init priority. if (!isa(List)) return; - // Sanity check the structors list. - const ConstantArray *InitList = dyn_cast(List); - if (!InitList) return; // Not an array! - StructType *ETy = dyn_cast(InitList->getType()->getElementType()); - if (!ETy || ETy->getNumElements() != 3 || - !isa(ETy->getTypeAtIndex(0U)) || - !isa(ETy->getTypeAtIndex(1U)) || - !isa(ETy->getTypeAtIndex(2U))) - return; // Not (int, ptr, ptr). - // Gather the structors in a form that's convenient for sorting by priority. SmallVector Structors; - for (Value *O : InitList->operands()) { - ConstantStruct *CS = dyn_cast(O); - if (!CS) continue; // Malformed. + for (Value *O : cast(List)->operands()) { + auto *CS = cast(O); if (CS->getOperand(1)->isNullValue()) break; // Found a null terminator, skip the rest. ConstantInt *Priority = dyn_cast(CS->getOperand(0)); diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp index 4a570dbbe339d..889303b4278ed 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp @@ -925,13 +925,12 @@ void DwarfCompileUnit::constructAbstractSubprogramScopeDIE( ContextCU->addDIEEntry(*AbsDef, dwarf::DW_AT_object_pointer, *ObjectPointer); } -/// Whether to use the GNU analog for a DWARF5 tag, attribute, or location atom. -static bool useGNUAnalogForDwarf5Feature(DwarfDebug *DD) { +bool DwarfCompileUnit::useGNUAnalogForDwarf5Feature() const { return DD->getDwarfVersion() == 4 && DD->tuneForGDB(); } dwarf::Tag DwarfCompileUnit::getDwarf5OrGNUTag(dwarf::Tag Tag) const { - if (!useGNUAnalogForDwarf5Feature(DD)) + if (!useGNUAnalogForDwarf5Feature()) return Tag; switch (Tag) { case dwarf::DW_TAG_call_site: @@ -945,7 +944,7 @@ dwarf::Tag DwarfCompileUnit::getDwarf5OrGNUTag(dwarf::Tag Tag) const { dwarf::Attribute DwarfCompileUnit::getDwarf5OrGNUAttr(dwarf::Attribute Attr) const { - if (!useGNUAnalogForDwarf5Feature(DD)) + if (!useGNUAnalogForDwarf5Feature()) return Attr; switch (Attr) { case dwarf::DW_AT_call_all_calls: @@ -967,7 +966,7 @@ DwarfCompileUnit::getDwarf5OrGNUAttr(dwarf::Attribute Attr) const { dwarf::LocationAtom DwarfCompileUnit::getDwarf5OrGNULocationAtom(dwarf::LocationAtom Loc) const { - if (!useGNUAnalogForDwarf5Feature(DD)) + if (!useGNUAnalogForDwarf5Feature()) return Loc; switch (Loc) { case dwarf::DW_OP_entry_value: @@ -981,6 +980,7 @@ DIE &DwarfCompileUnit::constructCallSiteEntryDIE(DIE &ScopeDIE, DIE *CalleeDIE, bool IsTail, const MCSymbol *PCAddr, + const MCSymbol *CallAddr, unsigned CallReg) { // Insert a call site entry DIE within ScopeDIE. DIE &CallSiteDIE = createAndAddDIE(getDwarf5OrGNUTag(dwarf::DW_TAG_call_site), @@ -996,16 +996,33 @@ DIE &DwarfCompileUnit::constructCallSiteEntryDIE(DIE &ScopeDIE, *CalleeDIE); } - if (IsTail) + if (IsTail) { // Attach DW_AT_call_tail_call to tail calls for standards compliance. addFlag(CallSiteDIE, getDwarf5OrGNUAttr(dwarf::DW_AT_call_tail_call)); + // Attach the address of the branch instruction to allow the debugger to + // show where the tail call occurred. This attribute has no GNU analog. + // + // GDB works backwards from non-standard usage of DW_AT_low_pc (in DWARF4 + // mode -- equivalently, in DWARF5 mode, DW_AT_call_return_pc) at tail-call + // site entries to figure out the PC of tail-calling branch instructions. + // This means it doesn't need the compiler to emit DW_AT_call_pc, so we + // don't emit it here. + // + // There's no need to tie non-GDB debuggers to this non-standardness, as it + // adds unnecessary complexity to the debugger. For non-GDB debuggers, emit + // the standard DW_AT_call_pc info. + if (!useGNUAnalogForDwarf5Feature()) + addLabelAddress(CallSiteDIE, dwarf::DW_AT_call_pc, CallAddr); + } + // Attach the return PC to allow the debugger to disambiguate call paths // from one function to another. // // The return PC is only really needed when the call /isn't/ a tail call, but - // for some reason GDB always expects it. - if (!IsTail || DD->tuneForGDB()) { + // GDB expects it in DWARF4 mode, even for tail calls (see the comment above + // the DW_AT_call_pc emission logic for an explanation). + if (!IsTail || useGNUAnalogForDwarf5Feature()) { assert(PCAddr && "Missing return PC information for a call"); addLabelAddress(CallSiteDIE, getDwarf5OrGNUAttr(dwarf::DW_AT_call_return_pc), PCAddr); diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h index 52e1c71a8e653..5d0afee4c3df0 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h @@ -230,6 +230,10 @@ class DwarfCompileUnit final : public DwarfUnit { void constructAbstractSubprogramScopeDIE(LexicalScope *Scope); + /// Whether to use the GNU analog for a DWARF5 tag, attribute, or location + /// atom. Only applicable when emitting otherwise DWARF4-compliant debug info. + bool useGNUAnalogForDwarf5Feature() const; + /// This takes a DWARF 5 tag and returns it or a GNU analog. dwarf::Tag getDwarf5OrGNUTag(dwarf::Tag Tag) const; @@ -245,10 +249,12 @@ class DwarfCompileUnit final : public DwarfUnit { /// For indirect calls \p CalleeDIE is set to nullptr. /// \p IsTail specifies whether the call is a tail call. /// \p PCAddr points to the PC value after the call instruction. + /// \p CallAddr points to the PC value at the call instruction (or is null). /// \p CallReg is a register location for an indirect call. For direct calls /// the \p CallReg is set to 0. DIE &constructCallSiteEntryDIE(DIE &ScopeDIE, DIE *CalleeDIE, bool IsTail, - const MCSymbol *PCAddr, unsigned CallReg); + const MCSymbol *PCAddr, + const MCSymbol *CallAddr, unsigned CallReg); /// Construct call site parameter DIEs for the \p CallSiteDIE. The \p Params /// were collected by the \ref collectCallSiteParameters. /// Note: The order of parameters does not matter, since debuggers recognize diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp index 7efeb1a3736ad..94ceab2c3b82b 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp @@ -95,6 +95,10 @@ static cl::opt UseDwarfRangesBaseAddressSpecifier( "use-dwarf-ranges-base-address-specifier", cl::Hidden, cl::desc("Use base address specifiers in debug_ranges"), cl::init(false)); +static cl::opt EmitDwarfDebugEntryValues( + "emit-debug-entry-values", cl::Hidden, + cl::desc("Emit the debug entry values"), cl::init(false)); + static cl::opt GenerateARangeSection("generate-arange-section", cl::Hidden, cl::desc("Generate dwarf aranges"), @@ -419,6 +423,12 @@ DwarfDebug::DwarfDebug(AsmPrinter *A, Module *M) // a monolithic string offsets table without any header. UseSegmentedStringOffsetsTable = DwarfVersion >= 5; + // Emit call-site-param debug info for GDB and LLDB, if the target supports + // the debug entry values feature. It can also be enabled explicitly. + EmitDebugEntryValues = (Asm->TM.Options.ShouldEmitDebugEntryValues() && + (tuneForGDB() || tuneForLLDB())) || + EmitDwarfDebugEntryValues; + Asm->OutStreamer->getContext().setDwarfVersion(DwarfVersion); } @@ -562,6 +572,18 @@ struct FwdRegParamInfo { /// Register worklist for finding call site values. using FwdRegWorklist = MapVector>; +/// Append the expression \p Addition to \p Original and return the result. +static const DIExpression *combineDIExpressions(const DIExpression *Original, + const DIExpression *Addition) { + std::vector Elts = Addition->getElements().vec(); + // Avoid multiple DW_OP_stack_values. + if (Original->isImplicit() && Addition->isImplicit()) + erase_if(Elts, [](uint64_t Op) { return Op == dwarf::DW_OP_stack_value; }); + const DIExpression *CombinedExpr = + (Elts.size() > 0) ? DIExpression::append(Original, Elts) : Original; + return CombinedExpr; +} + /// Emit call site parameter entries that are described by the given value and /// debug expression. template @@ -581,9 +603,8 @@ static void finishCallSiteParams(ValT Val, const DIExpression *Expr, // parameter when walking through the instructions. Append that to the // base expression. const DIExpression *CombinedExpr = - ShouldCombineExpressions - ? DIExpression::append(Expr, Param.Expr->getElements()) - : Expr; + ShouldCombineExpressions ? combineDIExpressions(Expr, Param.Expr) + : Expr; assert((!CombinedExpr || CombinedExpr->isValid()) && "Combined debug expression is invalid"); @@ -613,15 +634,7 @@ static void addToFwdRegWorklist(FwdRegWorklist &Worklist, unsigned Reg, // instructions we may have already created an expression for the // parameter when walking through the instructions. Append that to the // new expression. - std::vector ParamElts = Param.Expr->getElements().vec(); - // Avoid multiple DW_OP_stack_values. - if (Expr->isImplicit() && Param.Expr->isImplicit()) - erase_if(ParamElts, - [](uint64_t Op) { return Op == dwarf::DW_OP_stack_value; }); - const DIExpression *CombinedExpr = - (Param.Expr->getNumElements() > 0) - ? DIExpression::append(Expr, ParamElts) - : Expr; + const DIExpression *CombinedExpr = combineDIExpressions(Expr, Param.Expr); ParamsForFwdReg.push_back({Param.ParamReg, CombinedExpr}); } } @@ -865,16 +878,21 @@ void DwarfDebug::constructCallSiteEntryDIEs(const DISubprogram &SP, const MachineInstr *TopLevelCallMI = MI.isInsideBundle() ? &*getBundleStart(MI.getIterator()) : &MI; - // For tail calls, no return PC information is needed. - // For regular calls (and tail calls in GDB tuning), the return PC - // is needed to disambiguate paths in the call graph which could lead to - // some target function. + // For non-tail calls, the return PC is needed to disambiguate paths in + // the call graph which could lead to some target function. For tail + // calls, no return PC information is needed, unless tuning for GDB in + // DWARF4 mode in which case we fake a return PC for compatibility. const MCSymbol *PCAddr = - (IsTail && !tuneForGDB()) - ? nullptr - : const_cast(getLabelAfterInsn(TopLevelCallMI)); + (!IsTail || CU.useGNUAnalogForDwarf5Feature()) + ? const_cast(getLabelAfterInsn(TopLevelCallMI)) + : nullptr; + + // For tail calls, it's necessary to record the address of the branch + // instruction so that the debugger can show where the tail call occurred. + const MCSymbol *CallAddr = + IsTail ? getLabelBeforeInsn(TopLevelCallMI) : nullptr; - assert((IsTail || PCAddr) && "Call without return PC information"); + assert((IsTail || PCAddr) && "Non-tail call without return PC"); LLVM_DEBUG(dbgs() << "CallSiteEntry: " << MF.getName() << " -> " << (CalleeDecl ? CalleeDecl->getName() @@ -883,12 +901,11 @@ void DwarfDebug::constructCallSiteEntryDIEs(const DISubprogram &SP, ->getName(CallReg))) << (IsTail ? " [IsTail]" : "") << "\n"); - DIE &CallSiteDIE = CU.constructCallSiteEntryDIE(ScopeDIE, CalleeDIE, - IsTail, PCAddr, CallReg); + DIE &CallSiteDIE = CU.constructCallSiteEntryDIE( + ScopeDIE, CalleeDIE, IsTail, PCAddr, CallAddr, CallReg); - // GDB and LLDB support call site parameter debug info. - if (Asm->TM.Options.EnableDebugEntryValues && - (tuneForGDB() || tuneForLLDB())) { + // Optionally emit call-site-param debug info. + if (emitDebugEntryValues()) { ParamSet Params; // Try to interpret values of call site parameters. collectCallSiteParameters(&MI, Params); @@ -1774,11 +1791,32 @@ void DwarfDebug::collectEntityInfo(DwarfCompileUnit &TheCU, // Process beginning of an instruction. void DwarfDebug::beginInstruction(const MachineInstr *MI) { + const MachineFunction &MF = *MI->getMF(); + const auto *SP = MF.getFunction().getSubprogram(); + bool NoDebug = + !SP || SP->getUnit()->getEmissionKind() == DICompileUnit::NoDebug; + + // When describing calls, we need a label for the call instruction. + // TODO: Add support for targets with delay slots. + if (!NoDebug && SP->areAllCallsDescribed() && + MI->isCandidateForCallSiteEntry(MachineInstr::AnyInBundle) && + !MI->hasDelaySlot()) { + const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); + bool IsTail = TII->isTailCall(*MI); + // For tail calls, we need the address of the branch instruction for + // DW_AT_call_pc. + if (IsTail) + requestLabelBeforeInsn(MI); + // For non-tail calls, we need the return address for the call for + // DW_AT_call_return_pc. Under GDB tuning, this information is needed for + // tail calls as well. + requestLabelAfterInsn(MI); + } + DebugHandlerBase::beginInstruction(MI); assert(CurMI); - const auto *SP = MI->getMF()->getFunction().getSubprogram(); - if (!SP || SP->getUnit()->getEmissionKind() == DICompileUnit::NoDebug) + if (NoDebug) return; // Check if source location changes, but ignore DBG_VALUE and CFI locations. @@ -1792,11 +1830,6 @@ void DwarfDebug::beginInstruction(const MachineInstr *MI) { unsigned LastAsmLine = Asm->OutStreamer->getContext().getCurrentDwarfLoc().getLine(); - // Request a label after the call in order to emit AT_return_pc information - // in call site entries. TODO: Add support for targets with delay slots. - if (SP->areAllCallsDescribed() && MI->isCall() && !MI->hasDelaySlot()) - requestLabelAfterInsn(MI); - if (DL == PrevInstLoc) { // If we have an ongoing unspecified location, nothing to do here. if (!DL) diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h index a44960589d89f..882fc739d792a 100644 --- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h +++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h @@ -386,6 +386,11 @@ class DwarfDebug : public DebugHandlerBase { /// a monolithic sequence of string offsets. bool UseSegmentedStringOffsetsTable; + /// Enable production of call site parameters needed to print the debug entry + /// values. Useful for testing purposes when a debugger does not support the + /// feature yet. + bool EmitDebugEntryValues; + /// Separated Dwarf Variables /// In general these will all be for bits that are left in the /// original object file, rather than things that are meant @@ -708,6 +713,10 @@ class DwarfDebug : public DebugHandlerBase { return UseSegmentedStringOffsetsTable; } + bool emitDebugEntryValues() const { + return EmitDebugEntryValues; + } + bool shareAcrossDWOCUs() const; /// Returns the Dwarf Version. diff --git a/llvm/lib/CodeGen/CFIInstrInserter.cpp b/llvm/lib/CodeGen/CFIInstrInserter.cpp index 2fc436e63496d..ef548c84d3c00 100644 --- a/llvm/lib/CodeGen/CFIInstrInserter.cpp +++ b/llvm/lib/CodeGen/CFIInstrInserter.cpp @@ -18,7 +18,6 @@ //===----------------------------------------------------------------------===// #include "llvm/ADT/DepthFirstIterator.h" -#include "llvm/ADT/SetOperations.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineModuleInfo.h" @@ -77,10 +76,6 @@ class CFIInstrInserter : public MachineFunctionPass { unsigned IncomingCFARegister = 0; /// Value of cfa register valid at basic block exit. unsigned OutgoingCFARegister = 0; - /// Set of callee saved registers saved at basic block entry. - BitVector IncomingCSRSaved; - /// Set of callee saved registers saved at basic block exit. - BitVector OutgoingCSRSaved; /// If in/out cfa offset and register values for this block have already /// been set or not. bool Processed = false; @@ -113,8 +108,7 @@ class CFIInstrInserter : public MachineFunctionPass { return -MBBVector[MBB->getNumber()].IncomingCFAOffset; } - void reportCFAError(const MBBCFAInfo &Pred, const MBBCFAInfo &Succ); - void reportCSRError(const MBBCFAInfo &Pred, const MBBCFAInfo &Succ); + void report(const MBBCFAInfo &Pred, const MBBCFAInfo &Succ); /// Go through each MBB in a function and check that outgoing offset and /// register of its predecessors match incoming offset and register of that /// MBB, as well as that incoming offset and register of its successors match @@ -138,8 +132,6 @@ void CFIInstrInserter::calculateCFAInfo(MachineFunction &MF) { // function. unsigned InitialRegister = MF.getSubtarget().getFrameLowering()->getInitialCFARegister(MF); - const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); - unsigned NumRegs = TRI.getNumRegs(); // Initialize MBBMap. for (MachineBasicBlock &MBB : MF) { @@ -149,8 +141,6 @@ void CFIInstrInserter::calculateCFAInfo(MachineFunction &MF) { MBBInfo.OutgoingCFAOffset = InitialOffset; MBBInfo.IncomingCFARegister = InitialRegister; MBBInfo.OutgoingCFARegister = InitialRegister; - MBBInfo.IncomingCSRSaved.resize(NumRegs); - MBBInfo.OutgoingCSRSaved.resize(NumRegs); MBBVector[MBB.getNumber()] = MBBInfo; } @@ -169,11 +159,8 @@ void CFIInstrInserter::calculateOutgoingCFAInfo(MBBCFAInfo &MBBInfo) { int SetOffset = MBBInfo.IncomingCFAOffset; // Outgoing cfa register set by the block. unsigned SetRegister = MBBInfo.IncomingCFARegister; - MachineFunction *MF = MBBInfo.MBB->getParent(); - const std::vector &Instrs = MF->getFrameInstructions(); - const TargetRegisterInfo &TRI = *MF->getSubtarget().getRegisterInfo(); - unsigned NumRegs = TRI.getNumRegs(); - BitVector CSRSaved(NumRegs), CSRRestored(NumRegs); + const std::vector &Instrs = + MBBInfo.MBB->getParent()->getFrameInstructions(); // Determine cfa offset and register set by the block. for (MachineInstr &MI : *MBBInfo.MBB) { @@ -194,15 +181,6 @@ void CFIInstrInserter::calculateOutgoingCFAInfo(MBBCFAInfo &MBBInfo) { SetRegister = CFI.getRegister(); SetOffset = CFI.getOffset(); break; - case MCCFIInstruction::OpOffset: - case MCCFIInstruction::OpRegister: - case MCCFIInstruction::OpRelOffset: - CSRSaved.set(CFI.getRegister()); - break; - case MCCFIInstruction::OpRestore: - case MCCFIInstruction::OpUndefined: - CSRRestored.set(CFI.getRegister()); - break; case MCCFIInstruction::OpRememberState: // TODO: Add support for handling cfi_remember_state. #ifndef NDEBUG @@ -221,7 +199,12 @@ void CFIInstrInserter::calculateOutgoingCFAInfo(MBBCFAInfo &MBBInfo) { break; // Other CFI directives do not affect CFA value. case MCCFIInstruction::OpSameValue: + case MCCFIInstruction::OpOffset: + case MCCFIInstruction::OpRelOffset: case MCCFIInstruction::OpEscape: + case MCCFIInstruction::OpRestore: + case MCCFIInstruction::OpUndefined: + case MCCFIInstruction::OpRegister: case MCCFIInstruction::OpWindowSave: case MCCFIInstruction::OpNegateRAState: case MCCFIInstruction::OpGnuArgsSize: @@ -235,11 +218,6 @@ void CFIInstrInserter::calculateOutgoingCFAInfo(MBBCFAInfo &MBBInfo) { // Update outgoing CFA info. MBBInfo.OutgoingCFAOffset = SetOffset; MBBInfo.OutgoingCFARegister = SetRegister; - - // Update outgoing CSR info. - MBBInfo.OutgoingCSRSaved = MBBInfo.IncomingCSRSaved; - MBBInfo.OutgoingCSRSaved |= CSRSaved; - MBBInfo.OutgoingCSRSaved.reset(CSRRestored); } void CFIInstrInserter::updateSuccCFAInfo(MBBCFAInfo &MBBInfo) { @@ -258,7 +236,6 @@ void CFIInstrInserter::updateSuccCFAInfo(MBBCFAInfo &MBBInfo) { if (!SuccInfo.Processed) { SuccInfo.IncomingCFAOffset = CurrentInfo.OutgoingCFAOffset; SuccInfo.IncomingCFARegister = CurrentInfo.OutgoingCFARegister; - SuccInfo.IncomingCSRSaved = CurrentInfo.OutgoingCSRSaved; Stack.push_back(Succ); } } @@ -310,23 +287,12 @@ bool CFIInstrInserter::insertCFIInstrs(MachineFunction &MF) { .addCFIIndex(CFIIndex); InsertedCFIInstr = true; } - - BitVector SetDifference = PrevMBBInfo->OutgoingCSRSaved; - SetDifference.reset(MBBInfo.IncomingCSRSaved); - for (int Reg : SetDifference.set_bits()) { - unsigned CFIIndex = - MF.addFrameInst(MCCFIInstruction::createRestore(nullptr, Reg)); - BuildMI(*MBBInfo.MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex); - InsertedCFIInstr = true; - } PrevMBBInfo = &MBBInfo; } return InsertedCFIInstr; } -void CFIInstrInserter::reportCFAError(const MBBCFAInfo &Pred, - const MBBCFAInfo &Succ) { +void CFIInstrInserter::report(const MBBCFAInfo &Pred, const MBBCFAInfo &Succ) { errs() << "*** Inconsistent CFA register and/or offset between pred and succ " "***\n"; errs() << "Pred: " << Pred.MBB->getName() << " #" << Pred.MBB->getNumber() @@ -341,22 +307,6 @@ void CFIInstrInserter::reportCFAError(const MBBCFAInfo &Pred, << " incoming CFA Offset:" << Succ.IncomingCFAOffset << "\n"; } -void CFIInstrInserter::reportCSRError(const MBBCFAInfo &Pred, - const MBBCFAInfo &Succ) { - errs() << "*** Inconsistent CSR Saved between pred and succ in function " - << Pred.MBB->getParent()->getName() << " ***\n"; - errs() << "Pred: " << Pred.MBB->getName() << " #" << Pred.MBB->getNumber() - << " outgoing CSR Saved: "; - for (int Reg : Pred.OutgoingCSRSaved.set_bits()) - errs() << Reg << " "; - errs() << "\n"; - errs() << "Succ: " << Succ.MBB->getName() << " #" << Succ.MBB->getNumber() - << " incoming CSR Saved: "; - for (int Reg : Succ.IncomingCSRSaved.set_bits()) - errs() << Reg << " "; - errs() << "\n"; -} - unsigned CFIInstrInserter::verify(MachineFunction &MF) { unsigned ErrorNum = 0; for (auto *CurrMBB : depth_first(&MF)) { @@ -371,13 +321,7 @@ unsigned CFIInstrInserter::verify(MachineFunction &MF) { // we don't generate epilogues inside such blocks. if (SuccMBBInfo.MBB->succ_empty() && !SuccMBBInfo.MBB->isReturnBlock()) continue; - reportCFAError(CurrMBBInfo, SuccMBBInfo); - ErrorNum++; - } - // Check that IncomingCSRSaved of every successor matches the - // OutgoingCSRSaved of CurrMBB - if (SuccMBBInfo.IncomingCSRSaved != CurrMBBInfo.OutgoingCSRSaved) { - reportCSRError(CurrMBBInfo, SuccMBBInfo); + report(CurrMBBInfo, SuccMBBInfo); ErrorNum++; } } diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp index 1cccd6fade4ef..1fd5154cbf5be 100644 --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -7200,7 +7200,7 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, bool &ModifiedDT) { } if (FreezeInst *FI = dyn_cast(I)) { - // br(freeze(icmp a, const)) -> br(icmp (freeze a), const) + // freeze(icmp a, const)) -> icmp (freeze a), const // This helps generate efficient conditional jumps. Instruction *CmpI = nullptr; if (ICmpInst *II = dyn_cast(FI->getOperand(0))) diff --git a/llvm/lib/CodeGen/CommandFlags.cpp b/llvm/lib/CodeGen/CommandFlags.cpp index 7acb84df582fa..d5dc49a91177b 100644 --- a/llvm/lib/CodeGen/CommandFlags.cpp +++ b/llvm/lib/CodeGen/CommandFlags.cpp @@ -380,7 +380,7 @@ codegen::RegisterCodeGenFlags::RegisterCodeGenFlags() { static cl::opt EnableDebugEntryValues( "debug-entry-values", - cl::desc("Emit debug info about parameter's entry values"), + cl::desc("Enable debug info for the debug entry values."), cl::init(false)); CGBINDOPT(EnableDebugEntryValues); diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index ecb46f401fb44..e9e4e2b30698a 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -1490,6 +1490,54 @@ bool CombinerHelper::matchAnyExplicitUseIsUndef(MachineInstr &MI) { }); } +bool CombinerHelper::matchAllExplicitUsesAreUndef(MachineInstr &MI) { + return all_of(MI.explicit_uses(), [this](const MachineOperand &MO) { + return !MO.isReg() || + getOpcodeDef(TargetOpcode::G_IMPLICIT_DEF, MO.getReg(), MRI); + }); +} + +bool CombinerHelper::matchUndefShuffleVectorMask(MachineInstr &MI) { + assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR); + ArrayRef Mask = MI.getOperand(3).getShuffleMask(); + return all_of(Mask, [](int Elt) { return Elt < 0; }); +} + +bool CombinerHelper::matchEqualDefs(const MachineOperand &MOP1, + const MachineOperand &MOP2) { + if (!MOP1.isReg() || !MOP2.isReg()) + return false; + MachineInstr *I1 = getDefIgnoringCopies(MOP1.getReg(), MRI); + if (!I1) + return false; + MachineInstr *I2 = getDefIgnoringCopies(MOP2.getReg(), MRI); + if (!I2) + return false; + + // On the off-chance that there's some target instruction feeding into the + // select, let's use produceSameValue instead of isIdenticalTo. + return Builder.getTII().produceSameValue(*I1, *I2, &MRI); +} + +bool CombinerHelper::replaceSingleDefInstWithOperand(MachineInstr &MI, + unsigned OpIdx) { + assert(MI.getNumExplicitDefs() == 1 && "Expected one explicit def?"); + Register OldReg = MI.getOperand(0).getReg(); + Register Replacement = MI.getOperand(OpIdx).getReg(); + assert(canReplaceReg(OldReg, Replacement, MRI) && "Cannot replace register?"); + MI.eraseFromParent(); + replaceRegWith(MRI, OldReg, Replacement); + return true; +} + +bool CombinerHelper::matchSelectSameVal(MachineInstr &MI) { + assert(MI.getOpcode() == TargetOpcode::G_SELECT); + // Match (cond ? x : x) + return matchEqualDefs(MI.getOperand(2), MI.getOperand(3)) && + canReplaceReg(MI.getOperand(0).getReg(), MI.getOperand(2).getReg(), + MRI); +} + bool CombinerHelper::replaceInstWithFConstant(MachineInstr &MI, double C) { assert(MI.getNumDefs() == 1 && "Expected only one def?"); Builder.setInstr(MI); diff --git a/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp b/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp index 8e369fe9e31d4..213af320531c8 100644 --- a/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp +++ b/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp @@ -65,12 +65,18 @@ KnownBits GISelKnownBits::getKnownBits(MachineInstr &MI) { } KnownBits GISelKnownBits::getKnownBits(Register R) { - KnownBits Known; - LLT Ty = MRI.getType(R); + const LLT Ty = MRI.getType(R); APInt DemandedElts = Ty.isVector() ? APInt::getAllOnesValue(Ty.getNumElements()) : APInt(1, 1); + return getKnownBits(R, DemandedElts); +} + +KnownBits GISelKnownBits::getKnownBits(Register R, const APInt &DemandedElts, + unsigned Depth) { // For now, we only maintain the cache during one request. assert(ComputeKnownBitsCache.empty() && "Cache should have been cleared"); + + KnownBits Known; computeKnownBitsImpl(R, Known, DemandedElts); ComputeKnownBitsCache.clear(); return Known; @@ -428,6 +434,7 @@ unsigned GISelKnownBits::computeNumSignBits(Register R, return 1; // No demanded elts, better to assume we don't know anything. LLT DstTy = MRI.getType(R); + const unsigned TyBits = DstTy.getScalarSizeInBits(); // Handle the case where this is called on a register that does not have a // type constraint. This is unlikely to occur except by looking through copies @@ -436,6 +443,7 @@ unsigned GISelKnownBits::computeNumSignBits(Register R, if (!DstTy.isValid()) return 1; + unsigned FirstAnswer = 1; switch (Opcode) { case TargetOpcode::COPY: { MachineOperand &Src = MI.getOperand(1); @@ -465,13 +473,34 @@ unsigned GISelKnownBits::computeNumSignBits(Register R, return NumSrcSignBits - (NumSrcBits - DstTyBits); break; } - default: + case TargetOpcode::G_INTRINSIC: + case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS: + default: { + unsigned NumBits = + TL.computeNumSignBitsForTargetInstr(*this, R, DemandedElts, MRI, Depth); + if (NumBits > 1) + FirstAnswer = std::max(FirstAnswer, NumBits); break; } + } + + // Finally, if we can prove that the top bits of the result are 0's or 1's, + // use this information. + KnownBits Known = getKnownBits(R, DemandedElts, Depth); + APInt Mask; + if (Known.isNonNegative()) { // sign bit is 0 + Mask = Known.Zero; + } else if (Known.isNegative()) { // sign bit is 1; + Mask = Known.One; + } else { + // Nothing known. + return FirstAnswer; + } - // TODO: Handle target instructions - // TODO: Fall back to known bits - return 1; + // Okay, we know that the sign bit in Mask is set. Use CLO to determine + // the number of identical bits in the top of the input value. + Mask <<= Mask.getBitWidth() - TyBits; + return std::max(FirstAnswer, Mask.countLeadingOnes()); } unsigned GISelKnownBits::computeNumSignBits(Register R, unsigned Depth) { diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp index 22a1eae3f4806..567b2f9c50629 100644 --- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp +++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -1424,6 +1424,14 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID, return translateOverflowIntrinsic(CI, TargetOpcode::G_UMULO, MIRBuilder); case Intrinsic::smul_with_overflow: return translateOverflowIntrinsic(CI, TargetOpcode::G_SMULO, MIRBuilder); + case Intrinsic::uadd_sat: + return translateBinaryOp(TargetOpcode::G_UADDSAT, CI, MIRBuilder); + case Intrinsic::sadd_sat: + return translateBinaryOp(TargetOpcode::G_SADDSAT, CI, MIRBuilder); + case Intrinsic::usub_sat: + return translateBinaryOp(TargetOpcode::G_USUBSAT, CI, MIRBuilder); + case Intrinsic::ssub_sat: + return translateBinaryOp(TargetOpcode::G_SSUBSAT, CI, MIRBuilder); case Intrinsic::fmuladd: { const TargetMachine &TM = MF->getTarget(); const TargetLowering &TLI = *MF->getSubtarget().getTargetLowering(); diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp index d7cbdbc3c9046..ac9b35aeb18c6 100644 --- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -120,6 +120,9 @@ LegalizerHelper::legalizeInstrStep(MachineInstr &MI) { case WidenScalar: LLVM_DEBUG(dbgs() << ".. Widen scalar\n"); return widenScalar(MI, Step.TypeIdx, Step.NewType); + case Bitcast: + LLVM_DEBUG(dbgs() << ".. Bitcast type\n"); + return bitcast(MI, Step.TypeIdx, Step.NewType); case Lower: LLVM_DEBUG(dbgs() << ".. Lower\n"); return lower(MI, Step.TypeIdx, Step.NewType); @@ -1251,6 +1254,19 @@ void LegalizerHelper::moreElementsVectorSrc(MachineInstr &MI, LLT MoreTy, MO.setReg(MoreReg); } +void LegalizerHelper::bitcastSrc(MachineInstr &MI, LLT CastTy, unsigned OpIdx) { + MachineOperand &Op = MI.getOperand(OpIdx); + Op.setReg(MIRBuilder.buildBitcast(CastTy, Op).getReg(0)); +} + +void LegalizerHelper::bitcastDst(MachineInstr &MI, LLT CastTy, unsigned OpIdx) { + MachineOperand &MO = MI.getOperand(OpIdx); + Register CastDst = MRI.createGenericVirtualRegister(CastTy); + MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt()); + MIRBuilder.buildBitcast(MO, CastDst); + MO.setReg(CastDst); +} + LegalizerHelper::LegalizeResult LegalizerHelper::widenScalarMergeValues(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) { @@ -1390,11 +1406,12 @@ LegalizerHelper::widenScalarUnmergeValues(MachineInstr &MI, unsigned TypeIdx, if (!DstTy.isScalar()) return UnableToLegalize; - if (WideTy.getSizeInBits() == SrcTy.getSizeInBits()) { + if (WideTy.getSizeInBits() >= SrcTy.getSizeInBits()) { if (SrcTy.isPointer()) { const DataLayout &DL = MIRBuilder.getDataLayout(); if (DL.isNonIntegralAddressSpace(SrcTy.getAddressSpace())) { - LLVM_DEBUG(dbgs() << "Not casting non-integral address space integer\n"); + LLVM_DEBUG( + dbgs() << "Not casting non-integral address space integer\n"); return UnableToLegalize; } @@ -1402,6 +1419,14 @@ LegalizerHelper::widenScalarUnmergeValues(MachineInstr &MI, unsigned TypeIdx, SrcReg = MIRBuilder.buildPtrToInt(SrcTy, SrcReg).getReg(0); } + // Widen SrcTy to WideTy. This does not affect the result, but since the + // user requested this size, it is probably better handled than SrcTy and + // should reduce the total number of legalization artifacts + if (WideTy.getSizeInBits() > SrcTy.getSizeInBits()) { + SrcTy = WideTy; + SrcReg = MIRBuilder.buildAnyExt(WideTy, SrcReg).getReg(0); + } + // Theres no unmerge type to target. Directly extract the bits from the // source type unsigned DstSize = DstTy.getSizeInBits(); @@ -1417,10 +1442,6 @@ LegalizerHelper::widenScalarUnmergeValues(MachineInstr &MI, unsigned TypeIdx, return Legalized; } - // TODO - if (WideTy.getSizeInBits() > SrcTy.getSizeInBits()) - return UnableToLegalize; - // Extend the source to a wider type. LLT LCMTy = getLCMType(SrcTy, WideTy); @@ -2099,6 +2120,61 @@ LegalizerHelper::lowerBitcast(MachineInstr &MI) { return UnableToLegalize; } +LegalizerHelper::LegalizeResult +LegalizerHelper::bitcast(MachineInstr &MI, unsigned TypeIdx, LLT CastTy) { + MIRBuilder.setInstr(MI); + + switch (MI.getOpcode()) { + case TargetOpcode::G_LOAD: { + if (TypeIdx != 0) + return UnableToLegalize; + + Observer.changingInstr(MI); + bitcastDst(MI, CastTy, 0); + Observer.changedInstr(MI); + return Legalized; + } + case TargetOpcode::G_STORE: { + if (TypeIdx != 0) + return UnableToLegalize; + + Observer.changingInstr(MI); + bitcastSrc(MI, CastTy, 0); + Observer.changedInstr(MI); + return Legalized; + } + case TargetOpcode::G_SELECT: { + if (TypeIdx != 0) + return UnableToLegalize; + + if (MRI.getType(MI.getOperand(1).getReg()).isVector()) { + LLVM_DEBUG( + dbgs() << "bitcast action not implemented for vector select\n"); + return UnableToLegalize; + } + + Observer.changingInstr(MI); + bitcastSrc(MI, CastTy, 2); + bitcastSrc(MI, CastTy, 3); + bitcastDst(MI, CastTy, 0); + Observer.changedInstr(MI); + return Legalized; + } + case TargetOpcode::G_AND: + case TargetOpcode::G_OR: + case TargetOpcode::G_XOR: { + Observer.changingInstr(MI); + bitcastSrc(MI, CastTy, 1); + bitcastSrc(MI, CastTy, 2); + bitcastDst(MI, CastTy, 0); + Observer.changedInstr(MI); + return Legalized; + } + default: + return UnableToLegalize; + } +} + LegalizerHelper::LegalizeResult LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT Ty) { using namespace TargetOpcode; diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerInfo.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerInfo.cpp index cad08a3959362..2658434fe9491 100644 --- a/llvm/lib/CodeGen/GlobalISel/LegalizerInfo.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LegalizerInfo.cpp @@ -59,6 +59,9 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, LegalizeAction Action) { case MoreElements: OS << "MoreElements"; break; + case Bitcast: + OS << "Bitcast"; + break; case Lower: OS << "Lower"; break; @@ -173,6 +176,9 @@ static bool mutationIsSane(const LegalizeRule &Rule, return true; } + case Bitcast: { + return OldTy != NewTy && OldTy.getSizeInBits() == NewTy.getSizeInBits(); + } default: return true; } @@ -575,6 +581,7 @@ LegalizerInfo::findAction(const SizeAndActionsVec &Vec, const uint32_t Size) { LegalizeAction Action = Vec[VecIdx].second; switch (Action) { case Legal: + case Bitcast: case Lower: case Libcall: case Custom: diff --git a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp index ebd43d2281e17..e97be39ea70fe 100644 --- a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp +++ b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp @@ -959,7 +959,11 @@ MachineInstrBuilder MachineIRBuilder::buildInstr(unsigned Opc, case TargetOpcode::G_SMIN: case TargetOpcode::G_SMAX: case TargetOpcode::G_UMIN: - case TargetOpcode::G_UMAX: { + case TargetOpcode::G_UMAX: + case TargetOpcode::G_UADDSAT: + case TargetOpcode::G_SADDSAT: + case TargetOpcode::G_USUBSAT: + case TargetOpcode::G_SSUBSAT: { // All these are binary ops. assert(DstOps.size() == 1 && "Invalid Dst"); assert(SrcOps.size() == 2 && "Invalid Srcs"); diff --git a/llvm/lib/CodeGen/GlobalISel/RegisterBank.cpp b/llvm/lib/CodeGen/GlobalISel/RegisterBank.cpp index 54e5d48edf276..fc9c802693abd 100644 --- a/llvm/lib/CodeGen/GlobalISel/RegisterBank.cpp +++ b/llvm/lib/CodeGen/GlobalISel/RegisterBank.cpp @@ -19,12 +19,11 @@ using namespace llvm; const unsigned RegisterBank::InvalidID = UINT_MAX; -const unsigned RegisterBank::InvalidHwMode = UINT_MAX; RegisterBank::RegisterBank( - unsigned ID, const char *Name, const unsigned *Sizes, + unsigned ID, const char *Name, unsigned Size, const uint32_t *CoveredClasses, unsigned NumRegClasses) - : ID(ID), Name(Name), Sizes(Sizes), HwMode(InvalidHwMode) { + : ID(ID), Name(Name), Size(Size) { ContainedRegClasses.resize(NumRegClasses); ContainedRegClasses.setBitsInMask(CoveredClasses); } @@ -64,8 +63,7 @@ bool RegisterBank::covers(const TargetRegisterClass &RC) const { } bool RegisterBank::isValid() const { - return ID != InvalidID && Name != nullptr && Sizes != nullptr && - HwMode != InvalidID && + return ID != InvalidID && Name != nullptr && Size != 0 && // A register bank that does not cover anything is useless. !ContainedRegClasses.empty(); } diff --git a/llvm/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp b/llvm/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp index 3a8d0a9d3c4fc..255ea693b5c4a 100644 --- a/llvm/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp +++ b/llvm/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp @@ -56,11 +56,8 @@ const unsigned RegisterBankInfo::InvalidMappingID = UINT_MAX - 1; // RegisterBankInfo implementation. //------------------------------------------------------------------------------ RegisterBankInfo::RegisterBankInfo(RegisterBank **RegBanks, - unsigned NumRegBanks, unsigned HwMode) + unsigned NumRegBanks) : RegBanks(RegBanks), NumRegBanks(NumRegBanks) { - // Initialize HwMode for all RegBanks - for (unsigned Idx = 0, End = getNumRegBanks(); Idx != End; ++Idx) - RegBanks[Idx]->HwMode = HwMode; #ifndef NDEBUG for (unsigned Idx = 0, End = getNumRegBanks(); Idx != End; ++Idx) { assert(RegBanks[Idx] != nullptr && "Invalid RegisterBank"); diff --git a/llvm/lib/CodeGen/LiveDebugValues.cpp b/llvm/lib/CodeGen/LiveDebugValues.cpp index 94c5cc58ac1e3..9816bd8f97ec5 100644 --- a/llvm/lib/CodeGen/LiveDebugValues.cpp +++ b/llvm/lib/CodeGen/LiveDebugValues.cpp @@ -110,7 +110,10 @@ static bool isRegOtherThanSPAndFP(const MachineOperand &Op, namespace { +// Max out the number of statically allocated elements in DefinedRegsSet, as +// this prevents fallback to std::set::count() operations. using DefinedRegsSet = SmallSet; + using VarLocSet = CoalescingBitVector; /// A type-checked pair of {Register Location (or 0), Index}, used to index @@ -482,7 +485,8 @@ class LiveDebugValues : public MachineFunctionPass { } }; - using VarLocInMBB = SmallDenseMap; + using VarLocInMBB = + SmallDenseMap>; struct TransferDebugPair { MachineInstr *TransferInst; ///< Instruction where this transfer occurs. LocIndex LocationID; ///< Location number for the transfer dest. @@ -562,10 +566,11 @@ class LiveDebugValues : public MachineFunctionPass { } }; - /// Collect all VarLoc IDs from \p CollectFrom for VarLocs which are located - /// in \p Reg, of kind RegisterKind. Insert collected IDs in \p Collected. - void collectIDsForReg(VarLocSet &Collected, uint32_t Reg, - const VarLocSet &CollectFrom) const; + /// Collect all VarLoc IDs from \p CollectFrom for VarLocs of kind + /// RegisterKind which are located in any reg in \p Regs. Insert collected IDs + /// into \p Collected. + void collectIDsForRegs(VarLocSet &Collected, const DefinedRegsSet &Regs, + const VarLocSet &CollectFrom) const; /// Get the registers which are used by VarLocs of kind RegisterKind tracked /// by \p CollectFrom. @@ -573,15 +578,17 @@ class LiveDebugValues : public MachineFunctionPass { SmallVectorImpl &UsedRegs) const; VarLocSet &getVarLocsInMBB(const MachineBasicBlock *MBB, VarLocInMBB &Locs) { - auto Result = Locs.try_emplace(MBB, Alloc); - return Result.first->second; + std::unique_ptr &VLS = Locs[MBB]; + if (!VLS) + VLS = std::make_unique(Alloc); + return *VLS.get(); } const VarLocSet &getVarLocsInMBB(const MachineBasicBlock *MBB, const VarLocInMBB &Locs) const { auto It = Locs.find(MBB); assert(It != Locs.end() && "MBB not in map"); - return It->second; + return *It->second.get(); } /// Tests whether this instruction is a spill to a stack location. @@ -770,16 +777,30 @@ LiveDebugValues::OpenRangesSet::getEntryValueBackup(DebugVariable Var) { return llvm::None; } -void LiveDebugValues::collectIDsForReg(VarLocSet &Collected, uint32_t Reg, - const VarLocSet &CollectFrom) const { - // The half-open interval [FirstIndexForReg, FirstInvalidIndex) contains all - // possible VarLoc IDs for VarLocs of kind RegisterKind which live in Reg. - uint64_t FirstIndexForReg = LocIndex::rawIndexForReg(Reg); - uint64_t FirstInvalidIndex = LocIndex::rawIndexForReg(Reg + 1); - // Iterate through that half-open interval and collect all the set IDs. - for (auto It = CollectFrom.find(FirstIndexForReg), End = CollectFrom.end(); - It != End && *It < FirstInvalidIndex; ++It) - Collected.set(*It); +void LiveDebugValues::collectIDsForRegs(VarLocSet &Collected, + const DefinedRegsSet &Regs, + const VarLocSet &CollectFrom) const { + assert(!Regs.empty() && "Nothing to collect"); + SmallVector SortedRegs; + for (Register Reg : Regs) + SortedRegs.push_back(Reg); + array_pod_sort(SortedRegs.begin(), SortedRegs.end()); + auto It = CollectFrom.find(LocIndex::rawIndexForReg(SortedRegs.front())); + auto End = CollectFrom.end(); + for (uint32_t Reg : SortedRegs) { + // The half-open interval [FirstIndexForReg, FirstInvalidIndex) contains all + // possible VarLoc IDs for VarLocs of kind RegisterKind which live in Reg. + uint64_t FirstIndexForReg = LocIndex::rawIndexForReg(Reg); + uint64_t FirstInvalidIndex = LocIndex::rawIndexForReg(Reg + 1); + It.advanceToLowerBound(FirstIndexForReg); + + // Iterate through that half-open interval and collect all the set IDs. + for (; It != End && *It < FirstInvalidIndex; ++It) + Collected.set(*It); + + if (It == End) + return; + } } void LiveDebugValues::getUsedRegs(const VarLocSet &CollectFrom, @@ -800,7 +821,7 @@ void LiveDebugValues::getUsedRegs(const VarLocSet &CollectFrom, // even if there aren't any VarLocs living in `FoundReg+1`, we're still // guaranteed to move on to the next register (or to end()). uint64_t NextRegIndex = LocIndex::rawIndexForReg(FoundReg + 1); - It = CollectFrom.find(NextRegIndex); + It.advanceToLowerBound(NextRegIndex); } } @@ -1073,9 +1094,7 @@ void LiveDebugValues::transferRegisterDef( unsigned SP = TLI->getStackPointerRegisterToSaveRestore(); // Find the regs killed by MI, and find regmasks of preserved regs. - // Max out the number of statically allocated elements in `DeadRegs`, as this - // prevents fallback to std::set::count() operations. - SmallSet DeadRegs; + DefinedRegsSet DeadRegs; SmallVector RegMasks; for (const MachineOperand &MO : MI.operands()) { // Determine whether the operand is a register def. @@ -1094,9 +1113,6 @@ void LiveDebugValues::transferRegisterDef( // Erase VarLocs which reside in one of the dead registers. For performance // reasons, it's critical to not iterate over the full set of open VarLocs. // Iterate over the set of dying/used regs instead. - VarLocSet KillSet(Alloc); - for (uint32_t DeadReg : DeadRegs) - collectIDsForReg(KillSet, DeadReg, OpenRanges.getVarLocs()); if (!RegMasks.empty()) { SmallVector UsedRegs; getUsedRegs(OpenRanges.getVarLocs(), UsedRegs); @@ -1118,14 +1134,20 @@ void LiveDebugValues::transferRegisterDef( return MachineOperand::clobbersPhysReg(RegMask, Reg); }); if (AnyRegMaskKillsReg) - collectIDsForReg(KillSet, Reg, OpenRanges.getVarLocs()); + DeadRegs.insert(Reg); } } + + if (DeadRegs.empty()) + return; + + VarLocSet KillSet(Alloc); + collectIDsForRegs(KillSet, DeadRegs, OpenRanges.getVarLocs()); OpenRanges.erase(KillSet, VarLocIDs); if (auto *TPC = getAnalysisIfAvailable()) { auto &TM = TPC->getTM(); - if (TM.Options.EnableDebugEntryValues) + if (TM.Options.ShouldEmitDebugEntryValues()) emitEntryValues(MI, OpenRanges, VarLocIDs, Transfers, KillSet); } } @@ -1479,10 +1501,11 @@ bool LiveDebugValues::join( // Just copy over the Out locs to incoming locs for the first visited // predecessor, and for all other predecessors join the Out locs. + VarLocSet &OutLocVLS = *OL->second.get(); if (!NumVisited) - InLocsT = OL->second; + InLocsT = OutLocVLS; else - InLocsT &= OL->second; + InLocsT &= OutLocVLS; LLVM_DEBUG({ if (!InLocsT.empty()) { @@ -1554,7 +1577,7 @@ void LiveDebugValues::flushPendingLocs(VarLocInMBB &PendingInLocs, for (auto &Iter : PendingInLocs) { // Map is keyed on a constant pointer, unwrap it so we can insert insts. auto &MBB = const_cast(*Iter.first); - VarLocSet &Pending = Iter.second; + VarLocSet &Pending = *Iter.second.get(); for (uint64_t ID : Pending) { // The ID location is live-in to MBB -- work out what kind of machine @@ -1630,7 +1653,7 @@ void LiveDebugValues::recordEntryValue(const MachineInstr &MI, VarLocMap &VarLocIDs) { if (auto *TPC = getAnalysisIfAvailable()) { auto &TM = TPC->getTM(); - if (!TM.Options.EnableDebugEntryValues) + if (!TM.Options.ShouldEmitDebugEntryValues()) return; } @@ -1703,7 +1726,7 @@ bool LiveDebugValues::ExtendRanges(MachineFunction &MF) { // Initialize per-block structures and scan for fragment overlaps. for (auto &MBB : MF) { - PendingInLocs.try_emplace(&MBB, Alloc); + PendingInLocs[&MBB] = std::make_unique(Alloc); for (auto &MI : MBB) { if (MI.isDebugValue()) diff --git a/llvm/lib/CodeGen/MIRParser/MILexer.cpp b/llvm/lib/CodeGen/MIRParser/MILexer.cpp index 36a027c987e1f..e4852d3069418 100644 --- a/llvm/lib/CodeGen/MIRParser/MILexer.cpp +++ b/llvm/lib/CodeGen/MIRParser/MILexer.cpp @@ -260,6 +260,7 @@ static MIToken::TokenKind getIdentifierKind(StringRef Identifier) { .Case("liveout", MIToken::kw_liveout) .Case("address-taken", MIToken::kw_address_taken) .Case("landing-pad", MIToken::kw_landing_pad) + .Case("ehfunclet-entry", MIToken::kw_ehfunclet_entry) .Case("liveins", MIToken::kw_liveins) .Case("successors", MIToken::kw_successors) .Case("floatpred", MIToken::kw_floatpred) diff --git a/llvm/lib/CodeGen/MIRParser/MILexer.h b/llvm/lib/CodeGen/MIRParser/MILexer.h index e76f6a7e21a39..c804e1604f7b3 100644 --- a/llvm/lib/CodeGen/MIRParser/MILexer.h +++ b/llvm/lib/CodeGen/MIRParser/MILexer.h @@ -114,6 +114,7 @@ struct MIToken { kw_liveout, kw_address_taken, kw_landing_pad, + kw_ehfunclet_entry, kw_liveins, kw_successors, kw_floatpred, diff --git a/llvm/lib/CodeGen/MIRParser/MIParser.cpp b/llvm/lib/CodeGen/MIRParser/MIParser.cpp index 93af409ec8552..689e5afcf08e8 100644 --- a/llvm/lib/CodeGen/MIRParser/MIParser.cpp +++ b/llvm/lib/CodeGen/MIRParser/MIParser.cpp @@ -650,6 +650,7 @@ bool MIParser::parseBasicBlockDefinition( lex(); bool HasAddressTaken = false; bool IsLandingPad = false; + bool IsEHFuncletEntry = false; MachineBasicBlockSection SectionType = MBBS_None; unsigned Alignment = 0; BasicBlock *BB = nullptr; @@ -665,6 +666,10 @@ bool MIParser::parseBasicBlockDefinition( IsLandingPad = true; lex(); break; + case MIToken::kw_ehfunclet_entry: + IsEHFuncletEntry = true; + lex(); + break; case MIToken::kw_align: if (parseAlignment(Alignment)) return true; @@ -708,6 +713,7 @@ bool MIParser::parseBasicBlockDefinition( if (HasAddressTaken) MBB->setHasAddressTaken(); MBB->setIsEHPad(IsLandingPad); + MBB->setIsEHFuncletEntry(IsEHFuncletEntry); if (SectionType != MBBS_None) { MBB->setSectionType(SectionType); MF.setBBSectionsType(BasicBlockSection::List); diff --git a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp index 199d077ac66ee..cad0a8d0899a4 100644 --- a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp +++ b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp @@ -654,7 +654,7 @@ bool MIRParserImpl::initializeFrameInfo(PerFunctionMIParsingState &PFS, MFI.setStackSize(YamlMFI.StackSize); MFI.setOffsetAdjustment(YamlMFI.OffsetAdjustment); if (YamlMFI.MaxAlignment) - MFI.ensureMaxAlignment(YamlMFI.MaxAlignment); + MFI.ensureMaxAlignment(Align(YamlMFI.MaxAlignment)); MFI.setAdjustsStack(YamlMFI.AdjustsStack); MFI.setHasCalls(YamlMFI.HasCalls); if (YamlMFI.MaxCallFrameSize != ~0u) diff --git a/llvm/lib/CodeGen/MIRPrinter.cpp b/llvm/lib/CodeGen/MIRPrinter.cpp index 2a150b1368df2..22f7e1644a48e 100644 --- a/llvm/lib/CodeGen/MIRPrinter.cpp +++ b/llvm/lib/CodeGen/MIRPrinter.cpp @@ -630,6 +630,11 @@ void MIPrinter::print(const MachineBasicBlock &MBB) { OS << "landing-pad"; HasAttributes = true; } + if (MBB.isEHFuncletEntry()) { + OS << (HasAttributes ? ", " : " ("); + OS << "ehfunclet-entry"; + HasAttributes = true; + } if (MBB.getAlignment() != Align(1)) { OS << (HasAttributes ? ", " : " ("); OS << "align " << MBB.getAlignment().value(); diff --git a/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/llvm/lib/CodeGen/MachineBlockPlacement.cpp index 5ce43906dea1e..31491513bd96a 100644 --- a/llvm/lib/CodeGen/MachineBlockPlacement.cpp +++ b/llvm/lib/CodeGen/MachineBlockPlacement.cpp @@ -2506,14 +2506,9 @@ MachineBlockPlacement::collectLoopBlockSet(const MachineLoop &L) { // its frequency and the frequency of the loop block. When it is too small, // don't add it to the loop chain. If there are outer loops, then this block // will be merged into the first outer loop chain for which this block is not - // cold anymore. - // - // If a block uses static profiling data (e.g. from '__builtin_expect()'), - // then the programmer is explicitly telling us which paths are hot and cold. - // There's no reason for the compiler to believe otherwise, unless - // '-fprofile-use' is specified. - if (F->getFunction().hasProfileData() || ForceLoopColdBlock || - L.hasStaticProfInfo()) { + // cold anymore. This needs precise profile data and we only do this when + // profile data is available. + if (F->getFunction().hasProfileData() || ForceLoopColdBlock) { BlockFrequency LoopFreq(0); for (auto LoopPred : L.getHeader()->predecessors()) if (!L.contains(LoopPred)) diff --git a/llvm/lib/CodeGen/MachineFunction.cpp b/llvm/lib/CodeGen/MachineFunction.cpp index 76a1bd9f75afb..281d1bc15bfac 100644 --- a/llvm/lib/CodeGen/MachineFunction.cpp +++ b/llvm/lib/CodeGen/MachineFunction.cpp @@ -130,7 +130,7 @@ static inline unsigned getFnStackAlignment(const TargetSubtargetInfo *STI, const Function &F) { if (F.hasFnAttribute(Attribute::StackAlignment)) return F.getFnStackAlignment(); - return STI->getFrameLowering()->getStackAlignment(); + return STI->getFrameLowering()->getStackAlign().value(); } MachineFunction::MachineFunction(const Function &F, @@ -172,7 +172,7 @@ void MachineFunction::init() { F.hasFnAttribute(Attribute::StackAlignment)); if (F.hasFnAttribute(Attribute::StackAlignment)) - FrameInfo->ensureMaxAlignment(F.getFnStackAlignment()); + FrameInfo->ensureMaxAlignment(*F.getFnStackAlign()); ConstantPool = new (Allocator) MachineConstantPool(getDataLayout()); Alignment = STI->getTargetLowering()->getMinFunctionAlignment(); diff --git a/llvm/lib/CodeGen/MachineLoopInfo.cpp b/llvm/lib/CodeGen/MachineLoopInfo.cpp index 67916fbe722ee..0c1439da9b299 100644 --- a/llvm/lib/CodeGen/MachineLoopInfo.cpp +++ b/llvm/lib/CodeGen/MachineLoopInfo.cpp @@ -111,13 +111,6 @@ DebugLoc MachineLoop::getStartLoc() const { return DebugLoc(); } -bool MachineLoop::hasStaticProfInfo() const { - return llvm::any_of(blocks(), [](const MachineBasicBlock *MBB){ - const BasicBlock *BB = MBB->getBasicBlock(); - return BB && BB->getTerminator()->hasMetadata(LLVMContext::MD_prof); - }); -} - MachineBasicBlock * MachineLoopInfo::findLoopPreheader(MachineLoop *L, bool SpeculativePreheader) const { diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp index 4efa99a9887d3..41a53d1edde6c 100644 --- a/llvm/lib/CodeGen/MachinePipeliner.cpp +++ b/llvm/lib/CodeGen/MachinePipeliner.cpp @@ -2368,7 +2368,7 @@ int SMSchedule::earliestCycleInChain(const SDep &Dep) { continue; EarlyCycle = std::min(EarlyCycle, it->second); for (const auto &PI : PrevSU->Preds) - if (PI.getKind() == SDep::Order || Dep.getKind() == SDep::Output) + if (PI.getKind() == SDep::Order || PI.getKind() == SDep::Output) Worklist.push_back(PI); Visited.insert(PrevSU); } @@ -2391,7 +2391,7 @@ int SMSchedule::latestCycleInChain(const SDep &Dep) { continue; LateCycle = std::max(LateCycle, it->second); for (const auto &SI : SuccSU->Succs) - if (SI.getKind() == SDep::Order || Dep.getKind() == SDep::Output) + if (SI.getKind() == SDep::Order || SI.getKind() == SDep::Output) Worklist.push_back(SI); Visited.insert(SuccSU); } diff --git a/llvm/lib/CodeGen/PrologEpilogInserter.cpp b/llvm/lib/CodeGen/PrologEpilogInserter.cpp index b12ccc40eb61f..b3cece0223b5d 100644 --- a/llvm/lib/CodeGen/PrologEpilogInserter.cpp +++ b/llvm/lib/CodeGen/PrologEpilogInserter.cpp @@ -438,14 +438,12 @@ static void assignCalleeSavedSpillSlots(MachineFunction &F, unsigned Size = RegInfo->getSpillSize(*RC); if (FixedSlot == FixedSpillSlots + NumFixedSpillSlots) { // Nope, just spill it anywhere convenient. - unsigned Align = RegInfo->getSpillAlignment(*RC); - unsigned StackAlign = TFI->getStackAlignment(); - + Align Alignment(RegInfo->getSpillAlignment(*RC)); // We may not be able to satisfy the desired alignment specification of // the TargetRegisterClass if the stack alignment is smaller. Use the // min. - Align = std::min(Align, StackAlign); - FrameIdx = MFI.CreateStackObject(Size, Align, true); + Alignment = std::min(Alignment, TFI->getStackAlign()); + FrameIdx = MFI.CreateStackObject(Size, Alignment, true); if ((unsigned)FrameIdx < MinCSFrameIndex) MinCSFrameIndex = FrameIdx; if ((unsigned)FrameIdx > MaxCSFrameIndex) MaxCSFrameIndex = FrameIdx; } else { diff --git a/llvm/lib/CodeGen/RegUsageInfoPropagate.cpp b/llvm/lib/CodeGen/RegUsageInfoPropagate.cpp index 0205e6193741d..7b851f1d40e5e 100644 --- a/llvm/lib/CodeGen/RegUsageInfoPropagate.cpp +++ b/llvm/lib/CodeGen/RegUsageInfoPropagate.cpp @@ -118,8 +118,8 @@ bool RegUsageInfoPropagation::runOnMachineFunction(MachineFunction &MF) { continue; LLVM_DEBUG( dbgs() - << "Call Instruction Before Register Usage Info Propagation : \n"); - LLVM_DEBUG(dbgs() << MI << "\n"); + << "Call Instruction Before Register Usage Info Propagation : \n" + << MI << "\n"); auto UpdateRegMask = [&](const Function &F) { const ArrayRef RegMask = PRUI->getRegUsageInfo(F); @@ -140,8 +140,9 @@ bool RegUsageInfoPropagation::runOnMachineFunction(MachineFunction &MF) { } LLVM_DEBUG( - dbgs() << "Call Instruction After Register Usage Info Propagation : " - << MI << '\n'); + dbgs() + << "Call Instruction After Register Usage Info Propagation : \n" + << MI << '\n'); } } diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 254669b364708..574a80fcc3468 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -8325,13 +8325,15 @@ SDValue DAGCombiner::visitFunnelShift(SDNode *N) { // fold (fshr ld1, ld0, c) -> (ld0[ofs]) iff ld0 and ld1 are consecutive. // TODO - bigendian support once we have test coverage. // TODO - can we merge this with CombineConseutiveLoads/MatchLoadCombine? + // TODO - permit LHS EXTLOAD if extensions are shifted out. if ((BitWidth % 8) == 0 && (ShAmt % 8) == 0 && !VT.isVector() && !DAG.getDataLayout().isBigEndian()) { auto *LHS = dyn_cast(N0); auto *RHS = dyn_cast(N1); if (LHS && RHS && LHS->isSimple() && RHS->isSimple() && LHS->getAddressSpace() == RHS->getAddressSpace() && - (LHS->hasOneUse() || RHS->hasOneUse()) && ISD::isNON_EXTLoad(RHS)) { + (LHS->hasOneUse() || RHS->hasOneUse()) && ISD::isNON_EXTLoad(RHS) && + ISD::isNON_EXTLoad(LHS)) { if (DAG.areNonVolatileConsecutiveLoads(LHS, RHS, BitWidth / 8, 1)) { SDLoc DL(RHS); uint64_t PtrOff = @@ -11023,7 +11025,9 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) { // Attempt to pre-truncate BUILD_VECTOR sources. if (N0.getOpcode() == ISD::BUILD_VECTOR && !LegalOperations && - TLI.isTruncateFree(SrcVT.getScalarType(), VT.getScalarType())) { + TLI.isTruncateFree(SrcVT.getScalarType(), VT.getScalarType()) && + // Avoid creating illegal types if running after type legalizer. + (!LegalTypes || TLI.isTypeLegal(VT.getScalarType()))) { SDLoc DL(N); EVT SVT = VT.getScalarType(); SmallVector TruncOps; @@ -11953,6 +11957,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { // Always prefer FMAD to FMA for precision. unsigned PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA; bool Aggressive = TLI.enableAggressiveFMAFusion(VT); + bool NoSignedZero = Options.NoSignedZerosFPMath || Flags.hasNoSignedZeros(); // Is the node an FMUL and contractable either due to global flags or // SDNodeFlags. @@ -12116,7 +12121,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) { // -> (fma (fneg y), z, (fma (fneg u), v, x)) if (CanFuse && N1.getOpcode() == PreferredFusedOpcode && isContractableFMUL(N1.getOperand(2)) && - N1->hasOneUse()) { + N1->hasOneUse() && NoSignedZero) { SDValue N20 = N1.getOperand(2).getOperand(0); SDValue N21 = N1.getOperand(2).getOperand(1); return DAG.getNode(PreferredFusedOpcode, SL, VT, @@ -13054,11 +13059,13 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) { } // (fdiv (fneg X), (fneg Y)) -> (fdiv X, Y) - if (isCheaperToUseNegatedFPOps(N0, N1)) - return DAG.getNode( - ISD::FDIV, SDLoc(N), VT, - TLI.getNegatedExpression(N0, DAG, LegalOperations, ForCodeSize), - TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize), Flags); + if (isCheaperToUseNegatedFPOps(N0, N1)) { + SDValue Neg0 = + TLI.getNegatedExpression(N0, DAG, LegalOperations, ForCodeSize); + SDValue Neg1 = + TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize); + return DAG.getNode(ISD::FDIV, SDLoc(N), VT, Neg0, Neg1, Flags); + } return SDValue(); } @@ -14222,118 +14229,142 @@ bool DAGCombiner::CombineToPreIndexedLoadStore(SDNode *N) { return true; } -/// Try to combine a load/store with a add/sub of the base pointer node into a -/// post-indexed load/store. The transformation folded the add/subtract into the -/// new indexed load/store effectively and all of its uses are redirected to the -/// new load/store. -bool DAGCombiner::CombineToPostIndexedLoadStore(SDNode *N) { - if (Level < AfterLegalizeDAG) +static bool shouldCombineToPostInc(SDNode *N, SDValue Ptr, SDNode *PtrUse, + SDValue &BasePtr, SDValue &Offset, + ISD::MemIndexedMode &AM, + SelectionDAG &DAG, + const TargetLowering &TLI) { + if (PtrUse == N || + (PtrUse->getOpcode() != ISD::ADD && PtrUse->getOpcode() != ISD::SUB)) return false; - bool IsLoad = true; - bool IsMasked = false; - SDValue Ptr; - if (!getCombineLoadStoreParts(N, ISD::POST_INC, ISD::POST_DEC, IsLoad, IsMasked, - Ptr, TLI)) + if (!TLI.getPostIndexedAddressParts(N, PtrUse, BasePtr, Offset, AM, DAG)) return false; - if (Ptr.getNode()->hasOneUse()) + // Don't create a indexed load / store with zero offset. + if (isNullConstant(Offset)) return false; - for (SDNode *Op : Ptr.getNode()->uses()) { - if (Op == N || - (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB)) - continue; + if (isa(BasePtr) || isa(BasePtr)) + return false; - SDValue BasePtr; - SDValue Offset; - ISD::MemIndexedMode AM = ISD::UNINDEXED; - if (TLI.getPostIndexedAddressParts(N, Op, BasePtr, Offset, AM, DAG)) { - // Don't create a indexed load / store with zero offset. - if (isNullConstant(Offset)) - continue; + SmallPtrSet Visited; + for (SDNode *Use : BasePtr.getNode()->uses()) { + if (Use == Ptr.getNode()) + continue; - // Try turning it into a post-indexed load / store except when - // 1) All uses are load / store ops that use it as base ptr (and - // it may be folded as addressing mmode). - // 2) Op must be independent of N, i.e. Op is neither a predecessor - // nor a successor of N. Otherwise, if Op is folded that would - // create a cycle. + // No if there's a later user which could perform the index instead. + if (isa(Use)) { + bool IsLoad = true; + bool IsMasked = false; + SDValue OtherPtr; + if (getCombineLoadStoreParts(Use, ISD::POST_INC, ISD::POST_DEC, IsLoad, + IsMasked, OtherPtr, TLI)) { + SmallVector Worklist; + Worklist.push_back(Use); + if (SDNode::hasPredecessorHelper(N, Visited, Worklist)) + return false; + } + } - if (isa(BasePtr) || isa(BasePtr)) - continue; + // If all the uses are load / store addresses, then don't do the + // transformation. + if (Use->getOpcode() == ISD::ADD || Use->getOpcode() == ISD::SUB) { + for (SDNode *UseUse : Use->uses()) + if (canFoldInAddressingMode(Use, UseUse, DAG, TLI)) + return false; + } + } + return true; +} - // Check for #1. - bool TryNext = false; - for (SDNode *Use : BasePtr.getNode()->uses()) { - if (Use == Ptr.getNode()) - continue; +static SDNode *getPostIndexedLoadStoreOp(SDNode *N, bool &IsLoad, + bool &IsMasked, SDValue &Ptr, + SDValue &BasePtr, SDValue &Offset, + ISD::MemIndexedMode &AM, + SelectionDAG &DAG, + const TargetLowering &TLI) { + if (!getCombineLoadStoreParts(N, ISD::POST_INC, ISD::POST_DEC, IsLoad, + IsMasked, Ptr, TLI) || + Ptr.getNode()->hasOneUse()) + return nullptr; + + // Try turning it into a post-indexed load / store except when + // 1) All uses are load / store ops that use it as base ptr (and + // it may be folded as addressing mmode). + // 2) Op must be independent of N, i.e. Op is neither a predecessor + // nor a successor of N. Otherwise, if Op is folded that would + // create a cycle. + for (SDNode *Op : Ptr->uses()) { + // Check for #1. + if (!shouldCombineToPostInc(N, Ptr, Op, BasePtr, Offset, AM, DAG, TLI)) + continue; - // If all the uses are load / store addresses, then don't do the - // transformation. - if (Use->getOpcode() == ISD::ADD || Use->getOpcode() == ISD::SUB) { - bool RealUse = false; - for (SDNode *UseUse : Use->uses()) { - if (!canFoldInAddressingMode(Use, UseUse, DAG, TLI)) - RealUse = true; - } + // Check for #2. + SmallPtrSet Visited; + SmallVector Worklist; + // Ptr is predecessor to both N and Op. + Visited.insert(Ptr.getNode()); + Worklist.push_back(N); + Worklist.push_back(Op); + if (!SDNode::hasPredecessorHelper(N, Visited, Worklist) && + !SDNode::hasPredecessorHelper(Op, Visited, Worklist)) + return Op; + } + return nullptr; +} - if (!RealUse) { - TryNext = true; - break; - } - } - } +/// Try to combine a load/store with a add/sub of the base pointer node into a +/// post-indexed load/store. The transformation folded the add/subtract into the +/// new indexed load/store effectively and all of its uses are redirected to the +/// new load/store. +bool DAGCombiner::CombineToPostIndexedLoadStore(SDNode *N) { + if (Level < AfterLegalizeDAG) + return false; - if (TryNext) - continue; + bool IsLoad = true; + bool IsMasked = false; + SDValue Ptr; + SDValue BasePtr; + SDValue Offset; + ISD::MemIndexedMode AM = ISD::UNINDEXED; + SDNode *Op = getPostIndexedLoadStoreOp(N, IsLoad, IsMasked, Ptr, BasePtr, + Offset, AM, DAG, TLI); + if (!Op) + return false; - // Check for #2. - SmallPtrSet Visited; - SmallVector Worklist; - // Ptr is predecessor to both N and Op. - Visited.insert(Ptr.getNode()); - Worklist.push_back(N); - Worklist.push_back(Op); - if (!SDNode::hasPredecessorHelper(N, Visited, Worklist) && - !SDNode::hasPredecessorHelper(Op, Visited, Worklist)) { - SDValue Result; - if (!IsMasked) - Result = IsLoad ? DAG.getIndexedLoad(SDValue(N, 0), SDLoc(N), BasePtr, - Offset, AM) - : DAG.getIndexedStore(SDValue(N, 0), SDLoc(N), + SDValue Result; + if (!IsMasked) + Result = IsLoad ? DAG.getIndexedLoad(SDValue(N, 0), SDLoc(N), BasePtr, + Offset, AM) + : DAG.getIndexedStore(SDValue(N, 0), SDLoc(N), + BasePtr, Offset, AM); + else + Result = IsLoad ? DAG.getIndexedMaskedLoad(SDValue(N, 0), SDLoc(N), + BasePtr, Offset, AM) + : DAG.getIndexedMaskedStore(SDValue(N, 0), SDLoc(N), BasePtr, Offset, AM); - else - Result = IsLoad ? DAG.getIndexedMaskedLoad(SDValue(N, 0), SDLoc(N), - BasePtr, Offset, AM) - : DAG.getIndexedMaskedStore(SDValue(N, 0), SDLoc(N), - BasePtr, Offset, AM); - ++PostIndexedNodes; - ++NodesCombined; - LLVM_DEBUG(dbgs() << "\nReplacing.5 "; N->dump(&DAG); - dbgs() << "\nWith: "; Result.getNode()->dump(&DAG); - dbgs() << '\n'); - WorklistRemover DeadNodes(*this); - if (IsLoad) { - DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(0)); - DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Result.getValue(2)); - } else { - DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(1)); - } - - // Finally, since the node is now dead, remove it from the graph. - deleteAndRecombine(N); - - // Replace the uses of Use with uses of the updated base value. - DAG.ReplaceAllUsesOfValueWith(SDValue(Op, 0), - Result.getValue(IsLoad ? 1 : 0)); - deleteAndRecombine(Op); - return true; - } - } + ++PostIndexedNodes; + ++NodesCombined; + LLVM_DEBUG(dbgs() << "\nReplacing.5 "; N->dump(&DAG); + dbgs() << "\nWith: "; Result.getNode()->dump(&DAG); + dbgs() << '\n'); + WorklistRemover DeadNodes(*this); + if (IsLoad) { + DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(0)); + DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Result.getValue(2)); + } else { + DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(1)); } - return false; + // Finally, since the node is now dead, remove it from the graph. + deleteAndRecombine(N); + + // Replace the uses of Use with uses of the updated base value. + DAG.ReplaceAllUsesOfValueWith(SDValue(Op, 0), + Result.getValue(IsLoad ? 1 : 0)); + deleteAndRecombine(Op); + return true; } /// Return the base-pointer arithmetic from an indexed \p LD. @@ -19749,16 +19780,6 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) { if (N0.getOpcode() == ISD::BITCAST && N0.hasOneUse() && N1.isUndef() && Level < AfterLegalizeVectorOps && TLI.isTypeLegal(VT)) { - auto ScaleShuffleMask = [](ArrayRef Mask, int Scale) { - if (Scale == 1) - return SmallVector(Mask.begin(), Mask.end()); - - SmallVector NewMask; - for (int M : Mask) - for (int s = 0; s != Scale; ++s) - NewMask.push_back(M < 0 ? -1 : Scale * M + s); - return NewMask; - }; SDValue BC0 = peekThroughOneUseBitcasts(N0); if (BC0.getOpcode() == ISD::VECTOR_SHUFFLE && BC0.hasOneUse()) { @@ -19778,10 +19799,10 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) { // Scale the shuffle masks to the smaller scalar type. ShuffleVectorSDNode *InnerSVN = cast(BC0); - SmallVector InnerMask = - ScaleShuffleMask(InnerSVN->getMask(), InnerScale); - SmallVector OuterMask = - ScaleShuffleMask(SVN->getMask(), OuterScale); + SmallVector InnerMask; + SmallVector OuterMask; + scaleShuffleMask(InnerScale, InnerSVN->getMask(), InnerMask); + scaleShuffleMask(OuterScale, SVN->getMask(), OuterMask); // Merge the shuffle masks. SmallVector NewMask; diff --git a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp index ada09092c4782..461d481c822f5 100644 --- a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp @@ -1572,6 +1572,27 @@ bool FastISel::selectBitCast(const User *I) { return true; } +bool FastISel::selectFreeze(const User *I) { + Register Reg = getRegForValue(I->getOperand(0)); + if (!Reg) + // Unhandled operand. + return false; + + EVT ETy = TLI.getValueType(DL, I->getOperand(0)->getType()); + if (ETy == MVT::Other || !TLI.isTypeLegal(ETy)) + // Unhandled type, bail out. + return false; + + MVT Ty = ETy.getSimpleVT(); + const TargetRegisterClass *TyRegClass = TLI.getRegClassFor(Ty); + Register ResultReg = createResultReg(TyRegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::COPY), ResultReg).addReg(Reg); + + updateValueMap(I, ResultReg); + return true; +} + // Remove local value instructions starting from the instruction after // SavedLastLocalValue to the current function insert point. void FastISel::removeDeadLocalValueCode(MachineInstr *SavedLastLocalValue) @@ -1913,6 +1934,9 @@ bool FastISel::selectOperator(const User *I, unsigned Opcode) { case Instruction::ExtractValue: return selectExtractValue(I); + case Instruction::Freeze: + return selectFreeze(I); + case Instruction::PHI: llvm_unreachable("FastISel shouldn't visit PHI nodes!"); diff --git a/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp b/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp index a69f1e534284f..e35d47555918d 100644 --- a/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp @@ -85,7 +85,6 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf, TLI = MF->getSubtarget().getTargetLowering(); RegInfo = &MF->getRegInfo(); const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering(); - unsigned StackAlign = TFI->getStackAlignment(); DA = DAG->getDivergenceAnalysis(); // Check whether the function can return without sret-demotion. @@ -130,19 +129,19 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf, // Initialize the mapping of values to registers. This is only set up for // instruction values that are used outside of the block that defines // them. + const Align StackAlign = TFI->getStackAlign(); for (const BasicBlock &BB : *Fn) { for (const Instruction &I : BB) { if (const AllocaInst *AI = dyn_cast(&I)) { Type *Ty = AI->getAllocatedType(); - unsigned Align = - std::max((unsigned)MF->getDataLayout().getPrefTypeAlignment(Ty), - AI->getAlignment()); + Align Alignment = + max(MF->getDataLayout().getPrefTypeAlign(Ty), AI->getAlign()); // Static allocas can be folded into the initial stack frame // adjustment. For targets that don't realign the stack, don't // do this if there is an extra alignment requirement. if (AI->isStaticAlloca() && - (TFI->isStackRealignable() || (Align <= StackAlign))) { + (TFI->isStackRealignable() || (Alignment <= StackAlign))) { const ConstantInt *CUI = cast(AI->getArraySize()); uint64_t TySize = MF->getDataLayout().getTypeAllocSize(Ty).getKnownMinSize(); @@ -154,10 +153,10 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf, if (Iter != CatchObjects.end() && TLI->needsFixedCatchObjects()) { FrameIndex = MF->getFrameInfo().CreateFixedObject( TySize, 0, /*IsImmutable=*/false, /*isAliased=*/true); - MF->getFrameInfo().setObjectAlignment(FrameIndex, Align); + MF->getFrameInfo().setObjectAlignment(FrameIndex, Alignment); } else { - FrameIndex = - MF->getFrameInfo().CreateStackObject(TySize, Align, false, AI); + FrameIndex = MF->getFrameInfo().CreateStackObject(TySize, Alignment, + false, AI); } // Scalable vectors may need a special StackID to distinguish @@ -176,10 +175,9 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf, // FIXME: Overaligned static allocas should be grouped into // a single dynamic allocation instead of using a separate // stack allocation for each one. - if (Align <= StackAlign) - Align = 0; // Inform the Frame Information that we have variable-sized objects. - MF->getFrameInfo().CreateVariableSizedObject(Align ? Align : 1, AI); + MF->getFrameInfo().CreateVariableSizedObject( + Alignment <= StackAlign ? 0 : Alignment.value(), AI); } } diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp index 428dc83e2fd3d..60174fc9da8f8 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp @@ -2456,6 +2456,7 @@ void DAGTypeLegalizer::SoftPromoteHalfResult(SDNode *N, unsigned ResNo) { case ISD::FLOG10: case ISD::FNEARBYINT: case ISD::FNEG: + case ISD::FREEZE: case ISD::FRINT: case ISD::FROUND: case ISD::FSIN: diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp index 3955490a3f85e..0248b5121e3f2 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -201,6 +201,10 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) { case ISD::VECREDUCE_UMIN: Res = PromoteIntRes_VECREDUCE(N); break; + + case ISD::FREEZE: + Res = PromoteIntRes_FREEZE(N); + break; } // If the result is null then the sub-method took care of registering it. @@ -401,6 +405,12 @@ static EVT getShiftAmountTyForConstant(EVT VT, const TargetLowering &TLI, return ShiftVT; } +SDValue DAGTypeLegalizer::PromoteIntRes_FREEZE(SDNode *N) { + SDValue V = GetPromotedInteger(N->getOperand(0)); + return DAG.getNode(ISD::FREEZE, SDLoc(N), + V.getValueType(), V); +} + SDValue DAGTypeLegalizer::PromoteIntRes_BSWAP(SDNode *N) { SDValue Op = GetPromotedInteger(N->getOperand(0)); EVT OVT = N->getValueType(0); @@ -1868,6 +1878,7 @@ void DAGTypeLegalizer::ExpandIntegerResult(SDNode *N, unsigned ResNo) { case ISD::SELECT: SplitRes_SELECT(N, Lo, Hi); break; case ISD::SELECT_CC: SplitRes_SELECT_CC(N, Lo, Hi); break; case ISD::UNDEF: SplitRes_UNDEF(N, Lo, Hi); break; + case ISD::FREEZE: SplitRes_FREEZE(N, Lo, Hi); break; case ISD::BITCAST: ExpandRes_BITCAST(N, Lo, Hi); break; case ISD::BUILD_PAIR: ExpandRes_BUILD_PAIR(N, Lo, Hi); break; diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h index 8126c42e44988..aee4ab1fcd619 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -309,6 +309,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { SDValue PromoteIntRes_EXTRACT_VECTOR_ELT(SDNode *N); SDValue PromoteIntRes_FP_TO_XINT(SDNode *N); SDValue PromoteIntRes_FP_TO_FP16(SDNode *N); + SDValue PromoteIntRes_FREEZE(SDNode *N); SDValue PromoteIntRes_INT_EXTEND(SDNode *N); SDValue PromoteIntRes_LOAD(LoadSDNode *N); SDValue PromoteIntRes_MLOAD(MaskedLoadSDNode *N); @@ -961,6 +962,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { void SplitRes_SELECT (SDNode *N, SDValue &Lo, SDValue &Hi); void SplitRes_SELECT_CC (SDNode *N, SDValue &Lo, SDValue &Hi); void SplitRes_UNDEF (SDNode *N, SDValue &Lo, SDValue &Hi); + void SplitRes_FREEZE (SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVSETCC(const SDNode *N); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp index ad3e02f9921a2..8231a320b4f3e 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp @@ -558,3 +558,12 @@ void DAGTypeLegalizer::SplitRes_UNDEF(SDNode *N, SDValue &Lo, SDValue &Hi) { Lo = DAG.getUNDEF(LoVT); Hi = DAG.getUNDEF(HiVT); } + +void DAGTypeLegalizer::SplitRes_FREEZE(SDNode *N, SDValue &Lo, SDValue &Hi) { + SDValue L, H; + SDLoc dl(N); + GetSplitOp(N->getOperand(0), L, H); + + Lo = DAG.getNode(ISD::FREEZE, dl, L.getValueType(), L); + Hi = DAG.getNode(ISD::FREEZE, dl, H.getValueType(), H); +} diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index 5a4b4c615bc06..afed415af5ed9 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -20,10 +20,11 @@ //===----------------------------------------------------------------------===// #include "LegalizeTypes.h" +#include "llvm/Analysis/MemoryLocation.h" #include "llvm/IR/DataLayout.h" #include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/raw_ostream.h" #include "llvm/Support/TypeSize.h" +#include "llvm/Support/raw_ostream.h" using namespace llvm; #define DEBUG_TYPE "legalize-types" @@ -88,6 +89,7 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) { case ISD::FLOG2: case ISD::FNEARBYINT: case ISD::FNEG: + case ISD::FREEZE: case ISD::FP_EXTEND: case ISD::FP_TO_SINT: case ISD::FP_TO_UINT: @@ -878,6 +880,7 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) { case ISD::FLOG2: case ISD::FNEARBYINT: case ISD::FNEG: + case ISD::FREEZE: case ISD::FP_EXTEND: case ISD::FP_ROUND: case ISD::FP_TO_SINT: @@ -1627,11 +1630,6 @@ void DAGTypeLegalizer::SplitVecRes_MGATHER(MaskedGatherSDNode *MGT, std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, dl); } - EVT MemoryVT = MGT->getMemoryVT(); - EVT LoMemVT, HiMemVT; - // Split MemoryVT - std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT); - SDValue PassThruLo, PassThruHi; if (getTypeAction(PassThru.getValueType()) == TargetLowering::TypeSplitVector) GetSplitVector(PassThru, PassThruLo, PassThruHi); @@ -1644,10 +1642,10 @@ void DAGTypeLegalizer::SplitVecRes_MGATHER(MaskedGatherSDNode *MGT, else std::tie(IndexLo, IndexHi) = DAG.SplitVector(Index, dl); - MachineMemOperand *MMO = DAG.getMachineFunction(). - getMachineMemOperand(MGT->getPointerInfo(), - MachineMemOperand::MOLoad, LoMemVT.getStoreSize(), - Alignment, MGT->getAAInfo(), MGT->getRanges()); + MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand( + MGT->getPointerInfo(), MachineMemOperand::MOLoad, + MemoryLocation::UnknownSize, Alignment, MGT->getAAInfo(), + MGT->getRanges()); SDValue OpsLo[] = {Ch, PassThruLo, MaskLo, Ptr, IndexLo, Scale}; Lo = DAG.getMaskedGather(DAG.getVTList(LoVT, MVT::Other), LoVT, dl, OpsLo, @@ -2376,13 +2374,10 @@ SDValue DAGTypeLegalizer::SplitVecOp_MSCATTER(MaskedScatterSDNode *N, SDValue Index = N->getIndex(); SDValue Scale = N->getScale(); SDValue Data = N->getValue(); - EVT MemoryVT = N->getMemoryVT(); unsigned Alignment = N->getOriginalAlignment(); SDLoc DL(N); // Split all operands - EVT LoMemVT, HiMemVT; - std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT); SDValue DataLo, DataHi; if (getTypeAction(Data.getValueType()) == TargetLowering::TypeSplitVector) @@ -2409,20 +2404,14 @@ SDValue DAGTypeLegalizer::SplitVecOp_MSCATTER(MaskedScatterSDNode *N, std::tie(IndexLo, IndexHi) = DAG.SplitVector(Index, DL); SDValue Lo; - MachineMemOperand *MMO = DAG.getMachineFunction(). - getMachineMemOperand(N->getPointerInfo(), - MachineMemOperand::MOStore, LoMemVT.getStoreSize(), - Alignment, N->getAAInfo(), N->getRanges()); + MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand( + N->getPointerInfo(), MachineMemOperand::MOStore, + MemoryLocation::UnknownSize, Alignment, N->getAAInfo(), N->getRanges()); SDValue OpsLo[] = {Ch, DataLo, MaskLo, Ptr, IndexLo, Scale}; Lo = DAG.getMaskedScatter(DAG.getVTList(MVT::Other), DataLo.getValueType(), DL, OpsLo, MMO, N->getIndexType()); - MMO = DAG.getMachineFunction(). - getMachineMemOperand(N->getPointerInfo(), - MachineMemOperand::MOStore, HiMemVT.getStoreSize(), - Alignment, N->getAAInfo(), N->getRanges()); - // The order of the Scatter operation after split is well defined. The "Hi" // part comes after the "Lo". So these two operations should be chained one // after another. @@ -2844,6 +2833,7 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) { case ISD::CTTZ: case ISD::CTTZ_ZERO_UNDEF: case ISD::FNEG: + case ISD::FREEZE: case ISD::FCANONICALIZE: Res = WidenVecRes_Unary(N); break; diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 5b673486af153..58311069c09ae 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -4048,13 +4048,10 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts, return FirstAnswer; } - // Okay, we know that the sign bit in Mask is set. Use CLZ to determine + // Okay, we know that the sign bit in Mask is set. Use CLO to determine // the number of identical bits in the top of the input value. - Mask = ~Mask; Mask <<= Mask.getBitWidth()-VTBits; - // Return # leading zeros. We use 'min' here in case Val was zero before - // shifting. We don't want to return '64' as for an i32 "0". - return std::max(FirstAnswer, std::min(VTBits, Mask.countLeadingZeros())); + return std::max(FirstAnswer, Mask.countLeadingOnes()); } bool SelectionDAG::isBaseWithConstantOffset(SDValue Op) const { diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 29f15095af583..1ad86208ed4ea 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -415,10 +415,13 @@ static SDValue getCopyFromPartsVector(SelectionDAG &DAG, const SDLoc &DL, // Build a vector with BUILD_VECTOR or CONCAT_VECTORS from the // intermediate operands. EVT BuiltVectorTy = - EVT::getVectorVT(*DAG.getContext(), IntermediateVT.getScalarType(), - (IntermediateVT.isVector() - ? IntermediateVT.getVectorNumElements() * NumParts - : NumIntermediates)); + IntermediateVT.isVector() + ? EVT::getVectorVT( + *DAG.getContext(), IntermediateVT.getScalarType(), + IntermediateVT.getVectorElementCount() * NumParts) + : EVT::getVectorVT(*DAG.getContext(), + IntermediateVT.getScalarType(), + NumIntermediates); Val = DAG.getNode(IntermediateVT.isVector() ? ISD::CONCAT_VECTORS : ISD::BUILD_VECTOR, DL, BuiltVectorTy, Ops); @@ -4007,8 +4010,7 @@ void SelectionDAGBuilder::visitAlloca(const AllocaInst &I) { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); auto &DL = DAG.getDataLayout(); uint64_t TySize = DL.getTypeAllocSize(Ty); - unsigned Align = - std::max((unsigned)DL.getPrefTypeAlignment(Ty), I.getAlignment()); + MaybeAlign Alignment = max(DL.getPrefTypeAlign(Ty), I.getAlign()); SDValue AllocSize = getValue(I.getArraySize()); @@ -4023,25 +4025,26 @@ void SelectionDAGBuilder::visitAlloca(const AllocaInst &I) { // Handle alignment. If the requested alignment is less than or equal to // the stack alignment, ignore it. If the size is greater than or equal to // the stack alignment, we note this in the DYNAMIC_STACKALLOC node. - unsigned StackAlign = - DAG.getSubtarget().getFrameLowering()->getStackAlignment(); - if (Align <= StackAlign) - Align = 0; + Align StackAlign = DAG.getSubtarget().getFrameLowering()->getStackAlign(); + if (Alignment <= StackAlign) + Alignment = None; + const uint64_t StackAlignMask = StackAlign.value() - 1U; // Round the size of the allocation up to the stack alignment size // by add SA-1 to the size. This doesn't overflow because we're computing // an address inside an alloca. SDNodeFlags Flags; Flags.setNoUnsignedWrap(true); AllocSize = DAG.getNode(ISD::ADD, dl, AllocSize.getValueType(), AllocSize, - DAG.getConstant(StackAlign - 1, dl, IntPtr), Flags); + DAG.getConstant(StackAlignMask, dl, IntPtr), Flags); // Mask out the low bits for alignment purposes. - AllocSize = - DAG.getNode(ISD::AND, dl, AllocSize.getValueType(), AllocSize, - DAG.getConstant(~(uint64_t)(StackAlign - 1), dl, IntPtr)); + AllocSize = DAG.getNode(ISD::AND, dl, AllocSize.getValueType(), AllocSize, + DAG.getConstant(~StackAlignMask, dl, IntPtr)); - SDValue Ops[] = {getRoot(), AllocSize, DAG.getConstant(Align, dl, IntPtr)}; + SDValue Ops[] = { + getRoot(), AllocSize, + DAG.getConstant(Alignment ? Alignment->value() : 0, dl, IntPtr)}; SDVTList VTs = DAG.getVTList(AllocSize.getValueType(), MVT::Other); SDValue DSA = DAG.getNode(ISD::DYNAMIC_STACKALLOC, dl, VTs, Ops); setValue(&I, DSA); @@ -10687,6 +10690,22 @@ void SelectionDAGBuilder::visitSwitch(const SwitchInst &SI) { } void SelectionDAGBuilder::visitFreeze(const FreezeInst &I) { - SDValue N = getValue(I.getOperand(0)); - setValue(&I, N); + SDNodeFlags Flags; + + SDValue Op = getValue(I.getOperand(0)); + if (I.getOperand(0)->getType()->isAggregateType()) { + EVT VT = Op.getValueType(); + SmallVector Values; + for (unsigned i = 0; i < Op.getNumOperands(); ++i) { + SDValue Arg(Op.getNode(), i); + SDValue UnNodeValue = DAG.getNode(ISD::FREEZE, getCurSDLoc(), VT, Arg, Flags); + Values.push_back(UnNodeValue); + } + SDValue MergedValue = DAG.getMergeValues(Values, getCurSDLoc()); + setValue(&I, MergedValue); + } else { + SDValue UnNodeValue = DAG.getNode(ISD::FREEZE, getCurSDLoc(), Op.getValueType(), + Op, Flags); + setValue(&I, UnNodeValue); + } } diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp index 0fd132f03af8b..aca462f566746 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -392,6 +392,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const { case ISD::GC_TRANSITION_START: return "gc_transition.start"; case ISD::GC_TRANSITION_END: return "gc_transition.end"; case ISD::GET_DYNAMIC_AREA_OFFSET: return "get.dynamic.area.offset"; + case ISD::FREEZE: return "freeze"; // Bit manipulation case ISD::ABS: return "abs"; diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp index 51b2439dddc3d..6fa6bde047c54 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp @@ -2290,6 +2290,14 @@ void SelectionDAGISel::Select_UNDEF(SDNode *N) { CurDAG->SelectNodeTo(N, TargetOpcode::IMPLICIT_DEF, N->getValueType(0)); } +void SelectionDAGISel::Select_FREEZE(SDNode *N) { + // TODO: We don't have FREEZE pseudo-instruction in MachineInstr-level now. + // If FREEZE instruction is added later, the code below must be changed as + // well. + CurDAG->SelectNodeTo(N, TargetOpcode::COPY, N->getValueType(0), + N->getOperand(0)); +} + /// GetVBR - decode a vbr encoding whose top bit is set. LLVM_ATTRIBUTE_ALWAYS_INLINE static inline uint64_t GetVBR(uint64_t Val, const unsigned char *MatcherTable, unsigned &Idx) { @@ -2826,6 +2834,9 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch, case ISD::UNDEF: Select_UNDEF(NodeToMatch); return; + case ISD::FREEZE: + Select_FREEZE(NodeToMatch); + return; } assert(!NodeToMatch->isMachineOpcode() && "Node already selected!"); diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 6148b24e3e000..3f0c6443211ee 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -2783,6 +2783,12 @@ unsigned TargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op, return 1; } +unsigned TargetLowering::computeNumSignBitsForTargetInstr( + GISelKnownBits &Analysis, Register R, const APInt &DemandedElts, + const MachineRegisterInfo &MRI, unsigned Depth) const { + return 1; +} + bool TargetLowering::SimplifyDemandedVectorEltsForTargetNode( SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth) const { @@ -5695,8 +5701,7 @@ TargetLowering::getNegatibleCost(SDValue Op, SelectionDAG &DAG, } SDValue TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG, - bool LegalOperations, - bool ForCodeSize, + bool LegalOps, bool OptForSize, unsigned Depth) const { // fneg is removable even if it has multiple uses. if (Op.getOpcode() == ISD::FNEG) @@ -5704,13 +5709,19 @@ SDValue TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG, assert(Depth <= SelectionDAG::MaxRecursionDepth && "getNegatedExpression doesn't match getNegatibleCost"); + + // Pre-increment recursion depth for use in recursive calls. + ++Depth; const SDNodeFlags Flags = Op->getFlags(); + EVT VT = Op.getValueType(); + unsigned Opcode = Op.getOpcode(); + SDLoc DL(Op); - switch (Op.getOpcode()) { + switch (Opcode) { case ISD::ConstantFP: { APFloat V = cast(Op)->getValueAPF(); V.changeSign(); - return DAG.getConstantFP(V, SDLoc(Op), Op.getValueType()); + return DAG.getConstantFP(V, DL, VT); } case ISD::BUILD_VECTOR: { SmallVector Ops; @@ -5721,60 +5732,52 @@ SDValue TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG, } APFloat V = cast(C)->getValueAPF(); V.changeSign(); - Ops.push_back(DAG.getConstantFP(V, SDLoc(Op), C.getValueType())); + Ops.push_back(DAG.getConstantFP(V, DL, C.getValueType())); } - return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Ops); + return DAG.getBuildVector(VT, DL, Ops); } case ISD::FADD: { + SDValue X = Op.getOperand(0), Y = Op.getOperand(1); assert((DAG.getTarget().Options.NoSignedZerosFPMath || Flags.hasNoSignedZeros()) && "Expected NSZ fp-flag"); - // fold (fneg (fadd A, B)) -> (fsub (fneg A), B) - NegatibleCost V0 = getNegatibleCost(Op.getOperand(0), DAG, LegalOperations, - ForCodeSize, Depth + 1); - if (V0 != NegatibleCost::Expensive) - return DAG.getNode(ISD::FSUB, SDLoc(Op), Op.getValueType(), - getNegatedExpression(Op.getOperand(0), DAG, - LegalOperations, ForCodeSize, - Depth + 1), - Op.getOperand(1), Flags); - // fold (fneg (fadd A, B)) -> (fsub (fneg B), A) - return DAG.getNode(ISD::FSUB, SDLoc(Op), Op.getValueType(), - getNegatedExpression(Op.getOperand(1), DAG, - LegalOperations, ForCodeSize, - Depth + 1), - Op.getOperand(0), Flags); - } - case ISD::FSUB: - // fold (fneg (fsub 0, B)) -> B - if (ConstantFPSDNode *N0CFP = - isConstOrConstSplatFP(Op.getOperand(0), /*AllowUndefs*/ true)) - if (N0CFP->isZero()) - return Op.getOperand(1); + // fold (fneg (fadd X, Y)) -> (fsub (fneg X), Y) + NegatibleCost CostX = getNegatibleCost(X, DAG, LegalOps, OptForSize, Depth); + if (CostX != NegatibleCost::Expensive) + return DAG.getNode( + ISD::FSUB, DL, VT, + getNegatedExpression(X, DAG, LegalOps, OptForSize, Depth), Y, Flags); - // fold (fneg (fsub A, B)) -> (fsub B, A) - return DAG.getNode(ISD::FSUB, SDLoc(Op), Op.getValueType(), - Op.getOperand(1), Op.getOperand(0), Flags); + // fold (fneg (fadd X, Y)) -> (fsub (fneg Y), X) + return DAG.getNode( + ISD::FSUB, DL, VT, + getNegatedExpression(Y, DAG, LegalOps, OptForSize, Depth), X, Flags); + } + case ISD::FSUB: { + SDValue X = Op.getOperand(0), Y = Op.getOperand(1); + // fold (fneg (fsub 0, Y)) -> Y + if (ConstantFPSDNode *C = isConstOrConstSplatFP(X, /*AllowUndefs*/ true)) + if (C->isZero()) + return Y; + // fold (fneg (fsub X, Y)) -> (fsub Y, X) + return DAG.getNode(ISD::FSUB, DL, VT, Y, X, Flags); + } case ISD::FMUL: case ISD::FDIV: { + SDValue X = Op.getOperand(0), Y = Op.getOperand(1); // fold (fneg (fmul X, Y)) -> (fmul (fneg X), Y) - NegatibleCost V0 = getNegatibleCost(Op.getOperand(0), DAG, LegalOperations, - ForCodeSize, Depth + 1); - if (V0 != NegatibleCost::Expensive) - return DAG.getNode(Op.getOpcode(), SDLoc(Op), Op.getValueType(), - getNegatedExpression(Op.getOperand(0), DAG, - LegalOperations, ForCodeSize, - Depth + 1), - Op.getOperand(1), Flags); + NegatibleCost CostX = getNegatibleCost(X, DAG, LegalOps, OptForSize, Depth); + if (CostX != NegatibleCost::Expensive) + return DAG.getNode( + Opcode, DL, VT, + getNegatedExpression(X, DAG, LegalOps, OptForSize, Depth), Y, Flags); // fold (fneg (fmul X, Y)) -> (fmul X, (fneg Y)) return DAG.getNode( - Op.getOpcode(), SDLoc(Op), Op.getValueType(), Op.getOperand(0), - getNegatedExpression(Op.getOperand(1), DAG, LegalOperations, - ForCodeSize, Depth + 1), - Flags); + Opcode, DL, VT, X, + getNegatedExpression(Y, DAG, LegalOps, OptForSize, Depth), Flags); } case ISD::FMA: case ISD::FMAD: { @@ -5782,39 +5785,30 @@ SDValue TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG, Flags.hasNoSignedZeros()) && "Expected NSZ fp-flag"); - SDValue Neg2 = getNegatedExpression(Op.getOperand(2), DAG, LegalOperations, - ForCodeSize, Depth + 1); - - NegatibleCost V0 = getNegatibleCost(Op.getOperand(0), DAG, LegalOperations, - ForCodeSize, Depth + 1); - NegatibleCost V1 = getNegatibleCost(Op.getOperand(1), DAG, LegalOperations, - ForCodeSize, Depth + 1); - if (V0 > V1) { + SDValue X = Op.getOperand(0), Y = Op.getOperand(1), Z = Op.getOperand(2); + SDValue NegZ = getNegatedExpression(Z, DAG, LegalOps, OptForSize, Depth); + NegatibleCost CostX = getNegatibleCost(X, DAG, LegalOps, OptForSize, Depth); + NegatibleCost CostY = getNegatibleCost(Y, DAG, LegalOps, OptForSize, Depth); + if (CostX > CostY) { // fold (fneg (fma X, Y, Z)) -> (fma (fneg X), Y, (fneg Z)) - SDValue Neg0 = getNegatedExpression( - Op.getOperand(0), DAG, LegalOperations, ForCodeSize, Depth + 1); - return DAG.getNode(Op.getOpcode(), SDLoc(Op), Op.getValueType(), Neg0, - Op.getOperand(1), Neg2, Flags); + SDValue NegX = getNegatedExpression(X, DAG, LegalOps, OptForSize, Depth); + return DAG.getNode(Opcode, DL, VT, NegX, Y, NegZ, Flags); } // fold (fneg (fma X, Y, Z)) -> (fma X, (fneg Y), (fneg Z)) - SDValue Neg1 = getNegatedExpression(Op.getOperand(1), DAG, LegalOperations, - ForCodeSize, Depth + 1); - return DAG.getNode(Op.getOpcode(), SDLoc(Op), Op.getValueType(), - Op.getOperand(0), Neg1, Neg2, Flags); + SDValue NegY = getNegatedExpression(Y, DAG, LegalOps, OptForSize, Depth); + return DAG.getNode(Opcode, DL, VT, X, NegY, NegZ, Flags); } case ISD::FP_EXTEND: case ISD::FSIN: - return DAG.getNode(Op.getOpcode(), SDLoc(Op), Op.getValueType(), - getNegatedExpression(Op.getOperand(0), DAG, - LegalOperations, ForCodeSize, - Depth + 1)); + return DAG.getNode(Opcode, DL, VT, + getNegatedExpression(Op.getOperand(0), DAG, LegalOps, + OptForSize, Depth)); case ISD::FP_ROUND: - return DAG.getNode(ISD::FP_ROUND, SDLoc(Op), Op.getValueType(), - getNegatedExpression(Op.getOperand(0), DAG, - LegalOperations, ForCodeSize, - Depth + 1), + return DAG.getNode(ISD::FP_ROUND, DL, VT, + getNegatedExpression(Op.getOperand(0), DAG, LegalOps, + OptForSize, Depth), Op.getOperand(1)); } diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp index 436857d6b2150..27440b2dd02c3 100644 --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -1657,7 +1657,7 @@ int TargetLoweringBase::InstructionOpcodeToISD(unsigned Opcode) const { case ExtractValue: return ISD::MERGE_VALUES; case InsertValue: return ISD::MERGE_VALUES; case LandingPad: return 0; - case Freeze: return 0; + case Freeze: return ISD::FREEZE; } llvm_unreachable("Unknown instruction type encountered!"); diff --git a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp index 8f1c342202d6d..dedb6735b4708 100644 --- a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp +++ b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp @@ -108,6 +108,7 @@ void TargetLoweringObjectFileELF::Initialize(MCContext &Ctx, TM = &TgtM; CodeModel::Model CM = TgtM.getCodeModel(); + InitializeELF(TgtM.Options.UseInitArray); switch (TgtM.getTargetTriple().getArch()) { case Triple::arm: diff --git a/llvm/lib/CodeGen/TargetOptionsImpl.cpp b/llvm/lib/CodeGen/TargetOptionsImpl.cpp index d794a261ecb2f..4866d4c171c0e 100644 --- a/llvm/lib/CodeGen/TargetOptionsImpl.cpp +++ b/llvm/lib/CodeGen/TargetOptionsImpl.cpp @@ -45,3 +45,9 @@ bool TargetOptions::DisableFramePointerElim(const MachineFunction &MF) const { bool TargetOptions::HonorSignDependentRoundingFPMath() const { return !UnsafeFPMath && HonorSignDependentRoundingFPMathOption; } + +/// NOTE: There are targets that still do not support the debug entry values +/// production. +bool TargetOptions::ShouldEmitDebugEntryValues() const { + return SupportsDebugEntryValues || EnableDebugEntryValues; +} diff --git a/llvm/lib/CodeGen/ValueTypes.cpp b/llvm/lib/CodeGen/ValueTypes.cpp index 264982983fc84..f93c4b87729b6 100644 --- a/llvm/lib/CodeGen/ValueTypes.cpp +++ b/llvm/lib/CodeGen/ValueTypes.cpp @@ -22,7 +22,8 @@ EVT EVT::changeExtendedTypeToInteger() const { EVT EVT::changeExtendedVectorElementTypeToInteger() const { LLVMContext &Context = LLVMTy->getContext(); EVT IntTy = getIntegerVT(Context, getScalarSizeInBits()); - return getVectorVT(Context, IntTy, getVectorNumElements()); + return getVectorVT(Context, IntTy, getVectorNumElements(), + isScalableVector()); } EVT EVT::getExtendedIntegerVT(LLVMContext &Context, unsigned BitWidth) { @@ -32,10 +33,19 @@ EVT EVT::getExtendedIntegerVT(LLVMContext &Context, unsigned BitWidth) { return VT; } -EVT EVT::getExtendedVectorVT(LLVMContext &Context, EVT VT, - unsigned NumElements) { +EVT EVT::getExtendedVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, + bool IsScalable) { EVT ResultVT; - ResultVT.LLVMTy = VectorType::get(VT.getTypeForEVT(Context), NumElements); + ResultVT.LLVMTy = + VectorType::get(VT.getTypeForEVT(Context), NumElements, IsScalable); + assert(ResultVT.isExtended() && "Type is not extended!"); + return ResultVT; +} + +EVT EVT::getExtendedVectorVT(LLVMContext &Context, EVT VT, ElementCount EC) { + EVT ResultVT; + ResultVT.LLVMTy = + VectorType::get(VT.getTypeForEVT(Context), {EC.Min, EC.Scalable}); assert(ResultVT.isExtended() && "Type is not extended!"); return ResultVT; } @@ -92,6 +102,14 @@ bool EVT::isExtended2048BitVector() const { return isExtendedVector() && getExtendedSizeInBits() == 2048; } +bool EVT::isExtendedFixedLengthVector() const { + return isExtendedVector() && !cast(LLVMTy)->isScalable(); +} + +bool EVT::isExtendedScalableVector() const { + return isExtendedVector() && cast(LLVMTy)->isScalable(); +} + EVT EVT::getExtendedVectorElementType() const { assert(isExtended() && "Type is not extended!"); return EVT::getEVT(cast(LLVMTy)->getElementType()); @@ -102,6 +120,11 @@ unsigned EVT::getExtendedVectorNumElements() const { return cast(LLVMTy)->getNumElements(); } +ElementCount EVT::getExtendedVectorElementCount() const { + assert(isExtended() && "Type is not extended!"); + return cast(LLVMTy)->getElementCount(); +} + TypeSize EVT::getExtendedSizeInBits() const { assert(isExtended() && "Type is not extended!"); if (IntegerType *ITy = dyn_cast(LLVMTy)) diff --git a/llvm/lib/DWARFLinker/DWARFLinker.cpp b/llvm/lib/DWARFLinker/DWARFLinker.cpp index 8464c04f801ec..6444af9046b8d 100644 --- a/llvm/lib/DWARFLinker/DWARFLinker.cpp +++ b/llvm/lib/DWARFLinker/DWARFLinker.cpp @@ -1056,6 +1056,10 @@ unsigned DWARFLinker::DIECloner::cloneAddressAttribute( if (Die.getTag() == dwarf::DW_TAG_call_site) Addr = (Info.OrigCallReturnPc ? Info.OrigCallReturnPc : Addr) + Info.PCOffset; + } else if (AttrSpec.Attr == dwarf::DW_AT_call_pc) { + // Relocate the address of a branch instruction within a call site entry. + if (Die.getTag() == dwarf::DW_TAG_call_site) + Addr = (Info.OrigCallPc ? Info.OrigCallPc : Addr) + Info.PCOffset; } Die.addValue(DIEAlloc, static_cast(AttrSpec.Attr), @@ -1914,6 +1918,14 @@ static uint64_t getDwoId(const DWARFDie &CUDie, const DWARFUnit &Unit) { return 0; } +static std::string remapPath(StringRef Path, + const objectPrefixMap &ObjectPrefixMap) { + for (const auto &Entry : ObjectPrefixMap) + if (Path.startswith(Entry.first)) + return (Twine(Entry.second) + Path.substr(Entry.first.size())).str(); + return Path.str(); +} + bool DWARFLinker::registerModuleReference( DWARFDie CUDie, const DWARFUnit &Unit, const DwarfFile &File, OffsetsStringPool &StringPool, UniquingStringPool &UniquingStringPool, @@ -1923,6 +1935,8 @@ bool DWARFLinker::registerModuleReference( CUDie.find({dwarf::DW_AT_dwo_name, dwarf::DW_AT_GNU_dwo_name}), ""); if (PCMfile.empty()) return false; + if (Options.ObjectPrefixMap) + PCMfile = remapPath(PCMfile, *Options.ObjectPrefixMap); // Clang module DWARF skeleton CUs abuse this for the path to the module. uint64_t DwoId = getDwoId(CUDie, Unit); diff --git a/llvm/lib/DebugInfo/CodeView/TypeStreamMerger.cpp b/llvm/lib/DebugInfo/CodeView/TypeStreamMerger.cpp index 45b462378cbbe..8c4b640bcd193 100644 --- a/llvm/lib/DebugInfo/CodeView/TypeStreamMerger.cpp +++ b/llvm/lib/DebugInfo/CodeView/TypeStreamMerger.cpp @@ -389,11 +389,9 @@ ArrayRef TypeStreamMerger::remapIndices(const CVType &OriginalType, MutableArrayRef Storage) { unsigned Align = OriginalType.RecordData.size() & 3; - unsigned AlignedSize = alignTo(OriginalType.RecordData.size(), 4); - assert(Storage.size() == AlignedSize && + assert(Storage.size() == alignTo(OriginalType.RecordData.size(), 4) && "The storage buffer size is not a multiple of 4 bytes which will " "cause misalignment in the output TPI stream!"); - (void)AlignedSize; SmallVector Refs; discoverTypeIndices(OriginalType.RecordData, Refs); diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDebugLine.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDebugLine.cpp index 66384745b58da..366f0479c93a6 100644 --- a/llvm/lib/DebugInfo/DWARF/DWARFDebugLine.cpp +++ b/llvm/lib/DebugInfo/DWARF/DWARFDebugLine.cpp @@ -255,8 +255,8 @@ parseV5DirFileTables(const DWARFDataExtractor &DebugLineData, return DirDescriptors.takeError(); // Get the directory entries, according to the format described above. - int DirEntryCount = DebugLineData.getU8(OffsetPtr); - for (int I = 0; I != DirEntryCount; ++I) { + uint64_t DirEntryCount = DebugLineData.getULEB128(OffsetPtr); + for (uint64_t I = 0; I != DirEntryCount; ++I) { for (auto Descriptor : *DirDescriptors) { DWARFFormValue Value(Descriptor.Form); switch (Descriptor.Type) { @@ -283,8 +283,8 @@ parseV5DirFileTables(const DWARFDataExtractor &DebugLineData, return FileDescriptors.takeError(); // Get the file entries, according to the format described above. - int FileEntryCount = DebugLineData.getU8(OffsetPtr); - for (int I = 0; I != FileEntryCount; ++I) { + uint64_t FileEntryCount = DebugLineData.getULEB128(OffsetPtr); + for (uint64_t I = 0; I != FileEntryCount; ++I) { DWARFDebugLine::FileNameEntry FileEntry; for (auto Descriptor : *FileDescriptors) { DWARFFormValue Value(Descriptor.Form); @@ -1209,11 +1209,15 @@ bool DWARFDebugLine::Prologue::getFileNameByIndex( if (!Name) return false; StringRef FileName = *Name; - if (Kind == FileLineInfoKind::Default || + if (Kind == FileLineInfoKind::RawValue || isPathAbsoluteOnWindowsOrPosix(FileName)) { Result = std::string(FileName); return true; } + if (Kind == FileLineInfoKind::BaseNameOnly) { + Result = std::string(llvm::sys::path::filename(FileName)); + return true; + } SmallString<16> FilePath; StringRef IncludeDir; diff --git a/llvm/lib/DebugInfo/Symbolize/DIPrinter.cpp b/llvm/lib/DebugInfo/Symbolize/DIPrinter.cpp index 223b6630d69d4..10352237763c9 100644 --- a/llvm/lib/DebugInfo/Symbolize/DIPrinter.cpp +++ b/llvm/lib/DebugInfo/Symbolize/DIPrinter.cpp @@ -73,8 +73,6 @@ void DIPrinter::print(const DILineInfo &Info, bool Inlined) { std::string Filename = Info.FileName; if (Filename == DILineInfo::BadString) Filename = DILineInfo::Addr2LineBadString; - else if (Basenames) - Filename = std::string(llvm::sys::path::filename(Filename)); if (!Verbose) { OS << Filename << ":" << Info.Line; if (Style == OutputStyle::LLVM) diff --git a/llvm/lib/DebugInfo/Symbolize/SymbolizableObjectFile.cpp b/llvm/lib/DebugInfo/Symbolize/SymbolizableObjectFile.cpp index b4d49d9ff9582..9835fc039f5cb 100644 --- a/llvm/lib/DebugInfo/Symbolize/SymbolizableObjectFile.cpp +++ b/llvm/lib/DebugInfo/Symbolize/SymbolizableObjectFile.cpp @@ -35,12 +35,6 @@ using namespace llvm; using namespace object; using namespace symbolize; -static DILineInfoSpecifier -getDILineInfoSpecifier(FunctionNameKind FNKind) { - return DILineInfoSpecifier( - DILineInfoSpecifier::FileLineInfoKind::AbsoluteFilePath, FNKind); -} - ErrorOr> SymbolizableObjectFile::create(const object::ObjectFile *Obj, std::unique_ptr DICtx, @@ -251,16 +245,16 @@ bool SymbolizableObjectFile::shouldOverrideWithSymbolTable( DILineInfo SymbolizableObjectFile::symbolizeCode(object::SectionedAddress ModuleOffset, - FunctionNameKind FNKind, + DILineInfoSpecifier LineInfoSpecifier, bool UseSymbolTable) const { if (ModuleOffset.SectionIndex == object::SectionedAddress::UndefSection) ModuleOffset.SectionIndex = getModuleSectionIndexForAddress(ModuleOffset.Address); - DILineInfo LineInfo = DebugInfoContext->getLineInfoForAddress( - ModuleOffset, getDILineInfoSpecifier(FNKind)); + DILineInfo LineInfo = + DebugInfoContext->getLineInfoForAddress(ModuleOffset, LineInfoSpecifier); // Override function name from symbol table if necessary. - if (shouldOverrideWithSymbolTable(FNKind, UseSymbolTable)) { + if (shouldOverrideWithSymbolTable(LineInfoSpecifier.FNKind, UseSymbolTable)) { std::string FunctionName; uint64_t Start, Size; if (getNameFromSymbolTable(SymbolRef::ST_Function, ModuleOffset.Address, @@ -272,20 +266,20 @@ SymbolizableObjectFile::symbolizeCode(object::SectionedAddress ModuleOffset, } DIInliningInfo SymbolizableObjectFile::symbolizeInlinedCode( - object::SectionedAddress ModuleOffset, FunctionNameKind FNKind, - bool UseSymbolTable) const { + object::SectionedAddress ModuleOffset, + DILineInfoSpecifier LineInfoSpecifier, bool UseSymbolTable) const { if (ModuleOffset.SectionIndex == object::SectionedAddress::UndefSection) ModuleOffset.SectionIndex = getModuleSectionIndexForAddress(ModuleOffset.Address); DIInliningInfo InlinedContext = DebugInfoContext->getInliningInfoForAddress( - ModuleOffset, getDILineInfoSpecifier(FNKind)); + ModuleOffset, LineInfoSpecifier); // Make sure there is at least one frame in context. if (InlinedContext.getNumberOfFrames() == 0) InlinedContext.addFrame(DILineInfo()); // Override the function name in lower frame with name from symbol table. - if (shouldOverrideWithSymbolTable(FNKind, UseSymbolTable)) { + if (shouldOverrideWithSymbolTable(LineInfoSpecifier.FNKind, UseSymbolTable)) { std::string FunctionName; uint64_t Start, Size; if (getNameFromSymbolTable(SymbolRef::ST_Function, ModuleOffset.Address, diff --git a/llvm/lib/DebugInfo/Symbolize/SymbolizableObjectFile.h b/llvm/lib/DebugInfo/Symbolize/SymbolizableObjectFile.h index b5b9793a44d99..ee5e7e745674a 100644 --- a/llvm/lib/DebugInfo/Symbolize/SymbolizableObjectFile.h +++ b/llvm/lib/DebugInfo/Symbolize/SymbolizableObjectFile.h @@ -35,10 +35,10 @@ class SymbolizableObjectFile : public SymbolizableModule { bool UntagAddresses); DILineInfo symbolizeCode(object::SectionedAddress ModuleOffset, - FunctionNameKind FNKind, + DILineInfoSpecifier LineInfoSpecifier, bool UseSymbolTable) const override; DIInliningInfo symbolizeInlinedCode(object::SectionedAddress ModuleOffset, - FunctionNameKind FNKind, + DILineInfoSpecifier LineInfoSpecifier, bool UseSymbolTable) const override; DIGlobal symbolizeData(object::SectionedAddress ModuleOffset) const override; std::vector diff --git a/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp b/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp index 19a07f94a068c..768c306dfe382 100644 --- a/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp +++ b/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp @@ -51,8 +51,9 @@ LLVMSymbolizer::symbolizeCodeCommon(SymbolizableModule *Info, if (Opts.RelativeAddresses) ModuleOffset.Address += Info->getModulePreferredBase(); - DILineInfo LineInfo = Info->symbolizeCode(ModuleOffset, Opts.PrintFunctions, - Opts.UseSymbolTable); + DILineInfo LineInfo = Info->symbolizeCode( + ModuleOffset, DILineInfoSpecifier(Opts.PathStyle, Opts.PrintFunctions), + Opts.UseSymbolTable); if (Opts.Demangle) LineInfo.FunctionName = DemangleName(LineInfo.FunctionName, Info); return LineInfo; @@ -103,7 +104,8 @@ LLVMSymbolizer::symbolizeInlinedCode(const std::string &ModuleName, ModuleOffset.Address += Info->getModulePreferredBase(); DIInliningInfo InlinedContext = Info->symbolizeInlinedCode( - ModuleOffset, Opts.PrintFunctions, Opts.UseSymbolTable); + ModuleOffset, DILineInfoSpecifier(Opts.PathStyle, Opts.PrintFunctions), + Opts.UseSymbolTable); if (Opts.Demangle) { for (int i = 0, n = InlinedContext.getNumberOfFrames(); i < n; i++) { auto *Frame = InlinedContext.getMutableFrame(i); diff --git a/llvm/lib/ExecutionEngine/Orc/CompileOnDemandLayer.cpp b/llvm/lib/ExecutionEngine/Orc/CompileOnDemandLayer.cpp index 3905ce9bf5aca..a8e88a9785c59 100644 --- a/llvm/lib/ExecutionEngine/Orc/CompileOnDemandLayer.cpp +++ b/llvm/lib/ExecutionEngine/Orc/CompileOnDemandLayer.cpp @@ -165,10 +165,12 @@ void CompileOnDemandLayer::emit(MaterializationResponsibility R, return; } - R.replace(reexports(PDR.getImplDylib(), std::move(NonCallables), - JITDylibLookupFlags::MatchAllSymbols)); - R.replace(lazyReexports(LCTMgr, PDR.getISManager(), PDR.getImplDylib(), - std::move(Callables), AliaseeImpls)); + if (!NonCallables.empty()) + R.replace(reexports(PDR.getImplDylib(), std::move(NonCallables), + JITDylibLookupFlags::MatchAllSymbols)); + if (!Callables.empty()) + R.replace(lazyReexports(LCTMgr, PDR.getISManager(), PDR.getImplDylib(), + std::move(Callables), AliaseeImpls)); } CompileOnDemandLayer::PerDylibResources & diff --git a/llvm/lib/ExecutionEngine/Orc/Core.cpp b/llvm/lib/ExecutionEngine/Orc/Core.cpp index c651fe68cb155..552a7f2ab4f69 100644 --- a/llvm/lib/ExecutionEngine/Orc/Core.cpp +++ b/llvm/lib/ExecutionEngine/Orc/Core.cpp @@ -10,10 +10,8 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/Config/llvm-config.h" +#include "llvm/ExecutionEngine/Orc/DebugUtils.h" #include "llvm/ExecutionEngine/Orc/OrcError.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/Format.h" #include #if LLVM_ENABLE_THREADS @@ -22,122 +20,6 @@ #define DEBUG_TYPE "orc" -using namespace llvm; - -namespace { - -#ifndef NDEBUG - -cl::opt PrintHidden("debug-orc-print-hidden", cl::init(true), - cl::desc("debug print hidden symbols defined by " - "materialization units"), - cl::Hidden); - -cl::opt PrintCallable("debug-orc-print-callable", cl::init(true), - cl::desc("debug print callable symbols defined by " - "materialization units"), - cl::Hidden); - -cl::opt PrintData("debug-orc-print-data", cl::init(true), - cl::desc("debug print data symbols defined by " - "materialization units"), - cl::Hidden); - -#endif // NDEBUG - -// SetPrinter predicate that prints every element. -template struct PrintAll { - bool operator()(const T &E) { return true; } -}; - -bool anyPrintSymbolOptionSet() { -#ifndef NDEBUG - return PrintHidden || PrintCallable || PrintData; -#else - return false; -#endif // NDEBUG -} - -bool flagsMatchCLOpts(const JITSymbolFlags &Flags) { -#ifndef NDEBUG - // Bail out early if this is a hidden symbol and we're not printing hiddens. - if (!PrintHidden && !Flags.isExported()) - return false; - - // Return true if this is callable and we're printing callables. - if (PrintCallable && Flags.isCallable()) - return true; - - // Return true if this is data and we're printing data. - if (PrintData && !Flags.isCallable()) - return true; - - // otherwise return false. - return false; -#else - return false; -#endif // NDEBUG -} - -// Prints a sequence of items, filtered by an user-supplied predicate. -template > -class SequencePrinter { -public: - SequencePrinter(const Sequence &S, char OpenSeq, char CloseSeq, - Pred ShouldPrint = Pred()) - : S(S), OpenSeq(OpenSeq), CloseSeq(CloseSeq), - ShouldPrint(std::move(ShouldPrint)) {} - - void printTo(llvm::raw_ostream &OS) const { - bool PrintComma = false; - OS << OpenSeq; - for (auto &E : S) { - if (ShouldPrint(E)) { - if (PrintComma) - OS << ','; - OS << ' ' << E; - PrintComma = true; - } - } - OS << ' ' << CloseSeq; - } - -private: - const Sequence &S; - char OpenSeq; - char CloseSeq; - mutable Pred ShouldPrint; -}; - -template -SequencePrinter printSequence(const Sequence &S, char OpenSeq, - char CloseSeq, Pred P = Pred()) { - return SequencePrinter(S, OpenSeq, CloseSeq, std::move(P)); -} - -// Render a SequencePrinter by delegating to its printTo method. -template -llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, - const SequencePrinter &Printer) { - Printer.printTo(OS); - return OS; -} - -struct PrintSymbolFlagsMapElemsMatchingCLOpts { - bool operator()(const orc::SymbolFlagsMap::value_type &KV) { - return flagsMatchCLOpts(KV.second); - } -}; - -struct PrintSymbolMapElemsMatchingCLOpts { - bool operator()(const orc::SymbolMap::value_type &KV) { - return flagsMatchCLOpts(KV.second.getFlags()); - } -}; - -} // end anonymous namespace - namespace llvm { namespace orc { @@ -152,162 +34,6 @@ RegisterDependenciesFunction NoDependenciesToRegister = void MaterializationUnit::anchor() {} -raw_ostream &operator<<(raw_ostream &OS, const SymbolStringPtr &Sym) { - return OS << *Sym; -} - -raw_ostream &operator<<(raw_ostream &OS, const SymbolNameSet &Symbols) { - return OS << printSequence(Symbols, '{', '}', PrintAll()); -} - -raw_ostream &operator<<(raw_ostream &OS, const SymbolNameVector &Symbols) { - return OS << printSequence(Symbols, '[', ']', PrintAll()); -} - -raw_ostream &operator<<(raw_ostream &OS, const JITSymbolFlags &Flags) { - if (Flags.hasError()) - OS << "[*ERROR*]"; - if (Flags.isCallable()) - OS << "[Callable]"; - else - OS << "[Data]"; - if (Flags.isWeak()) - OS << "[Weak]"; - else if (Flags.isCommon()) - OS << "[Common]"; - - if (!Flags.isExported()) - OS << "[Hidden]"; - - return OS; -} - -raw_ostream &operator<<(raw_ostream &OS, const JITEvaluatedSymbol &Sym) { - return OS << format("0x%016" PRIx64, Sym.getAddress()) << " " - << Sym.getFlags(); -} - -raw_ostream &operator<<(raw_ostream &OS, const SymbolFlagsMap::value_type &KV) { - return OS << "(\"" << KV.first << "\", " << KV.second << ")"; -} - -raw_ostream &operator<<(raw_ostream &OS, const SymbolMap::value_type &KV) { - return OS << "(\"" << KV.first << "\": " << KV.second << ")"; -} - -raw_ostream &operator<<(raw_ostream &OS, const SymbolFlagsMap &SymbolFlags) { - return OS << printSequence(SymbolFlags, '{', '}', - PrintSymbolFlagsMapElemsMatchingCLOpts()); -} - -raw_ostream &operator<<(raw_ostream &OS, const SymbolMap &Symbols) { - return OS << printSequence(Symbols, '{', '}', - PrintSymbolMapElemsMatchingCLOpts()); -} - -raw_ostream &operator<<(raw_ostream &OS, - const SymbolDependenceMap::value_type &KV) { - return OS << "(" << KV.first->getName() << ", " << KV.second << ")"; -} - -raw_ostream &operator<<(raw_ostream &OS, const SymbolDependenceMap &Deps) { - return OS << printSequence(Deps, '{', '}', - PrintAll()); -} - -raw_ostream &operator<<(raw_ostream &OS, const MaterializationUnit &MU) { - OS << "MU@" << &MU << " (\"" << MU.getName() << "\""; - if (anyPrintSymbolOptionSet()) - OS << ", " << MU.getSymbols(); - return OS << ")"; -} - -raw_ostream &operator<<(raw_ostream &OS, const LookupKind &K) { - switch (K) { - case LookupKind::Static: - return OS << "Static"; - case LookupKind::DLSym: - return OS << "DLSym"; - } - llvm_unreachable("Invalid lookup kind"); -} - -raw_ostream &operator<<(raw_ostream &OS, - const JITDylibLookupFlags &JDLookupFlags) { - switch (JDLookupFlags) { - case JITDylibLookupFlags::MatchExportedSymbolsOnly: - return OS << "MatchExportedSymbolsOnly"; - case JITDylibLookupFlags::MatchAllSymbols: - return OS << "MatchAllSymbols"; - } - llvm_unreachable("Invalid JITDylib lookup flags"); -} - -raw_ostream &operator<<(raw_ostream &OS, const SymbolLookupFlags &LookupFlags) { - switch (LookupFlags) { - case SymbolLookupFlags::RequiredSymbol: - return OS << "RequiredSymbol"; - case SymbolLookupFlags::WeaklyReferencedSymbol: - return OS << "WeaklyReferencedSymbol"; - } - llvm_unreachable("Invalid symbol lookup flags"); -} - -raw_ostream &operator<<(raw_ostream &OS, - const SymbolLookupSet::value_type &KV) { - return OS << "(" << KV.first << ", " << KV.second << ")"; -} - -raw_ostream &operator<<(raw_ostream &OS, const SymbolLookupSet &LookupSet) { - return OS << printSequence(LookupSet, '{', '}', - PrintAll()); -} - -raw_ostream &operator<<(raw_ostream &OS, - const JITDylibSearchOrder &SearchOrder) { - OS << "["; - if (!SearchOrder.empty()) { - assert(SearchOrder.front().first && - "JITDylibList entries must not be null"); - OS << " (\"" << SearchOrder.front().first->getName() << "\", " - << SearchOrder.begin()->second << ")"; - for (auto &KV : - make_range(std::next(SearchOrder.begin(), 1), SearchOrder.end())) { - assert(KV.first && "JITDylibList entries must not be null"); - OS << ", (\"" << KV.first->getName() << "\", " << KV.second << ")"; - } - } - OS << " ]"; - return OS; -} - -raw_ostream &operator<<(raw_ostream &OS, const SymbolAliasMap &Aliases) { - OS << "{"; - for (auto &KV : Aliases) - OS << " " << *KV.first << ": " << KV.second.Aliasee << " " - << KV.second.AliasFlags; - OS << " }"; - return OS; -} - -raw_ostream &operator<<(raw_ostream &OS, const SymbolState &S) { - switch (S) { - case SymbolState::Invalid: - return OS << "Invalid"; - case SymbolState::NeverSearched: - return OS << "Never-Searched"; - case SymbolState::Materializing: - return OS << "Materializing"; - case SymbolState::Resolved: - return OS << "Resolved"; - case SymbolState::Emitted: - return OS << "Emitted"; - case SymbolState::Ready: - return OS << "Ready"; - } - llvm_unreachable("Invalid state"); -} - FailedToMaterialize::FailedToMaterialize( std::shared_ptr Symbols) : Symbols(std::move(Symbols)) { @@ -516,8 +242,15 @@ void MaterializationResponsibility::failMaterialization() { void MaterializationResponsibility::replace( std::unique_ptr MU) { - for (auto &KV : MU->getSymbols()) + // If the replacement MU is empty then return. + if (MU->getSymbols().empty()) + return; + + for (auto &KV : MU->getSymbols()) { + assert(SymbolFlags.count(KV.first) && + "Replacing definition outside this responsibility set"); SymbolFlags.erase(KV.first); + } if (MU->getInitializerSymbol() == InitSymbol) InitSymbol = nullptr; @@ -934,7 +667,11 @@ void JITDylib::replace(std::unique_ptr MU) { "Unexpected materializer entry in map"); SymI->second.setAddress(SymI->second.getAddress()); SymI->second.setMaterializerAttached(true); - UnmaterializedInfos[KV.first] = UMI; + + auto &UMIEntry = UnmaterializedInfos[KV.first]; + assert((!UMIEntry || !UMIEntry->MU) && + "Replacing symbol with materializer still attached"); + UMIEntry = UMI; } return nullptr; @@ -2295,5 +2032,13 @@ void ExecutionSession::runOutstandingMUs() { } } +#ifndef NDEBUG +void ExecutionSession::dumpDispatchInfo(JITDylib &JD, MaterializationUnit &MU) { + runSessionLocked([&]() { + dbgs() << "Dispatching " << MU << " for " << JD.getName() << "\n"; + }); +} +#endif // NDEBUG + } // End namespace orc. } // End namespace llvm. diff --git a/llvm/lib/ExecutionEngine/Orc/DebugUtils.cpp b/llvm/lib/ExecutionEngine/Orc/DebugUtils.cpp index 8d4a8107a71ca..b816363cc547e 100644 --- a/llvm/lib/ExecutionEngine/Orc/DebugUtils.cpp +++ b/llvm/lib/ExecutionEngine/Orc/DebugUtils.cpp @@ -7,17 +7,293 @@ //===----------------------------------------------------------------------===// #include "llvm/ExecutionEngine/Orc/DebugUtils.h" + +#include "llvm/ExecutionEngine/Orc/Core.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/FileSystem.h" +#include "llvm/Support/Format.h" #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/Path.h" #include "llvm/Support/raw_ostream.h" #define DEBUG_TYPE "orc" +using namespace llvm; + +namespace { + +#ifndef NDEBUG + +cl::opt PrintHidden("debug-orc-print-hidden", cl::init(true), + cl::desc("debug print hidden symbols defined by " + "materialization units"), + cl::Hidden); + +cl::opt PrintCallable("debug-orc-print-callable", cl::init(true), + cl::desc("debug print callable symbols defined by " + "materialization units"), + cl::Hidden); + +cl::opt PrintData("debug-orc-print-data", cl::init(true), + cl::desc("debug print data symbols defined by " + "materialization units"), + cl::Hidden); + +#endif // NDEBUG + +// SetPrinter predicate that prints every element. +template struct PrintAll { + bool operator()(const T &E) { return true; } +}; + +bool anyPrintSymbolOptionSet() { +#ifndef NDEBUG + return PrintHidden || PrintCallable || PrintData; +#else + return false; +#endif // NDEBUG +} + +bool flagsMatchCLOpts(const JITSymbolFlags &Flags) { +#ifndef NDEBUG + // Bail out early if this is a hidden symbol and we're not printing hiddens. + if (!PrintHidden && !Flags.isExported()) + return false; + + // Return true if this is callable and we're printing callables. + if (PrintCallable && Flags.isCallable()) + return true; + + // Return true if this is data and we're printing data. + if (PrintData && !Flags.isCallable()) + return true; + + // otherwise return false. + return false; +#else + return false; +#endif // NDEBUG +} + +// Prints a sequence of items, filtered by an user-supplied predicate. +template > +class SequencePrinter { +public: + SequencePrinter(const Sequence &S, char OpenSeq, char CloseSeq, + Pred ShouldPrint = Pred()) + : S(S), OpenSeq(OpenSeq), CloseSeq(CloseSeq), + ShouldPrint(std::move(ShouldPrint)) {} + + void printTo(llvm::raw_ostream &OS) const { + bool PrintComma = false; + OS << OpenSeq; + for (auto &E : S) { + if (ShouldPrint(E)) { + if (PrintComma) + OS << ','; + OS << ' ' << E; + PrintComma = true; + } + } + OS << ' ' << CloseSeq; + } + +private: + const Sequence &S; + char OpenSeq; + char CloseSeq; + mutable Pred ShouldPrint; +}; + +template +SequencePrinter printSequence(const Sequence &S, char OpenSeq, + char CloseSeq, Pred P = Pred()) { + return SequencePrinter(S, OpenSeq, CloseSeq, std::move(P)); +} + +// Render a SequencePrinter by delegating to its printTo method. +template +llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, + const SequencePrinter &Printer) { + Printer.printTo(OS); + return OS; +} + +struct PrintSymbolFlagsMapElemsMatchingCLOpts { + bool operator()(const orc::SymbolFlagsMap::value_type &KV) { + return flagsMatchCLOpts(KV.second); + } +}; + +struct PrintSymbolMapElemsMatchingCLOpts { + bool operator()(const orc::SymbolMap::value_type &KV) { + return flagsMatchCLOpts(KV.second.getFlags()); + } +}; + +} // end anonymous namespace + namespace llvm { namespace orc { +raw_ostream &operator<<(raw_ostream &OS, const SymbolStringPtr &Sym) { + return OS << *Sym; +} + +raw_ostream &operator<<(raw_ostream &OS, const SymbolNameSet &Symbols) { + return OS << printSequence(Symbols, '{', '}', PrintAll()); +} + +raw_ostream &operator<<(raw_ostream &OS, const SymbolNameVector &Symbols) { + return OS << printSequence(Symbols, '[', ']', PrintAll()); +} + +raw_ostream &operator<<(raw_ostream &OS, const JITSymbolFlags &Flags) { + if (Flags.hasError()) + OS << "[*ERROR*]"; + if (Flags.isCallable()) + OS << "[Callable]"; + else + OS << "[Data]"; + if (Flags.isWeak()) + OS << "[Weak]"; + else if (Flags.isCommon()) + OS << "[Common]"; + + if (!Flags.isExported()) + OS << "[Hidden]"; + + return OS; +} + +raw_ostream &operator<<(raw_ostream &OS, const JITEvaluatedSymbol &Sym) { + return OS << format("0x%016" PRIx64, Sym.getAddress()) << " " + << Sym.getFlags(); +} + +raw_ostream &operator<<(raw_ostream &OS, const SymbolFlagsMap::value_type &KV) { + return OS << "(\"" << KV.first << "\", " << KV.second << ")"; +} + +raw_ostream &operator<<(raw_ostream &OS, const SymbolMap::value_type &KV) { + return OS << "(\"" << KV.first << "\": " << KV.second << ")"; +} + +raw_ostream &operator<<(raw_ostream &OS, const SymbolFlagsMap &SymbolFlags) { + return OS << printSequence(SymbolFlags, '{', '}', + PrintSymbolFlagsMapElemsMatchingCLOpts()); +} + +raw_ostream &operator<<(raw_ostream &OS, const SymbolMap &Symbols) { + return OS << printSequence(Symbols, '{', '}', + PrintSymbolMapElemsMatchingCLOpts()); +} + +raw_ostream &operator<<(raw_ostream &OS, + const SymbolDependenceMap::value_type &KV) { + return OS << "(" << KV.first->getName() << ", " << KV.second << ")"; +} + +raw_ostream &operator<<(raw_ostream &OS, const SymbolDependenceMap &Deps) { + return OS << printSequence(Deps, '{', '}', + PrintAll()); +} + +raw_ostream &operator<<(raw_ostream &OS, const MaterializationUnit &MU) { + OS << "MU@" << &MU << " (\"" << MU.getName() << "\""; + if (anyPrintSymbolOptionSet()) + OS << ", " << MU.getSymbols(); + return OS << ")"; +} + +raw_ostream &operator<<(raw_ostream &OS, const LookupKind &K) { + switch (K) { + case LookupKind::Static: + return OS << "Static"; + case LookupKind::DLSym: + return OS << "DLSym"; + } + llvm_unreachable("Invalid lookup kind"); +} + +raw_ostream &operator<<(raw_ostream &OS, + const JITDylibLookupFlags &JDLookupFlags) { + switch (JDLookupFlags) { + case JITDylibLookupFlags::MatchExportedSymbolsOnly: + return OS << "MatchExportedSymbolsOnly"; + case JITDylibLookupFlags::MatchAllSymbols: + return OS << "MatchAllSymbols"; + } + llvm_unreachable("Invalid JITDylib lookup flags"); +} + +raw_ostream &operator<<(raw_ostream &OS, const SymbolLookupFlags &LookupFlags) { + switch (LookupFlags) { + case SymbolLookupFlags::RequiredSymbol: + return OS << "RequiredSymbol"; + case SymbolLookupFlags::WeaklyReferencedSymbol: + return OS << "WeaklyReferencedSymbol"; + } + llvm_unreachable("Invalid symbol lookup flags"); +} + +raw_ostream &operator<<(raw_ostream &OS, + const SymbolLookupSet::value_type &KV) { + return OS << "(" << KV.first << ", " << KV.second << ")"; +} + +raw_ostream &operator<<(raw_ostream &OS, const SymbolLookupSet &LookupSet) { + return OS << printSequence(LookupSet, '{', '}', + PrintAll()); +} + +raw_ostream &operator<<(raw_ostream &OS, + const JITDylibSearchOrder &SearchOrder) { + OS << "["; + if (!SearchOrder.empty()) { + assert(SearchOrder.front().first && + "JITDylibList entries must not be null"); + OS << " (\"" << SearchOrder.front().first->getName() << "\", " + << SearchOrder.begin()->second << ")"; + for (auto &KV : + make_range(std::next(SearchOrder.begin(), 1), SearchOrder.end())) { + assert(KV.first && "JITDylibList entries must not be null"); + OS << ", (\"" << KV.first->getName() << "\", " << KV.second << ")"; + } + } + OS << " ]"; + return OS; +} + +raw_ostream &operator<<(raw_ostream &OS, const SymbolAliasMap &Aliases) { + OS << "{"; + for (auto &KV : Aliases) + OS << " " << *KV.first << ": " << KV.second.Aliasee << " " + << KV.second.AliasFlags; + OS << " }"; + return OS; +} + +raw_ostream &operator<<(raw_ostream &OS, const SymbolState &S) { + switch (S) { + case SymbolState::Invalid: + return OS << "Invalid"; + case SymbolState::NeverSearched: + return OS << "Never-Searched"; + case SymbolState::Materializing: + return OS << "Materializing"; + case SymbolState::Resolved: + return OS << "Resolved"; + case SymbolState::Emitted: + return OS << "Emitted"; + case SymbolState::Ready: + return OS << "Ready"; + } + llvm_unreachable("Invalid state"); +} + DumpObjects::DumpObjects(std::string DumpDir, std::string IdentifierOverride) : DumpDir(std::move(DumpDir)), IdentifierOverride(std::move(IdentifierOverride)) { diff --git a/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp b/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp index 63a5b1f09c821..9451a57254931 100644 --- a/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp +++ b/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp @@ -175,7 +175,6 @@ class GenericLLVMIRPlatformSupport : public LLJIT::PlatformSupport { } Error notifyAdding(JITDylib &JD, const MaterializationUnit &MU) { - std::lock_guard Lock(PlatformSupportMutex); if (auto &InitSym = MU.getInitializerSymbol()) InitSymbols[&JD].add(InitSym); else { @@ -236,11 +235,13 @@ class GenericLLVMIRPlatformSupport : public LLJIT::PlatformSupport { } void registerInitFunc(JITDylib &JD, SymbolStringPtr InitName) { - std::lock_guard Lock(PlatformSupportMutex); - InitFunctions[&JD].add(InitName); + getExecutionSession().runSessionLocked([&]() { + InitFunctions[&JD].add(InitName); + }); } private: + Expected> getInitializers(JITDylib &JD) { if (auto Err = issueInitLookups(JD)) return std::move(Err); @@ -248,18 +249,17 @@ class GenericLLVMIRPlatformSupport : public LLJIT::PlatformSupport { DenseMap LookupSymbols; std::vector DFSLinkOrder; - { - std::lock_guard Lock(PlatformSupportMutex); - DFSLinkOrder = getDFSLinkOrder(JD); + getExecutionSession().runSessionLocked([&]() { + DFSLinkOrder = getDFSLinkOrder(JD); - for (auto *NextJD : DFSLinkOrder) { - auto IFItr = InitFunctions.find(NextJD); - if (IFItr != InitFunctions.end()) { - LookupSymbols[NextJD] = std::move(IFItr->second); - InitFunctions.erase(IFItr); + for (auto *NextJD : DFSLinkOrder) { + auto IFItr = InitFunctions.find(NextJD); + if (IFItr != InitFunctions.end()) { + LookupSymbols[NextJD] = std::move(IFItr->second); + InitFunctions.erase(IFItr); + } } - } - } + }); LLVM_DEBUG({ dbgs() << "JITDylib init order is [ "; @@ -300,21 +300,20 @@ class GenericLLVMIRPlatformSupport : public LLJIT::PlatformSupport { DenseMap LookupSymbols; std::vector DFSLinkOrder; - { - std::lock_guard Lock(PlatformSupportMutex); - DFSLinkOrder = getDFSLinkOrder(JD); - - for (auto *NextJD : DFSLinkOrder) { - auto &JDLookupSymbols = LookupSymbols[NextJD]; - auto DIFItr = DeInitFunctions.find(NextJD); - if (DIFItr != DeInitFunctions.end()) { - LookupSymbols[NextJD] = std::move(DIFItr->second); - DeInitFunctions.erase(DIFItr); - } - JDLookupSymbols.add(LLJITRunAtExits, + ES.runSessionLocked([&]() { + DFSLinkOrder = getDFSLinkOrder(JD); + + for (auto *NextJD : DFSLinkOrder) { + auto &JDLookupSymbols = LookupSymbols[NextJD]; + auto DIFItr = DeInitFunctions.find(NextJD); + if (DIFItr != DeInitFunctions.end()) { + LookupSymbols[NextJD] = std::move(DIFItr->second); + DeInitFunctions.erase(DIFItr); + } + JDLookupSymbols.add(LLJITRunAtExits, SymbolLookupFlags::WeaklyReferencedSymbol); } - } + }); auto LookupResult = Platform::lookupInitSymbols(ES, LookupSymbols); @@ -366,20 +365,19 @@ class GenericLLVMIRPlatformSupport : public LLJIT::PlatformSupport { /// JITDylibs that it depends on). Error issueInitLookups(JITDylib &JD) { DenseMap RequiredInitSymbols; + std::vector DFSLinkOrder; - { - std::lock_guard Lock(PlatformSupportMutex); - - auto DFSLinkOrder = getDFSLinkOrder(JD); + getExecutionSession().runSessionLocked([&]() { + DFSLinkOrder = getDFSLinkOrder(JD); - for (auto *NextJD : DFSLinkOrder) { - auto ISItr = InitSymbols.find(NextJD); - if (ISItr != InitSymbols.end()) { - RequiredInitSymbols[NextJD] = std::move(ISItr->second); - InitSymbols.erase(ISItr); + for (auto *NextJD : DFSLinkOrder) { + auto ISItr = InitSymbols.find(NextJD); + if (ISItr != InitSymbols.end()) { + RequiredInitSymbols[NextJD] = std::move(ISItr->second); + InitSymbols.erase(ISItr); + } } - } - } + }); return Platform::lookupInitSymbols(getExecutionSession(), RequiredInitSymbols) @@ -435,7 +433,6 @@ class GenericLLVMIRPlatformSupport : public LLJIT::PlatformSupport { return ThreadSafeModule(std::move(M), std::move(Ctx)); } - std::mutex PlatformSupportMutex; LLJIT &J; SymbolStringPtr InitFunctionPrefix; DenseMap InitSymbols; diff --git a/llvm/lib/ExecutionEngine/Orc/Layer.cpp b/llvm/lib/ExecutionEngine/Orc/Layer.cpp index 2bf27c44f7666..5930c6800c652 100644 --- a/llvm/lib/ExecutionEngine/Orc/Layer.cpp +++ b/llvm/lib/ExecutionEngine/Orc/Layer.cpp @@ -8,6 +8,7 @@ #include "llvm/ExecutionEngine/Orc/Layer.h" +#include "llvm/ExecutionEngine/Orc/DebugUtils.h" #include "llvm/ExecutionEngine/Orc/ExecutionUtils.h" #include "llvm/IR/Constants.h" #include "llvm/Object/MachO.h" diff --git a/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp b/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp index 9a836677ef15b..d69398663aff5 100644 --- a/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp +++ b/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp @@ -9,6 +9,7 @@ #include "llvm/ExecutionEngine/Orc/MachOPlatform.h" #include "llvm/BinaryFormat/MachO.h" +#include "llvm/ExecutionEngine/Orc/DebugUtils.h" #include "llvm/Support/BinaryByteStream.h" #include "llvm/Support/Debug.h" @@ -163,7 +164,6 @@ Error MachOPlatform::notifyAdding(JITDylib &JD, const MaterializationUnit &MU) { if (!InitSym) return Error::success(); - std::lock_guard Lock(PlatformMutex); RegisteredInitSymbols[&JD].add(InitSym); LLVM_DEBUG({ dbgs() << "MachOPlatform: Registered init symbol " << *InitSym << " for MU " @@ -187,11 +187,10 @@ MachOPlatform::getInitializerSequence(JITDylib &JD) { std::vector DFSLinkOrder; while (true) { - // Lock the platform while we search for any initializer symbols to - // look up. + DenseMap NewInitSymbols; - { - std::lock_guard Lock(PlatformMutex); + + ES.runSessionLocked([&]() { DFSLinkOrder = getDFSLinkOrder(JD); for (auto *InitJD : DFSLinkOrder) { @@ -201,7 +200,7 @@ MachOPlatform::getInitializerSequence(JITDylib &JD) { RegisteredInitSymbols.erase(RISItr); } } - } + }); if (NewInitSymbols.empty()) break; @@ -228,7 +227,7 @@ MachOPlatform::getInitializerSequence(JITDylib &JD) { // Lock again to collect the initializers. InitializerSequence FullInitSeq; { - std::lock_guard Lock(PlatformMutex); + std::lock_guard Lock(InitSeqsMutex); for (auto *InitJD : reverse(DFSLinkOrder)) { LLVM_DEBUG({ dbgs() << "MachOPlatform: Appending inits for \"" << InitJD->getName() @@ -251,7 +250,7 @@ MachOPlatform::getDeinitializerSequence(JITDylib &JD) { DeinitializerSequence FullDeinitSeq; { - std::lock_guard Lock(PlatformMutex); + std::lock_guard Lock(InitSeqsMutex); for (auto *DeinitJD : DFSLinkOrder) { FullDeinitSeq.emplace_back(DeinitJD, MachOJITDylibDeinitializers()); } @@ -285,7 +284,7 @@ void MachOPlatform::registerInitInfo( MachOJITDylibInitializers::SectionExtent ModInits, MachOJITDylibInitializers::SectionExtent ObjCSelRefs, MachOJITDylibInitializers::SectionExtent ObjCClassList) { - std::lock_guard Lock(PlatformMutex); + std::lock_guard Lock(InitSeqsMutex); auto &InitSeq = InitSeqs[&JD]; @@ -384,7 +383,7 @@ void MachOPlatform::InitScraperPlugin::modifyPassConfig( else dbgs() << "none\n"; - dbgs() << "__mod_init_func: "; + dbgs() << " __mod_init_func: "; if (ModInits.NumPtrs) dbgs() << ModInits.NumPtrs << " pointer(s) at " << formatv("{0:x16}", ModInits.Address) << "\n"; diff --git a/llvm/lib/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.cpp b/llvm/lib/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.cpp index f51bd9d3b1c27..f660c4290e4ca 100644 --- a/llvm/lib/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.cpp +++ b/llvm/lib/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.cpp @@ -81,8 +81,12 @@ RTDyldObjectLinkingLayer::RTDyldObjectLinkingLayer( RTDyldObjectLinkingLayer::~RTDyldObjectLinkingLayer() { std::lock_guard Lock(RTDyldLayerMutex); - for (auto &MemMgr : MemMgrs) + for (auto &MemMgr : MemMgrs) { + for (auto *L : EventListeners) + L->notifyFreeingObject( + static_cast(reinterpret_cast(MemMgr.get()))); MemMgr->deregisterEHFrames(); + } } void RTDyldObjectLinkingLayer::emit(MaterializationResponsibility R, @@ -97,13 +101,7 @@ void RTDyldObjectLinkingLayer::emit(MaterializationResponsibility R, auto &ES = getExecutionSession(); - // Create a MemoryBufferRef backed MemoryBuffer (i.e. shallow) copy of the - // the underlying buffer to pass into RuntimeDyld. This allows us to hold - // ownership of the real underlying buffer and return it to the user once - // the object has been emitted. - auto ObjBuffer = MemoryBuffer::getMemBuffer(O->getMemBufferRef(), false); - - auto Obj = object::ObjectFile::createObjectFile(*ObjBuffer); + auto Obj = object::ObjectFile::createObjectFile(*O); if (!Obj) { getExecutionSession().reportError(Obj.takeError()); @@ -154,20 +152,39 @@ void RTDyldObjectLinkingLayer::emit(MaterializationResponsibility R, JITDylibSearchOrderResolver Resolver(*SharedR); jitLinkForORC( - **Obj, std::move(O), *MemMgr, Resolver, ProcessAllSections, - [this, K, SharedR, &Obj, InternalSymbols]( + object::OwningBinary(std::move(*Obj), std::move(O)), + *MemMgr, Resolver, ProcessAllSections, + [this, K, SharedR, MemMgr, InternalSymbols]( + const object::ObjectFile &Obj, std::unique_ptr LoadedObjInfo, std::map ResolvedSymbols) { - return onObjLoad(K, *SharedR, **Obj, std::move(LoadedObjInfo), + return onObjLoad(K, *SharedR, Obj, MemMgr, std::move(LoadedObjInfo), ResolvedSymbols, *InternalSymbols); }, - [this, K, SharedR, O = std::move(O)](Error Err) mutable { - onObjEmit(K, std::move(O), *SharedR, std::move(Err)); + [this, K, SharedR, MemMgr](object::OwningBinary Obj, + Error Err) mutable { + onObjEmit(K, *SharedR, std::move(Obj), MemMgr, std::move(Err)); }); } +void RTDyldObjectLinkingLayer::registerJITEventListener(JITEventListener &L) { + std::lock_guard Lock(RTDyldLayerMutex); + assert(llvm::none_of(EventListeners, + [&](JITEventListener *O) { return O == &L; }) && + "Listener has already been registered"); + EventListeners.push_back(&L); +} + +void RTDyldObjectLinkingLayer::unregisterJITEventListener(JITEventListener &L) { + std::lock_guard Lock(RTDyldLayerMutex); + auto I = llvm::find(EventListeners, &L); + assert(I != EventListeners.end() && "Listener not registered"); + EventListeners.erase(I); +} + Error RTDyldObjectLinkingLayer::onObjLoad( - VModuleKey K, MaterializationResponsibility &R, object::ObjectFile &Obj, + VModuleKey K, MaterializationResponsibility &R, + const object::ObjectFile &Obj, RuntimeDyld::MemoryManager *MemMgr, std::unique_ptr LoadedObjInfo, std::map Resolved, std::set &InternalSymbols) { @@ -252,12 +269,17 @@ Error RTDyldObjectLinkingLayer::onObjLoad( if (NotifyLoaded) NotifyLoaded(K, Obj, *LoadedObjInfo); + std::lock_guard Lock(RTDyldLayerMutex); + assert(!LoadedObjInfos.count(MemMgr) && "Duplicate loaded info for MemMgr"); + LoadedObjInfos[MemMgr] = std::move(LoadedObjInfo); + return Error::success(); } void RTDyldObjectLinkingLayer::onObjEmit( - VModuleKey K, std::unique_ptr ObjBuffer, - MaterializationResponsibility &R, Error Err) { + VModuleKey K, MaterializationResponsibility &R, + object::OwningBinary O, + RuntimeDyld::MemoryManager *MemMgr, Error Err) { if (Err) { getExecutionSession().reportError(std::move(Err)); R.failMaterialization(); @@ -270,6 +292,22 @@ void RTDyldObjectLinkingLayer::onObjEmit( return; } + std::unique_ptr Obj; + std::unique_ptr ObjBuffer; + std::tie(Obj, ObjBuffer) = O.takeBinary(); + + // Run EventListener notifyLoaded callbacks. + { + std::lock_guard Lock(RTDyldLayerMutex); + auto LOIItr = LoadedObjInfos.find(MemMgr); + assert(LOIItr != LoadedObjInfos.end() && "LoadedObjInfo missing"); + for (auto *L : EventListeners) + L->notifyObjectLoaded( + static_cast(reinterpret_cast(MemMgr)), *Obj, + *LOIItr->second); + LoadedObjInfos.erase(MemMgr); + } + if (NotifyEmitted) NotifyEmitted(K, std::move(ObjBuffer)); } diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp index 2df71a5e5e741..5cc8ef58e906f 100644 --- a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp +++ b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp @@ -1190,16 +1190,16 @@ Error RuntimeDyldImpl::resolveExternalSymbols() { void RuntimeDyldImpl::finalizeAsync( std::unique_ptr This, - unique_function OnEmitted, - std::unique_ptr UnderlyingBuffer) { + unique_function, Error)> + OnEmitted, + object::OwningBinary O) { auto SharedThis = std::shared_ptr(std::move(This)); auto PostResolveContinuation = - [SharedThis, OnEmitted = std::move(OnEmitted), - UnderlyingBuffer = std::move(UnderlyingBuffer)]( + [SharedThis, OnEmitted = std::move(OnEmitted), O = std::move(O)]( Expected Result) mutable { if (!Result) { - OnEmitted(Result.takeError()); + OnEmitted(std::move(O), Result.takeError()); return; } @@ -1213,10 +1213,11 @@ void RuntimeDyldImpl::finalizeAsync( SharedThis->registerEHFrames(); std::string ErrMsg; if (SharedThis->MemMgr.finalizeMemory(&ErrMsg)) - OnEmitted(make_error(std::move(ErrMsg), + OnEmitted(std::move(O), + make_error(std::move(ErrMsg), inconvertibleErrorCode())); else - OnEmitted(Error::success()); + OnEmitted(std::move(O), Error::success()); }; JITSymbolResolver::LookupSet Symbols; @@ -1403,32 +1404,35 @@ void RuntimeDyld::deregisterEHFrames() { // FIXME: Kill this with fire once we have a new JIT linker: this is only here // so that we can re-use RuntimeDyld's implementation without twisting the // interface any further for ORC's purposes. -void jitLinkForORC(object::ObjectFile &Obj, - std::unique_ptr UnderlyingBuffer, - RuntimeDyld::MemoryManager &MemMgr, - JITSymbolResolver &Resolver, bool ProcessAllSections, - unique_function LoadedObj, - std::map)> - OnLoaded, - unique_function OnEmitted) { +void jitLinkForORC( + object::OwningBinary O, + RuntimeDyld::MemoryManager &MemMgr, JITSymbolResolver &Resolver, + bool ProcessAllSections, + unique_function< + Error(const object::ObjectFile &Obj, + std::unique_ptr LoadedObj, + std::map)> + OnLoaded, + unique_function, Error)> + OnEmitted) { RuntimeDyld RTDyld(MemMgr, Resolver); RTDyld.setProcessAllSections(ProcessAllSections); - auto Info = RTDyld.loadObject(Obj); + auto Info = RTDyld.loadObject(*O.getBinary()); if (RTDyld.hasError()) { - OnEmitted(make_error(RTDyld.getErrorString(), - inconvertibleErrorCode())); + OnEmitted(std::move(O), make_error(RTDyld.getErrorString(), + inconvertibleErrorCode())); return; } - if (auto Err = OnLoaded(std::move(Info), RTDyld.getSymbolTable())) - OnEmitted(std::move(Err)); + if (auto Err = + OnLoaded(*O.getBinary(), std::move(Info), RTDyld.getSymbolTable())) + OnEmitted(std::move(O), std::move(Err)); RuntimeDyldImpl::finalizeAsync(std::move(RTDyld.Dyld), std::move(OnEmitted), - std::move(UnderlyingBuffer)); + std::move(O)); } } // end namespace llvm diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h index 3505dd77f875e..d1d2e432e7e87 100644 --- a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h +++ b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h @@ -549,9 +549,11 @@ class RuntimeDyldImpl { void resolveLocalRelocations(); - static void finalizeAsync(std::unique_ptr This, - unique_function OnEmitted, - std::unique_ptr UnderlyingBuffer); + static void finalizeAsync( + std::unique_ptr This, + unique_function, Error)> + OnEmitted, + object::OwningBinary O); void reassignSectionAddress(unsigned SectionID, uint64_t Addr); diff --git a/llvm/lib/Frontend/OpenMP/OMPContext.cpp b/llvm/lib/Frontend/OpenMP/OMPContext.cpp index 62e5d1d8c466c..748caa645a37e 100644 --- a/llvm/lib/Frontend/OpenMP/OMPContext.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPContext.cpp @@ -26,8 +26,9 @@ using namespace omp; OMPContext::OMPContext(bool IsDeviceCompilation, Triple TargetTriple) { // Add the appropriate device kind trait based on the triple and the // IsDeviceCompilation flag. - ActiveTraits.insert(IsDeviceCompilation ? TraitProperty::device_kind_nohost - : TraitProperty::device_kind_host); + ActiveTraits.set(unsigned(IsDeviceCompilation + ? TraitProperty::device_kind_nohost + : TraitProperty::device_kind_host)); switch (TargetTriple.getArch()) { case Triple::arm: case Triple::armeb: @@ -43,12 +44,12 @@ OMPContext::OMPContext(bool IsDeviceCompilation, Triple TargetTriple) { case Triple::ppc64le: case Triple::x86: case Triple::x86_64: - ActiveTraits.insert(TraitProperty::device_kind_cpu); + ActiveTraits.set(unsigned(TraitProperty::device_kind_cpu)); break; case Triple::amdgcn: case Triple::nvptx: case Triple::nvptx64: - ActiveTraits.insert(TraitProperty::device_kind_gpu); + ActiveTraits.set(unsigned(TraitProperty::device_kind_gpu)); break; default: break; @@ -58,7 +59,7 @@ OMPContext::OMPContext(bool IsDeviceCompilation, Triple TargetTriple) { #define OMP_TRAIT_PROPERTY(Enum, TraitSetEnum, TraitSelectorEnum, Str) \ if (TraitSelector::TraitSelectorEnum == TraitSelector::device_arch) \ if (TargetTriple.getArch() == TargetTriple.getArchTypeForLLVMName(Str)) \ - ActiveTraits.insert(TraitProperty::Enum); + ActiveTraits.set(unsigned(TraitProperty::Enum)); #include "llvm/Frontend/OpenMP/OMPKinds.def" // TODO: What exactly do we want to see as device ISA trait? @@ -67,20 +68,22 @@ OMPContext::OMPContext(bool IsDeviceCompilation, Triple TargetTriple) { // LLVM is the "OpenMP vendor" but we could also interpret vendor as the // target vendor. - ActiveTraits.insert(TraitProperty::implementation_vendor_llvm); + ActiveTraits.set(unsigned(TraitProperty::implementation_vendor_llvm)); // The user condition true is accepted but not false. - ActiveTraits.insert(TraitProperty::user_condition_true); + ActiveTraits.set(unsigned(TraitProperty::user_condition_true)); // This is for sure some device. - ActiveTraits.insert(TraitProperty::device_kind_any); + ActiveTraits.set(unsigned(TraitProperty::device_kind_any)); LLVM_DEBUG({ dbgs() << "[" << DEBUG_TYPE << "] New OpenMP context with the following properties:\n"; - for (auto &Property : ActiveTraits) + for (const auto &SetBitsIt : ActiveTraits.set_bits()) { + TraitProperty Property = TraitProperty(SetBitsIt); dbgs() << "\t " << getOpenMPContextTraitPropertyFullName(Property) << "\n"; + } }); } @@ -122,17 +125,24 @@ static bool isStrictSubset(const VariantMatchInfo &VMI0, // If all required traits are a strict subset and the ordered vectors storing // the construct traits, we say it is a strict subset. Note that the latter // relation is not required to be strict. - return set_is_strict_subset(VMI0.RequiredTraits, VMI1.RequiredTraits) && - isSubset(VMI0.ConstructTraits, VMI1.ConstructTraits); + if (VMI0.RequiredTraits.count() >= VMI1.RequiredTraits.count()) + return false; + for (const auto &SetBitsIt : VMI0.RequiredTraits.set_bits()) + if (!VMI1.RequiredTraits.test(SetBitsIt)) + return false; + if (!isSubset(VMI0.ConstructTraits, VMI1.ConstructTraits)) + return false; + return true; } static int isVariantApplicableInContextHelper( const VariantMatchInfo &VMI, const OMPContext &Ctx, SmallVectorImpl *ConstructMatches) { - for (TraitProperty Property : VMI.RequiredTraits) { + for (const auto &SetBitsIt : VMI.RequiredTraits.set_bits()) { + TraitProperty Property = TraitProperty(SetBitsIt); - bool IsActiveTrait = Ctx.ActiveTraits.count(Property); + bool IsActiveTrait = Ctx.ActiveTraits.test(unsigned(Property)); if (!IsActiveTrait) { LLVM_DEBUG(dbgs() << "[" << DEBUG_TYPE << "] Property " << getOpenMPContextTraitPropertyName(Property) @@ -181,7 +191,8 @@ static APInt getVariantMatchScore(const VariantMatchInfo &VMI, APInt Score(64, 1); unsigned NoConstructTraits = VMI.ConstructTraits.size(); - for (TraitProperty Property : VMI.RequiredTraits) { + for (const auto &SetBitsIt : VMI.RequiredTraits.set_bits()) { + TraitProperty Property = TraitProperty(SetBitsIt); // If there is a user score attached, use it. if (VMI.ScoreMap.count(Property)) { const APInt &UserScore = VMI.ScoreMap.lookup(Property); diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index 408c28a9bae7d..de5b94403bd77 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -374,8 +374,14 @@ static bool ShouldUpgradeX86Intrinsic(Function *F, StringRef Name) { Name.startswith("avx2.pblendd.") || // Added in 3.7 Name.startswith("avx.vbroadcastf128") || // Added in 4.0 Name == "avx2.vbroadcasti128" || // Added in 3.7 - Name.startswith("avx512.mask.broadcastf") || // Added in 6.0 - Name.startswith("avx512.mask.broadcasti") || // Added in 6.0 + Name.startswith("avx512.mask.broadcastf32x4.") || // Added in 6.0 + Name.startswith("avx512.mask.broadcastf64x2.") || // Added in 6.0 + Name.startswith("avx512.mask.broadcastf32x8.") || // Added in 6.0 + Name.startswith("avx512.mask.broadcastf64x4.") || // Added in 6.0 + Name.startswith("avx512.mask.broadcasti32x4.") || // Added in 6.0 + Name.startswith("avx512.mask.broadcasti64x2.") || // Added in 6.0 + Name.startswith("avx512.mask.broadcasti32x8.") || // Added in 6.0 + Name.startswith("avx512.mask.broadcasti64x4.") || // Added in 6.0 Name == "xop.vpcmov" || // Added in 3.8 Name == "xop.vpcmov.256" || // Added in 5.0 Name.startswith("avx512.mask.move.s") || // Added in 4.0 diff --git a/llvm/lib/IR/ConstantRange.cpp b/llvm/lib/IR/ConstantRange.cpp index 3d25cb5bfbdf0..b8c57533568be 100644 --- a/llvm/lib/IR/ConstantRange.cpp +++ b/llvm/lib/IR/ConstantRange.cpp @@ -802,6 +802,8 @@ ConstantRange ConstantRange::binaryOp(Instruction::BinaryOps BinOp, return binaryAnd(Other); case Instruction::Or: return binaryOr(Other); + case Instruction::Xor: + return binaryXor(Other); // Note: floating point operations applied to abstract ranges are just // ideal integer operations with a lossy representation case Instruction::FAdd: @@ -1211,6 +1213,18 @@ ConstantRange::binaryOr(const ConstantRange &Other) const { return getNonEmpty(std::move(umax), APInt::getNullValue(getBitWidth())); } +ConstantRange ConstantRange::binaryXor(const ConstantRange &Other) const { + if (isEmptySet() || Other.isEmptySet()) + return getEmpty(); + + // Use APInt's implementation of XOR for single element ranges. + if (isSingleElement() && Other.isSingleElement()) + return {*getSingleElement() ^ *Other.getSingleElement()}; + + // TODO: replace this with something less conservative + return getFull(); +} + ConstantRange ConstantRange::shl(const ConstantRange &Other) const { if (isEmptySet() || Other.isEmptySet()) diff --git a/llvm/lib/IR/IntrinsicInst.cpp b/llvm/lib/IR/IntrinsicInst.cpp index 78f98fd191c08..1ce17aa63bdbf 100644 --- a/llvm/lib/IR/IntrinsicInst.cpp +++ b/llvm/lib/IR/IntrinsicInst.cpp @@ -28,6 +28,8 @@ #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" +#include "llvm/IR/PatternMatch.h" + #include "llvm/Support/raw_ostream.h" using namespace llvm; @@ -178,6 +180,140 @@ bool ConstrainedFPIntrinsic::classof(const IntrinsicInst *I) { } } +ElementCount VPIntrinsic::getStaticVectorLength() const { + auto GetVectorLengthOfType = [](const Type *T) -> ElementCount { + auto VT = cast(T); + auto ElemCount = VT->getElementCount(); + return ElemCount; + }; + + auto VPMask = getMaskParam(); + return GetVectorLengthOfType(VPMask->getType()); +} + +Value *VPIntrinsic::getMaskParam() const { + auto maskPos = GetMaskParamPos(getIntrinsicID()); + if (maskPos) + return getArgOperand(maskPos.getValue()); + return nullptr; +} + +Value *VPIntrinsic::getVectorLengthParam() const { + auto vlenPos = GetVectorLengthParamPos(getIntrinsicID()); + if (vlenPos) + return getArgOperand(vlenPos.getValue()); + return nullptr; +} + +Optional VPIntrinsic::GetMaskParamPos(Intrinsic::ID IntrinsicID) { + switch (IntrinsicID) { + default: + return None; + +#define REGISTER_VP_INTRINSIC(VPID, MASKPOS, VLENPOS) \ + case Intrinsic::VPID: \ + return MASKPOS; +#include "llvm/IR/VPIntrinsics.def" + } +} + +Optional VPIntrinsic::GetVectorLengthParamPos(Intrinsic::ID IntrinsicID) { + switch (IntrinsicID) { + default: + return None; + +#define REGISTER_VP_INTRINSIC(VPID, MASKPOS, VLENPOS) \ + case Intrinsic::VPID: \ + return VLENPOS; +#include "llvm/IR/VPIntrinsics.def" + } +} + +bool VPIntrinsic::IsVPIntrinsic(Intrinsic::ID ID) { + switch (ID) { + default: + return false; + +#define REGISTER_VP_INTRINSIC(VPID, MASKPOS, VLENPOS) \ + case Intrinsic::VPID: \ + break; +#include "llvm/IR/VPIntrinsics.def" + } + return true; +} + +// Equivalent non-predicated opcode +unsigned VPIntrinsic::GetFunctionalOpcodeForVP(Intrinsic::ID ID) { + switch (ID) { + default: + return Instruction::Call; + +#define HANDLE_VP_TO_OC(VPID, OC) \ + case Intrinsic::VPID: \ + return Instruction::OC; +#include "llvm/IR/VPIntrinsics.def" + } +} + +Intrinsic::ID VPIntrinsic::GetForOpcode(unsigned OC) { + switch (OC) { + default: + return Intrinsic::not_intrinsic; + +#define HANDLE_VP_TO_OC(VPID, OC) \ + case Instruction::OC: \ + return Intrinsic::VPID; +#include "llvm/IR/VPIntrinsics.def" + } +} + +bool VPIntrinsic::canIgnoreVectorLengthParam() const { + using namespace PatternMatch; + + ElementCount EC = getStaticVectorLength(); + + // No vlen param - no lanes masked-off by it. + auto *VLParam = getVectorLengthParam(); + if (!VLParam) + return true; + + // Note that the VP intrinsic causes undefined behavior if the Explicit Vector + // Length parameter is strictly greater-than the number of vector elements of + // the operation. This function returns true when this is detected statically + // in the IR. + + // Check whether "W == vscale * EC.Min" + if (EC.Scalable) { + // Undig the DL + auto ParMod = this->getModule(); + if (!ParMod) + return false; + const auto &DL = ParMod->getDataLayout(); + + // Compare vscale patterns + uint64_t ParamFactor; + if (EC.Min > 1 && + match(VLParam, m_c_BinOp(m_ConstantInt(ParamFactor), m_VScale(DL)))) { + return ParamFactor >= EC.Min; + } + if (match(VLParam, m_VScale(DL))) { + return ParamFactor; + } + return false; + } + + // standard SIMD operation + auto VLConst = dyn_cast(VLParam); + if (!VLConst) + return false; + + uint64_t VLNum = VLConst->getZExtValue(); + if (VLNum >= EC.Min) + return true; + + return false; +} + Instruction::BinaryOps BinaryOpIntrinsic::getBinaryOp() const { switch (getIntrinsicID()) { case Intrinsic::uadd_with_overflow: diff --git a/llvm/lib/IR/KnowledgeRetention.cpp b/llvm/lib/IR/KnowledgeRetention.cpp index ec6f5712cee54..aa83d0fd6df89 100644 --- a/llvm/lib/IR/KnowledgeRetention.cpp +++ b/llvm/lib/IR/KnowledgeRetention.cpp @@ -206,6 +206,8 @@ static Value *getValueFromBundleOpInfo(IntrinsicInst &Assume, bool llvm::hasAttributeInAssume(CallInst &AssumeCI, Value *IsOn, StringRef AttrName, uint64_t *ArgVal, AssumeQuery AQR) { + assert(isa(AssumeCI) && + "this function is intended to be used on llvm.assume"); IntrinsicInst &Assume = cast(AssumeCI); assert(Assume.getIntrinsicID() == Intrinsic::assume && "this function is intended to be used on llvm.assume"); @@ -253,19 +255,19 @@ void llvm::fillMapFromAssume(CallInst &AssumeCI, RetainedKnowledgeMap &Result) { if (Key.first == nullptr && Key.second == Attribute::None) continue; if (!BundleHasArguement(Bundles, BOIE_Argument)) { - Result[Key] = {0, 0}; + Result[Key][&Assume] = {0, 0}; continue; } unsigned Val = cast( getValueFromBundleOpInfo(Assume, Bundles, BOIE_Argument)) ->getZExtValue(); auto Lookup = Result.find(Key); - if (Lookup == Result.end()) { - Result[Key] = {Val, Val}; + if (Lookup == Result.end() || !Lookup->second.count(&Assume)) { + Result[Key][&Assume] = {Val, Val}; continue; } - Lookup->second.Min = std::min(Val, Lookup->second.Min); - Lookup->second.Max = std::max(Val, Lookup->second.Max); + Lookup->second[&Assume].Min = std::min(Val, Lookup->second[&Assume].Min); + Lookup->second[&Assume].Max = std::max(Val, Lookup->second[&Assume].Max); } } diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp index 68c920d4e54f4..c8871a1f3e641 100644 --- a/llvm/lib/IR/Verifier.cpp +++ b/llvm/lib/IR/Verifier.cpp @@ -4783,6 +4783,12 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) { "Intrinsic does not support vectors", &Call); break; } + case Intrinsic::bswap: { + Type *Ty = Call.getType(); + unsigned Size = Ty->getScalarSizeInBits(); + Assert(Size % 16 == 0, "bswap must be an even number of bytes", &Call); + break; + } }; } diff --git a/llvm/lib/Support/CMakeLists.txt b/llvm/lib/Support/CMakeLists.txt index de43626a14cca..06ee9b1c023b9 100644 --- a/llvm/lib/Support/CMakeLists.txt +++ b/llvm/lib/Support/CMakeLists.txt @@ -115,6 +115,7 @@ add_llvm_component_library(LLVMSupport MemoryBuffer.cpp MD5.cpp NativeFormatting.cpp + OptimalLayout.cpp Optional.cpp Parallel.cpp PluginLoader.cpp diff --git a/llvm/lib/Support/LockFileManager.cpp b/llvm/lib/Support/LockFileManager.cpp index a4793aa7c04fe..88489a658953f 100644 --- a/llvm/lib/Support/LockFileManager.cpp +++ b/llvm/lib/Support/LockFileManager.cpp @@ -17,12 +17,16 @@ #include "llvm/Support/Signals.h" #include "llvm/Support/raw_ostream.h" #include +#include #include #include +#include #include #include #include +#include #include + #ifdef _WIN32 #include #endif @@ -295,23 +299,29 @@ LockFileManager::waitForUnlock(const unsigned MaxSeconds) { if (getState() != LFS_Shared) return Res_Success; -#ifdef _WIN32 - unsigned long Interval = 1; -#else - struct timespec Interval; - Interval.tv_sec = 0; - Interval.tv_nsec = 1000000; -#endif + // Since we don't yet have an event-based method to wait for the lock file, + // implement randomized exponential backoff, similar to Ethernet collision + // algorithm. This improves performance on machines with high core counts + // when the file lock is heavily contended by multiple clang processes + const unsigned long MinWaitDurationMS = 10; + const unsigned long MaxWaitMultiplier = 50; // 500ms max wait + unsigned long WaitMultiplier = 1; + unsigned long ElapsedTimeSeconds = 0; + + std::random_device Device; + std::default_random_engine Engine(Device()); + + auto StartTime = std::chrono::steady_clock::now(); + do { + // FIXME: implement event-based waiting + // Sleep for the designated interval, to allow the owning process time to // finish up and remove the lock file. - // FIXME: Should we hook in to system APIs to get a notification when the - // lock file is deleted? -#ifdef _WIN32 - Sleep(Interval); -#else - nanosleep(&Interval, nullptr); -#endif + std::uniform_int_distribution Distribution(1, + WaitMultiplier); + unsigned long WaitDurationMS = MinWaitDurationMS * Distribution(Engine); + std::this_thread::sleep_for(std::chrono::milliseconds(WaitDurationMS)); if (sys::fs::access(LockFileName.c_str(), sys::fs::AccessMode::Exist) == errc::no_such_file_or_directory) { @@ -325,24 +335,16 @@ LockFileManager::waitForUnlock(const unsigned MaxSeconds) { if (!processStillExecuting((*Owner).first, (*Owner).second)) return Res_OwnerDied; - // Exponentially increase the time we wait for the lock to be removed. -#ifdef _WIN32 - Interval *= 2; -#else - Interval.tv_sec *= 2; - Interval.tv_nsec *= 2; - if (Interval.tv_nsec >= 1000000000) { - ++Interval.tv_sec; - Interval.tv_nsec -= 1000000000; + WaitMultiplier *= 2; + if (WaitMultiplier > MaxWaitMultiplier) { + WaitMultiplier = MaxWaitMultiplier; } -#endif - } while ( -#ifdef _WIN32 - Interval < MaxSeconds * 1000 -#else - Interval.tv_sec < (time_t)MaxSeconds -#endif - ); + + ElapsedTimeSeconds = std::chrono::duration_cast( + std::chrono::steady_clock::now() - StartTime) + .count(); + + } while (ElapsedTimeSeconds < MaxSeconds); // Give up. return Res_Timeout; diff --git a/llvm/lib/Support/OptimalLayout.cpp b/llvm/lib/Support/OptimalLayout.cpp new file mode 100644 index 0000000000000..93f912bf73a3b --- /dev/null +++ b/llvm/lib/Support/OptimalLayout.cpp @@ -0,0 +1,452 @@ +//===--- OptimalLayout.cpp - Optimal data layout algorithm ----------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements the performOptimalLayout interface. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Support/OptimalLayout.h" + +using namespace llvm; + +#ifndef NDEBUG +static void checkValidLayout(ArrayRef Fields, uint64_t Size, + Align MaxAlign) { + uint64_t LastEnd = 0; + Align ComputedMaxAlign; + for (auto &Field : Fields) { + assert(Field.hasFixedOffset() && + "didn't assign a fixed offset to field"); + assert(isAligned(Field.Alignment, Field.Offset) && + "didn't assign a correctly-aligned offset to field"); + assert(Field.Offset >= LastEnd && + "didn't assign offsets in ascending order"); + LastEnd = Field.getEndOffset(); + assert(Field.Alignment <= MaxAlign && + "didn't compute MaxAlign correctly"); + ComputedMaxAlign = std::max(Field.Alignment, MaxAlign); + } + assert(LastEnd == Size && "didn't compute LastEnd correctly"); + assert(ComputedMaxAlign == MaxAlign && "didn't compute MaxAlign correctly"); +} +#endif + +std::pair +llvm::performOptimalLayout(MutableArrayRef Fields) { +#ifndef NDEBUG + // Do some simple precondition checks. + { + bool InFixedPrefix = true; + size_t LastEnd = 0; + for (auto &Field : Fields) { + assert(Field.Size > 0 && "field of zero size"); + if (Field.hasFixedOffset()) { + assert(InFixedPrefix && + "fixed-offset fields are not a strict prefix of array"); + assert(LastEnd <= Field.Offset && + "fixed-offset fields overlap or are not in order"); + LastEnd = Field.getEndOffset(); + assert(LastEnd > Field.Offset && + "overflow in fixed-offset end offset"); + } else { + InFixedPrefix = false; + } + } + } +#endif + + // Do an initial pass over the fields. + Align MaxAlign; + + // Find the first flexible-offset field, tracking MaxAlign. + auto FirstFlexible = Fields.begin(), E = Fields.end(); + while (FirstFlexible != E && FirstFlexible->hasFixedOffset()) { + MaxAlign = std::max(MaxAlign, FirstFlexible->Alignment); + ++FirstFlexible; + } + + // If there are no flexible fields, we're done. + if (FirstFlexible == E) { + uint64_t Size = 0; + if (!Fields.empty()) + Size = Fields.back().getEndOffset(); + +#ifndef NDEBUG + checkValidLayout(Fields, Size, MaxAlign); +#endif + return std::make_pair(Size, MaxAlign); + } + + // Walk over the flexible-offset fields, tracking MaxAlign and + // assigning them a unique number in order of their appearance. + // We'll use this unique number in the comparison below so that + // we can use array_pod_sort, which isn't stable. We won't use it + // past that point. + { + uintptr_t UniqueNumber = 0; + for (auto I = FirstFlexible; I != E; ++I) { + I->Scratch = reinterpret_cast(UniqueNumber++); + MaxAlign = std::max(MaxAlign, I->Alignment); + } + } + + // Sort the flexible elements in order of decreasing alignment, + // then decreasing size, and then the original order as recorded + // in Scratch. The decreasing-size aspect of this is only really + // important if we get into the gap-filling stage below, but it + // doesn't hurt here. + array_pod_sort(FirstFlexible, E, + [](const OptimalLayoutField *lhs, + const OptimalLayoutField *rhs) -> int { + // Decreasing alignment. + if (lhs->Alignment != rhs->Alignment) + return (lhs->Alignment < rhs->Alignment ? 1 : -1); + + // Decreasing size. + if (lhs->Size != rhs->Size) + return (lhs->Size < rhs->Size ? 1 : -1); + + // Original order. + auto lhsNumber = reinterpret_cast(lhs->Scratch); + auto rhsNumber = reinterpret_cast(rhs->Scratch); + if (lhsNumber != rhsNumber) + return (lhsNumber < rhsNumber ? -1 : 1); + + return 0; + }); + + // Do a quick check for whether that sort alone has given us a perfect + // layout with no interior padding. This is very common: if the + // fixed-layout fields have no interior padding, and they end at a + // sufficiently-aligned offset for all the flexible-layout fields, + // and the flexible-layout fields all have sizes that are multiples + // of their alignment, then this will reliably trigger. + { + bool HasPadding = false; + uint64_t LastEnd = 0; + + // Walk the fixed-offset fields. + for (auto I = Fields.begin(); I != FirstFlexible; ++I) { + assert(I->hasFixedOffset()); + if (LastEnd != I->Offset) { + HasPadding = true; + break; + } + LastEnd = I->getEndOffset(); + } + + // Walk the flexible-offset fields, optimistically assigning fixed + // offsets. Note that we maintain a strict division between the + // fixed-offset and flexible-offset fields, so if we end up + // discovering padding later in this loop, we can just abandon this + // work and we'll ignore the offsets we already assigned. + if (!HasPadding) { + for (auto I = FirstFlexible; I != E; ++I) { + auto Offset = alignTo(LastEnd, I->Alignment); + if (LastEnd != Offset) { + HasPadding = true; + break; + } + I->Offset = Offset; + LastEnd = I->getEndOffset(); + } + } + + // If we already have a perfect layout, we're done. + if (!HasPadding) { +#ifndef NDEBUG + checkValidLayout(Fields, LastEnd, MaxAlign); +#endif + return std::make_pair(LastEnd, MaxAlign); + } + } + + // The algorithm sketch at this point is as follows. + // + // Consider the padding gaps between fixed-offset fields in ascending + // order. Let LastEnd be the offset of the first byte following the + // field before the gap, or 0 if the gap is at the beginning of the + // structure. Find the "best" flexible-offset field according to the + // criteria below. If no such field exists, proceed to the next gap. + // Otherwise, add the field at the first properly-aligned offset for + // that field that is >= LastEnd, then update LastEnd and repeat in + // order to fill any remaining gap following that field. + // + // Next, let LastEnd to be the offset of the first byte following the + // last fixed-offset field, or 0 if there are no fixed-offset fields. + // While there are flexible-offset fields remaining, find the "best" + // flexible-offset field according to the criteria below, add it at + // the first properly-aligned offset for that field that is >= LastEnd, + // and update LastEnd to the first byte following the field. + // + // The "best" field is chosen by the following criteria, considered + // strictly in order: + // + // - When filling a gap betweeen fields, the field must fit. + // - A field is preferred if it requires less padding following LastEnd. + // - A field is preferred if it is more aligned. + // - A field is preferred if it is larger. + // - A field is preferred if it appeared earlier in the initial order. + // + // Minimizing leading padding is a greedy attempt to avoid padding + // entirely. Preferring more-aligned fields is an attempt to eliminate + // stricter constraints earlier, with the idea that weaker alignment + // constraints may be resolvable with less padding elsewhere. These + // These two rules are sufficient to ensure that we get the optimal + // layout in the "C-style" case. Preferring larger fields tends to take + // better advantage of large gaps and may be more likely to have a size + // that's a multiple of a useful alignment. Preferring the initial + // order may help somewhat with locality but is mostly just a way of + // ensuring deterministic output. + // + // Note that this algorithm does not guarantee a minimal layout. Picking + // a larger object greedily may leave a gap that cannot be filled as + // efficiently. Unfortunately, solving this perfectly is an NP-complete + // problem (by reduction from bin-packing: let B_i be the bin sizes and + // O_j be the object sizes; add fixed-offset fields such that the gaps + // between them have size B_i, and add flexible-offset fields with + // alignment 1 and size O_j; if the layout size is equal to the end of + // the last fixed-layout field, the objects fit in the bins; note that + // this doesn't even require the complexity of alignment). + + // The implementation below is essentially just an optimized version of + // scanning the list of remaining fields looking for the best, which + // would be O(n^2). In the worst case, it doesn't improve on that. + // However, in practice it'll just scan the array of alignment bins + // and consider the first few elements from one or two bins. The + // number of bins is bounded by a small constant: alignments are powers + // of two that are vanishingly unlikely to be over 64 and fairly unlikely + // to be over 8. And multiple elements only need to be considered when + // filling a gap between fixed-offset fields, which doesn't happen very + // often. We could use a data structure within bins that optimizes for + // finding the best-sized match, but it would require allocating memory + // and copying data, so it's unlikely to be worthwhile. + + + // Start by organizing the flexible-offset fields into bins according to + // their alignment. We expect a small enough number of bins that we + // don't care about the asymptotic costs of walking this. + struct AlignmentQueue { + /// The minimum size of anything currently in this queue. + uint64_t MinSize; + + /// The head of the queue. A singly-linked list. The order here should + /// be consistent with the earlier sort, i.e. the elements should be + /// monotonically descending in size and otherwise in the original order. + /// + /// We remove the queue from the array as soon as this is empty. + OptimalLayoutField *Head; + + /// The alignment requirement of the queue. + Align Alignment; + + static OptimalLayoutField *getNext(OptimalLayoutField *Cur) { + return static_cast(Cur->Scratch); + } + }; + SmallVector FlexibleFieldsByAlignment; + for (auto I = FirstFlexible; I != E; ) { + auto Head = I; + auto Alignment = I->Alignment; + + uint64_t MinSize = I->Size; + auto LastInQueue = I; + for (++I; I != E && I->Alignment == Alignment; ++I) { + LastInQueue->Scratch = I; + LastInQueue = I; + MinSize = std::min(MinSize, I->Size); + } + LastInQueue->Scratch = nullptr; + + FlexibleFieldsByAlignment.push_back({MinSize, Head, Alignment}); + } + +#ifndef NDEBUG + // Verify that we set the queues up correctly. + auto checkQueues = [&]{ + bool FirstQueue = true; + Align LastQueueAlignment; + for (auto &Queue : FlexibleFieldsByAlignment) { + assert((FirstQueue || Queue.Alignment < LastQueueAlignment) && + "bins not in order of descending alignment"); + LastQueueAlignment = Queue.Alignment; + FirstQueue = false; + + assert(Queue.Head && "queue was empty"); + uint64_t LastSize = ~(uint64_t)0; + for (auto I = Queue.Head; I; I = Queue.getNext(I)) { + assert(I->Alignment == Queue.Alignment && "bad field in queue"); + assert(I->Size <= LastSize && "queue not in descending size order"); + LastSize = I->Size; + } + } + }; + checkQueues(); +#endif + + /// Helper function to remove a field from a queue. + auto spliceFromQueue = [&](AlignmentQueue *Queue, + OptimalLayoutField *Last, + OptimalLayoutField *Cur) { + assert(Last ? Queue->getNext(Last) == Cur : Queue->Head == Cur); + + // If we're removing Cur from a non-initial position, splice it out + // of the linked list. + if (Last) { + Last->Scratch = Cur->Scratch; + + // If Cur was the last field in the list, we need to update MinSize. + // We can just use the last field's size because the list is in + // descending order of size. + if (!Cur->Scratch) + Queue->MinSize = Last->Size; + + // Otherwise, replace the head. + } else { + if (auto NewHead = Queue->getNext(Cur)) + Queue->Head = NewHead; + + // If we just emptied the queue, destroy its bin. + else + FlexibleFieldsByAlignment.erase(Queue); + } + }; + + // Do layout into a local array. Doing this in-place on Fields is + // not really feasible. + SmallVector Layout; + Layout.reserve(Fields.size()); + + // The offset that we're currently looking to insert at (or after). + uint64_t LastEnd = 0; + + // Helper function to splice Cur out of the given queue and add it + // to the layout at the given offset. + auto addToLayout = [&](AlignmentQueue *Queue, + OptimalLayoutField *Last, + OptimalLayoutField *Cur, + uint64_t Offset) -> bool { + assert(Offset == alignTo(LastEnd, Cur->Alignment)); + + // Splice out. This potentially invalidates Queue. + spliceFromQueue(Queue, Last, Cur); + + // Add Cur to the layout. + Layout.push_back(*Cur); + Layout.back().Offset = Offset; + LastEnd = Layout.back().getEndOffset(); + + // Always return true so that we can be tail-called. + return true; + }; + + // Helper function to try to find a field in the given queue that'll + // fit starting at StartOffset but before EndOffset (if present). + // Note that this never fails if EndOffset is not provided. + auto tryAddFillerFromQueue = [&](AlignmentQueue *Queue, + uint64_t StartOffset, + Optional EndOffset) -> bool { + assert(Queue->Head); + assert(StartOffset == alignTo(LastEnd, Queue->Alignment)); + + // Figure out the maximum size that a field can be, and ignore this + // queue if there's nothing in it that small. + auto MaxViableSize = + (EndOffset ? *EndOffset - StartOffset : ~(uint64_t)0); + if (Queue->MinSize > MaxViableSize) return false; + + // Find the matching field. Note that this should always find + // something because of the MinSize check above. + for (OptimalLayoutField *Cur = Queue->Head, *Last = nullptr; + true; Last = Cur, Cur = Queue->getNext(Cur)) { + assert(Cur && "didn't find a match in queue despite its MinSize"); + if (Cur->Size <= MaxViableSize) + return addToLayout(Queue, Last, Cur, StartOffset); + } + + llvm_unreachable("didn't find a match in queue despite its MinSize"); + }; + + // Helper function to find the "best" flexible-offset field according + // to the criteria described above. + auto tryAddBestField = [&](Optional BeforeOffset) -> bool { + auto QueueB = FlexibleFieldsByAlignment.begin(); + auto QueueE = FlexibleFieldsByAlignment.end(); + + // Start by looking for the most-aligned queue that doesn't need any + // leading padding after LastEnd. + auto FirstQueueToSearch = QueueB; + for (; FirstQueueToSearch != QueueE; ++FirstQueueToSearch) { + if (isAligned(FirstQueueToSearch->Alignment, LastEnd)) + break; + } + + uint64_t Offset = LastEnd; + while (true) { + // Invariant: all of the queues in [FirstQueueToSearch, QueueE) + // require the same initial padding offset. + + // Search those queues in descending order of alignment for a + // satisfactory field. + for (auto Queue = FirstQueueToSearch; Queue != QueueE; ++Queue) { + if (tryAddFillerFromQueue(Queue, Offset, BeforeOffset)) + return true; + } + + // Okay, we don't need to scan those again. + QueueE = FirstQueueToSearch; + + // If we started from the first queue, we're done. + if (FirstQueueToSearch == QueueB) + return false; + + // Otherwise, scan backwards to find the most-aligned queue that + // still has minimal leading padding after LastEnd. + --FirstQueueToSearch; + Offset = alignTo(LastEnd, FirstQueueToSearch->Alignment); + while (FirstQueueToSearch != QueueB && + Offset == alignTo(LastEnd, FirstQueueToSearch[-1].Alignment)) + --FirstQueueToSearch; + } + }; + + // Phase 1: fill the gaps between fixed-offset fields with the best + // flexible-offset field that fits. + for (auto I = Fields.begin(); I != FirstFlexible; ++I) { + while (LastEnd != I->Offset) { + if (!tryAddBestField(I->Offset)) + break; + } + Layout.push_back(*I); + LastEnd = I->getEndOffset(); + } + +#ifndef NDEBUG + checkQueues(); +#endif + + // Phase 2: repeatedly add the best flexible-offset field until + // they're all gone. + while (!FlexibleFieldsByAlignment.empty()) { + bool Success = tryAddBestField(None); + assert(Success && "didn't find a field with no fixed limit?"); + (void) Success; + } + + // Copy the layout back into place. + assert(Layout.size() == Fields.size()); + memcpy(Fields.data(), Layout.data(), + Fields.size() * sizeof(OptimalLayoutField)); + +#ifndef NDEBUG + // Make a final check that the layout is valid. + checkValidLayout(Fields, LastEnd, MaxAlign); +#endif + + return std::make_pair(LastEnd, MaxAlign); +} diff --git a/llvm/lib/Support/Windows/Path.inc b/llvm/lib/Support/Windows/Path.inc index d634c123fbdc5..0eadefb689fd2 100644 --- a/llvm/lib/Support/Windows/Path.inc +++ b/llvm/lib/Support/Windows/Path.inc @@ -47,7 +47,7 @@ using namespace llvm; using llvm::sys::windows::UTF8ToUTF16; using llvm::sys::windows::CurCPToUTF16; using llvm::sys::windows::UTF16ToUTF8; -using llvm::sys::path::widenPath; +using llvm::sys::windows::widenPath; static bool is_separator(const wchar_t value) { switch (value) { @@ -61,64 +61,69 @@ static bool is_separator(const wchar_t value) { namespace llvm { namespace sys { -namespace path { +namespace windows { -// Convert a UTF-8 path to UTF-16. Also, if the absolute equivalent of the -// path is longer than CreateDirectory can tolerate, make it absolute and -// prefixed by '\\?\'. -std::error_code widenPath(const Twine &Path8, - SmallVectorImpl &Path16) { - const size_t MaxDirLen = MAX_PATH - 12; // Must leave room for 8.3 filename. +// Convert a UTF-8 path to UTF-16. Also, if the absolute equivalent of the path +// is longer than the limit that the Win32 Unicode File API can tolerate, make +// it an absolute normalized path prefixed by '\\?\'. +std::error_code widenPath(const Twine &Path8, SmallVectorImpl &Path16, + size_t MaxPathLen) { + assert(MaxPathLen <= MAX_PATH); - // Several operations would convert Path8 to SmallString; more efficient to - // do it once up front. - SmallString<128> Path8Str; + // Several operations would convert Path8 to SmallString; more efficient to do + // it once up front. + SmallString Path8Str; Path8.toVector(Path8Str); - // If we made this path absolute, how much longer would it get? + if (std::error_code EC = UTF8ToUTF16(Path8Str, Path16)) + return EC; + + const bool IsAbsolute = llvm::sys::path::is_absolute(Path8); size_t CurPathLen; - if (llvm::sys::path::is_absolute(Twine(Path8Str))) + if (IsAbsolute) CurPathLen = 0; // No contribution from current_path needed. else { - CurPathLen = ::GetCurrentDirectoryW(0, NULL); + CurPathLen = ::GetCurrentDirectoryW( + 0, NULL); // Returns the size including the null terminator. if (CurPathLen == 0) return mapWindowsError(::GetLastError()); } - // Would the absolute path be longer than our limit? - if ((Path8Str.size() + CurPathLen) >= MaxDirLen && - !Path8Str.startswith("\\\\?\\")) { - SmallString<2*MAX_PATH> FullPath("\\\\?\\"); - if (CurPathLen) { - SmallString<80> CurPath; - if (std::error_code EC = llvm::sys::fs::current_path(CurPath)) - return EC; - FullPath.append(CurPath); - } - // Traverse the requested path, canonicalizing . and .. (because the \\?\ - // prefix is documented to treat them as real components). Ignore - // separators, which can be returned from the iterator if the path has a - // drive name. We don't need to call native() on the result since append() - // always attaches preferred_separator. - for (llvm::sys::path::const_iterator I = llvm::sys::path::begin(Path8Str), - E = llvm::sys::path::end(Path8Str); - I != E; ++I) { - if (I->size() == 1 && is_separator((*I)[0])) - continue; - if (I->size() == 1 && *I == ".") - continue; - if (I->size() == 2 && *I == "..") - llvm::sys::path::remove_filename(FullPath); - else - llvm::sys::path::append(FullPath, *I); - } - return UTF8ToUTF16(FullPath, Path16); + const char *const LongPathPrefix = "\\\\?\\"; + + if ((Path16.size() + CurPathLen) < MaxPathLen || + Path8Str.startswith(LongPathPrefix)) + return std::error_code(); + + if (!IsAbsolute) { + if (std::error_code EC = llvm::sys::fs::make_absolute(Path8Str)) + return EC; } - // Just use the caller's original path. - return UTF8ToUTF16(Path8Str, Path16); + // Remove '.' and '..' because long paths treat these as real path components. + llvm::sys::path::remove_dots(Path8Str, true); + + const StringRef RootName = llvm::sys::path::root_name(Path8Str); + assert(!RootName.empty() && + "Root name cannot be empty for an absolute path!"); + + // llvm::sys::path::remove_dots, used above, can leave a '/' after the root + // name and long paths must use '\' as the separator. + const size_t RootNameSize = RootName.size(); + if (RootNameSize < Path8Str.size() && Path8Str[RootNameSize] == '/') + Path8Str[RootNameSize] = '\\'; + + SmallString<2 * MAX_PATH> FullPath(LongPathPrefix); + if (RootName[1] != ':') { // Check if UNC. + FullPath.append("UNC\\"); + FullPath.append(Path8Str.begin() + 2, Path8Str.end()); + } else + FullPath.append(Path8Str); + + return UTF8ToUTF16(FullPath, Path16); } -} // end namespace path + +} // end namespace windows namespace fs { @@ -227,7 +232,9 @@ std::error_code create_directory(const Twine &path, bool IgnoreExisting, perms Perms) { SmallVector path_utf16; - if (std::error_code ec = widenPath(path, path_utf16)) + // CreateDirectoryW has a lower maximum path length as it must leave room for + // an 8.3 filename. + if (std::error_code ec = widenPath(path, path_utf16, MAX_PATH - 12)) return ec; if (!::CreateDirectoryW(path_utf16.begin(), NULL)) { diff --git a/llvm/lib/Support/Windows/Program.inc b/llvm/lib/Support/Windows/Program.inc index f20538e40cc0f..48954ba047356 100644 --- a/llvm/lib/Support/Windows/Program.inc +++ b/llvm/lib/Support/Windows/Program.inc @@ -151,7 +151,7 @@ static HANDLE RedirectIO(Optional Path, int fd, if (windows::UTF8ToUTF16(fname, fnameUnicode)) return INVALID_HANDLE_VALUE; } else { - if (path::widenPath(fname, fnameUnicode)) + if (sys::windows::widenPath(fname, fnameUnicode)) return INVALID_HANDLE_VALUE; } h = CreateFileW(fnameUnicode.data(), fd ? GENERIC_WRITE : GENERIC_READ, @@ -263,7 +263,7 @@ static bool Execute(ProcessInfo &PI, StringRef Program, fflush(stderr); SmallVector ProgramUtf16; - if (std::error_code ec = path::widenPath(Program, ProgramUtf16)) { + if (std::error_code ec = sys::windows::widenPath(Program, ProgramUtf16)) { SetLastError(ec.value()); MakeErrMsg(ErrMsg, std::string("Unable to convert application name to UTF-16")); diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp index 050d9830e402d..4a0b1b4d8502b 100644 --- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp +++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -412,6 +412,7 @@ bool AArch64ExpandPseudo::expand_DestructiveOp( } LLVM_FALLTHROUGH; case AArch64::DestructiveBinary: + case AArch64::DestructiveBinaryImm: std::tie(PredIdx, DOPIdx, SrcIdx) = std::make_tuple(1, 2, 3); break; default: @@ -430,6 +431,9 @@ bool AArch64ExpandPseudo::expand_DestructiveOp( DstReg != MI.getOperand(DOPIdx).getReg() || MI.getOperand(DOPIdx).getReg() != MI.getOperand(SrcIdx).getReg(); break; + case AArch64::DestructiveBinaryImm: + DOPRegIsUnique = true; + break; } assert (DOPRegIsUnique && "The destructive operand should be unique"); @@ -498,6 +502,7 @@ bool AArch64ExpandPseudo::expand_DestructiveOp( .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead)); switch (DType) { + case AArch64::DestructiveBinaryImm: case AArch64::DestructiveBinaryComm: case AArch64::DestructiveBinaryCommWithRev: DOP.add(MI.getOperand(PredIdx)) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 7b189dac4abab..fdbdc42a96894 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -8978,6 +8978,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.align = Align(16); Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile; return true; + case Intrinsic::aarch64_sve_ld1: case Intrinsic::aarch64_sve_ldnt1: { PointerType *PtrTy = cast(I.getArgOperand(1)->getType()); Info.opc = ISD::INTRINSIC_W_CHAIN; @@ -8985,9 +8986,12 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.ptrVal = I.getArgOperand(1); Info.offset = 0; Info.align = MaybeAlign(DL.getABITypeAlignment(PtrTy->getElementType())); - Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MONonTemporal; + Info.flags = MachineMemOperand::MOLoad; + if (Intrinsic == Intrinsic::aarch64_sve_ldnt1) + Info.flags |= MachineMemOperand::MONonTemporal; return true; } + case Intrinsic::aarch64_sve_st1: case Intrinsic::aarch64_sve_stnt1: { PointerType *PtrTy = cast(I.getArgOperand(2)->getType()); Info.opc = ISD::INTRINSIC_W_CHAIN; @@ -8995,7 +8999,9 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.ptrVal = I.getArgOperand(2); Info.offset = 0; Info.align = MaybeAlign(DL.getABITypeAlignment(PtrTy->getElementType())); - Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MONonTemporal; + Info.flags = MachineMemOperand::MOStore; + if (Intrinsic == Intrinsic::aarch64_sve_stnt1) + Info.flags |= MachineMemOperand::MONonTemporal; return true; } default: @@ -11514,7 +11520,7 @@ static MVT getSVEContainerType(EVT ContentTy) { } } -static SDValue performLDNT1Combine(SDNode *N, SelectionDAG &DAG) { +static SDValue performLD1Combine(SDNode *N, SelectionDAG &DAG) { SDLoc DL(N); EVT VT = N->getValueType(0); EVT PtrTy = N->getOperand(3).getValueType(); @@ -11539,7 +11545,7 @@ static SDValue performLDNT1Combine(SDNode *N, SelectionDAG &DAG) { return L; } -static SDValue performSTNT1Combine(SDNode *N, SelectionDAG &DAG) { +static SDValue performST1Combine(SDNode *N, SelectionDAG &DAG) { SDLoc DL(N); SDValue Data = N->getOperand(2); @@ -12995,9 +13001,9 @@ static SDValue legalizeSVEGatherPrefetchOffsVec(SDNode *N, SelectionDAG &DAG) { return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops); } -/// Combines a node carrying the intrinsic `aarch64_sve_gather_prf` into a -/// node that uses `aarch64_sve_gather_prf_scaled_uxtw` when the scalar -/// offset passed to `aarch64_sve_gather_prf` is not a valid immediate for +/// Combines a node carrying the intrinsic `aarch64_sve_prf_gather` into a +/// node that uses `aarch64_sve_prf_gather_scaled_uxtw` when the scalar +/// offset passed to `aarch64_sve_prf_gather` is not a valid immediate for /// the sve gather prefetch instruction with vector plus immediate addressing /// mode. static SDValue combineSVEPrefetchVecBaseImmOff(SDNode *N, SelectionDAG &DAG, @@ -13011,8 +13017,8 @@ static SDValue combineSVEPrefetchVecBaseImmOff(SDNode *N, SelectionDAG &DAG, // ...otherwise swap the offset base with the offset... SmallVector Ops(N->op_begin(), N->op_end()); std::swap(Ops[ImmPos], Ops[OffsetPos]); - // ...and remap the intrinsic `aarch64_sve_gather_prf` to - // `aarch64_sve_gather_prf_scaled_uxtw`. + // ...and remap the intrinsic `aarch64_sve_prf_gather` to + // `aarch64_sve_prf_gather_scaled_uxtw`. SDLoc DL(N); Ops[1] = DAG.getConstant(NewIID, DL, MVT::i64); @@ -13083,30 +13089,30 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, case ISD::INTRINSIC_VOID: case ISD::INTRINSIC_W_CHAIN: switch (cast(N->getOperand(1))->getZExtValue()) { - case Intrinsic::aarch64_sve_gather_prfb: + case Intrinsic::aarch64_sve_prfb_gather: return combineSVEPrefetchVecBaseImmOff( - N, DAG, Intrinsic::aarch64_sve_gather_prfb_scaled_uxtw, + N, DAG, Intrinsic::aarch64_sve_prfb_gather_scaled_uxtw, 1 /*=ScalarSizeInBytes*/); - case Intrinsic::aarch64_sve_gather_prfh: + case Intrinsic::aarch64_sve_prfh_gather: return combineSVEPrefetchVecBaseImmOff( - N, DAG, Intrinsic::aarch64_sve_gather_prfh_scaled_uxtw, + N, DAG, Intrinsic::aarch64_sve_prfh_gather_scaled_uxtw, 2 /*=ScalarSizeInBytes*/); - case Intrinsic::aarch64_sve_gather_prfw: + case Intrinsic::aarch64_sve_prfw_gather: return combineSVEPrefetchVecBaseImmOff( - N, DAG, Intrinsic::aarch64_sve_gather_prfw_scaled_uxtw, + N, DAG, Intrinsic::aarch64_sve_prfw_gather_scaled_uxtw, 4 /*=ScalarSizeInBytes*/); - case Intrinsic::aarch64_sve_gather_prfd: + case Intrinsic::aarch64_sve_prfd_gather: return combineSVEPrefetchVecBaseImmOff( - N, DAG, Intrinsic::aarch64_sve_gather_prfd_scaled_uxtw, + N, DAG, Intrinsic::aarch64_sve_prfd_gather_scaled_uxtw, 8 /*=ScalarSizeInBytes*/); - case Intrinsic::aarch64_sve_gather_prfb_scaled_uxtw: - case Intrinsic::aarch64_sve_gather_prfb_scaled_sxtw: - case Intrinsic::aarch64_sve_gather_prfh_scaled_uxtw: - case Intrinsic::aarch64_sve_gather_prfh_scaled_sxtw: - case Intrinsic::aarch64_sve_gather_prfw_scaled_uxtw: - case Intrinsic::aarch64_sve_gather_prfw_scaled_sxtw: - case Intrinsic::aarch64_sve_gather_prfd_scaled_uxtw: - case Intrinsic::aarch64_sve_gather_prfd_scaled_sxtw: + case Intrinsic::aarch64_sve_prfb_gather_scaled_uxtw: + case Intrinsic::aarch64_sve_prfb_gather_scaled_sxtw: + case Intrinsic::aarch64_sve_prfh_gather_scaled_uxtw: + case Intrinsic::aarch64_sve_prfh_gather_scaled_sxtw: + case Intrinsic::aarch64_sve_prfw_gather_scaled_uxtw: + case Intrinsic::aarch64_sve_prfw_gather_scaled_sxtw: + case Intrinsic::aarch64_sve_prfd_gather_scaled_uxtw: + case Intrinsic::aarch64_sve_prfd_gather_scaled_sxtw: return legalizeSVEGatherPrefetchOffsVec(N, DAG); case Intrinsic::aarch64_neon_ld2: case Intrinsic::aarch64_neon_ld3: @@ -13130,8 +13136,9 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, case Intrinsic::aarch64_neon_st3lane: case Intrinsic::aarch64_neon_st4lane: return performNEONPostLDSTCombine(N, DCI, DAG); + case Intrinsic::aarch64_sve_ld1: case Intrinsic::aarch64_sve_ldnt1: - return performLDNT1Combine(N, DAG); + return performLD1Combine(N, DAG); case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset: return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1); case Intrinsic::aarch64_sve_ldnt1_gather: @@ -13144,8 +13151,9 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, return performLDNF1Combine(N, DAG, AArch64ISD::LDNF1); case Intrinsic::aarch64_sve_ldff1: return performLDNF1Combine(N, DAG, AArch64ISD::LDFF1); + case Intrinsic::aarch64_sve_st1: case Intrinsic::aarch64_sve_stnt1: - return performSTNT1Combine(N, DAG); + return performST1Combine(N, DAG); case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset: return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1); case Intrinsic::aarch64_sve_stnt1_scatter_uxtw: @@ -13926,3 +13934,16 @@ void AArch64TargetLowering::finalizeLowering(MachineFunction &MF) const { bool AArch64TargetLowering::needsFixedCatchObjects() const { return false; } + +bool AArch64TargetLowering::shouldLocalize( + const MachineInstr &MI, const TargetTransformInfo *TTI) const { + if (MI.getOpcode() == TargetOpcode::G_GLOBAL_VALUE) { + // On Darwin, TLS global vars get selected into function calls, which + // we don't want localized, as they can get moved into the middle of a + // another call sequence. + const GlobalValue &GV = *MI.getOperand(1).getGlobal(); + if (GV.isThreadLocal() && Subtarget->isTargetMachO()) + return false; + } + return TargetLoweringBase::shouldLocalize(MI, TTI); +} diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 8394c90a11be3..0932fb062a79c 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -859,6 +859,9 @@ class AArch64TargetLowering : public TargetLowering { bool shouldNormalizeToSelectSequence(LLVMContext &, EVT) const override; void finalizeLowering(MachineFunction &MF) const override; + + bool shouldLocalize(const MachineInstr &MI, + const TargetTransformInfo *TTI) const override; }; namespace AArch64 { diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index d3a541d0246b5..7395f24f2118d 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -880,37 +880,37 @@ multiclass sve_prefetch; - defm PRFH_S : sve_mem_32b_prfm_sv_scaled<0b01, "prfh", ZPR32ExtSXTW16, ZPR32ExtUXTW16, int_aarch64_sve_gather_prfh_scaled_sxtw, int_aarch64_sve_gather_prfh_scaled_uxtw>; - defm PRFW_S : sve_mem_32b_prfm_sv_scaled<0b10, "prfw", ZPR32ExtSXTW32, ZPR32ExtUXTW32, int_aarch64_sve_gather_prfw_scaled_sxtw, int_aarch64_sve_gather_prfw_scaled_uxtw>; - defm PRFD_S : sve_mem_32b_prfm_sv_scaled<0b11, "prfd", ZPR32ExtSXTW64, ZPR32ExtUXTW64, int_aarch64_sve_gather_prfd_scaled_sxtw, int_aarch64_sve_gather_prfd_scaled_uxtw>; + defm PRFB_S : sve_mem_32b_prfm_sv_scaled<0b00, "prfb", ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, int_aarch64_sve_prfb_gather_scaled_sxtw, int_aarch64_sve_prfb_gather_scaled_uxtw>; + defm PRFH_S : sve_mem_32b_prfm_sv_scaled<0b01, "prfh", ZPR32ExtSXTW16, ZPR32ExtUXTW16, int_aarch64_sve_prfh_gather_scaled_sxtw, int_aarch64_sve_prfh_gather_scaled_uxtw>; + defm PRFW_S : sve_mem_32b_prfm_sv_scaled<0b10, "prfw", ZPR32ExtSXTW32, ZPR32ExtUXTW32, int_aarch64_sve_prfw_gather_scaled_sxtw, int_aarch64_sve_prfw_gather_scaled_uxtw>; + defm PRFD_S : sve_mem_32b_prfm_sv_scaled<0b11, "prfd", ZPR32ExtSXTW64, ZPR32ExtUXTW64, int_aarch64_sve_prfd_gather_scaled_sxtw, int_aarch64_sve_prfd_gather_scaled_uxtw>; // Gather prefetch using unpacked, scaled 32-bit offsets, e.g. // prfh pldl1keep, p0, [x0, z0.d, uxtw #1] - defm PRFB_D : sve_mem_64b_prfm_sv_ext_scaled<0b00, "prfb", ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, int_aarch64_sve_gather_prfb_scaled_sxtw, int_aarch64_sve_gather_prfb_scaled_uxtw>; - defm PRFH_D : sve_mem_64b_prfm_sv_ext_scaled<0b01, "prfh", ZPR64ExtSXTW16, ZPR64ExtUXTW16, int_aarch64_sve_gather_prfh_scaled_sxtw, int_aarch64_sve_gather_prfh_scaled_uxtw>; - defm PRFW_D : sve_mem_64b_prfm_sv_ext_scaled<0b10, "prfw", ZPR64ExtSXTW32, ZPR64ExtUXTW32, int_aarch64_sve_gather_prfw_scaled_sxtw, int_aarch64_sve_gather_prfw_scaled_uxtw>; - defm PRFD_D : sve_mem_64b_prfm_sv_ext_scaled<0b11, "prfd", ZPR64ExtSXTW64, ZPR64ExtUXTW64, int_aarch64_sve_gather_prfd_scaled_sxtw, int_aarch64_sve_gather_prfd_scaled_uxtw>; + defm PRFB_D : sve_mem_64b_prfm_sv_ext_scaled<0b00, "prfb", ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, int_aarch64_sve_prfb_gather_scaled_sxtw, int_aarch64_sve_prfb_gather_scaled_uxtw>; + defm PRFH_D : sve_mem_64b_prfm_sv_ext_scaled<0b01, "prfh", ZPR64ExtSXTW16, ZPR64ExtUXTW16, int_aarch64_sve_prfh_gather_scaled_sxtw, int_aarch64_sve_prfh_gather_scaled_uxtw>; + defm PRFW_D : sve_mem_64b_prfm_sv_ext_scaled<0b10, "prfw", ZPR64ExtSXTW32, ZPR64ExtUXTW32, int_aarch64_sve_prfw_gather_scaled_sxtw, int_aarch64_sve_prfw_gather_scaled_uxtw>; + defm PRFD_D : sve_mem_64b_prfm_sv_ext_scaled<0b11, "prfd", ZPR64ExtSXTW64, ZPR64ExtUXTW64, int_aarch64_sve_prfd_gather_scaled_sxtw, int_aarch64_sve_prfd_gather_scaled_uxtw>; // Gather prefetch using scaled 64-bit offsets, e.g. // prfh pldl1keep, p0, [x0, z0.d, lsl #1] - defm PRFB_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b00, "prfb", ZPR64ExtLSL8, int_aarch64_sve_gather_prfb_scaled>; - defm PRFH_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b01, "prfh", ZPR64ExtLSL16, int_aarch64_sve_gather_prfh_scaled>; - defm PRFW_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b10, "prfw", ZPR64ExtLSL32, int_aarch64_sve_gather_prfw_scaled>; - defm PRFD_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b11, "prfd", ZPR64ExtLSL64, int_aarch64_sve_gather_prfd_scaled>; + defm PRFB_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b00, "prfb", ZPR64ExtLSL8, int_aarch64_sve_prfb_gather_scaled>; + defm PRFH_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b01, "prfh", ZPR64ExtLSL16, int_aarch64_sve_prfh_gather_scaled>; + defm PRFW_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b10, "prfw", ZPR64ExtLSL32, int_aarch64_sve_prfw_gather_scaled>; + defm PRFD_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b11, "prfd", ZPR64ExtLSL64, int_aarch64_sve_prfd_gather_scaled>; // Gather prefetch using 32/64-bit pointers with offset, e.g. // prfh pldl1keep, p0, [z0.s, #16] // prfh pldl1keep, p0, [z0.d, #16] - defm PRFB_S_PZI : sve_mem_32b_prfm_vi<0b00, "prfb", imm0_31, int_aarch64_sve_gather_prfb>; - defm PRFH_S_PZI : sve_mem_32b_prfm_vi<0b01, "prfh", uimm5s2, int_aarch64_sve_gather_prfh>; - defm PRFW_S_PZI : sve_mem_32b_prfm_vi<0b10, "prfw", uimm5s4, int_aarch64_sve_gather_prfw>; - defm PRFD_S_PZI : sve_mem_32b_prfm_vi<0b11, "prfd", uimm5s8, int_aarch64_sve_gather_prfd>; + defm PRFB_S_PZI : sve_mem_32b_prfm_vi<0b00, "prfb", imm0_31, int_aarch64_sve_prfb_gather>; + defm PRFH_S_PZI : sve_mem_32b_prfm_vi<0b01, "prfh", uimm5s2, int_aarch64_sve_prfh_gather>; + defm PRFW_S_PZI : sve_mem_32b_prfm_vi<0b10, "prfw", uimm5s4, int_aarch64_sve_prfw_gather>; + defm PRFD_S_PZI : sve_mem_32b_prfm_vi<0b11, "prfd", uimm5s8, int_aarch64_sve_prfd_gather>; - defm PRFB_D_PZI : sve_mem_64b_prfm_vi<0b00, "prfb", imm0_31, int_aarch64_sve_gather_prfb>; - defm PRFH_D_PZI : sve_mem_64b_prfm_vi<0b01, "prfh", uimm5s2, int_aarch64_sve_gather_prfh>; - defm PRFW_D_PZI : sve_mem_64b_prfm_vi<0b10, "prfw", uimm5s4, int_aarch64_sve_gather_prfw>; - defm PRFD_D_PZI : sve_mem_64b_prfm_vi<0b11, "prfd", uimm5s8, int_aarch64_sve_gather_prfd>; + defm PRFB_D_PZI : sve_mem_64b_prfm_vi<0b00, "prfb", imm0_31, int_aarch64_sve_prfb_gather>; + defm PRFH_D_PZI : sve_mem_64b_prfm_vi<0b01, "prfh", uimm5s2, int_aarch64_sve_prfh_gather>; + defm PRFW_D_PZI : sve_mem_64b_prfm_vi<0b10, "prfw", uimm5s4, int_aarch64_sve_prfw_gather>; + defm PRFD_D_PZI : sve_mem_64b_prfm_vi<0b11, "prfd", uimm5s8, int_aarch64_sve_prfd_gather>; defm ADR_SXTW_ZZZ_D : sve_int_bin_cons_misc_0_a_sxtw<0b00, "adr">; defm ADR_UXTW_ZZZ_D : sve_int_bin_cons_misc_0_a_uxtw<0b01, "adr">; @@ -1131,17 +1131,22 @@ multiclass sve_prefetch; // Predicated shifts - defm ASR_ZPmI : sve_int_bin_pred_shift_imm_right<0b0000, "asr">; - defm LSR_ZPmI : sve_int_bin_pred_shift_imm_right<0b0001, "lsr">; + defm ASR_ZPmI : sve_int_bin_pred_shift_imm_right<0b0000, "asr", "ASR_ZPZI">; + defm LSR_ZPmI : sve_int_bin_pred_shift_imm_right<0b0001, "lsr", "LSR_ZPZI">; defm LSL_ZPmI : sve_int_bin_pred_shift_imm_left< 0b0011, "lsl">; - defm ASRD_ZPmI : sve_int_bin_pred_shift_imm_right<0b0100, "asrd", int_aarch64_sve_asrd>; + defm ASRD_ZPmI : sve_int_bin_pred_shift_imm_right<0b0100, "asrd", "ASRD_ZPZI", int_aarch64_sve_asrd>; - defm ASR_ZPmZ : sve_int_bin_pred_shift<0b000, "asr", int_aarch64_sve_asr>; - defm LSR_ZPmZ : sve_int_bin_pred_shift<0b001, "lsr", int_aarch64_sve_lsr>; - defm LSL_ZPmZ : sve_int_bin_pred_shift<0b011, "lsl", int_aarch64_sve_lsl>; - defm ASRR_ZPmZ : sve_int_bin_pred_shift<0b100, "asrr", null_frag>; - defm LSRR_ZPmZ : sve_int_bin_pred_shift<0b101, "lsrr", null_frag>; - defm LSLR_ZPmZ : sve_int_bin_pred_shift<0b111, "lslr", null_frag>; + defm ASR_ZPZZ : sve_int_bin_pred_zx; + defm LSR_ZPZZ : sve_int_bin_pred_zx; + defm LSL_ZPZZ : sve_int_bin_pred_zx; + defm ASRD_ZPZI : sve_int_bin_pred_shift_0_right_zx; + + defm ASR_ZPmZ : sve_int_bin_pred_shift<0b000, "asr", "ASR_ZPZZ", int_aarch64_sve_asr, "ASRR_ZPmZ", 1>; + defm LSR_ZPmZ : sve_int_bin_pred_shift<0b001, "lsr", "LSR_ZPZZ", int_aarch64_sve_lsr, "LSRR_ZPmZ", 1>; + defm LSL_ZPmZ : sve_int_bin_pred_shift<0b011, "lsl", "LSL_ZPZZ", int_aarch64_sve_lsl, "LSLR_ZPmZ", 1>; + defm ASRR_ZPmZ : sve_int_bin_pred_shift<0b100, "asrr", "ASRR_ZPZZ", null_frag, "ASR_ZPmZ", 0>; + defm LSRR_ZPmZ : sve_int_bin_pred_shift<0b101, "lsrr", "LSRR_ZPZZ", null_frag, "LSR_ZPmZ", 0>; + defm LSLR_ZPmZ : sve_int_bin_pred_shift<0b111, "lslr", "LSLR_ZPZZ", null_frag, "LSL_ZPmZ", 0>; defm ASR_WIDE_ZPmZ : sve_int_bin_pred_shift_wide<0b000, "asr", int_aarch64_sve_asr_wide>; defm LSR_WIDE_ZPmZ : sve_int_bin_pred_shift_wide<0b001, "lsr", int_aarch64_sve_lsr_wide>; @@ -1777,10 +1782,10 @@ let Predicates = [HasSVE2] in { defm UQRSHLR_ZPmZ : sve2_int_arith_pred<0b011110, "uqrshlr", null_frag>; // SVE2 predicated shifts - defm SQSHL_ZPmI : sve_int_bin_pred_shift_imm_left< 0b0110, "sqshl">; - defm UQSHL_ZPmI : sve_int_bin_pred_shift_imm_left< 0b0111, "uqshl">; - defm SRSHR_ZPmI : sve_int_bin_pred_shift_imm_right<0b1100, "srshr", int_aarch64_sve_srshr>; - defm URSHR_ZPmI : sve_int_bin_pred_shift_imm_right<0b1101, "urshr", int_aarch64_sve_urshr>; + defm SQSHL_ZPmI : sve_int_bin_pred_shift_imm_left< 0b0110, "sqshl", "SQSHL_ZPZI">; + defm UQSHL_ZPmI : sve_int_bin_pred_shift_imm_left< 0b0111, "uqshl", "UQSHL_ZPZI">; + defm SRSHR_ZPmI : sve_int_bin_pred_shift_imm_right<0b1100, "srshr", "SRSHR_ZPZI", int_aarch64_sve_srshr>; + defm URSHR_ZPmI : sve_int_bin_pred_shift_imm_right<0b1101, "urshr", "URSHR_ZPZI", int_aarch64_sve_urshr>; defm SQSHLU_ZPmI : sve2_int_bin_pred_shift_imm_left< 0b1111, "sqshlu", int_aarch64_sve_sqshlu>; // SVE2 integer add/subtract long diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp index a5676d286ebe0..62ae04a078081 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -309,6 +309,9 @@ AArch64TargetMachine::AArch64TargetMachine(const Target &T, const Triple &TT, // AArch64 supports default outlining behaviour. setSupportsDefaultOutlining(true); + + // AArch64 supports the debug entry values. + setSupportsDebugEntryValues(true); } AArch64TargetMachine::~AArch64TargetMachine() = default; diff --git a/llvm/lib/Target/AArch64/AArch64TargetObjectFile.cpp b/llvm/lib/Target/AArch64/AArch64TargetObjectFile.cpp index cf049ec274f2c..dfc66f0cb4c16 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetObjectFile.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetObjectFile.cpp @@ -20,7 +20,6 @@ using namespace dwarf; void AArch64_ELFTargetObjectFile::Initialize(MCContext &Ctx, const TargetMachine &TM) { TargetLoweringObjectFileELF::Initialize(Ctx, TM); - InitializeELF(TM.Options.UseInitArray); // AARCH64 ELF ABI does not define static relocation type for TLS offset // within a module. Do not generate AT_location for TLS variables. SupportDebugThreadLocalLocation = false; diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td index 3937d6390c4da..6b4924b8f2259 100644 --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -375,6 +375,12 @@ class SVE_3_Op_Pat_SelZero : Pat<(vtd (vtd (op vt1:$Op1, (vselect vt1:$Op1, vt2:$Op2, (SVEDup0)), vt3:$Op3))), (inst $Op1, $Op2, $Op3)>; + +class SVE_3_Op_Pat_Shift_Imm_SelZero +: Pat<(vtd (op vt1:$Op1, (vselect vt1:$Op1, vt2:$Op2, (SVEDup0)), (i32 (vt3:$Op3)))), + (inst $Op1, $Op2, vt3:$Op3)>; } // @@ -433,6 +439,13 @@ let hasNoSchedulingInfo = 1 in { Pseudo<(outs zprty:$Zd), (ins PPR3bAny:$Pg, zprty:$Zs1, zprty:$Zs2), []> { let FalseLanes = flags; } + + class PredTwoOpImmPseudo + : SVEPseudo2Instr, + Pseudo<(outs zprty:$Zd), (ins PPR3bAny:$Pg, zprty:$Zs1, immty:$imm), []> { + let FalseLanes = flags; + } } //===----------------------------------------------------------------------===// @@ -4692,19 +4705,23 @@ class sve_int_bin_pred_shift_imm tsz8_64, bits<4> opc, string asm, let Inst{4-0} = Zdn; let Constraints = "$Zdn = $_Zdn"; - let DestructiveInstType = DestructiveOther; + let DestructiveInstType = DestructiveBinaryImm; let ElementSize = zprty.ElementSize; } -multiclass sve_int_bin_pred_shift_imm_left opc, string asm> { - def _B : sve_int_bin_pred_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftL8>; - def _H : sve_int_bin_pred_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftL16> { +multiclass sve_int_bin_pred_shift_imm_left opc, string asm, string psName=""> { + def _B : SVEPseudo2Instr, + sve_int_bin_pred_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftL8>; + def _H : SVEPseudo2Instr, + sve_int_bin_pred_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftL16> { let Inst{8} = imm{3}; } - def _S : sve_int_bin_pred_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftL32> { + def _S : SVEPseudo2Instr, + sve_int_bin_pred_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftL32> { let Inst{9-8} = imm{4-3}; } - def _D : sve_int_bin_pred_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftL64> { + def _D : SVEPseudo2Instr, + sve_int_bin_pred_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftL64> { let Inst{22} = imm{5}; let Inst{9-8} = imm{4-3}; } @@ -4730,16 +4747,20 @@ multiclass sve2_int_bin_pred_shift_imm_left opc, string asm, def : SVE_3_Op_Imm_Pat(NAME # _D)>; } -multiclass sve_int_bin_pred_shift_imm_right opc, string asm, +multiclass sve_int_bin_pred_shift_imm_right opc, string asm, string Ps, SDPatternOperator op = null_frag> { - def _B : sve_int_bin_pred_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8>; - def _H : sve_int_bin_pred_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16> { + def _B : SVEPseudo2Instr, + sve_int_bin_pred_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8>; + def _H : SVEPseudo2Instr, + sve_int_bin_pred_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16> { let Inst{8} = imm{3}; } - def _S : sve_int_bin_pred_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftR32> { + def _S : SVEPseudo2Instr, + sve_int_bin_pred_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftR32> { let Inst{9-8} = imm{4-3}; } - def _D : sve_int_bin_pred_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftR64> { + def _D : SVEPseudo2Instr, + sve_int_bin_pred_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftR64> { let Inst{22} = imm{5}; let Inst{9-8} = imm{4-3}; } @@ -4750,6 +4771,18 @@ multiclass sve_int_bin_pred_shift_imm_right opc, string asm, def : SVE_3_Op_Imm_Pat(NAME # _D)>; } +multiclass sve_int_bin_pred_shift_0_right_zx { + def _ZERO_B : PredTwoOpImmPseudo; + def _ZERO_H : PredTwoOpImmPseudo; + def _ZERO_S : PredTwoOpImmPseudo; + def _ZERO_D : PredTwoOpImmPseudo; + + def : SVE_3_Op_Pat_Shift_Imm_SelZero(NAME # _ZERO_B)>; + def : SVE_3_Op_Pat_Shift_Imm_SelZero(NAME # _ZERO_H)>; + def : SVE_3_Op_Pat_Shift_Imm_SelZero(NAME # _ZERO_S)>; + def : SVE_3_Op_Pat_Shift_Imm_SelZero(NAME # _ZERO_D)>; +} + class sve_int_bin_pred_shift sz8_64, bit wide, bits<3> opc, string asm, ZPRRegOp zprty, ZPRRegOp zprty2> : I<(outs zprty:$Zdn), (ins PPR3bAny:$Pg, zprty:$_Zdn, zprty2:$Zm), @@ -4774,19 +4807,36 @@ class sve_int_bin_pred_shift sz8_64, bit wide, bits<3> opc, let ElementSize = zprty.ElementSize; } -multiclass sve_int_bin_pred_shift opc, string asm, - SDPatternOperator op> { - def _B : sve_int_bin_pred_shift<0b00, 0b0, opc, asm, ZPR8, ZPR8>; - def _H : sve_int_bin_pred_shift<0b01, 0b0, opc, asm, ZPR16, ZPR16>; - def _S : sve_int_bin_pred_shift<0b10, 0b0, opc, asm, ZPR32, ZPR32>; - def _D : sve_int_bin_pred_shift<0b11, 0b0, opc, asm, ZPR64, ZPR64>; - +multiclass sve_int_bin_pred_shift opc, string asm, string Ps, + SDPatternOperator op, string revname, bit isOrig> { + let DestructiveInstType = DestructiveBinaryCommWithRev in { + def _B : sve_int_bin_pred_shift<0b00, 0b0, opc, asm, ZPR8, ZPR8>, + SVEPseudo2Instr, SVEInstr2Rev; + def _H : sve_int_bin_pred_shift<0b01, 0b0, opc, asm, ZPR16, ZPR16>, + SVEPseudo2Instr, SVEInstr2Rev; + def _S : sve_int_bin_pred_shift<0b10, 0b0, opc, asm, ZPR32, ZPR32>, + SVEPseudo2Instr, SVEInstr2Rev; + def _D : sve_int_bin_pred_shift<0b11, 0b0, opc, asm, ZPR64, ZPR64>, + SVEPseudo2Instr, SVEInstr2Rev; + } def : SVE_3_Op_Pat(NAME # _B)>; def : SVE_3_Op_Pat(NAME # _H)>; def : SVE_3_Op_Pat(NAME # _S)>; def : SVE_3_Op_Pat(NAME # _D)>; } +multiclass sve_int_bin_pred_zx { + def _ZERO_B : PredTwoOpPseudo; + def _ZERO_H : PredTwoOpPseudo; + def _ZERO_S : PredTwoOpPseudo; + def _ZERO_D : PredTwoOpPseudo; + + def : SVE_3_Op_Pat_SelZero(NAME # _ZERO_B)>; + def : SVE_3_Op_Pat_SelZero(NAME # _ZERO_H)>; + def : SVE_3_Op_Pat_SelZero(NAME # _ZERO_S)>; + def : SVE_3_Op_Pat_SelZero(NAME # _ZERO_D)>; +} + multiclass sve_int_bin_pred_shift_wide opc, string asm, SDPatternOperator op> { def _B : sve_int_bin_pred_shift<0b00, 0b1, opc, asm, ZPR8, ZPR64>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp index e7e5e132ecbd3..86586b3009ad0 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp @@ -800,8 +800,6 @@ bool AMDGPUCallLowering::lowerFormalArguments( TLI.allocateSystemSGPRs(CCInfo, MF, *Info, CC, IsShader); } else { CCInfo.AllocateReg(Info->getScratchRSrcReg()); - CCInfo.AllocateReg(Info->getScratchWaveOffsetReg()); - CCInfo.AllocateReg(Info->getFrameOffsetReg()); TLI.allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 4265e9992a500..269434d31e21d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1017,8 +1017,14 @@ void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) { SDVTList VTList = CurDAG->getVTList(MVT::i32, MVT::Glue); - unsigned Opc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32; - unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32; + static const unsigned OpcMap[2][2][2] = { + {{AMDGPU::S_SUB_U32, AMDGPU::S_ADD_U32}, + {AMDGPU::V_SUB_I32_e32, AMDGPU::V_ADD_I32_e32}}, + {{AMDGPU::S_SUBB_U32, AMDGPU::S_ADDC_U32}, + {AMDGPU::V_SUBB_U32_e32, AMDGPU::V_ADDC_U32_e32}}}; + + unsigned Opc = OpcMap[0][N->isDivergent()][IsAdd]; + unsigned CarryOpc = OpcMap[1][N->isDivergent()][IsAdd]; SDNode *AddLo; if (!ConsumeCarry) { @@ -1474,6 +1480,7 @@ static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) { } std::pair AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const { + SDLoc DL(N); const MachineFunction &MF = CurDAG->getMachineFunction(); const SIMachineFunctionInfo *Info = MF.getInfo(); @@ -1488,9 +1495,8 @@ std::pair AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const } // If we don't know this private access is a local stack object, it needs to - // be relative to the entry point's scratch wave offset register. - return std::make_pair(N, CurDAG->getRegister(Info->getScratchWaveOffsetReg(), - MVT::i32)); + // be relative to the entry point's scratch wave offset. + return std::make_pair(N, CurDAG->getTargetConstant(0, DL, MVT::i32)); } bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent, @@ -1515,10 +1521,10 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent, // In a call sequence, stores to the argument stack area are relative to the // stack pointer. const MachinePointerInfo &PtrInfo = cast(Parent)->getPointerInfo(); - unsigned SOffsetReg = isStackPtrRelative(PtrInfo) ? - Info->getStackPtrOffsetReg() : Info->getScratchWaveOffsetReg(); - SOffset = CurDAG->getRegister(SOffsetReg, MVT::i32); + SOffset = isStackPtrRelative(PtrInfo) + ? CurDAG->getRegister(Info->getStackPtrOffsetReg(), MVT::i32) + : CurDAG->getTargetConstant(0, DL, MVT::i32); ImmOffset = CurDAG->getTargetConstant(Imm & 4095, DL, MVT::i16); return true; } @@ -1576,12 +1582,12 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent, SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32); const MachinePointerInfo &PtrInfo = cast(Parent)->getPointerInfo(); - unsigned SOffsetReg = isStackPtrRelative(PtrInfo) ? - Info->getStackPtrOffsetReg() : Info->getScratchWaveOffsetReg(); // FIXME: Get from MachinePointerInfo? We should only be using the frame // offset if we know this is in a call sequence. - SOffset = CurDAG->getRegister(SOffsetReg, MVT::i32); + SOffset = isStackPtrRelative(PtrInfo) + ? CurDAG->getRegister(Info->getStackPtrOffsetReg(), MVT::i32) + : CurDAG->getTargetConstant(0, DL, MVT::i32); Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16); return true; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index bee0605bd556d..bdf5c3b036635 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -4587,6 +4587,29 @@ unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode( } } +unsigned AMDGPUTargetLowering::computeNumSignBitsForTargetInstr( + GISelKnownBits &Analysis, Register R, + const APInt &DemandedElts, const MachineRegisterInfo &MRI, + unsigned Depth) const { + const MachineInstr *MI = MRI.getVRegDef(R); + if (!MI) + return 1; + + // TODO: Check range metadata on MMO. + switch (MI->getOpcode()) { + case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE: + return 25; + case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT: + return 17; + case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE: + return 24; + case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT: + return 16; + default: + return 1; + } +} + bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h index 54747b57f6f4d..7d0b17f7e8164 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -269,6 +269,12 @@ class AMDGPUTargetLowering : public TargetLowering { const SelectionDAG &DAG, unsigned Depth = 0) const override; + unsigned computeNumSignBitsForTargetInstr(GISelKnownBits &Analysis, + Register R, + const APInt &DemandedElts, + const MachineRegisterInfo &MRI, + unsigned Depth = 0) const override; + bool isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN = false, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index b79c44604dabf..f3cfbc1061ff7 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -2688,10 +2688,10 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const { const MachineMemOperand *MMO = *MI->memoperands_begin(); const MachinePointerInfo &PtrInfo = MMO->getPointerInfo(); - Register SOffsetReg = isStackPtrRelative(PtrInfo) - ? Info->getStackPtrOffsetReg() - : Info->getScratchWaveOffsetReg(); - MIB.addReg(SOffsetReg); + if (isStackPtrRelative(PtrInfo)) + MIB.addReg(Info->getStackPtrOffsetReg()); + else + MIB.addImm(0); }, [=](MachineInstrBuilder &MIB) { // offset MIB.addImm(Offset & 4095); @@ -2728,13 +2728,6 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const { } } - // If we don't know this private access is a local stack object, it needs to - // be relative to the entry point's scratch wave offset register. - // TODO: Should split large offsets that don't fit like above. - // TODO: Don't use scratch wave offset just because the offset didn't fit. - Register SOffset = FI.hasValue() ? Info->getStackPtrOffsetReg() - : Info->getScratchWaveOffsetReg(); - return {{[=](MachineInstrBuilder &MIB) { // rsrc MIB.addReg(Info->getScratchRSrcReg()); }, @@ -2745,7 +2738,15 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const { MIB.addReg(VAddr); }, [=](MachineInstrBuilder &MIB) { // soffset - MIB.addReg(SOffset); + // If we don't know this private access is a local stack object, it + // needs to be relative to the entry point's scratch wave offset. + // TODO: Should split large offsets that don't fit like above. + // TODO: Don't use scratch wave offset just because the offset + // didn't fit. + if (!Info->isEntryFunction() && FI.hasValue()) + MIB.addReg(Info->getStackPtrOffsetReg()); + else + MIB.addImm(0); }, [=](MachineInstrBuilder &MIB) { // offset MIB.addImm(Offset); @@ -2783,15 +2784,17 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffset( const MachineMemOperand *MMO = *MI->memoperands_begin(); const MachinePointerInfo &PtrInfo = MMO->getPointerInfo(); - Register SOffsetReg = isStackPtrRelative(PtrInfo) - ? Info->getStackPtrOffsetReg() - : Info->getScratchWaveOffsetReg(); return {{ - [=](MachineInstrBuilder &MIB) { + [=](MachineInstrBuilder &MIB) { // rsrc MIB.addReg(Info->getScratchRSrcReg()); - }, // rsrc - [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffsetReg); }, // soffset - [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset + }, + [=](MachineInstrBuilder &MIB) { // soffset + if (isStackPtrRelative(PtrInfo)) + MIB.addReg(Info->getStackPtrOffsetReg()); + else + MIB.addImm(0); + }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset }}; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 47866501776d8..01fd8483e5c14 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -3548,12 +3548,10 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic( // Rewrite the addressing register layout before doing anything else. if (IsA16) { -#if 0 // FIXME: this feature is missing from gfx10. When that is fixed, this check // should be introduced. - if (!ST.hasFeature(AMDGPU::FeatureR128A16)) + if (!ST.hasR128A16() && !ST.hasGFX10A16()) return false; -#endif if (NumVAddrs > 1) { SmallVector PackedRegs; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h index 13fa9154fb8b3..5fdcd71537caf 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -16,6 +16,7 @@ #include "AMDGPU.h" #include "AMDGPUCallLowering.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "R600FrameLowering.h" #include "R600ISelLowering.h" #include "R600InstrInfo.h" @@ -246,6 +247,13 @@ class AMDGPUSubtarget { uint64_t getExplicitKernArgSize(const Function &F, Align &MaxAlign) const; unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const; + /// \returns Corresponsing DWARF register number mapping flavour for the + /// \p WavefrontSize. + AMDGPUDwarfFlavour getAMDGPUDwarfFlavour() const { + return WavefrontSize == 32 ? AMDGPUDwarfFlavour::Wave32 + : AMDGPUDwarfFlavour::Wave64; + } + virtual ~AMDGPUSubtarget() {} }; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 7553616a86dcb..a1c20220ae264 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -23,6 +23,7 @@ #include "AMDGPUTargetTransformInfo.h" #include "GCNIterativeScheduler.h" #include "GCNSchedStrategy.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "R600MachineScheduler.h" #include "SIMachineFunctionInfo.h" #include "SIMachineScheduler.h" @@ -375,6 +376,12 @@ AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT, getEffectiveCodeModel(CM, CodeModel::Small), OptLevel), TLOF(createTLOF(getTargetTriple())) { initAsmInfo(); + if (TT.getArch() == Triple::amdgcn) { + if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize64")) + MRI.reset(llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour::Wave64)); + else if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize32")) + MRI.reset(llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour::Wave32)); + } } bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false; @@ -1051,11 +1058,14 @@ bool GCNTargetMachine::parseMachineFunctionInfo( MFI->initializeBaseYamlFields(YamlMFI); - auto parseRegister = [&](const yaml::StringValue &RegName, unsigned &RegVal) { - if (parseNamedRegisterReference(PFS, RegVal, RegName.Value, Error)) { + auto parseRegister = [&](const yaml::StringValue &RegName, Register &RegVal) { + // FIXME: Update parseNamedRegsiterReference to take a Register. + unsigned TempReg; + if (parseNamedRegisterReference(PFS, TempReg, RegName.Value, Error)) { SourceRange = RegName.SourceRange; return true; } + RegVal = TempReg; return false; }; @@ -1073,7 +1083,6 @@ bool GCNTargetMachine::parseMachineFunctionInfo( }; if (parseRegister(YamlMFI.ScratchRSrcReg, MFI->ScratchRSrcReg) || - parseRegister(YamlMFI.ScratchWaveOffsetReg, MFI->ScratchWaveOffsetReg) || parseRegister(YamlMFI.FrameOffsetReg, MFI->FrameOffsetReg) || parseRegister(YamlMFI.StackPtrOffsetReg, MFI->StackPtrOffsetReg)) return true; @@ -1083,11 +1092,6 @@ bool GCNTargetMachine::parseMachineFunctionInfo( return diagnoseRegisterClass(YamlMFI.ScratchRSrcReg); } - if (MFI->ScratchWaveOffsetReg != AMDGPU::SCRATCH_WAVE_OFFSET_REG && - !AMDGPU::SGPR_32RegClass.contains(MFI->ScratchWaveOffsetReg)) { - return diagnoseRegisterClass(YamlMFI.ScratchWaveOffsetReg); - } - if (MFI->FrameOffsetReg != AMDGPU::FP_REG && !AMDGPU::SGPR_32RegClass.contains(MFI->FrameOffsetReg)) { return diagnoseRegisterClass(YamlMFI.FrameOffsetReg); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index caa3a4aa31f4f..318e536da6aa4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -476,11 +476,24 @@ int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, Opd1PropInfo, Opd2PropInfo); } +// Return true if there's a potential benefit from using v2f16 instructions for +// an intrinsic, even if it requires nontrivial legalization. +static bool intrinsicHasPackedVectorBenefit(Intrinsic::ID ID) { + switch (ID) { + case Intrinsic::fma: // TODO: fmuladd + // There's a small benefit to using vector ops in the legalized code. + case Intrinsic::round: + return true; + default: + return false; + } +} + template int GCNTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, ArrayRef Args, FastMathFlags FMF, unsigned VF, const Instruction *I) { - if (ID != Intrinsic::fma) + if (!intrinsicHasPackedVectorBenefit(ID)) return BaseT::getIntrinsicInstrCost(ID, RetTy, Args, FMF, VF, I); EVT OrigTy = TLI->getValueType(DL, RetTy); @@ -502,8 +515,14 @@ int GCNTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, if (ST->has16BitInsts() && SLT == MVT::f16) NElts = (NElts + 1) / 2; - return LT.first * NElts * (ST->hasFastFMAF32() ? getHalfRateInstrCost() - : getQuarterRateInstrCost()); + // TODO: Get more refined intrinsic costs? + unsigned InstRate = getQuarterRateInstrCost(); + if (ID == Intrinsic::fma) { + InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost() + : getQuarterRateInstrCost(); + } + + return LT.first * NElts * InstRate; } int GCNTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp index df843f4034efc..0f7db76babb0c 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp @@ -308,7 +308,6 @@ void AMDGPUInstPrinter::printRegOperand(unsigned RegNo, raw_ostream &O, switch (RegNo) { case AMDGPU::FP_REG: case AMDGPU::SP_REG: - case AMDGPU::SCRATCH_WAVE_OFFSET_REG: case AMDGPU::PRIVATE_RSRC_REG: llvm_unreachable("pseudo-register should not ever be emitted"); case AMDGPU::SCC: diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp index 671c44e2e36cf..7d3235efc59e6 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp @@ -65,6 +65,12 @@ static MCRegisterInfo *createAMDGPUMCRegisterInfo(const Triple &TT) { return X; } +MCRegisterInfo *llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour DwarfFlavour) { + MCRegisterInfo *X = new MCRegisterInfo(); + InitAMDGPUMCRegisterInfo(X, AMDGPU::PC_REG, DwarfFlavour); + return X; +} + static MCSubtargetInfo * createAMDGPUMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) { if (TT.getArch() == Triple::r600) diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h index 9754d31fee600..b9cdbc6502e57 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h @@ -33,6 +33,10 @@ class Target; class Triple; class raw_pwrite_stream; +enum AMDGPUDwarfFlavour { Wave64 = 0, Wave32 = 1 }; + +MCRegisterInfo *createGCNMCRegisterInfo(AMDGPUDwarfFlavour DwarfFlavour); + MCCodeEmitter *createR600MCCodeEmitter(const MCInstrInfo &MCII, const MCRegisterInfo &MRI, MCContext &Ctx); diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index e9344dbcdf75f..73b7e7caaeae9 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -612,19 +612,26 @@ void SIFoldOperands::foldOperand( if (frameIndexMayFold(TII, *UseMI, UseOpIdx, OpToFold)) { // Sanity check that this is a stack access. // FIXME: Should probably use stack pseudos before frame lowering. - MachineOperand *SOff = TII->getNamedOperand(*UseMI, AMDGPU::OpName::soffset); - if (!SOff->isReg() || (SOff->getReg() != MFI->getScratchWaveOffsetReg() && - SOff->getReg() != MFI->getStackPtrOffsetReg())) - return; if (TII->getNamedOperand(*UseMI, AMDGPU::OpName::srsrc)->getReg() != MFI->getScratchRSrcReg()) return; + // Ensure this is either relative to the current frame or the current wave. + MachineOperand &SOff = + *TII->getNamedOperand(*UseMI, AMDGPU::OpName::soffset); + if ((!SOff.isReg() || SOff.getReg() != MFI->getStackPtrOffsetReg()) && + (!SOff.isImm() || SOff.getImm() != 0)) + return; + // A frame index will resolve to a positive constant, so it should always be // safe to fold the addressing mode, even pre-GFX9. UseMI->getOperand(UseOpIdx).ChangeToFrameIndex(OpToFold.getIndex()); - SOff->setReg(MFI->getStackPtrOffsetReg()); + + // If this is relative to the current wave, update it to be relative to the + // current frame. + if (SOff.isImm()) + SOff.ChangeToRegister(MFI->getStackPtrOffsetReg(), false); return; } diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp index 60b35981de9c9..1f2517bec6db8 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -30,12 +30,6 @@ static ArrayRef getAllSGPR128(const GCNSubtarget &ST, ST.getMaxNumSGPRs(MF) / 4); } -static ArrayRef getAllSGPRs(const GCNSubtarget &ST, - const MachineFunction &MF) { - return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(), - ST.getMaxNumSGPRs(MF)); -} - // Find a scratch register that we can use at the start of the prologue to // re-align the stack pointer. We avoid using callee-save registers since they // may appear to be free when this is called from canUseAsPrologue (during @@ -184,11 +178,13 @@ static void buildEpilogReload(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB, .addMemOperand(MMO); } -void SIFrameLowering::emitFlatScratchInit(const GCNSubtarget &ST, - MachineFunction &MF, - MachineBasicBlock &MBB) const { +// Emit flat scratch setup code, assuming `MFI->hasFlatScratchInit()` +void SIFrameLowering::emitEntryFunctionFlatScratchInit( + MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I, + const DebugLoc &DL, Register ScratchWaveOffsetReg) const { + const GCNSubtarget &ST = MF.getSubtarget(); const SIInstrInfo *TII = ST.getInstrInfo(); - const SIRegisterInfo* TRI = &TII->getRegisterInfo(); + const SIRegisterInfo *TRI = &TII->getRegisterInfo(); const SIMachineFunctionInfo *MFI = MF.getInfo(); // We don't need this if we only have spills since there is no user facing @@ -201,11 +197,6 @@ void SIFrameLowering::emitFlatScratchInit(const GCNSubtarget &ST, // pointer. Because we only detect if flat instructions are used at all, // this will be used more often than necessary on VI. - // Debug location must be unknown since the first debug location is used to - // determine the end of the prologue. - DebugLoc DL; - MachineBasicBlock::iterator I = MBB.begin(); - Register FlatScratchInitReg = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT); @@ -216,8 +207,6 @@ void SIFrameLowering::emitFlatScratchInit(const GCNSubtarget &ST, Register FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0); Register FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1); - unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg(); - // Do a 64-bit pointer add. if (ST.flatScratchIsPointer()) { if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) { @@ -266,16 +255,20 @@ void SIFrameLowering::emitFlatScratchInit(const GCNSubtarget &ST, .addImm(8); } -unsigned SIFrameLowering::getReservedPrivateSegmentBufferReg( - const GCNSubtarget &ST, - const SIInstrInfo *TII, - const SIRegisterInfo *TRI, - SIMachineFunctionInfo *MFI, - MachineFunction &MF) const { +// Shift down registers reserved for the scratch RSRC. +Register SIFrameLowering::getEntryFunctionReservedScratchRsrcReg( + MachineFunction &MF, Register ScratchWaveOffsetReg) const { + + const GCNSubtarget &ST = MF.getSubtarget(); + const SIInstrInfo *TII = ST.getInstrInfo(); + const SIRegisterInfo *TRI = &TII->getRegisterInfo(); MachineRegisterInfo &MRI = MF.getRegInfo(); + SIMachineFunctionInfo *MFI = MF.getInfo(); + + assert(MFI->isEntryFunction()); + + Register ScratchRsrcReg = MFI->getScratchRSrcReg(); - // We need to insert initialization of the scratch resource descriptor. - unsigned ScratchRsrcReg = MFI->getScratchRSrcReg(); if (ScratchRsrcReg == AMDGPU::NoRegister || !MRI.isPhysRegUsed(ScratchRsrcReg)) return AMDGPU::NoRegister; @@ -293,8 +286,6 @@ unsigned SIFrameLowering::getReservedPrivateSegmentBufferReg( // cannot do this for the resources required for scratch access. For now we // skip over user SGPRs and may leave unused holes. - // We find the resource first because it has an alignment requirement. - unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 3) / 4; ArrayRef AllSGPR128s = getAllSGPR128(ST, MF); AllSGPR128s = AllSGPR128s.slice(std::min(static_cast(AllSGPR128s.size()), NumPreloaded)); @@ -304,7 +295,14 @@ unsigned SIFrameLowering::getReservedPrivateSegmentBufferReg( for (MCPhysReg Reg : AllSGPR128s) { // Pick the first unallocated one. Make sure we don't clobber the other // reserved input we needed. - if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg)) { + // + // FIXME: The preloaded SGPR count is not accurate for shaders as the + // scratch wave offset may be in a fixed SGPR or + // SITargetLowering::allocateSystemSGPRs may choose some free SGPR for the + // scratch wave offset. We explicitly avoid the scratch wave offset to + // account for this. + if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg) && + !TRI->isSubRegisterEq(Reg, ScratchWaveOffsetReg)) { MRI.replaceRegWith(ScratchRsrcReg, Reg); MFI->setScratchRSrcReg(Reg); return Reg; @@ -314,231 +312,113 @@ unsigned SIFrameLowering::getReservedPrivateSegmentBufferReg( return ScratchRsrcReg; } -// Shift down registers reserved for the scratch wave offset. -std::pair -SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg( - const GCNSubtarget &ST, const SIInstrInfo *TII, const SIRegisterInfo *TRI, - SIMachineFunctionInfo *MFI, MachineFunction &MF) const { - MachineRegisterInfo &MRI = MF.getRegInfo(); - unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg(); - - assert(MFI->isEntryFunction()); - - // No replacement necessary. - if (ScratchWaveOffsetReg == AMDGPU::NoRegister || - (!hasFP(MF) && !MRI.isPhysRegUsed(ScratchWaveOffsetReg))) { - return std::make_pair(AMDGPU::NoRegister, false); - } - - if (ST.hasSGPRInitBug()) - return std::make_pair(ScratchWaveOffsetReg, false); - - unsigned NumPreloaded = MFI->getNumPreloadedSGPRs(); - - ArrayRef AllSGPRs = getAllSGPRs(ST, MF); - if (NumPreloaded > AllSGPRs.size()) - return std::make_pair(ScratchWaveOffsetReg, false); - - AllSGPRs = AllSGPRs.slice(NumPreloaded); - - // We need to drop register from the end of the list that we cannot use - // for the scratch wave offset. - // + 2 s102 and s103 do not exist on VI. - // + 2 for vcc - // + 2 for xnack_mask - // + 2 for flat_scratch - // + 4 for registers reserved for scratch resource register - // + 1 for register reserved for scratch wave offset. (By exluding this - // register from the list to consider, it means that when this - // register is being used for the scratch wave offset and there - // are no other free SGPRs, then the value will stay in this register. - // + 1 if stack pointer is used. - // ---- - // 13 (+1) - unsigned ReservedRegCount = 13; - - if (AllSGPRs.size() < ReservedRegCount) - return std::make_pair(ScratchWaveOffsetReg, false); - - bool HandledScratchWaveOffsetReg = - ScratchWaveOffsetReg != TRI->reservedPrivateSegmentWaveByteOffsetReg(MF); - bool FPAdjusted = false; - - for (MCPhysReg Reg : AllSGPRs.drop_back(ReservedRegCount)) { - // Pick the first unallocated SGPR. Be careful not to pick an alias of the - // scratch descriptor, since we haven’t added its uses yet. - if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg)) { - if (!HandledScratchWaveOffsetReg) { - HandledScratchWaveOffsetReg = true; - - MRI.replaceRegWith(ScratchWaveOffsetReg, Reg); - if (MFI->getScratchWaveOffsetReg() == MFI->getStackPtrOffsetReg()) { - assert(!hasFP(MF)); - MFI->setStackPtrOffsetReg(Reg); - } - - MFI->setScratchWaveOffsetReg(Reg); - MFI->setFrameOffsetReg(Reg); - ScratchWaveOffsetReg = Reg; - FPAdjusted = true; - break; - } - } - } - - return std::make_pair(ScratchWaveOffsetReg, FPAdjusted); -} - void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const { assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported"); - SIMachineFunctionInfo *MFI = MF.getInfo(); - - // If we only have SGPR spills, we won't actually be using scratch memory - // since these spill to VGPRs. - // - // FIXME: We should be cleaning up these unused SGPR spill frame indices - // somewhere. - - const GCNSubtarget &ST = MF.getSubtarget(); - const SIInstrInfo *TII = ST.getInstrInfo(); - const SIRegisterInfo *TRI = &TII->getRegisterInfo(); - MachineRegisterInfo &MRI = MF.getRegInfo(); - const Function &F = MF.getFunction(); - - // We need to do the replacement of the private segment buffer and wave offset - // register even if there are no stack objects. There could be stores to undef - // or a constant without an associated object. + // FIXME: If we only have SGPR spills, we won't actually be using scratch + // memory since these spill to VGPRs. We should be cleaning up these unused + // SGPR spill frame indices somewhere. // FIXME: We still have implicit uses on SGPR spill instructions in case they // need to spill to vector memory. It's likely that will not happen, but at // this point it appears we need the setup. This part of the prolog should be // emitted after frame indices are eliminated. - if (MFI->hasFlatScratchInit()) - emitFlatScratchInit(ST, MF, MBB); + // FIXME: Remove all of the isPhysRegUsed checks - unsigned ScratchRsrcReg - = getReservedPrivateSegmentBufferReg(ST, TII, TRI, MFI, MF); + SIMachineFunctionInfo *MFI = MF.getInfo(); + const GCNSubtarget &ST = MF.getSubtarget(); + const SIInstrInfo *TII = ST.getInstrInfo(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + const Function &F = MF.getFunction(); - unsigned ScratchWaveOffsetReg; - bool FPAdjusted; - std::tie(ScratchWaveOffsetReg, FPAdjusted) = - getReservedPrivateSegmentWaveByteOffsetReg(ST, TII, TRI, MFI, MF); + assert(MFI->isEntryFunction()); - // We need to insert initialization of the scratch resource descriptor. - Register PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg( + Register ScratchWaveOffsetReg = MFI->getPreloadedReg( AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); - - unsigned PreloadedPrivateBufferReg = AMDGPU::NoRegister; - if (ST.isAmdHsaOrMesa(F)) { - PreloadedPrivateBufferReg = MFI->getPreloadedReg( - AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER); - } - - bool OffsetRegUsed = ScratchWaveOffsetReg != AMDGPU::NoRegister && - MRI.isPhysRegUsed(ScratchWaveOffsetReg); - bool ResourceRegUsed = ScratchRsrcReg != AMDGPU::NoRegister && - MRI.isPhysRegUsed(ScratchRsrcReg); - // FIXME: Hack to not crash in situations which emitted an error. - if (PreloadedScratchWaveOffsetReg == AMDGPU::NoRegister) + if (ScratchWaveOffsetReg == AMDGPU::NoRegister) return; - // We added live-ins during argument lowering, but since they were not used - // they were deleted. We're adding the uses now, so add them back. - MRI.addLiveIn(PreloadedScratchWaveOffsetReg); - MBB.addLiveIn(PreloadedScratchWaveOffsetReg); - - if (ResourceRegUsed && PreloadedPrivateBufferReg != AMDGPU::NoRegister) { - assert(ST.isAmdHsaOrMesa(F) || ST.isMesaGfxShader(F)); - MRI.addLiveIn(PreloadedPrivateBufferReg); - MBB.addLiveIn(PreloadedPrivateBufferReg); + // We need to do the replacement of the private segment buffer register even + // if there are no stack objects. There could be stores to undef or a + // constant without an associated object. + // + // This will return `AMDGPU::NoRegister` in cases where there are no actual + // uses of the SRSRC. + Register ScratchRsrcReg = + getEntryFunctionReservedScratchRsrcReg(MF, ScratchWaveOffsetReg); + + // Make the selected register live throughout the function. + if (ScratchRsrcReg != AMDGPU::NoRegister) { + for (MachineBasicBlock &OtherBB : MF) { + if (&OtherBB != &MBB) { + OtherBB.addLiveIn(ScratchRsrcReg); + } + } } - // Make the register selected live throughout the function. - for (MachineBasicBlock &OtherBB : MF) { - if (&OtherBB == &MBB) - continue; - - if (OffsetRegUsed || FPAdjusted) - OtherBB.addLiveIn(ScratchWaveOffsetReg); - - if (ResourceRegUsed) - OtherBB.addLiveIn(ScratchRsrcReg); + // Now that we have fixed the reserved SRSRC we need to locate the + // (potentially) preloaded SRSRC. + Register PreloadedScratchRsrcReg = AMDGPU::NoRegister; + if (ST.isAmdHsaOrMesa(F)) { + PreloadedScratchRsrcReg = + MFI->getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER); + if (ScratchRsrcReg != AMDGPU::NoRegister && + PreloadedScratchRsrcReg != AMDGPU::NoRegister) { + // We added live-ins during argument lowering, but since they were not + // used they were deleted. We're adding the uses now, so add them back. + MRI.addLiveIn(PreloadedScratchRsrcReg); + MBB.addLiveIn(PreloadedScratchRsrcReg); + } } + // Debug location must be unknown since the first debug location is used to + // determine the end of the prologue. DebugLoc DL; MachineBasicBlock::iterator I = MBB.begin(); - // If we reserved the original input registers, we don't need to copy to the - // reserved registers. - - bool CopyBuffer = ResourceRegUsed && - PreloadedPrivateBufferReg != AMDGPU::NoRegister && - ST.isAmdHsaOrMesa(F) && - ScratchRsrcReg != PreloadedPrivateBufferReg; - - // This needs to be careful of the copying order to avoid overwriting one of - // the input registers before it's been copied to it's final - // destination. Usually the offset should be copied first. - bool CopyBufferFirst = TRI->isSubRegisterEq(PreloadedPrivateBufferReg, - ScratchWaveOffsetReg); - if (CopyBuffer && CopyBufferFirst) { - BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg) - .addReg(PreloadedPrivateBufferReg, RegState::Kill); + if (MF.getFrameInfo().hasCalls()) { + Register SPReg = MFI->getStackPtrOffsetReg(); + assert(SPReg != AMDGPU::SP_REG); + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), SPReg) + .addImm(MF.getFrameInfo().getStackSize() * ST.getWavefrontSize()); } - unsigned SPReg = MFI->getStackPtrOffsetReg(); - assert(SPReg != AMDGPU::SP_REG); - - // FIXME: Remove the isPhysRegUsed checks - const bool HasFP = hasFP(MF); - - if (HasFP || OffsetRegUsed) { - assert(ScratchWaveOffsetReg); - BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchWaveOffsetReg) - .addReg(PreloadedScratchWaveOffsetReg, HasFP ? RegState::Kill : 0); + if (hasFP(MF)) { + Register FPReg = MFI->getFrameOffsetReg(); + assert(FPReg != AMDGPU::FP_REG); + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), FPReg).addImm(0); } - if (CopyBuffer && !CopyBufferFirst) { - BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg) - .addReg(PreloadedPrivateBufferReg, RegState::Kill); + if (MFI->hasFlatScratchInit() || ScratchRsrcReg != AMDGPU::NoRegister) { + MRI.addLiveIn(ScratchWaveOffsetReg); + MBB.addLiveIn(ScratchWaveOffsetReg); } - if (ResourceRegUsed) { - emitEntryFunctionScratchSetup(ST, MF, MBB, MFI, I, - PreloadedPrivateBufferReg, ScratchRsrcReg); + if (MFI->hasFlatScratchInit()) { + emitEntryFunctionFlatScratchInit(MF, MBB, I, DL, ScratchWaveOffsetReg); } - if (HasFP) { - DebugLoc DL; - const MachineFrameInfo &FrameInfo = MF.getFrameInfo(); - int64_t StackSize = FrameInfo.getStackSize(); - - // On kernel entry, the private scratch wave offset is the SP value. - if (StackSize == 0) { - BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), SPReg) - .addReg(MFI->getScratchWaveOffsetReg()); - } else { - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), SPReg) - .addReg(MFI->getScratchWaveOffsetReg()) - .addImm(StackSize * ST.getWavefrontSize()); - } + if (ScratchRsrcReg != AMDGPU::NoRegister) { + emitEntryFunctionScratchRsrcRegSetup(MF, MBB, I, DL, + PreloadedScratchRsrcReg, + ScratchRsrcReg, ScratchWaveOffsetReg); } } -// Emit scratch setup code for AMDPAL or Mesa, assuming ResourceRegUsed is set. -void SIFrameLowering::emitEntryFunctionScratchSetup(const GCNSubtarget &ST, - MachineFunction &MF, MachineBasicBlock &MBB, SIMachineFunctionInfo *MFI, - MachineBasicBlock::iterator I, unsigned PreloadedPrivateBufferReg, - unsigned ScratchRsrcReg) const { +// Emit scratch RSRC setup code, assuming `ScratchRsrcReg != AMDGPU::NoReg` +void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup( + MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I, + const DebugLoc &DL, Register PreloadedScratchRsrcReg, + Register ScratchRsrcReg, Register ScratchWaveOffsetReg) const { + const GCNSubtarget &ST = MF.getSubtarget(); const SIInstrInfo *TII = ST.getInstrInfo(); const SIRegisterInfo *TRI = &TII->getRegisterInfo(); + const SIMachineFunctionInfo *MFI = MF.getInfo(); const Function &Fn = MF.getFunction(); - DebugLoc DL; if (ST.isAmdPalOS()) { // The pointer to the GIT is formed from the offset passed in and either @@ -595,10 +475,8 @@ void SIFrameLowering::emitEntryFunctionScratchSetup(const GCNSubtarget &ST, .addImm(0) // dlc .addReg(ScratchRsrcReg, RegState::ImplicitDefine) .addMemOperand(MMO); - return; - } - if (ST.isMesaGfxShader(Fn) - || (PreloadedPrivateBufferReg == AMDGPU::NoRegister)) { + } else if (ST.isMesaGfxShader(Fn) || + (PreloadedScratchRsrcReg == AMDGPU::NoRegister)) { assert(!ST.isAmdHsaOrMesa(Fn)); const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32); @@ -658,7 +536,37 @@ void SIFrameLowering::emitEntryFunctionScratchSetup(const GCNSubtarget &ST, BuildMI(MBB, I, DL, SMovB32, Rsrc3) .addImm(Rsrc23 >> 32) .addReg(ScratchRsrcReg, RegState::ImplicitDefine); + } else if (ST.isAmdHsaOrMesa(Fn)) { + assert(PreloadedScratchRsrcReg != AMDGPU::NoRegister); + + if (ScratchRsrcReg != PreloadedScratchRsrcReg) { + BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg) + .addReg(PreloadedScratchRsrcReg, RegState::Kill); + } } + + // Add the scratch wave offset into the scratch RSRC. + // + // We only want to update the first 48 bits, which is the base address + // pointer, without touching the adjacent 16 bits of flags. We know this add + // cannot carry-out from bit 47, otherwise the scratch allocation would be + // impossible to fit in the 48-bit global address space. + // + // TODO: Evaluate if it is better to just construct an SRD using the flat + // scratch init and some constants rather than update the one we are passed. + Register ScratchRsrcSub0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0); + Register ScratchRsrcSub1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1); + + // We cannot Kill ScratchWaveOffsetReg here because we allow it to be used in + // the kernel body via inreg arguments. + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), ScratchRsrcSub0) + .addReg(ScratchRsrcSub0) + .addReg(ScratchWaveOffsetReg) + .addReg(ScratchRsrcReg, RegState::ImplicitDefine); + BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), ScratchRsrcSub1) + .addReg(ScratchRsrcSub1) + .addImm(0) + .addReg(ScratchRsrcReg, RegState::ImplicitDefine); } bool SIFrameLowering::isSupportedStackID(TargetStackID::Value ID) const { @@ -1125,19 +1033,17 @@ MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr( bool SIFrameLowering::hasFP(const MachineFunction &MF) const { const MachineFrameInfo &MFI = MF.getFrameInfo(); - if (MFI.hasCalls()) { + + // For entry functions we can use an immediate offset in most cases, so the + // presence of calls doesn't imply we need a distinct frame pointer. + if (MFI.hasCalls() && + !MF.getInfo()->isEntryFunction()) { // All offsets are unsigned, so need to be addressed in the same direction // as stack growth. // FIXME: This function is pretty broken, since it can be called before the // frame layout is determined or CSR spills are inserted. - if (MFI.getStackSize() != 0) - return true; - - // For the entry point, the input wave scratch offset must be copied to the - // API SP if there are calls. - if (MF.getInfo()->isEntryFunction()) - return true; + return MFI.getStackSize() != 0; } return MFI.hasVarSizedObjects() || MFI.isFrameAddressTaken() || diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/llvm/lib/Target/AMDGPU/SIFrameLowering.h index 8dd09726f6d2d..7314057d5ac1b 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.h +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.h @@ -55,26 +55,21 @@ class SIFrameLowering final : public AMDGPUFrameLowering { MachineBasicBlock::iterator MI) const override; private: - void emitFlatScratchInit(const GCNSubtarget &ST, - MachineFunction &MF, - MachineBasicBlock &MBB) const; - - unsigned getReservedPrivateSegmentBufferReg( - const GCNSubtarget &ST, - const SIInstrInfo *TII, - const SIRegisterInfo *TRI, - SIMachineFunctionInfo *MFI, - MachineFunction &MF) const; - - std::pair getReservedPrivateSegmentWaveByteOffsetReg( - const GCNSubtarget &ST, const SIInstrInfo *TII, const SIRegisterInfo *TRI, - SIMachineFunctionInfo *MFI, MachineFunction &MF) const; - - // Emit scratch setup code for AMDPAL or Mesa, assuming ResourceRegUsed is set. - void emitEntryFunctionScratchSetup(const GCNSubtarget &ST, MachineFunction &MF, - MachineBasicBlock &MBB, SIMachineFunctionInfo *MFI, - MachineBasicBlock::iterator I, unsigned PreloadedPrivateBufferReg, - unsigned ScratchRsrcReg) const; + void emitEntryFunctionFlatScratchInit(MachineFunction &MF, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + const DebugLoc &DL, + Register ScratchWaveOffsetReg) const; + + Register + getEntryFunctionReservedScratchRsrcReg(MachineFunction &MF, + Register ScratchWaveOffsetReg) const; + + void emitEntryFunctionScratchRsrcRegSetup( + MachineFunction &MF, MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, const DebugLoc &DL, + Register PreloadedPrivateBufferReg, Register ScratchRsrcReg, + Register ScratchWaveOffsetReg) const; public: bool hasFP(const MachineFunction &MF) const override; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 34fe290ade617..f551c69ca840b 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1912,67 +1912,45 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM, Info.setScratchRSrcReg(ReservedBufferReg); } - // hasFP should be accurate for kernels even before the frame is finalized. - if (ST.getFrameLowering()->hasFP(MF)) { - MachineRegisterInfo &MRI = MF.getRegInfo(); + MachineRegisterInfo &MRI = MF.getRegInfo(); - // Try to use s32 as the SP, but move it if it would interfere with input - // arguments. This won't work with calls though. - // - // FIXME: Move SP to avoid any possible inputs, or find a way to spill input - // registers. - if (!MRI.isLiveIn(AMDGPU::SGPR32)) { - Info.setStackPtrOffsetReg(AMDGPU::SGPR32); - } else { - assert(AMDGPU::isShader(MF.getFunction().getCallingConv())); + // For entry functions we have to set up the stack pointer if we use it, + // whereas non-entry functions get this "for free". This means there is no + // intrinsic advantage to using S32 over S34 in cases where we do not have + // calls but do need a frame pointer (i.e. if we are requested to have one + // because frame pointer elimination is disabled). To keep things simple we + // only ever use S32 as the call ABI stack pointer, and so using it does not + // imply we need a separate frame pointer. + // + // Try to use s32 as the SP, but move it if it would interfere with input + // arguments. This won't work with calls though. + // + // FIXME: Move SP to avoid any possible inputs, or find a way to spill input + // registers. + if (!MRI.isLiveIn(AMDGPU::SGPR32)) { + Info.setStackPtrOffsetReg(AMDGPU::SGPR32); + } else { + assert(AMDGPU::isShader(MF.getFunction().getCallingConv())); - if (MFI.hasCalls()) - report_fatal_error("call in graphics shader with too many input SGPRs"); + if (MFI.hasCalls()) + report_fatal_error("call in graphics shader with too many input SGPRs"); - for (unsigned Reg : AMDGPU::SGPR_32RegClass) { - if (!MRI.isLiveIn(Reg)) { - Info.setStackPtrOffsetReg(Reg); - break; - } + for (unsigned Reg : AMDGPU::SGPR_32RegClass) { + if (!MRI.isLiveIn(Reg)) { + Info.setStackPtrOffsetReg(Reg); + break; } - - if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG) - report_fatal_error("failed to find register for SP"); } - if (MFI.hasCalls()) { - Info.setScratchWaveOffsetReg(AMDGPU::SGPR33); - Info.setFrameOffsetReg(AMDGPU::SGPR33); - } else { - unsigned ReservedOffsetReg = - TRI.reservedPrivateSegmentWaveByteOffsetReg(MF); - Info.setScratchWaveOffsetReg(ReservedOffsetReg); - Info.setFrameOffsetReg(ReservedOffsetReg); - } - } else if (RequiresStackAccess) { - assert(!MFI.hasCalls()); - // We know there are accesses and they will be done relative to SP, so just - // pin it to the input. - // - // FIXME: Should not do this if inline asm is reading/writing these - // registers. - Register PreloadedSP = Info.getPreloadedReg( - AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); - - Info.setStackPtrOffsetReg(PreloadedSP); - Info.setScratchWaveOffsetReg(PreloadedSP); - Info.setFrameOffsetReg(PreloadedSP); - } else { - assert(!MFI.hasCalls()); + if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG) + report_fatal_error("failed to find register for SP"); + } - // There may not be stack access at all. There may still be spills, or - // access of a constant pointer (in which cases an extra copy will be - // emitted in the prolog). - unsigned ReservedOffsetReg - = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF); - Info.setStackPtrOffsetReg(ReservedOffsetReg); - Info.setScratchWaveOffsetReg(ReservedOffsetReg); - Info.setFrameOffsetReg(ReservedOffsetReg); + // hasFP should be accurate for entry functions even before the frame is + // finalized, because it does not rely on the known stack size, only + // properties like whether variable sized objects are present. + if (ST.getFrameLowering()->hasFP(MF)) { + Info.setFrameOffsetReg(AMDGPU::SGPR33); } } @@ -2231,8 +2209,6 @@ SDValue SITargetLowering::LowerFormalArguments( allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsShader); } else { CCInfo.AllocateReg(Info->getScratchRSrcReg()); - CCInfo.AllocateReg(Info->getScratchWaveOffsetReg()); - CCInfo.AllocateReg(Info->getFrameOffsetReg()); allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info); } @@ -6042,6 +6018,9 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, DAG.getConstant(1, SL, MVT::i32)); return DAG.getSetCC(SL, MVT::i1, SrcHi, Aperture, ISD::SETEQ); } + case Intrinsic::amdgcn_alignbit: + return DAG.getNode(ISD::FSHR, DL, VT, + Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); default: if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = AMDGPU::getImageDimIntrinsicInfo(IntrinsicID)) @@ -10653,11 +10632,6 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const { if (Info->getFrameOffsetReg() != AMDGPU::FP_REG) MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg()); - if (Info->getScratchWaveOffsetReg() != AMDGPU::SCRATCH_WAVE_OFFSET_REG) { - MRI.replaceRegWith(AMDGPU::SCRATCH_WAVE_OFFSET_REG, - Info->getScratchWaveOffsetReg()); - } - Info->limitOccupancy(MF); if (ST.isWave32() && !MF.empty()) { diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 49849576dc26c..e80061bb40cd3 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -1489,7 +1489,6 @@ def : UMad24Pat; // FIXME: This should only be done for VALU inputs defm : BFIPatterns ; -def : FSHRPattern ; def : ROTRPattern ; def : GCNPat<(i32 (trunc (srl i64:$src0, (and i32:$src1, (i32 31))))), diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index 79b74c5ede2db..f17608ad972a7 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -79,16 +79,13 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) // Non-entry functions have no special inputs for now, other registers // required for scratch access. ScratchRSrcReg = AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3; - ScratchWaveOffsetReg = AMDGPU::SGPR33; // TODO: Pick a high register, and shift down, similar to a kernel. - FrameOffsetReg = AMDGPU::SGPR34; + FrameOffsetReg = AMDGPU::SGPR33; StackPtrOffsetReg = AMDGPU::SGPR32; ArgInfo.PrivateSegmentBuffer = ArgDescriptor::createRegister(ScratchRSrcReg); - ArgInfo.PrivateSegmentWaveByteOffset = - ArgDescriptor::createRegister(ScratchWaveOffsetReg); if (F.hasFnAttribute("amdgpu-implicitarg-ptr")) ImplicitArgPtr = true; @@ -212,7 +209,7 @@ void SIMachineFunctionInfo::limitOccupancy(const MachineFunction &MF) { MF.getFunction())); } -unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer( +Register SIMachineFunctionInfo::addPrivateSegmentBuffer( const SIRegisterInfo &TRI) { ArgInfo.PrivateSegmentBuffer = ArgDescriptor::createRegister(TRI.getMatchingSuperReg( @@ -221,21 +218,21 @@ unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer( return ArgInfo.PrivateSegmentBuffer.getRegister(); } -unsigned SIMachineFunctionInfo::addDispatchPtr(const SIRegisterInfo &TRI) { +Register SIMachineFunctionInfo::addDispatchPtr(const SIRegisterInfo &TRI) { ArgInfo.DispatchPtr = ArgDescriptor::createRegister(TRI.getMatchingSuperReg( getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass)); NumUserSGPRs += 2; return ArgInfo.DispatchPtr.getRegister(); } -unsigned SIMachineFunctionInfo::addQueuePtr(const SIRegisterInfo &TRI) { +Register SIMachineFunctionInfo::addQueuePtr(const SIRegisterInfo &TRI) { ArgInfo.QueuePtr = ArgDescriptor::createRegister(TRI.getMatchingSuperReg( getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass)); NumUserSGPRs += 2; return ArgInfo.QueuePtr.getRegister(); } -unsigned SIMachineFunctionInfo::addKernargSegmentPtr(const SIRegisterInfo &TRI) { +Register SIMachineFunctionInfo::addKernargSegmentPtr(const SIRegisterInfo &TRI) { ArgInfo.KernargSegmentPtr = ArgDescriptor::createRegister(TRI.getMatchingSuperReg( getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass)); @@ -243,21 +240,21 @@ unsigned SIMachineFunctionInfo::addKernargSegmentPtr(const SIRegisterInfo &TRI) return ArgInfo.KernargSegmentPtr.getRegister(); } -unsigned SIMachineFunctionInfo::addDispatchID(const SIRegisterInfo &TRI) { +Register SIMachineFunctionInfo::addDispatchID(const SIRegisterInfo &TRI) { ArgInfo.DispatchID = ArgDescriptor::createRegister(TRI.getMatchingSuperReg( getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass)); NumUserSGPRs += 2; return ArgInfo.DispatchID.getRegister(); } -unsigned SIMachineFunctionInfo::addFlatScratchInit(const SIRegisterInfo &TRI) { +Register SIMachineFunctionInfo::addFlatScratchInit(const SIRegisterInfo &TRI) { ArgInfo.FlatScratchInit = ArgDescriptor::createRegister(TRI.getMatchingSuperReg( getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass)); NumUserSGPRs += 2; return ArgInfo.FlatScratchInit.getRegister(); } -unsigned SIMachineFunctionInfo::addImplicitBufferPtr(const SIRegisterInfo &TRI) { +Register SIMachineFunctionInfo::addImplicitBufferPtr(const SIRegisterInfo &TRI) { ArgInfo.ImplicitBufferPtr = ArgDescriptor::createRegister(TRI.getMatchingSuperReg( getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass)); NumUserSGPRs += 2; @@ -310,7 +307,7 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF, // Make sure to handle the case where a wide SGPR spill may span between two // VGPRs. for (int I = 0; I < NumLanes; ++I, ++NumVGPRSpillLanes) { - unsigned LaneVGPR; + Register LaneVGPR; unsigned VGPRIndex = (NumVGPRSpillLanes % WaveSize); if (VGPRIndex == 0) { @@ -442,7 +439,7 @@ MCPhysReg SIMachineFunctionInfo::getNextSystemSGPR() const { return AMDGPU::SGPR0 + NumUserSGPRs + NumSystemSGPRs; } -static yaml::StringValue regToString(unsigned Reg, +static yaml::StringValue regToString(Register Reg, const TargetRegisterInfo &TRI) { yaml::StringValue Dest; { @@ -515,7 +512,6 @@ yaml::SIMachineFunctionInfo::SIMachineFunctionInfo( WaveLimiter(MFI.needsWaveLimiter()), HighBitsOf32BitAddress(MFI.get32BitAddressHighBits()), ScratchRSrcReg(regToString(MFI.getScratchRSrcReg(), TRI)), - ScratchWaveOffsetReg(regToString(MFI.getScratchWaveOffsetReg(), TRI)), FrameOffsetReg(regToString(MFI.getFrameOffsetReg(), TRI)), StackPtrOffsetReg(regToString(MFI.getStackPtrOffsetReg(), TRI)), ArgInfo(convertArgumentInfo(MFI.getArgInfo(), TRI)), diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h index 885e83aeb5a3c..c6ccad800ccf3 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -284,7 +284,6 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo { uint32_t HighBitsOf32BitAddress = 0; StringValue ScratchRSrcReg = "$private_rsrc_reg"; - StringValue ScratchWaveOffsetReg = "$scratch_wave_offset_reg"; StringValue FrameOffsetReg = "$fp_reg"; StringValue StackPtrOffsetReg = "$sp_reg"; @@ -311,8 +310,6 @@ template <> struct MappingTraits { YamlIO.mapOptional("waveLimiter", MFI.WaveLimiter, false); YamlIO.mapOptional("scratchRSrcReg", MFI.ScratchRSrcReg, StringValue("$private_rsrc_reg")); - YamlIO.mapOptional("scratchWaveOffsetReg", MFI.ScratchWaveOffsetReg, - StringValue("$scratch_wave_offset_reg")); YamlIO.mapOptional("frameOffsetReg", MFI.FrameOffsetReg, StringValue("$fp_reg")); YamlIO.mapOptional("stackPtrOffsetReg", MFI.StackPtrOffsetReg, @@ -331,20 +328,20 @@ template <> struct MappingTraits { class SIMachineFunctionInfo final : public AMDGPUMachineFunction { friend class GCNTargetMachine; - unsigned TIDReg = AMDGPU::NoRegister; + Register TIDReg = AMDGPU::NoRegister; // Registers that may be reserved for spilling purposes. These may be the same // as the input registers. - unsigned ScratchRSrcReg = AMDGPU::PRIVATE_RSRC_REG; - unsigned ScratchWaveOffsetReg = AMDGPU::SCRATCH_WAVE_OFFSET_REG; + Register ScratchRSrcReg = AMDGPU::PRIVATE_RSRC_REG; - // This is the current function's incremented size from the kernel's scratch - // wave offset register. For an entry function, this is exactly the same as - // the ScratchWaveOffsetReg. - unsigned FrameOffsetReg = AMDGPU::FP_REG; + // This is the the unswizzled offset from the current dispatch's scratch wave + // base to the beginning of the current function's frame. + Register FrameOffsetReg = AMDGPU::FP_REG; - // Top of the stack SGPR offset derived from the ScratchWaveOffsetReg. - unsigned StackPtrOffsetReg = AMDGPU::SP_REG; + // This is an ABI register used in the non-entry calling convention to + // communicate the unswizzled offset from the current dispatch's scratch wave + // base to the beginning of the new function's frame. + Register StackPtrOffsetReg = AMDGPU::SP_REG; AMDGPUFunctionArgInfo ArgInfo; @@ -437,11 +434,11 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction { public: struct SpilledReg { - unsigned VGPR = 0; + Register VGPR; int Lane = -1; SpilledReg() = default; - SpilledReg(unsigned R, int L) : VGPR (R), Lane (L) {} + SpilledReg(Register R, int L) : VGPR (R), Lane (L) {} bool hasLane() { return Lane != -1;} bool hasReg() { return VGPR != 0;} @@ -449,13 +446,13 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction { struct SGPRSpillVGPRCSR { // VGPR used for SGPR spills - unsigned VGPR; + Register VGPR; // If the VGPR is a CSR, the stack slot used to save/restore it in the // prolog/epilog. Optional FI; - SGPRSpillVGPRCSR(unsigned V, Optional F) : VGPR(V), FI(F) {} + SGPRSpillVGPRCSR(Register V, Optional F) : VGPR(V), FI(F) {} }; struct VGPRSpillToAGPR { @@ -465,12 +462,9 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction { SparseBitVector<> WWMReservedRegs; - void ReserveWWMRegister(unsigned reg) { WWMReservedRegs.set(reg); } + void ReserveWWMRegister(Register Reg) { WWMReservedRegs.set(Reg); } private: - // SGPR->VGPR spilling support. - using SpillRegMask = std::pair; - // Track VGPR + wave index for each subregister of the SGPR spilled to // frameindex key. DenseMap> SGPRToVGPRSpills; @@ -488,7 +482,7 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction { public: // FIXME /// If this is set, an SGPR used for save/restore of the register used for the /// frame pointer. - unsigned SGPRForFPSaveRestoreCopy = 0; + Register SGPRForFPSaveRestoreCopy; Optional FramePointerSaveIndex; public: @@ -527,8 +521,8 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction { void removeDeadFrameIndices(MachineFrameInfo &MFI); bool hasCalculatedTID() const { return TIDReg != 0; }; - unsigned getTIDReg() const { return TIDReg; }; - void setTIDReg(unsigned Reg) { TIDReg = Reg; } + Register getTIDReg() const { return TIDReg; }; + void setTIDReg(Register Reg) { TIDReg = Reg; } unsigned getBytesInStackArgArea() const { return BytesInStackArgArea; @@ -539,34 +533,34 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction { } // Add user SGPRs. - unsigned addPrivateSegmentBuffer(const SIRegisterInfo &TRI); - unsigned addDispatchPtr(const SIRegisterInfo &TRI); - unsigned addQueuePtr(const SIRegisterInfo &TRI); - unsigned addKernargSegmentPtr(const SIRegisterInfo &TRI); - unsigned addDispatchID(const SIRegisterInfo &TRI); - unsigned addFlatScratchInit(const SIRegisterInfo &TRI); - unsigned addImplicitBufferPtr(const SIRegisterInfo &TRI); + Register addPrivateSegmentBuffer(const SIRegisterInfo &TRI); + Register addDispatchPtr(const SIRegisterInfo &TRI); + Register addQueuePtr(const SIRegisterInfo &TRI); + Register addKernargSegmentPtr(const SIRegisterInfo &TRI); + Register addDispatchID(const SIRegisterInfo &TRI); + Register addFlatScratchInit(const SIRegisterInfo &TRI); + Register addImplicitBufferPtr(const SIRegisterInfo &TRI); // Add system SGPRs. - unsigned addWorkGroupIDX() { + Register addWorkGroupIDX() { ArgInfo.WorkGroupIDX = ArgDescriptor::createRegister(getNextSystemSGPR()); NumSystemSGPRs += 1; return ArgInfo.WorkGroupIDX.getRegister(); } - unsigned addWorkGroupIDY() { + Register addWorkGroupIDY() { ArgInfo.WorkGroupIDY = ArgDescriptor::createRegister(getNextSystemSGPR()); NumSystemSGPRs += 1; return ArgInfo.WorkGroupIDY.getRegister(); } - unsigned addWorkGroupIDZ() { + Register addWorkGroupIDZ() { ArgInfo.WorkGroupIDZ = ArgDescriptor::createRegister(getNextSystemSGPR()); NumSystemSGPRs += 1; return ArgInfo.WorkGroupIDZ.getRegister(); } - unsigned addWorkGroupInfo() { + Register addWorkGroupInfo() { ArgInfo.WorkGroupInfo = ArgDescriptor::createRegister(getNextSystemSGPR()); NumSystemSGPRs += 1; return ArgInfo.WorkGroupInfo.getRegister(); @@ -585,14 +579,14 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction { ArgInfo.WorkItemIDZ = Arg; } - unsigned addPrivateSegmentWaveByteOffset() { + Register addPrivateSegmentWaveByteOffset() { ArgInfo.PrivateSegmentWaveByteOffset = ArgDescriptor::createRegister(getNextSystemSGPR()); NumSystemSGPRs += 1; return ArgInfo.PrivateSegmentWaveByteOffset.getRegister(); } - void setPrivateSegmentWaveByteOffset(unsigned Reg) { + void setPrivateSegmentWaveByteOffset(Register Reg) { ArgInfo.PrivateSegmentWaveByteOffset = ArgDescriptor::createRegister(Reg); } @@ -698,35 +692,31 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction { return NumUserSGPRs + NumSystemSGPRs; } - unsigned getPrivateSegmentWaveByteOffsetSystemSGPR() const { + Register getPrivateSegmentWaveByteOffsetSystemSGPR() const { return ArgInfo.PrivateSegmentWaveByteOffset.getRegister(); } /// Returns the physical register reserved for use as the resource /// descriptor for scratch accesses. - unsigned getScratchRSrcReg() const { + Register getScratchRSrcReg() const { return ScratchRSrcReg; } - void setScratchRSrcReg(unsigned Reg) { + void setScratchRSrcReg(Register Reg) { assert(Reg != 0 && "Should never be unset"); ScratchRSrcReg = Reg; } - unsigned getScratchWaveOffsetReg() const { - return ScratchWaveOffsetReg; - } - - unsigned getFrameOffsetReg() const { + Register getFrameOffsetReg() const { return FrameOffsetReg; } - void setFrameOffsetReg(unsigned Reg) { + void setFrameOffsetReg(Register Reg) { assert(Reg != 0 && "Should never be unset"); FrameOffsetReg = Reg; } - void setStackPtrOffsetReg(unsigned Reg) { + void setStackPtrOffsetReg(Register Reg) { assert(Reg != 0 && "Should never be unset"); StackPtrOffsetReg = Reg; } @@ -735,20 +725,15 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction { // NoRegister. This is mostly a workaround for MIR tests where state that // can't be directly computed from the function is not preserved in serialized // MIR. - unsigned getStackPtrOffsetReg() const { + Register getStackPtrOffsetReg() const { return StackPtrOffsetReg; } - void setScratchWaveOffsetReg(unsigned Reg) { - assert(Reg != 0 && "Should never be unset"); - ScratchWaveOffsetReg = Reg; - } - - unsigned getQueuePtrUserSGPR() const { + Register getQueuePtrUserSGPR() const { return ArgInfo.QueuePtr.getRegister(); } - unsigned getImplicitBufferPtrUserSGPR() const { + Register getImplicitBufferPtrUserSGPR() const { return ArgInfo.ImplicitBufferPtr.getRegister(); } @@ -861,7 +846,7 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction { } /// \returns SGPR used for \p Dim's work group ID. - unsigned getWorkGroupIDSGPR(unsigned Dim) const { + Register getWorkGroupIDSGPR(unsigned Dim) const { switch (Dim) { case 0: assert(hasWorkGroupIDX()); diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index 0b12e649e3e2b..93fde905beee9 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -38,12 +38,9 @@ static cl::opt EnableSpillSGPRToVGPR( cl::ReallyHidden, cl::init(true)); -SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST) : - AMDGPUGenRegisterInfo(0), - ST(ST), - SpillSGPRToVGPR(EnableSpillSGPRToVGPR), - isWave32(ST.isWave32()) { -} +SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST) + : AMDGPUGenRegisterInfo(AMDGPU::PC_REG, ST.getAMDGPUDwarfFlavour()), ST(ST), + SpillSGPRToVGPR(EnableSpillSGPRToVGPR), isWave32(ST.isWave32()) {} void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved, unsigned Reg) const { @@ -91,6 +88,13 @@ Register SIRegisterInfo::getFrameRegister(const MachineFunction &MF) const { const SIFrameLowering *TFI = MF.getSubtarget().getFrameLowering(); const SIMachineFunctionInfo *FuncInfo = MF.getInfo(); + // During ISel lowering we always reserve the stack pointer in entry + // functions, but never actually want to reference it when accessing our own + // frame. If we need a frame pointer we use it, but otherwise we can just use + // an immediate "0" which we represent by returning NoRegister. + if (FuncInfo->isEntryFunction()) { + return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg() : Register(); + } return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg() : FuncInfo->getStackPtrOffsetReg(); } @@ -177,29 +181,6 @@ unsigned SIRegisterInfo::reservedPrivateSegmentBufferReg( return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SGPR_128RegClass); } -static unsigned findPrivateSegmentWaveByteOffsetRegIndex(unsigned RegCount) { - unsigned Reg; - - // Try to place it in a hole after PrivateSegmentBufferReg. - if (RegCount & 3) { - // We cannot put the segment buffer in (Idx - 4) ... (Idx - 1) due to - // alignment constraints, so we have a hole where can put the wave offset. - Reg = RegCount - 1; - } else { - // We can put the segment buffer in (Idx - 4) ... (Idx - 1) and put the - // wave offset before it. - Reg = RegCount - 5; - } - - return Reg; -} - -unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg( - const MachineFunction &MF) const { - unsigned Reg = findPrivateSegmentWaveByteOffsetRegIndex(ST.getMaxNumSGPRs(MF)); - return AMDGPU::SGPR_32RegClass.getRegister(Reg); -} - BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { BitVector Reserved(getNumRegs()); @@ -279,19 +260,12 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { const SIMachineFunctionInfo *MFI = MF.getInfo(); - unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg(); - if (ScratchWaveOffsetReg != AMDGPU::NoRegister) { - // Reserve 1 SGPR for scratch wave offset in case we need to spill. - reserveRegisterTuples(Reserved, ScratchWaveOffsetReg); - } - unsigned ScratchRSrcReg = MFI->getScratchRSrcReg(); if (ScratchRSrcReg != AMDGPU::NoRegister) { // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we need // to spill. // TODO: May need to reserve a VGPR if doing LDS spilling. reserveRegisterTuples(Reserved, ScratchRSrcReg); - assert(!isSubRegister(ScratchRSrcReg, ScratchWaveOffsetReg)); } // We have to assume the SP is needed in case there are calls in the function, @@ -722,6 +696,9 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI, SOffset = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, MI, 0, false); if (SOffset == AMDGPU::NoRegister) { + if (ScratchOffsetReg == AMDGPU::NoRegister) { + report_fatal_error("could not scavenge SGPR to spill in entry function"); + } // There are no free SGPRs, and since we are in the process of spilling // VGPRs too. Since we need a VGPR in order to spill SGPRs (this is true // on SI/CI and on VI it is true until we implement spilling using scalar @@ -735,9 +712,14 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI, Scavenged = true; } - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), SOffset) - .addReg(ScratchOffsetReg) - .addImm(Offset); + if (ScratchOffsetReg == AMDGPU::NoRegister) { + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), SOffset) + .addImm(Offset); + } else { + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), SOffset) + .addReg(ScratchOffsetReg) + .addImm(Offset); + } Offset = 0; } @@ -772,16 +754,21 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI, EltSize, MinAlign(Align, EltSize * i)); MIB = BuildMI(*MBB, MI, DL, Desc) - .addReg(SubReg, getDefRegState(!IsStore) | getKillRegState(IsKill)) - .addReg(ScratchRsrcReg) - .addReg(SOffset, SOffsetRegState) - .addImm(Offset) - .addImm(0) // glc - .addImm(0) // slc - .addImm(0) // tfe - .addImm(0) // dlc - .addImm(0) // swz - .addMemOperand(NewMMO); + .addReg(SubReg, + getDefRegState(!IsStore) | getKillRegState(IsKill)) + .addReg(ScratchRsrcReg); + if (SOffset == AMDGPU::NoRegister) { + MIB.addImm(0); + } else { + MIB.addReg(SOffset, SOffsetRegState); + } + MIB.addImm(Offset) + .addImm(0) // glc + .addImm(0) // slc + .addImm(0) // tfe + .addImm(0) // dlc + .addImm(0) // swz + .addMemOperand(NewMMO); if (!IsStore && TmpReg != AMDGPU::NoRegister) MIB = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32), @@ -825,8 +812,7 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, MachineFrameInfo &FrameInfo = MF->getFrameInfo(); assert(SpillToVGPR || (SuperReg != MFI->getStackPtrOffsetReg() && - SuperReg != MFI->getFrameOffsetReg() && - SuperReg != MFI->getScratchWaveOffsetReg())); + SuperReg != MFI->getFrameOffsetReg())); assert(SuperReg != AMDGPU::M0 && "m0 should never spill"); @@ -1135,42 +1121,30 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, bool IsMUBUF = TII->isMUBUF(*MI); if (!IsMUBUF && !MFI->isEntryFunction()) { - // Convert to an absolute stack address by finding the offset from the - // scratch wave base and scaling by the wave size. + // Convert to a swizzled stack address by scaling by the wave size. // - // In an entry function/kernel the offset is already the absolute - // address relative to the frame register. - - Register TmpDiffReg = - RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0, false); - - // If there's no free SGPR, in-place modify the FP - Register DiffReg = TmpDiffReg.isValid() ? TmpDiffReg : FrameReg; + // In an entry function/kernel the offset is already swizzled. bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32; - Register ResultReg = IsCopy ? - MI->getOperand(0).getReg() : - RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); - - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), DiffReg) - .addReg(FrameReg) - .addReg(MFI->getScratchWaveOffsetReg()); + Register ResultReg = + IsCopy ? MI->getOperand(0).getReg() + : RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); int64_t Offset = FrameInfo.getObjectOffset(Index); if (Offset == 0) { // XXX - This never happens because of emergency scavenging slot at 0? BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), ResultReg) .addImm(ST.getWavefrontSizeLog2()) - .addReg(DiffReg); + .addReg(FrameReg); } else { if (auto MIB = TII->getAddNoCarry(*MBB, MI, DL, ResultReg, *RS)) { - Register ScaledReg = - RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MIB, 0); + // Reuse ResultReg in intermediate step. + Register ScaledReg = ResultReg; BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), ScaledReg) .addImm(ST.getWavefrontSizeLog2()) - .addReg(DiffReg, RegState::Kill); + .addReg(FrameReg); const bool IsVOP2 = MIB->getOpcode() == AMDGPU::V_ADD_U32_e32; @@ -1207,10 +1181,10 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, // unavailable. Only one additional mov is needed. Register TmpScaledReg = RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0, false); - Register ScaledReg = TmpScaledReg.isValid() ? TmpScaledReg : DiffReg; + Register ScaledReg = TmpScaledReg.isValid() ? TmpScaledReg : FrameReg; BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), ScaledReg) - .addReg(DiffReg, RegState::Kill) + .addReg(FrameReg) .addImm(ST.getWavefrontSizeLog2()); BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), ScaledReg) .addReg(ScaledReg, RegState::Kill) @@ -1224,19 +1198,12 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, .addReg(ScaledReg, RegState::Kill) .addImm(Offset); BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHL_B32), ScaledReg) - .addReg(DiffReg, RegState::Kill) + .addReg(FrameReg) .addImm(ST.getWavefrontSizeLog2()); } } } - if (!TmpDiffReg.isValid()) { - // Restore the FP. - BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), FrameReg) - .addReg(FrameReg) - .addReg(MFI->getScratchWaveOffsetReg()); - } - // Don't introduce an extra copy if we're just materializing in a mov. if (IsCopy) MI->eraseFromParent(); @@ -1251,10 +1218,17 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::vaddr)); - assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() == - MFI->getStackPtrOffsetReg()); - - TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->setReg(FrameReg); + auto &SOffset = *TII->getNamedOperand(*MI, AMDGPU::OpName::soffset); + assert((SOffset.isReg() && + SOffset.getReg() == MFI->getStackPtrOffsetReg()) || + (SOffset.isImm() && SOffset.getImm() == 0)); + if (SOffset.isReg()) { + if (FrameReg == AMDGPU::NoRegister) { + SOffset.ChangeToImmediate(0); + } else { + SOffset.setReg(FrameReg); + } + } int64_t Offset = FrameInfo.getObjectOffset(Index); int64_t OldImm diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h index 84261bef6b8ee..a13f6dc4c0e7f 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -50,11 +50,6 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo { /// spilling is needed. unsigned reservedPrivateSegmentBufferReg(const MachineFunction &MF) const; - /// Return the end register initially reserved for the scratch wave offset in - /// case spilling is needed. - unsigned reservedPrivateSegmentWaveByteOffsetReg( - const MachineFunction &MF) const; - BitVector getReservedRegs(const MachineFunction &MF) const override; const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override; diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td index b39b2b4542d08..987a93040ec32 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -101,10 +101,9 @@ def VCC_HI : SIReg<"vcc_hi", 107>; def PRIVATE_RSRC_REG : SIReg<"private_rsrc", 0>; def FP_REG : SIReg<"fp", 0>; def SP_REG : SIReg<"sp", 0>; -def SCRATCH_WAVE_OFFSET_REG : SIReg<"scratch_wave_offset", 0>; // Pseudo-register to represent the program-counter DWARF register. -def PC_REG : SIReg<"pc", 0>, DwarfRegNum<[16]> { +def PC_REG : SIReg<"pc", 0>, DwarfRegNum<[16, 0]> { // There is no physical register corresponding to a "program counter", but // we need to encode the concept in debug information in order to represent // things like the return value in unwind information. @@ -118,10 +117,10 @@ def VCC : RegisterWithSubRegs<"vcc", [VCC_LO, VCC_HI]> { let HWEncoding = 106; } -def EXEC_LO : SIReg<"exec_lo", 126>, DwarfRegNum<[1]>; +def EXEC_LO : SIReg<"exec_lo", 126>, DwarfRegNum<[1, 1]>; def EXEC_HI : SIReg<"exec_hi", 127>; -def EXEC : RegisterWithSubRegs<"exec", [EXEC_LO, EXEC_HI]>, DwarfRegNum<[17]> { +def EXEC : RegisterWithSubRegs<"exec", [EXEC_LO, EXEC_HI]>, DwarfRegNum<[17, 1]> { let Namespace = "AMDGPU"; let SubRegIndices = [sub0, sub1]; let HWEncoding = 126; @@ -212,23 +211,29 @@ def FLAT_SCR : FlatReg; foreach Index = 0-105 in { def SGPR#Index : SIReg <"s"#Index, Index>, - DwarfRegNum<[!if(!le(Index, 63), !add(Index, 32), !add(Index, 1024))]>; + DwarfRegNum<[!if(!le(Index, 63), !add(Index, 32), !add(Index, 1024)), + !if(!le(Index, 63), !add(Index, 32), !add(Index, 1024))]>; } // VGPR registers foreach Index = 0-255 in { + // Set a cost value for vgprs other than the argument registers (v0-v31). + // The ratio of index/allocation_granularity is taken as the cost value. + // Considered the allocation granularity as 4 here. + let CostPerUse=!if(!gt(Index, 31), !srl(Index, 2), 0) in { def VGPR#Index : SIReg <"v"#Index, Index>, - DwarfRegNum<[!add(Index, 2560)]> { + DwarfRegNum<[!add(Index, 2560), !add(Index, 1536)]> { let HWEncoding{8} = 1; } + } } // AccVGPR registers foreach Index = 0-255 in { def AGPR#Index : SIReg <"a"#Index, Index>, - DwarfRegNum<[!add(Index, 3072)]> { + DwarfRegNum<[!add(Index, 3072), !add(Index, 2048)]> { let HWEncoding{8} = 1; } } @@ -435,7 +440,7 @@ def AGPR_1024 : SIRegisterTuples.ret, AGPR_32, 255, 1, 32, "a">; //===----------------------------------------------------------------------===// def Pseudo_SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, - (add FP_REG, SP_REG, SCRATCH_WAVE_OFFSET_REG)> { + (add FP_REG, SP_REG)> { let isAllocatable = 0; let CopyCost = -1; } @@ -573,7 +578,7 @@ def SReg_160 : RegisterClass<"AMDGPU", [v5i32, v5f32], 32, let AllocationPriority = 16; } -def SGPR_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add SGPR_256Regs)> { +def SGPR_256 : RegisterClass<"AMDGPU", [v8i32, v8f32, v4i64], 32, (add SGPR_256Regs)> { let AllocationPriority = 17; } @@ -581,7 +586,7 @@ def TTMP_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add TTMP_256Regs)> { let isAllocatable = 0; } -def SReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, +def SReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32, v4i64], 32, (add SGPR_256, TTMP_256)> { // Requires 4 s_mov_b64 to copy let CopyCost = 4; diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td index 0e8e3f944f3dd..ff778b6e5adc2 100644 --- a/llvm/lib/Target/AMDGPU/SMInstructions.td +++ b/llvm/lib/Target/AMDGPU/SMInstructions.td @@ -808,7 +808,9 @@ foreach vt = SReg_128.RegTypes in { defm : SMRD_Pattern <"S_LOAD_DWORDX4", vt>; } -defm : SMRD_Pattern <"S_LOAD_DWORDX8", v8i32>; +foreach vt = SReg_256.RegTypes in { +defm : SMRD_Pattern <"S_LOAD_DWORDX8", vt>; +} defm : SMRD_Pattern <"S_LOAD_DWORDX16", v16i32>; defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORD", i32>; diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index 30345f82fae6e..ba540ea848fc0 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -343,7 +343,7 @@ def V_CUBEMA_F32 : VOP3Inst <"v_cubema_f32", VOP3_Profile, def V_BFE_U32 : VOP3Inst <"v_bfe_u32", VOP3_Profile, AMDGPUbfe_u32>; def V_BFE_I32 : VOP3Inst <"v_bfe_i32", VOP3_Profile, AMDGPUbfe_i32>; def V_BFI_B32 : VOP3Inst <"v_bfi_b32", VOP3_Profile, AMDGPUbfi>; -def V_ALIGNBIT_B32 : VOP3Inst <"v_alignbit_b32", VOP3_Profile, int_amdgcn_alignbit>; +def V_ALIGNBIT_B32 : VOP3Inst <"v_alignbit_b32", VOP3_Profile, fshr>; def V_ALIGNBYTE_B32 : VOP3Inst <"v_alignbyte_b32", VOP3_Profile, int_amdgcn_alignbyte>; def V_MIN3_F32 : VOP3Inst <"v_min3_f32", VOP3_Profile, AMDGPUfmin3>; def V_MIN3_I32 : VOP3Inst <"v_min3_i32", VOP3_Profile, AMDGPUsmin3>; diff --git a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp index 6be6f5da6379a..b334f4156559a 100644 --- a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -277,6 +277,15 @@ class ARMDAGToDAGISel : public SelectionDAGISel { void SelectMVE_VxDUP(SDNode *N, const uint16_t *Opcodes, bool Wrapping, bool Predicated); + /// Select SelectCDE_CXxD - Select CDE dual-GPR instruction (one of CX1D, + /// CX1DA, CX2D, CX2DA, CX3, CX3DA). + /// \arg \c NumExtraOps number of extra operands besides the coprocossor, + /// the accumulator and the immediate operand, i.e. 0 + /// for CX1*, 1 for CX2*, 2 for CX3* + /// \arg \c HasAccum whether the instruction has an accumulator operand + void SelectCDE_CXxD(SDNode *N, uint16_t Opcode, size_t NumExtraOps, + bool HasAccum); + /// SelectVLDDup - Select NEON load-duplicate intrinsics. NumVecs /// should be 1, 2, 3 or 4. The opcode array specifies the instructions used /// for loading D registers. @@ -2809,6 +2818,69 @@ void ARMDAGToDAGISel::SelectMVE_VxDUP(SDNode *N, const uint16_t *Opcodes, CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), makeArrayRef(Ops)); } +void ARMDAGToDAGISel::SelectCDE_CXxD(SDNode *N, uint16_t Opcode, + size_t NumExtraOps, bool HasAccum) { + bool IsBigEndian = CurDAG->getDataLayout().isBigEndian(); + SDLoc Loc(N); + SmallVector Ops; + + unsigned OpIdx = 1; + + // Convert and append the immediate operand designating the coprocessor. + SDValue ImmCorpoc = N->getOperand(OpIdx++); + uint32_t ImmCoprocVal = cast(ImmCorpoc)->getZExtValue(); + Ops.push_back(getI32Imm(ImmCoprocVal, Loc)); + + // For accumulating variants copy the low and high order parts of the + // accumulator into a register pair and add it to the operand vector. + if (HasAccum) { + SDValue AccLo = N->getOperand(OpIdx++); + SDValue AccHi = N->getOperand(OpIdx++); + if (IsBigEndian) + std::swap(AccLo, AccHi); + Ops.push_back(SDValue(createGPRPairNode(MVT::Untyped, AccLo, AccHi), 0)); + } + + // Copy extra operands as-is. + for (size_t I = 0; I < NumExtraOps; I++) + Ops.push_back(N->getOperand(OpIdx++)); + + // Convert and append the immediate operand + SDValue Imm = N->getOperand(OpIdx); + uint32_t ImmVal = cast(Imm)->getZExtValue(); + Ops.push_back(getI32Imm(ImmVal, Loc)); + + // Accumulating variants are IT-predicable, add predicate operands. + if (HasAccum) { + SDValue Pred = getAL(CurDAG, Loc); + SDValue PredReg = CurDAG->getRegister(0, MVT::i32); + Ops.push_back(Pred); + Ops.push_back(PredReg); + } + + // Create the CDE intruction + SDNode *InstrNode = CurDAG->getMachineNode(Opcode, Loc, MVT::Untyped, Ops); + SDValue ResultPair = SDValue(InstrNode, 0); + + // The original intrinsic had two outputs, and the output of the dual-register + // CDE instruction is a register pair. We need to extract the two subregisters + // and replace all uses of the original outputs with the extracted + // subregisters. + uint16_t SubRegs[2] = {ARM::gsub_0, ARM::gsub_1}; + if (IsBigEndian) + std::swap(SubRegs[0], SubRegs[1]); + + for (size_t ResIdx = 0; ResIdx < 2; ResIdx++) { + if (SDValue(N, ResIdx).use_empty()) + continue; + SDValue SubReg = CurDAG->getTargetExtractSubreg(SubRegs[ResIdx], Loc, + MVT::i32, ResultPair); + ReplaceUses(SDValue(N, ResIdx), SubReg); + } + + CurDAG->RemoveDeadNode(N); +} + void ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool IsIntrinsic, bool isUpdating, unsigned NumVecs, const uint16_t *DOpcodes, @@ -4773,6 +4845,40 @@ void ARMDAGToDAGISel::Select(SDNode *N) { IntNo == Intrinsic::arm_mve_vdwdup_predicated); return; } + + case Intrinsic::arm_cde_cx1d: + case Intrinsic::arm_cde_cx1da: + case Intrinsic::arm_cde_cx2d: + case Intrinsic::arm_cde_cx2da: + case Intrinsic::arm_cde_cx3d: + case Intrinsic::arm_cde_cx3da: { + bool HasAccum = IntNo == Intrinsic::arm_cde_cx1da || + IntNo == Intrinsic::arm_cde_cx2da || + IntNo == Intrinsic::arm_cde_cx3da; + size_t NumExtraOps; + uint16_t Opcode; + switch (IntNo) { + case Intrinsic::arm_cde_cx1d: + case Intrinsic::arm_cde_cx1da: + NumExtraOps = 0; + Opcode = HasAccum ? ARM::CDE_CX1DA : ARM::CDE_CX1D; + break; + case Intrinsic::arm_cde_cx2d: + case Intrinsic::arm_cde_cx2da: + NumExtraOps = 1; + Opcode = HasAccum ? ARM::CDE_CX2DA : ARM::CDE_CX2D; + break; + case Intrinsic::arm_cde_cx3d: + case Intrinsic::arm_cde_cx3da: + NumExtraOps = 2; + Opcode = HasAccum ? ARM::CDE_CX3DA : ARM::CDE_CX3D; + break; + default: + llvm_unreachable("Unexpected opcode"); + } + SelectCDE_CXxD(N, Opcode, NumExtraOps, HasAccum); + return; + } } break; } diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 2e6bcb550999d..03cff21104791 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -1669,6 +1669,10 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const { case ARMISD::VADDLVu: return "ARMISD::VADDLVu"; case ARMISD::VADDLVAs: return "ARMISD::VADDLVAs"; case ARMISD::VADDLVAu: return "ARMISD::VADDLVAu"; + case ARMISD::VADDLVps: return "ARMISD::VADDLVps"; + case ARMISD::VADDLVpu: return "ARMISD::VADDLVpu"; + case ARMISD::VADDLVAps: return "ARMISD::VADDLVAps"; + case ARMISD::VADDLVApu: return "ARMISD::VADDLVApu"; case ARMISD::VMLAVs: return "ARMISD::VMLAVs"; case ARMISD::VMLAVu: return "ARMISD::VMLAVu"; case ARMISD::VMLALVs: return "ARMISD::VMLALVs"; @@ -11816,18 +11820,15 @@ static SDValue PerformADDVecReduce(SDNode *N, return SDValue(); SDLoc dl(N); - SDValue Lo = DCI.DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, NA, - DCI.DAG.getConstant(0, dl, MVT::i32)); - SDValue Hi = DCI.DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, NA, - DCI.DAG.getConstant(1, dl, MVT::i32)); - SDValue Red = - VecRed->getNumOperands() == 1 - ? DCI.DAG.getNode(OpcodeA, dl, - DCI.DAG.getVTList({MVT::i32, MVT::i32}), Lo, Hi, - VecRed->getOperand(0)) - : DCI.DAG.getNode(OpcodeA, dl, - DCI.DAG.getVTList({MVT::i32, MVT::i32}), Lo, Hi, - VecRed->getOperand(0), VecRed->getOperand(1)); + SmallVector Ops; + Ops.push_back(DCI.DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, NA, + DCI.DAG.getConstant(0, dl, MVT::i32))); + Ops.push_back(DCI.DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, NA, + DCI.DAG.getConstant(1, dl, MVT::i32))); + for (unsigned i = 0, e = VecRed.getNumOperands(); i < e; i++) + Ops.push_back(VecRed->getOperand(i)); + SDValue Red = DCI.DAG.getNode(OpcodeA, dl, + DCI.DAG.getVTList({MVT::i32, MVT::i32}), Ops); return DCI.DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Red, SDValue(Red.getNode(), 1)); }; @@ -11840,6 +11841,14 @@ static SDValue PerformADDVecReduce(SDNode *N, return M; if (SDValue M = MakeVecReduce(ARMISD::VADDLVu, ARMISD::VADDLVAu, N1, N0)) return M; + if (SDValue M = MakeVecReduce(ARMISD::VADDLVps, ARMISD::VADDLVAps, N0, N1)) + return M; + if (SDValue M = MakeVecReduce(ARMISD::VADDLVpu, ARMISD::VADDLVApu, N0, N1)) + return M; + if (SDValue M = MakeVecReduce(ARMISD::VADDLVps, ARMISD::VADDLVAps, N1, N0)) + return M; + if (SDValue M = MakeVecReduce(ARMISD::VADDLVpu, ARMISD::VADDLVApu, N1, N0)) + return M; if (SDValue M = MakeVecReduce(ARMISD::VMLALVs, ARMISD::VMLALVAs, N0, N1)) return M; if (SDValue M = MakeVecReduce(ARMISD::VMLALVu, ARMISD::VMLALVAu, N0, N1)) @@ -12945,6 +12954,26 @@ static SDValue PerformVMOVhrCombine(SDNode *N, TargetLowering::DAGCombinerInfo & return NewCopy; } +static SDValue PerformVMOVrhCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + SDValue N0 = N->getOperand(0); + EVT VT = N->getValueType(0); + + // fold (VMOVrh (load x)) -> (zextload (i16*)x) + if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse()) { + LoadSDNode *LN0 = cast(N0); + + SDValue Load = + DCI.DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N), VT, LN0->getChain(), + LN0->getBasePtr(), MVT::i16, LN0->getMemOperand()); + DCI.DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Load.getValue(0)); + DCI.DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1)); + return Load; + } + + return SDValue(); +} + /// hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node /// are normal, non-volatile loads. If so, it is profitable to bitcast an /// i64 vector to have f64 elements, since the value can then be loaded @@ -13705,6 +13734,18 @@ static SDValue PerformVDUPCombine(SDNode *N, const ARMSubtarget *Subtarget) { SelectionDAG &DAG = DCI.DAG; SDValue Op = N->getOperand(0); + SDLoc dl(N); + + if (Subtarget->hasMVEIntegerOps()) { + // Convert VDUP f32 -> VDUP BITCAST i32 under MVE, as we know the value will + // need to come from a GPR. + if (Op.getValueType() == MVT::f32) + return DCI.DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0), + DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op)); + else if (Op.getValueType() == MVT::f16) + return DCI.DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0), + DAG.getNode(ARMISD::VMOVrh, dl, MVT::i32, Op)); + } if (!Subtarget->hasNEON()) return SDValue(); @@ -13852,6 +13893,33 @@ static SDValue PerformSplittingToNarrowingStores(StoreSDNode *St, FromVT.getVectorNumElements() % NumElements != 0) return SDValue(); + // Test if the Trunc will be convertable to a VMOVN with a shuffle, and if so + // use the VMOVN over splitting the store. We are looking for patterns of: + // !rev: 0 N 1 N+1 2 N+2 ... + // rev: N 0 N+1 1 N+2 2 ... + auto isVMOVNOriginalMask = [&](ArrayRef M, bool rev) { + unsigned NumElts = ToVT.getVectorNumElements(); + if (NumElts != M.size() || (ToVT != MVT::v8i16 && ToVT != MVT::v16i8)) + return false; + + unsigned Off0 = rev ? NumElts : 0; + unsigned Off1 = rev ? 0 : NumElts; + + for (unsigned i = 0; i < NumElts; i += 2) { + if (M[i] >= 0 && M[i] != (int)(Off0 + i / 2)) + return false; + if (M[i + 1] >= 0 && M[i + 1] != (int)(Off1 + i / 2)) + return false; + } + + return true; + }; + + if (auto *Shuffle = dyn_cast(Trunc->getOperand(0))) + if (isVMOVNOriginalMask(Shuffle->getMask(), false) || + isVMOVNOriginalMask(Shuffle->getMask(), true)) + return SDValue(); + SDLoc DL(St); // Details about the old store SDValue Ch = St->getChain(); @@ -14344,6 +14412,51 @@ SDValue ARMTargetLowering::PerformIntrinsicCombine(SDNode *N, return SDValue(); break; } + + case Intrinsic::arm_mve_minv: + case Intrinsic::arm_mve_maxv: + case Intrinsic::arm_mve_minav: + case Intrinsic::arm_mve_maxav: + case Intrinsic::arm_mve_minv_predicated: + case Intrinsic::arm_mve_maxv_predicated: + case Intrinsic::arm_mve_minav_predicated: + case Intrinsic::arm_mve_maxav_predicated: { + // These intrinsics all take an i32 scalar operand which is narrowed to the + // size of a single lane of the vector type they take as the other input. + unsigned BitWidth = N->getOperand(2)->getValueType(0).getScalarSizeInBits(); + APInt DemandedMask = APInt::getLowBitsSet(32, BitWidth); + if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)) + return SDValue(); + break; + } + + case Intrinsic::arm_mve_addv: { + // Turn this intrinsic straight into the appropriate ARMISD::VADDV node, + // which allow PerformADDVecReduce to turn it into VADDLV when possible. + bool Unsigned = cast(N->getOperand(2))->getZExtValue(); + unsigned Opc = Unsigned ? ARMISD::VADDVu : ARMISD::VADDVs; + return DAG.getNode(Opc, SDLoc(N), N->getVTList(), N->getOperand(1)); + } + + case Intrinsic::arm_mve_addlv: + case Intrinsic::arm_mve_addlv_predicated: { + // Same for these, but ARMISD::VADDLV has to be followed by a BUILD_PAIR + // which recombines the two outputs into an i64 + bool Unsigned = cast(N->getOperand(2))->getZExtValue(); + unsigned Opc = IntNo == Intrinsic::arm_mve_addlv ? + (Unsigned ? ARMISD::VADDLVu : ARMISD::VADDLVs) : + (Unsigned ? ARMISD::VADDLVpu : ARMISD::VADDLVps); + + SmallVector Ops; + for (unsigned i = 1, e = N->getNumOperands(); i < e; i++) + if (i != 2) // skip the unsigned flag + Ops.push_back(N->getOperand(i)); + + SDLoc dl(N); + SDValue val = DAG.getNode(Opc, dl, {MVT::i32, MVT::i32}, Ops); + return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, val.getValue(0), + val.getValue(1)); + } } return SDValue(); @@ -15052,6 +15165,7 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI, Subtarget); case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG); case ARMISD::VMOVhr: return PerformVMOVhrCombine(N, DCI); + case ARMISD::VMOVrh: return PerformVMOVrhCombine(N, DCI); case ISD::STORE: return PerformSTORECombine(N, DCI, Subtarget); case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI, Subtarget); case ISD::INSERT_VECTOR_ELT: return PerformInsertEltCombine(N, DCI); diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h index d95e4278e958e..b7b1d3a023580 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.h +++ b/llvm/lib/Target/ARM/ARMISelLowering.h @@ -207,12 +207,16 @@ class VectorType; VMULLu, // ...unsigned // MVE reductions - VADDVs, - VADDVu, - VADDLVs, - VADDLVu, - VADDLVAs, - VADDLVAu, + VADDVs, // sign- or zero-extend the elements of a vector to i32, + VADDVu, // add them all together, and return an i32 of their sum + VADDLVs, // sign- or zero-extend elements to i64 and sum, returning + VADDLVu, // the low and high 32-bit halves of the sum + VADDLVAs, // same as VADDLV[su] but also add an input accumulator + VADDLVAu, // provided as low and high halves + VADDLVps, // same as VADDLVs but with a v4i1 predicate mask + VADDLVpu, // same as VADDLVu but with a v4i1 predicate mask + VADDLVAps, // same as VADDLVps but with a v4i1 predicate mask + VADDLVApu, // same as VADDLVpu but with a v4i1 predicate mask VMLAVs, VMLAVu, VMLALVs, diff --git a/llvm/lib/Target/ARM/ARMInstrCDE.td b/llvm/lib/Target/ARM/ARMInstrCDE.td index fb02e9fefd8ce..93f3d9be24a40 100644 --- a/llvm/lib/Target/ARM/ARMInstrCDE.td +++ b/llvm/lib/Target/ARM/ARMInstrCDE.td @@ -215,6 +215,35 @@ def CDE_CX3A : CDE_CX3_Instr<"cx3a", cde_cx_params_single_acc>; def CDE_CX3D : CDE_CX3_Instr<"cx3d", cde_cx_params_dual_noacc>; def CDE_CX3DA : CDE_CX3_Instr<"cx3da", cde_cx_params_dual_acc>; +let Predicates = [HasCDE] in { + def : Pat<(i32 (int_arm_cde_cx1 timm:$coproc, timm:$imm)), + (i32 (CDE_CX1 p_imm:$coproc, imm_13b:$imm))>; + def : Pat<(i32 (int_arm_cde_cx1a timm:$coproc, GPRwithAPSR_NZCVnosp:$acc, + timm:$imm)), + (i32 (CDE_CX1A p_imm:$coproc, GPRwithAPSR_NZCVnosp:$acc, + imm_13b:$imm))>; + def : Pat<(i32 (int_arm_cde_cx2 timm:$coproc, GPRwithAPSR_NZCVnosp:$n, + timm:$imm)), + (i32 (CDE_CX2 p_imm:$coproc, GPRwithAPSR_NZCVnosp:$n, + imm_9b:$imm))>; + def : Pat<(i32 (int_arm_cde_cx2a timm:$coproc, GPRwithAPSR_NZCVnosp:$acc, + GPRwithAPSR_NZCVnosp:$n, timm:$imm)), + (i32 (CDE_CX2A p_imm:$coproc, GPRwithAPSR_NZCVnosp:$acc, + GPRwithAPSR_NZCVnosp:$n, imm_9b:$imm))>; + def : Pat<(i32 (int_arm_cde_cx3 timm:$coproc, GPRwithAPSR_NZCVnosp:$n, + GPRwithAPSR_NZCVnosp:$m, timm:$imm)), + (i32 (CDE_CX3 p_imm:$coproc, GPRwithAPSR_NZCVnosp:$n, + GPRwithAPSR_NZCVnosp:$m, imm_6b:$imm))>; + def : Pat<(i32 (int_arm_cde_cx3a timm:$coproc, + GPRwithAPSR_NZCVnosp:$acc, + GPRwithAPSR_NZCVnosp:$n, + GPRwithAPSR_NZCVnosp:$m, timm:$imm)), + (i32 (CDE_CX3A p_imm:$coproc, + GPRwithAPSR_NZCVnosp:$acc, + GPRwithAPSR_NZCVnosp:$n, + GPRwithAPSR_NZCVnosp:$m, imm_6b:$imm))>; +} + class CDE_RequiresSReg : Requires<[HasCDE, HasFPRegs]>; class CDE_RequiresDReg : Requires<[HasCDE, HasFPRegs]>; class CDE_RequiresQReg : Requires<[HasCDE, HasMVEInt]>; @@ -513,3 +542,67 @@ def CDE_VCX3_fpdp : CDE_VCX3_FP_Instr_D<"vcx3", cde_vcx_params_d_noacc>; def CDE_VCX3A_fpdp : CDE_VCX3_FP_Instr_D<"vcx3a", cde_vcx_params_d_acc>; def CDE_VCX3_vec : CDE_VCX3_Vec_Instr<"vcx3", cde_vcx_params_q_noacc>; def CDE_VCX3A_vec : CDE_VCX3_Vec_Instr<"vcx3a", cde_vcx_params_q_acc>; + + +let Predicates = [HasCDE, HasFPRegs] in { + def : Pat<(f32 (int_arm_cde_vcx1 timm:$coproc, timm:$imm)), + (f32 (CDE_VCX1_fpsp p_imm:$coproc, imm_11b:$imm))>; + def : Pat<(f32 (int_arm_cde_vcx1a timm:$coproc, (f32 SPR:$acc), timm:$imm)), + (f32 (CDE_VCX1A_fpsp p_imm:$coproc, SPR:$acc, imm_11b:$imm))>; + def : Pat<(f64 (int_arm_cde_vcx1 timm:$coproc, timm:$imm)), + (f64 (CDE_VCX1_fpdp p_imm:$coproc, imm_11b:$imm))>; + def : Pat<(f64 (int_arm_cde_vcx1a timm:$coproc, (f64 DPR:$acc), timm:$imm)), + (f64 (CDE_VCX1A_fpdp p_imm:$coproc, DPR:$acc, imm_11b:$imm))>; + + def : Pat<(f32 (int_arm_cde_vcx2 timm:$coproc, (f32 SPR:$n), timm:$imm)), + (f32 (CDE_VCX2_fpsp p_imm:$coproc, SPR:$n, imm_6b:$imm))>; + def : Pat<(f32 (int_arm_cde_vcx2a timm:$coproc, (f32 SPR:$acc), (f32 SPR:$n), + timm:$imm)), + (f32 (CDE_VCX2A_fpsp p_imm:$coproc, SPR:$acc, SPR:$n, imm_6b:$imm))>; + def : Pat<(f64 (int_arm_cde_vcx2 timm:$coproc, (f64 DPR:$n), timm:$imm)), + (f64 (CDE_VCX2_fpdp p_imm:$coproc, DPR:$n, imm_6b:$imm))>; + def : Pat<(f64 (int_arm_cde_vcx2a timm:$coproc, (f64 DPR:$acc), (f64 DPR:$n), + timm:$imm)), + (f64 (CDE_VCX2A_fpdp p_imm:$coproc, DPR:$acc, DPR:$n, imm_6b:$imm))>; + + def : Pat<(f32 (int_arm_cde_vcx3 timm:$coproc, (f32 SPR:$n), (f32 SPR:$m), + timm:$imm)), + (f32 (CDE_VCX3_fpsp p_imm:$coproc, (f32 SPR:$n), (f32 SPR:$m), + imm_3b:$imm))>; + def : Pat<(f32 (int_arm_cde_vcx3a timm:$coproc, (f32 SPR:$acc), (f32 SPR:$n), + (f32 SPR:$m), timm:$imm)), + (f32 (CDE_VCX3A_fpsp p_imm:$coproc, SPR:$acc, SPR:$n, SPR:$m, + imm_3b:$imm))>; + def : Pat<(f64 (int_arm_cde_vcx3 timm:$coproc, (f64 DPR:$n), (f64 DPR:$m), + timm:$imm)), + (f64 (CDE_VCX3_fpdp p_imm:$coproc, DPR:$n, DPR:$m, imm_3b:$imm))>; + def : Pat<(f64 (int_arm_cde_vcx3a timm:$coproc, (f64 DPR:$acc), (f64 DPR:$n), + (f64 DPR:$m), timm:$imm)), + (f64 (CDE_VCX3A_fpdp p_imm:$coproc, DPR:$acc, DPR:$n, DPR:$m, + imm_3b:$imm))>; +} + +let Predicates = [HasCDE, HasMVEInt] in { + def : Pat<(v16i8 (int_arm_cde_vcx1q timm:$coproc, timm:$imm)), + (v16i8 (CDE_VCX1_vec p_imm:$coproc, imm_12b:$imm))>; + def : Pat<(v16i8 (int_arm_cde_vcx1qa timm:$coproc, (v16i8 MQPR:$acc), + timm:$imm)), + (v16i8 (CDE_VCX1A_vec p_imm:$coproc, MQPR:$acc, imm_12b:$imm))>; + + def : Pat<(v16i8 (int_arm_cde_vcx2q timm:$coproc, (v16i8 MQPR:$n), timm:$imm)), + (v16i8 (CDE_VCX2_vec p_imm:$coproc, MQPR:$n, imm_7b:$imm))>; + def : Pat<(v16i8 (int_arm_cde_vcx2qa timm:$coproc, (v16i8 MQPR:$acc), + (v16i8 MQPR:$n), timm:$imm)), + (v16i8 (CDE_VCX2A_vec p_imm:$coproc, MQPR:$acc, MQPR:$n, + imm_7b:$imm))>; + + def : Pat<(v16i8 (int_arm_cde_vcx3q timm:$coproc, (v16i8 MQPR:$n), + (v16i8 MQPR:$m), timm:$imm)), + (v16i8 (CDE_VCX3_vec p_imm:$coproc, MQPR:$n, MQPR:$m, + imm_4b:$imm))>; + def : Pat<(v16i8 (int_arm_cde_vcx3qa timm:$coproc, (v16i8 MQPR:$acc), + (v16i8 MQPR:$n), (v16i8 MQPR:$m), + timm:$imm)), + (v16i8 (CDE_VCX3A_vec p_imm:$coproc, MQPR:$acc, MQPR:$n, MQPR:$m, + imm_4b:$imm))>; +} diff --git a/llvm/lib/Target/ARM/ARMInstrFormats.td b/llvm/lib/Target/ARM/ARMInstrFormats.td index 6e4a8ebab14cb..39646e3d3557b 100644 --- a/llvm/lib/Target/ARM/ARMInstrFormats.td +++ b/llvm/lib/Target/ARM/ARMInstrFormats.td @@ -408,6 +408,8 @@ class InstTemplate(f), "Pseudo"); @@ -421,6 +423,8 @@ class InstTemplate op, string opc> { let Inst{3-0} = Rm{3-0}; let DecoderMethod = "DecodeLDR"; } + + def ii : ARMAsmPseudo; } defm LDRSBT : AI3ldrT<0b1101, "ldrsbt">; diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td index 58164e57ab818..fcd3b5775869e 100644 --- a/llvm/lib/Target/ARM/ARMInstrMVE.td +++ b/llvm/lib/Target/ARM/ARMInstrMVE.td @@ -556,6 +556,7 @@ class MVE_VABAV size> let Inst{5} = Qm{3}; let Inst{3-1} = Qm{2-0}; let Inst{0} = 0b1; + let horizontalReduction = 1; } multiclass MVE_VABAV_m { @@ -605,62 +606,62 @@ class MVE_VADDV size, - list pattern=[]> { - def acc : MVE_VADDV<"vaddva", suffix, +def ARMVADDVs : SDNode<"ARMISD::VADDVs", SDTVecReduce>; +def ARMVADDVu : SDNode<"ARMISD::VADDVu", SDTVecReduce>; + +multiclass MVE_VADDV_A { + def acc : MVE_VADDV<"vaddva", VTI.Suffix, (ins tGPREven:$Rda_src, MQPR:$Qm), "$Rda = $Rda_src", - 0b1, U, size, pattern>; - def no_acc : MVE_VADDV<"vaddv", suffix, + 0b1, VTI.Unsigned, VTI.Size>; + def no_acc : MVE_VADDV<"vaddv", VTI.Suffix, (ins MQPR:$Qm), "", - 0b0, U, size, pattern>; -} + 0b0, VTI.Unsigned, VTI.Size>; -defm MVE_VADDVs8 : MVE_VADDV_A<"s8", 0b0, 0b00>; -defm MVE_VADDVs16 : MVE_VADDV_A<"s16", 0b0, 0b01>; -defm MVE_VADDVs32 : MVE_VADDV_A<"s32", 0b0, 0b10>; -defm MVE_VADDVu8 : MVE_VADDV_A<"u8", 0b1, 0b00>; -defm MVE_VADDVu16 : MVE_VADDV_A<"u16", 0b1, 0b01>; -defm MVE_VADDVu32 : MVE_VADDV_A<"u32", 0b1, 0b10>; + defvar InstA = !cast(NAME # "acc"); + defvar InstN = !cast(NAME # "no_acc"); -def ARMVADDVs : SDNode<"ARMISD::VADDVs", SDTVecReduce>; -def ARMVADDVu : SDNode<"ARMISD::VADDVu", SDTVecReduce>; + let Predicates = [HasMVEInt] in { + if VTI.Unsigned then { + def : Pat<(i32 (vecreduce_add (VTI.Vec MQPR:$vec))), + (i32 (InstN $vec))>; + def : Pat<(i32 (ARMVADDVu (VTI.Vec MQPR:$vec))), + (i32 (InstN $vec))>; + def : Pat<(i32 (add (i32 (vecreduce_add (VTI.Vec MQPR:$vec))), + (i32 tGPREven:$acc))), + (i32 (InstA $acc, $vec))>; + def : Pat<(i32 (add (i32 (ARMVADDVu (VTI.Vec MQPR:$vec))), + (i32 tGPREven:$acc))), + (i32 (InstA $acc, $vec))>; + } else { + def : Pat<(i32 (ARMVADDVs (VTI.Vec MQPR:$vec))), + (i32 (InstN $vec))>; + def : Pat<(i32 (add (i32 (ARMVADDVs (VTI.Vec MQPR:$vec))), + (i32 tGPREven:$acc))), + (i32 (InstA $acc, $vec))>; + } -let Predicates = [HasMVEInt] in { - def : Pat<(i32 (vecreduce_add (v4i32 MQPR:$src))), - (i32 (MVE_VADDVu32no_acc $src))>; - def : Pat<(i32 (vecreduce_add (v8i16 MQPR:$src))), - (i32 (MVE_VADDVu16no_acc $src))>; - def : Pat<(i32 (vecreduce_add (v16i8 MQPR:$src))), - (i32 (MVE_VADDVu8no_acc $src))>; - - def : Pat<(i32 (ARMVADDVs (v8i16 MQPR:$src))), - (i32 (MVE_VADDVs16no_acc $src))>; - def : Pat<(i32 (ARMVADDVu (v8i16 MQPR:$src))), - (i32 (MVE_VADDVu16no_acc $src))>; - def : Pat<(i32 (ARMVADDVs (v16i8 MQPR:$src))), - (i32 (MVE_VADDVs8no_acc $src))>; - def : Pat<(i32 (ARMVADDVu (v16i8 MQPR:$src))), - (i32 (MVE_VADDVu8no_acc $src))>; - - def : Pat<(i32 (add (i32 (vecreduce_add (v4i32 MQPR:$src1))), (i32 tGPREven:$src2))), - (i32 (MVE_VADDVu32acc $src2, $src1))>; - def : Pat<(i32 (add (i32 (vecreduce_add (v8i16 MQPR:$src1))), (i32 tGPREven:$src2))), - (i32 (MVE_VADDVu16acc $src2, $src1))>; - def : Pat<(i32 (add (i32 (vecreduce_add (v16i8 MQPR:$src1))), (i32 tGPREven:$src2))), - (i32 (MVE_VADDVu8acc $src2, $src1))>; - - def : Pat<(i32 (add (i32 (ARMVADDVs (v8i16 MQPR:$src1))), (i32 tGPREven:$src2))), - (i32 (MVE_VADDVs16acc $src2, $src1))>; - def : Pat<(i32 (add (i32 (ARMVADDVu (v8i16 MQPR:$src1))), (i32 tGPREven:$src2))), - (i32 (MVE_VADDVu16acc $src2, $src1))>; - def : Pat<(i32 (add (i32 (ARMVADDVs (v16i8 MQPR:$src1))), (i32 tGPREven:$src2))), - (i32 (MVE_VADDVs8acc $src2, $src1))>; - def : Pat<(i32 (add (i32 (ARMVADDVu (v16i8 MQPR:$src1))), (i32 tGPREven:$src2))), - (i32 (MVE_VADDVu8acc $src2, $src1))>; + def : Pat<(i32 (int_arm_mve_addv_predicated (VTI.Vec MQPR:$vec), + (i32 VTI.Unsigned), + (VTI.Pred VCCR:$pred))), + (i32 (InstN $vec, ARMVCCThen, $pred))>; + def : Pat<(i32 (add (int_arm_mve_addv_predicated (VTI.Vec MQPR:$vec), + (i32 VTI.Unsigned), + (VTI.Pred VCCR:$pred)), + (i32 tGPREven:$acc))), + (i32 (InstA $acc, $vec, ARMVCCThen, $pred))>; + } } +defm MVE_VADDVs8 : MVE_VADDV_A; +defm MVE_VADDVs16 : MVE_VADDV_A; +defm MVE_VADDVs32 : MVE_VADDV_A; +defm MVE_VADDVu8 : MVE_VADDV_A; +defm MVE_VADDVu16 : MVE_VADDV_A; +defm MVE_VADDVu32 : MVE_VADDV_A; + class MVE_VADDLV pattern=[]> : MVE_rDest<(outs tGPREven:$RdaLo, tGPROdd:$RdaHi), iops, NoItinerary, iname, @@ -679,22 +680,9 @@ class MVE_VADDLV pattern=[]> { - def acc : MVE_VADDLV<"vaddlva", suffix, - (ins tGPREven:$RdaLo_src, tGPROdd:$RdaHi_src, MQPR:$Qm), - "$RdaLo = $RdaLo_src,$RdaHi = $RdaHi_src", - 0b1, U, pattern>; - def no_acc : MVE_VADDLV<"vaddlv", suffix, - (ins MQPR:$Qm), "", - 0b0, U, pattern>; -} - - -defm MVE_VADDLVs32 : MVE_VADDLV_A<"s32", 0b0>; -defm MVE_VADDLVu32 : MVE_VADDLV_A<"u32", 0b1>; - def SDTVecReduceL : SDTypeProfile<2, 1, [ // VADDLV SDTCisInt<0>, SDTCisInt<1>, SDTCisVec<2> ]>; @@ -702,23 +690,49 @@ def SDTVecReduceLA : SDTypeProfile<2, 3, [ // VADDLVA SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>, SDTCisInt<3>, SDTCisVec<4> ]>; -def ARMVADDLVs : SDNode<"ARMISD::VADDLVs", SDTVecReduceL>; -def ARMVADDLVu : SDNode<"ARMISD::VADDLVu", SDTVecReduceL>; -def ARMVADDLVAs : SDNode<"ARMISD::VADDLVAs", SDTVecReduceLA>; -def ARMVADDLVAu : SDNode<"ARMISD::VADDLVAu", SDTVecReduceLA>; +def SDTVecReduceLP : SDTypeProfile<2, 2, [ // VADDLVp + SDTCisInt<0>, SDTCisInt<1>, SDTCisVec<2>, SDTCisVec<2> +]>; +def SDTVecReduceLPA : SDTypeProfile<2, 4, [ // VADDLVAp + SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>, SDTCisInt<3>, + SDTCisVec<4>, SDTCisVec<5> +]>; -let Predicates = [HasMVEInt] in { - def : Pat<(ARMVADDLVs (v4i32 MQPR:$val1)), - (MVE_VADDLVs32no_acc (v4i32 MQPR:$val1))>; - def : Pat<(ARMVADDLVu (v4i32 MQPR:$val1)), - (MVE_VADDLVu32no_acc (v4i32 MQPR:$val1))>; +multiclass MVE_VADDLV_A { + def acc : MVE_VADDLV<"vaddlva", VTI.Suffix, + (ins tGPREven:$RdaLo_src, tGPROdd:$RdaHi_src, MQPR:$Qm), + "$RdaLo = $RdaLo_src,$RdaHi = $RdaHi_src", + 0b1, VTI.Unsigned>; + def no_acc : MVE_VADDLV<"vaddlv", VTI.Suffix, + (ins MQPR:$Qm), "", + 0b0, VTI.Unsigned>; + + defvar InstA = !cast(NAME # "acc"); + defvar InstN = !cast(NAME # "no_acc"); - def : Pat<(ARMVADDLVAs tGPREven:$Rda, tGPROdd:$Rdb, (v4i32 MQPR:$val1)), - (MVE_VADDLVs32acc tGPREven:$Rda, tGPROdd:$Rdb, (v4i32 MQPR:$val1))>; - def : Pat<(ARMVADDLVAu tGPREven:$Rda, tGPROdd:$Rdb, (v4i32 MQPR:$val1)), - (MVE_VADDLVu32acc tGPREven:$Rda, tGPROdd:$Rdb, (v4i32 MQPR:$val1))>; + defvar letter = VTI.SuffixLetter; + defvar ARMVADDLV = SDNode<"ARMISD::VADDLV" # letter, SDTVecReduceL>; + defvar ARMVADDLVA = SDNode<"ARMISD::VADDLVA" # letter, SDTVecReduceLA>; + defvar ARMVADDLVp = SDNode<"ARMISD::VADDLVp" # letter, SDTVecReduceLP>; + defvar ARMVADDLVAp = SDNode<"ARMISD::VADDLVAp" # letter, SDTVecReduceLPA>; + + let Predicates = [HasMVEInt] in { + def : Pat<(ARMVADDLV (v4i32 MQPR:$vec)), + (InstN (v4i32 MQPR:$vec))>; + def : Pat<(ARMVADDLVA tGPREven:$acclo, tGPROdd:$acchi, (v4i32 MQPR:$vec)), + (InstA tGPREven:$acclo, tGPROdd:$acchi, (v4i32 MQPR:$vec))>; + def : Pat<(ARMVADDLVp (v4i32 MQPR:$vec), (VTI.Pred VCCR:$pred)), + (InstN (v4i32 MQPR:$vec), ARMVCCThen, (VTI.Pred VCCR:$pred))>; + def : Pat<(ARMVADDLVAp tGPREven:$acclo, tGPROdd:$acchi, (v4i32 MQPR:$vec), + (VTI.Pred VCCR:$pred)), + (InstA tGPREven:$acclo, tGPROdd:$acchi, (v4i32 MQPR:$vec), + ARMVCCThen, (VTI.Pred VCCR:$pred))>; + } } +defm MVE_VADDLVs32 : MVE_VADDLV_A; +defm MVE_VADDLVu32 : MVE_VADDLV_A; + class MVE_VMINMAXNMV pattern=[]> : MVE_rDest<(outs rGPR:$RdaDest), (ins rGPR:$RdaSrc, MQPR:$Qm), @@ -738,25 +752,47 @@ class MVE_VMINMAXNMV pattern=[]> { - def f32 : MVE_VMINMAXNMV; - def f16 : MVE_VMINMAXNMV; -} +multiclass MVE_VMINMAXNMV_p { + def "": MVE_VMINMAXNMV; + defvar Inst = !cast(NAME); + defvar unpred_intr = !cast(intrBaseName); + defvar pred_intr = !cast(intrBaseName#"_predicated"); -defm MVE_VMINNMV : MVE_VMINMAXNMV_fty<"vminnmv", 0b1>; -defm MVE_VMAXNMV : MVE_VMINMAXNMV_fty<"vmaxnmv", 0b0>; + let Predicates = [HasMVEFloat] in { + def : Pat<(Scalar (unpred_intr (Scalar ScalarReg:$prev), + (VTI.Vec MQPR:$vec))), + (COPY_TO_REGCLASS (Inst (COPY_TO_REGCLASS ScalarReg:$prev, rGPR), + (VTI.Vec MQPR:$vec)), + ScalarReg)>; + def : Pat<(Scalar (pred_intr (Scalar ScalarReg:$prev), + (VTI.Vec MQPR:$vec), + (VTI.Pred VCCR:$pred))), + (COPY_TO_REGCLASS (Inst (COPY_TO_REGCLASS ScalarReg:$prev, rGPR), + (VTI.Vec MQPR:$vec), + ARMVCCThen, (VTI.Pred VCCR:$pred)), + ScalarReg)>; + } +} -multiclass MVE_VMINMAXNMAV_fty pattern=[]> { - def f32 : MVE_VMINMAXNMV; - def f16 : MVE_VMINMAXNMV; +multiclass MVE_VMINMAXNMV_fty { + defm f32 : MVE_VMINMAXNMV_p; + defm f16 : MVE_VMINMAXNMV_p; } -defm MVE_VMINNMAV : MVE_VMINMAXNMAV_fty<"vminnmav", 0b1>; -defm MVE_VMAXNMAV : MVE_VMINMAXNMAV_fty<"vmaxnmav", 0b0>; +defm MVE_VMINNMV : MVE_VMINMAXNMV_fty<"vminnmv", 1, 1, "int_arm_mve_minnmv">; +defm MVE_VMAXNMV : MVE_VMINMAXNMV_fty<"vmaxnmv", 1, 0, "int_arm_mve_maxnmv">; +defm MVE_VMINNMAV: MVE_VMINMAXNMV_fty<"vminnmav", 0, 1, "int_arm_mve_minnmav">; +defm MVE_VMAXNMAV: MVE_VMINMAXNMV_fty<"vmaxnmav", 0, 0, "int_arm_mve_maxnmav">; class MVE_VMINMAXV size, bit bit_17, bit bit_7, list pattern=[]> @@ -776,33 +812,40 @@ class MVE_VMINMAXV size, let Inst{6-5} = 0b00; let Inst{3-1} = Qm{2-0}; let Inst{0} = 0b0; + let horizontalReduction = 1; } -multiclass MVE_VMINMAXV_p { +multiclass MVE_VMINMAXV_p { def "": MVE_VMINMAXV; - defvar Inst = !cast(NAME); + notAbs, isMin>; + defvar Inst = !cast(NAME); + defvar unpred_intr = !cast(intrBaseName); + defvar pred_intr = !cast(intrBaseName#"_predicated"); + defvar base_args = (? (i32 rGPR:$prev), (VTI.Vec MQPR:$vec)); + defvar args = !if(notAbs, !con(base_args, (? (i32 VTI.Unsigned))), + base_args); - let Predicates = [HasMVEInt] in - def _pat : Pat<(i32 (intr (i32 rGPR:$prev), (VTI.Vec MQPR:$vec))), - (i32 (Inst (i32 rGPR:$prev), (VTI.Vec MQPR:$vec)))>; + let Predicates = [HasMVEInt] in { + def : Pat<(i32 !con(args, (unpred_intr))), + (i32 (Inst (i32 rGPR:$prev), (VTI.Vec MQPR:$vec)))>; + def : Pat<(i32 !con(args, (pred_intr (VTI.Pred VCCR:$pred)))), + (i32 (Inst (i32 rGPR:$prev), (VTI.Vec MQPR:$vec), + ARMVCCThen, (VTI.Pred VCCR:$pred)))>; + } } -multiclass MVE_VMINMAXV_ty { - defm s8 : MVE_VMINMAXV_p; - defm s16: MVE_VMINMAXV_p; - defm s32: MVE_VMINMAXV_p; - defm u8 : MVE_VMINMAXV_p; - defm u16: MVE_VMINMAXV_p; - defm u32: MVE_VMINMAXV_p; +multiclass MVE_VMINMAXV_ty { + defm s8 : MVE_VMINMAXV_p; + defm s16: MVE_VMINMAXV_p; + defm s32: MVE_VMINMAXV_p; + defm u8 : MVE_VMINMAXV_p; + defm u16: MVE_VMINMAXV_p; + defm u32: MVE_VMINMAXV_p; } -defm MVE_VMINV : MVE_VMINMAXV_ty< - "vminv", 0b1, int_arm_mve_minv_s, int_arm_mve_minv_u>; -defm MVE_VMAXV : MVE_VMINMAXV_ty< - "vmaxv", 0b0, int_arm_mve_maxv_s, int_arm_mve_maxv_u>; +defm MVE_VMINV : MVE_VMINMAXV_ty<"vminv", 1, "int_arm_mve_minv">; +defm MVE_VMAXV : MVE_VMINMAXV_ty<"vmaxv", 0, "int_arm_mve_maxv">; let Predicates = [HasMVEInt] in { def : Pat<(i32 (vecreduce_smax (v16i8 MQPR:$src))), @@ -833,14 +876,14 @@ let Predicates = [HasMVEInt] in { } -multiclass MVE_VMINMAXAV_ty pattern=[]> { - def s8 : MVE_VMINMAXV; - def s16 : MVE_VMINMAXV; - def s32 : MVE_VMINMAXV; +multiclass MVE_VMINMAXAV_ty { + defm s8 : MVE_VMINMAXV_p; + defm s16: MVE_VMINMAXV_p; + defm s32: MVE_VMINMAXV_p; } -defm MVE_VMINAV : MVE_VMINMAXAV_ty<"vminav", 0b1>; -defm MVE_VMAXAV : MVE_VMINMAXAV_ty<"vmaxav", 0b0>; +defm MVE_VMINAV : MVE_VMINMAXAV_ty<"vminav", 1, "int_arm_mve_minav">; +defm MVE_VMAXAV : MVE_VMINMAXAV_ty<"vmaxav", 0, "int_arm_mve_maxav">; class MVE_VMLAMLSDAV @@ -861,6 +904,7 @@ class MVE_VMLAMLSDAV; - def : Pat<(v4f32 (ARMvdup (f32 SPR:$elem))), - (v4f32 (MVE_VDUP32 (i32 (COPY_TO_REGCLASS (f32 SPR:$elem), rGPR))))>; - def : Pat<(v8f16 (ARMvdup (f16 HPR:$elem))), - (v8f16 (MVE_VDUP16 (i32 (COPY_TO_REGCLASS (f16 HPR:$elem), rGPR))))>; + def : Pat<(v8f16 (ARMvdup (i32 rGPR:$elem))), + (MVE_VDUP16 rGPR:$elem)>; + def : Pat<(v4f32 (ARMvdup (i32 rGPR:$elem))), + (MVE_VDUP32 rGPR:$elem)>; def : Pat<(v4f32 (ARMvduplane (v4f32 MQPR:$src), imm:$lane)), (MVE_VDUP32 (MVE_VMOV_from_lane_32 MQPR:$src, imm:$lane))>; @@ -2134,15 +2179,15 @@ let Predicates = [HasMVEInt] in { (MVE_VDUP32 rGPR:$elem, ARMVCCThen, (v4i1 VCCR:$pred), (v4i32 MQPR:$inactive))>; def : Pat<(v4f32 (vselect (v4i1 VCCR:$pred), - (v4f32 (ARMvdup (f32 SPR:$elem))), + (v4f32 (ARMvdup (i32 rGPR:$elem))), (v4f32 MQPR:$inactive))), - (MVE_VDUP32 (i32 (COPY_TO_REGCLASS (f32 SPR:$elem), rGPR)), - ARMVCCThen, (v4i1 VCCR:$pred), (v4f32 MQPR:$inactive))>; + (MVE_VDUP32 rGPR:$elem, ARMVCCThen, (v4i1 VCCR:$pred), + (v4f32 MQPR:$inactive))>; def : Pat<(v8f16 (vselect (v8i1 VCCR:$pred), - (v8f16 (ARMvdup (f16 HPR:$elem))), + (v8f16 (ARMvdup (i32 rGPR:$elem))), (v8f16 MQPR:$inactive))), - (MVE_VDUP16 (i32 (COPY_TO_REGCLASS (f16 HPR:$elem), rGPR)), - ARMVCCThen, (v8i1 VCCR:$pred), (v8f16 MQPR:$inactive))>; + (MVE_VDUP16 rGPR:$elem, ARMVCCThen, (v8i1 VCCR:$pred), + (v8f16 MQPR:$inactive))>; } @@ -2650,6 +2695,7 @@ class MVE_VxSHRN { @@ -2692,6 +2738,7 @@ class MVE_VxQRSHRUN { @@ -4024,12 +4072,12 @@ multiclass unpred_vcmp_r { def i32 : Pat<(v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 MQPR:$v2), fc)), (v4i1 (!cast("MVE_VCMP"#suffix#"32") (v4i32 MQPR:$v1), (v4i32 MQPR:$v2), fc))>; - def i8r : Pat<(v16i1 (ARMvcmp (v16i8 MQPR:$v1), (v16i8 (ARMvdup GPR:$v2)), fc)), - (v16i1 (!cast("MVE_VCMP"#suffix#"8r") (v16i8 MQPR:$v1), (i32 GPR:$v2), fc))>; - def i16r : Pat<(v8i1 (ARMvcmp (v8i16 MQPR:$v1), (v8i16 (ARMvdup GPR:$v2)), fc)), - (v8i1 (!cast("MVE_VCMP"#suffix#"16r") (v8i16 MQPR:$v1), (i32 GPR:$v2), fc))>; - def i32r : Pat<(v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 (ARMvdup GPR:$v2)), fc)), - (v4i1 (!cast("MVE_VCMP"#suffix#"32r") (v4i32 MQPR:$v1), (i32 GPR:$v2), fc))>; + def i8r : Pat<(v16i1 (ARMvcmp (v16i8 MQPR:$v1), (v16i8 (ARMvdup rGPR:$v2)), fc)), + (v16i1 (!cast("MVE_VCMP"#suffix#"8r") (v16i8 MQPR:$v1), (i32 rGPR:$v2), fc))>; + def i16r : Pat<(v8i1 (ARMvcmp (v8i16 MQPR:$v1), (v8i16 (ARMvdup rGPR:$v2)), fc)), + (v8i1 (!cast("MVE_VCMP"#suffix#"16r") (v8i16 MQPR:$v1), (i32 rGPR:$v2), fc))>; + def i32r : Pat<(v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 (ARMvdup rGPR:$v2)), fc)), + (v4i1 (!cast("MVE_VCMP"#suffix#"32r") (v4i32 MQPR:$v1), (i32 rGPR:$v2), fc))>; def : Pat<(v16i1 (and (v16i1 VCCR:$p1), (v16i1 (ARMvcmp (v16i8 MQPR:$v1), (v16i8 MQPR:$v2), fc)))), (v16i1 (!cast("MVE_VCMP"#suffix#"8") (v16i8 MQPR:$v1), (v16i8 MQPR:$v2), fc, ARMVCCThen, VCCR:$p1))>; @@ -4038,12 +4086,12 @@ multiclass unpred_vcmp_r { def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 MQPR:$v2), fc)))), (v4i1 (!cast("MVE_VCMP"#suffix#"32") (v4i32 MQPR:$v1), (v4i32 MQPR:$v2), fc, ARMVCCThen, VCCR:$p1))>; - def : Pat<(v16i1 (and (v16i1 VCCR:$p1), (v16i1 (ARMvcmp (v16i8 MQPR:$v1), (v16i8 (ARMvdup GPR:$v2)), fc)))), - (v16i1 (!cast("MVE_VCMP"#suffix#"8r") (v16i8 MQPR:$v1), (i32 GPR:$v2), fc, ARMVCCThen, VCCR:$p1))>; - def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmp (v8i16 MQPR:$v1), (v8i16 (ARMvdup GPR:$v2)), fc)))), - (v8i1 (!cast("MVE_VCMP"#suffix#"16r") (v8i16 MQPR:$v1), (i32 GPR:$v2), fc, ARMVCCThen, VCCR:$p1))>; - def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 (ARMvdup GPR:$v2)), fc)))), - (v4i1 (!cast("MVE_VCMP"#suffix#"32r") (v4i32 MQPR:$v1), (i32 GPR:$v2), fc, ARMVCCThen, VCCR:$p1))>; + def : Pat<(v16i1 (and (v16i1 VCCR:$p1), (v16i1 (ARMvcmp (v16i8 MQPR:$v1), (v16i8 (ARMvdup rGPR:$v2)), fc)))), + (v16i1 (!cast("MVE_VCMP"#suffix#"8r") (v16i8 MQPR:$v1), (i32 rGPR:$v2), fc, ARMVCCThen, VCCR:$p1))>; + def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmp (v8i16 MQPR:$v1), (v8i16 (ARMvdup rGPR:$v2)), fc)))), + (v8i1 (!cast("MVE_VCMP"#suffix#"16r") (v8i16 MQPR:$v1), (i32 rGPR:$v2), fc, ARMVCCThen, VCCR:$p1))>; + def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 (ARMvdup rGPR:$v2)), fc)))), + (v4i1 (!cast("MVE_VCMP"#suffix#"32r") (v4i32 MQPR:$v1), (i32 rGPR:$v2), fc, ARMVCCThen, VCCR:$p1))>; } multiclass unpred_vcmpf_z { @@ -4059,25 +4107,25 @@ multiclass unpred_vcmpf_z { } multiclass unpred_vcmpf_r { - def f16 : Pat<(v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), fc)), - (v8i1 (MVE_VCMPf16 (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), fc))>; - def f32 : Pat<(v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), fc)), - (v4i1 (MVE_VCMPf32 (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), fc))>; + def : Pat<(v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), fc)), + (v8i1 (MVE_VCMPf16 (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), fc))>; + def : Pat<(v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), fc)), + (v4i1 (MVE_VCMPf32 (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), fc))>; - def f16r : Pat<(v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 (ARMvdup HPR:$v2)), fc)), - (v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), (i32 (COPY_TO_REGCLASS (f16 HPR:$v2), rGPR)), fc))>; - def f32r : Pat<(v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 (ARMvdup SPR:$v2)), fc)), - (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), (i32 (COPY_TO_REGCLASS (f32 SPR:$v2), rGPR)), fc))>; + def : Pat<(v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 (ARMvdup rGPR:$v2)), fc)), + (v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), (i32 rGPR:$v2), fc))>; + def : Pat<(v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 (ARMvdup rGPR:$v2)), fc)), + (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), (i32 rGPR:$v2), fc))>; def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), fc)))), (v8i1 (MVE_VCMPf16 (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), fc, ARMVCCThen, VCCR:$p1))>; def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), fc)))), (v4i1 (MVE_VCMPf32 (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), fc, ARMVCCThen, VCCR:$p1))>; - def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 (ARMvdup HPR:$v2)), fc)))), - (v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), (i32 (COPY_TO_REGCLASS (f16 HPR:$v2), rGPR)), fc, ARMVCCThen, VCCR:$p1))>; - def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 (ARMvdup SPR:$v2)), fc)))), - (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), (i32 (COPY_TO_REGCLASS (f32 SPR:$v2), rGPR)), fc, ARMVCCThen, VCCR:$p1))>; + def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 (ARMvdup rGPR:$v2)), fc)))), + (v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), (i32 rGPR:$v2), fc, ARMVCCThen, VCCR:$p1))>; + def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 (ARMvdup rGPR:$v2)), fc)))), + (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), (i32 rGPR:$v2), fc, ARMVCCThen, VCCR:$p1))>; } let Predicates = [HasMVEInt] in { @@ -4454,6 +4502,7 @@ class MVE_VxMOVxN { @@ -4788,25 +4838,21 @@ multiclass MVE_vec_scalar_fp_pat_m { let Predicates = [HasMVEFloat] in { // Unpredicated F16 - def : Pat<(v8f16 (unpred_op (v8f16 MQPR:$Qm), (v8f16 (ARMvdup HPR:$val)))), - (v8f16 (instr_f16 (v8f16 MQPR:$Qm), - (i32 (COPY_TO_REGCLASS (f16 HPR:$val), rGPR))))>; + def : Pat<(v8f16 (unpred_op (v8f16 MQPR:$Qm), (v8f16 (ARMvdup rGPR:$val)))), + (v8f16 (instr_f16 (v8f16 MQPR:$Qm), (i32 rGPR:$val)))>; // Unpredicated F32 - def : Pat<(v4f32 (unpred_op (v4f32 MQPR:$Qm), (v4f32 (ARMvdup SPR:$val)))), - (v4f32 (instr_f32 (v4f32 MQPR:$Qm), - (i32 (COPY_TO_REGCLASS (f32 SPR:$val), rGPR))))>; + def : Pat<(v4f32 (unpred_op (v4f32 MQPR:$Qm), (v4f32 (ARMvdup rGPR:$val)))), + (v4f32 (instr_f32 (v4f32 MQPR:$Qm), (i32 rGPR:$val)))>; // Predicated F16 - def : Pat<(v8f16 (pred_int (v8f16 MQPR:$Qm), (v8f16 (ARMvdup HPR:$val)), + def : Pat<(v8f16 (pred_int (v8f16 MQPR:$Qm), (v8f16 (ARMvdup rGPR:$val)), (v8i1 VCCR:$mask), (v8f16 MQPR:$inactive))), - (v8f16 (instr_f16 (v8f16 MQPR:$Qm), - (i32 (COPY_TO_REGCLASS (f16 HPR:$val), rGPR)), + (v8f16 (instr_f16 (v8f16 MQPR:$Qm), (i32 rGPR:$val), ARMVCCThen, (v8i1 VCCR:$mask), (v8f16 MQPR:$inactive)))>; // Preicated F32 - def : Pat<(v4f32 (pred_int (v4f32 MQPR:$Qm), (v4f32 (ARMvdup SPR:$val)), + def : Pat<(v4f32 (pred_int (v4f32 MQPR:$Qm), (v4f32 (ARMvdup rGPR:$val)), (v4i1 VCCR:$mask), (v4f32 MQPR:$inactive))), - (v4f32 (instr_f32 (v4f32 MQPR:$Qm), - (i32 (COPY_TO_REGCLASS (f32 SPR:$val), rGPR)), + (v4f32 (instr_f32 (v4f32 MQPR:$Qm), (i32 rGPR:$val), ARMVCCThen, (v4i1 VCCR:$mask), (v4f32 MQPR:$inactive)))>; } @@ -5029,19 +5075,19 @@ defm MVE_VQSHL_qr : MVE_VxSHL_qr_types<"vqshl", 0b1, 0b0>; defm MVE_VQRSHL_qr : MVE_VxSHL_qr_types<"vqrshl", 0b1, 0b1>; let Predicates = [HasMVEInt] in { - def : Pat<(v4i32 (ARMvshlu (v4i32 MQPR:$Qm), (v4i32 (ARMvdup GPR:$Rm)))), - (v4i32 (MVE_VSHL_qru32 (v4i32 MQPR:$Qm), GPR:$Rm))>; - def : Pat<(v8i16 (ARMvshlu (v8i16 MQPR:$Qm), (v8i16 (ARMvdup GPR:$Rm)))), - (v8i16 (MVE_VSHL_qru16 (v8i16 MQPR:$Qm), GPR:$Rm))>; - def : Pat<(v16i8 (ARMvshlu (v16i8 MQPR:$Qm), (v16i8 (ARMvdup GPR:$Rm)))), - (v16i8 (MVE_VSHL_qru8 (v16i8 MQPR:$Qm), GPR:$Rm))>; + def : Pat<(v4i32 (ARMvshlu (v4i32 MQPR:$Qm), (v4i32 (ARMvdup rGPR:$Rm)))), + (v4i32 (MVE_VSHL_qru32 (v4i32 MQPR:$Qm), rGPR:$Rm))>; + def : Pat<(v8i16 (ARMvshlu (v8i16 MQPR:$Qm), (v8i16 (ARMvdup rGPR:$Rm)))), + (v8i16 (MVE_VSHL_qru16 (v8i16 MQPR:$Qm), rGPR:$Rm))>; + def : Pat<(v16i8 (ARMvshlu (v16i8 MQPR:$Qm), (v16i8 (ARMvdup rGPR:$Rm)))), + (v16i8 (MVE_VSHL_qru8 (v16i8 MQPR:$Qm), rGPR:$Rm))>; - def : Pat<(v4i32 (ARMvshls (v4i32 MQPR:$Qm), (v4i32 (ARMvdup GPR:$Rm)))), - (v4i32 (MVE_VSHL_qrs32 (v4i32 MQPR:$Qm), GPR:$Rm))>; - def : Pat<(v8i16 (ARMvshls (v8i16 MQPR:$Qm), (v8i16 (ARMvdup GPR:$Rm)))), - (v8i16 (MVE_VSHL_qrs16 (v8i16 MQPR:$Qm), GPR:$Rm))>; - def : Pat<(v16i8 (ARMvshls (v16i8 MQPR:$Qm), (v16i8 (ARMvdup GPR:$Rm)))), - (v16i8 (MVE_VSHL_qrs8 (v16i8 MQPR:$Qm), GPR:$Rm))>; + def : Pat<(v4i32 (ARMvshls (v4i32 MQPR:$Qm), (v4i32 (ARMvdup rGPR:$Rm)))), + (v4i32 (MVE_VSHL_qrs32 (v4i32 MQPR:$Qm), rGPR:$Rm))>; + def : Pat<(v8i16 (ARMvshls (v8i16 MQPR:$Qm), (v8i16 (ARMvdup rGPR:$Rm)))), + (v8i16 (MVE_VSHL_qrs16 (v8i16 MQPR:$Qm), rGPR:$Rm))>; + def : Pat<(v16i8 (ARMvshls (v16i8 MQPR:$Qm), (v16i8 (ARMvdup rGPR:$Rm)))), + (v16i8 (MVE_VSHL_qrs8 (v16i8 MQPR:$Qm), rGPR:$Rm))>; } class MVE_VBRSR size, list pattern=[]> @@ -5223,19 +5269,21 @@ multiclass MVE_VFMA_qr_multi; + def : Pat<(VTI.Vec (fma v1, v2, vs)), + (VTI.Vec (Inst v1, v2, is))>; def : Pat<(VTI.Vec (pred_int v1, v2, vs, pred)), (VTI.Vec (Inst v1, v2, is, ARMVCCThen, pred))>; } else { - def : Pat<(VTI.Vec (fma v1, vs, v2)), (VTI.Vec (Inst v2, v1, is))>; - def : Pat<(VTI.Vec (fma vs, v1, v2)), (VTI.Vec (Inst v2, v1, is))>; + def : Pat<(VTI.Vec (fma v1, vs, v2)), + (VTI.Vec (Inst v2, v1, is))>; + def : Pat<(VTI.Vec (fma vs, v1, v2)), + (VTI.Vec (Inst v2, v1, is))>; def : Pat<(VTI.Vec (pred_int v1, vs, v2, pred)), (VTI.Vec (Inst v2, v1, is, ARMVCCThen, pred))>; def : Pat<(VTI.Vec (pred_int vs, v1, v2, pred)), diff --git a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp index dd4579fb6c5d4..fd7ce212d5912 100644 --- a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp +++ b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp @@ -61,6 +61,8 @@ using namespace llvm; namespace { + using InstSet = SmallPtrSetImpl; + class PostOrderLoopTraversal { MachineLoop &ML; MachineLoopInfo &MLI; @@ -518,6 +520,59 @@ static bool isRegInClass(const MachineOperand &MO, return MO.isReg() && MO.getReg() && Class->contains(MO.getReg()); } +// Can this instruction generate a non-zero result when given only zeroed +// operands? This allows us to know that, given operands with false bytes +// zeroed by masked loads, that the result will also contain zeros in those +// bytes. +static bool canGenerateNonZeros(const MachineInstr &MI) { + switch (MI.getOpcode()) { + default: + break; + // FIXME: FP minus 0? + //case ARM::MVE_VNEGf16: + //case ARM::MVE_VNEGf32: + case ARM::MVE_VMVN: + case ARM::MVE_VORN: + case ARM::MVE_VCLZs8: + case ARM::MVE_VCLZs16: + case ARM::MVE_VCLZs32: + return true; + } + return false; +} + +// MVE 'narrowing' operate on half a lane, reading from half and writing +// to half, which are referred to has the top and bottom half. The other +// half retains its previous value. +static bool retainsPreviousHalfElement(const MachineInstr &MI) { + const MCInstrDesc &MCID = MI.getDesc(); + uint64_t Flags = MCID.TSFlags; + return (Flags & ARMII::RetainsPreviousHalfElement) != 0; +} + +// Look at its register uses to see if it only can only receive zeros +// into its false lanes which would then produce zeros. Also check that +// the output register is also defined by an FalseLaneZeros instruction +// so that if tail-predication happens, the lanes that aren't updated will +// still be zeros. +static bool producesFalseLaneZeros(MachineInstr &MI, + const TargetRegisterClass *QPRs, + const ReachingDefAnalysis &RDA, + InstSet &FalseLaneZeros) { + if (canGenerateNonZeros(MI)) + return false; + for (auto &MO : MI.operands()) { + if (!MO.isReg() || !MO.getReg()) + continue; + if (auto *OpDef = RDA.getMIOperand(&MI, MO)) + if (FalseLaneZeros.count(OpDef)) + continue; + return false; + } + LLVM_DEBUG(dbgs() << "ARM Loops: Always False Zeros: " << MI); + return true; +} + bool LowOverheadLoop::ValidateLiveOuts() const { // We want to find out if the tail-predicated version of this loop will // produce the same values as the loop in its original form. For this to @@ -538,12 +593,14 @@ bool LowOverheadLoop::ValidateLiveOuts() const { // operands, or stored results are equivalent already. Other explicitly // predicated instructions will perform the same operation in the original // loop and the tail-predicated form too. Because of this, we can insert - // loads, stores and other predicated instructions into our KnownFalseZeros + // loads, stores and other predicated instructions into our Predicated // set and build from there. const TargetRegisterClass *QPRs = TRI.getRegClass(ARM::MQPRRegClassID); - SetVector UnknownFalseLanes; - SmallPtrSet KnownFalseZeros; + SetVector Unknown; + SmallPtrSet FalseLaneZeros; + SmallPtrSet Predicated; MachineBasicBlock *MBB = ML.getHeader(); + for (auto &MI : *MBB) { const MCInstrDesc &MCID = MI.getDesc(); uint64_t Flags = MCID.TSFlags; @@ -551,63 +608,49 @@ bool LowOverheadLoop::ValidateLiveOuts() const { continue; if (isVectorPredicated(&MI)) { - KnownFalseZeros.insert(&MI); + if (MI.mayLoad()) + FalseLaneZeros.insert(&MI); + Predicated.insert(&MI); continue; } if (MI.getNumDefs() == 0) continue; - // Only evaluate instructions which produce a single value. - assert((MI.getNumDefs() == 1 && MI.defs().begin()->isReg()) && - "Expected no more than one register def"); - - Register DefReg = MI.defs().begin()->getReg(); - for (auto &MO : MI.operands()) { - if (!isRegInClass(MO, QPRs) || !MO.isUse() || MO.getReg() != DefReg) - continue; - - // If this instruction overwrites one of its operands, and that register - // has known lanes, then this instruction also has known predicated false - // lanes. - if (auto *OpDef = RDA.getMIOperand(&MI, MO)) { - if (KnownFalseZeros.count(OpDef)) { - KnownFalseZeros.insert(&MI); - break; - } - } - } - if (!KnownFalseZeros.count(&MI)) - UnknownFalseLanes.insert(&MI); + if (producesFalseLaneZeros(MI, QPRs, RDA, FalseLaneZeros)) + FalseLaneZeros.insert(&MI); + else if (retainsPreviousHalfElement(MI)) + return false; + else + Unknown.insert(&MI); } - auto HasKnownUsers = [this](MachineInstr *MI, const MachineOperand &MO, - SmallPtrSetImpl &Knowns) { + auto HasPredicatedUsers = [this](MachineInstr *MI, const MachineOperand &MO, + SmallPtrSetImpl &Predicated) { SmallPtrSet Uses; RDA.getGlobalUses(MI, MO.getReg(), Uses); for (auto *Use : Uses) { - if (Use != MI && !Knowns.count(Use)) + if (Use != MI && !Predicated.count(Use)) return false; } return true; }; - // Now for all the unknown values, see if they're only consumed by known - // instructions. Visit in reverse so that we can start at the values being + // Visit the unknowns in reverse so that we can start at the values being // stored and then we can work towards the leaves, hopefully adding more - // instructions to KnownFalseZeros. - for (auto *MI : reverse(UnknownFalseLanes)) { + // instructions to Predicated. + for (auto *MI : reverse(Unknown)) { for (auto &MO : MI->operands()) { if (!isRegInClass(MO, QPRs) || !MO.isDef()) continue; - if (!HasKnownUsers(MI, MO, KnownFalseZeros)) { + if (!HasPredicatedUsers(MI, MO, Predicated)) { LLVM_DEBUG(dbgs() << "ARM Loops: Found an unknown def of : " << TRI.getRegAsmName(MO.getReg()) << " at " << *MI); return false; } } // Any unknown false lanes have been masked away by the user(s). - KnownFalseZeros.insert(MI); + Predicated.insert(MI); } // Collect Q-regs that are live in the exit blocks. We don't collect scalars diff --git a/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/llvm/lib/Target/ARM/ARMTargetMachine.cpp index 84876eda33a6f..63aa65267ef26 100644 --- a/llvm/lib/Target/ARM/ARMTargetMachine.cpp +++ b/llvm/lib/Target/ARM/ARMTargetMachine.cpp @@ -243,6 +243,9 @@ ARMBaseTargetMachine::ARMBaseTargetMachine(const Target &T, const Triple &TT, this->Options.NoTrapAfterNoreturn = true; } + // ARM supports the debug entry values. + setSupportsDebugEntryValues(true); + initAsmInfo(); } diff --git a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp index c1bddfb847d66..8ae7525ddea61 100644 --- a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp +++ b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp @@ -8546,6 +8546,26 @@ bool ARMAsmParser::processInstruction(MCInst &Inst, Inst = TmpInst; return true; } + // Alias for 'ldr{sb,h,sh}t Rt, [Rn] {, #imm}' for ommitted immediate. + case ARM::LDRSBTii: + case ARM::LDRHTii: + case ARM::LDRSHTii: { + MCInst TmpInst; + + if (Inst.getOpcode() == ARM::LDRSBTii) + TmpInst.setOpcode(ARM::LDRSBTi); + else if (Inst.getOpcode() == ARM::LDRHTii) + TmpInst.setOpcode(ARM::LDRHTi); + else if (Inst.getOpcode() == ARM::LDRSHTii) + TmpInst.setOpcode(ARM::LDRSHTi); + TmpInst.addOperand(Inst.getOperand(0)); + TmpInst.addOperand(Inst.getOperand(1)); + TmpInst.addOperand(Inst.getOperand(1)); + TmpInst.addOperand(MCOperand::createImm(256)); + TmpInst.addOperand(Inst.getOperand(2)); + Inst = TmpInst; + return true; + } // Alias for alternate form of 'str{,b}t Rt, [Rn], #imm' instruction. case ARM::STRT_POST: case ARM::STRBT_POST: { diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h b/llvm/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h index 6293a24623067..7d7d0af238938 100644 --- a/llvm/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h +++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h @@ -396,6 +396,13 @@ namespace ARMII { // Whether an instruction can be included in an MVE tail-predicated loop. ValidForTailPredication = 1 << 20, + // Whether an instruction writes to the top/bottom half of a vector element + // and leaves the other half untouched. + RetainsPreviousHalfElement = 1 << 21, + + // Whether the instruction produces a scalar result from vector operands. + HorizontalReduction = 1 << 22, + //===------------------------------------------------------------------===// // Code domain. DomainShift = 15, diff --git a/llvm/lib/Target/AVR/AVRFrameLowering.cpp b/llvm/lib/Target/AVR/AVRFrameLowering.cpp index 844021a9b2acb..119bd21248bf3 100644 --- a/llvm/lib/Target/AVR/AVRFrameLowering.cpp +++ b/llvm/lib/Target/AVR/AVRFrameLowering.cpp @@ -378,7 +378,7 @@ MachineBasicBlock::iterator AVRFrameLowering::eliminateCallFramePseudoInstr( // For adjcallstackdown we convert it into an 'adiw reg, ' handling // the read and write of SP in I/O space. if (Amount != 0) { - assert(getStackAlignment() == 1 && "Unsupported stack alignment"); + assert(getStackAlign() == Align(1) && "Unsupported stack alignment"); if (Opcode == TII.getCallFrameSetupOpcode()) { fixStackStores(MBB, MI, TII, true); diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRFixupKinds.h b/llvm/lib/Target/AVR/MCTargetDesc/AVRFixupKinds.h index b3504b89e4d33..a0dd1dc8ac3ec 100644 --- a/llvm/lib/Target/AVR/MCTargetDesc/AVRFixupKinds.h +++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRFixupKinds.h @@ -137,7 +137,7 @@ namespace fixups { /// of the fact that all instructions are aligned to addresses of size /// 2, so bit 0 of an address is always 0. This gives us another bit /// of precision. -/// \param[in,out] The target to adjust. +/// \param [in,out] val The target to adjust. template inline void adjustBranchTarget(T &val) { val >>= 1; } } // end of namespace fixups diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp index 2e7a65ba0a0d3..5248dc807368a 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp +++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp @@ -432,7 +432,7 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, DAG.getCopyFromReg(Chain, dl, HRI.getStackRegister(), PtrVT); bool NeedsArgAlign = false; - unsigned LargestAlignSeen = 0; + Align LargestAlignSeen; // Walk the register/memloc assignments, inserting copies/loads. for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; @@ -469,8 +469,8 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, StackPtr.getValueType()); MemAddr = DAG.getNode(ISD::ADD, dl, MVT::i32, StackPtr, MemAddr); if (ArgAlign) - LargestAlignSeen = std::max(LargestAlignSeen, - (unsigned)VA.getLocVT().getStoreSizeInBits() >> 3); + LargestAlignSeen = std::max( + LargestAlignSeen, Align(VA.getLocVT().getStoreSizeInBits() / 8)); if (Flags.isByVal()) { // The argument is a struct passed by value. According to LLVM, "Arg" // is a pointer. @@ -493,7 +493,7 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, if (NeedsArgAlign && Subtarget.hasV60Ops()) { LLVM_DEBUG(dbgs() << "Function needs byte stack align due to call args\n"); - unsigned VecAlign = HRI.getSpillAlignment(Hexagon::HvxVRRegClass); + Align VecAlign(HRI.getSpillAlignment(Hexagon::HvxVRRegClass)); LargestAlignSeen = std::max(LargestAlignSeen, VecAlign); MFI.ensureMaxAlignment(LargestAlignSeen); } diff --git a/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp b/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp index fdcc41a4ca41d..1dc1a783e6da7 100644 --- a/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp +++ b/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp @@ -112,7 +112,6 @@ static const char *getSectionSuffixForSize(unsigned Size) { void HexagonTargetObjectFile::Initialize(MCContext &Ctx, const TargetMachine &TM) { TargetLoweringObjectFileELF::Initialize(Ctx, TM); - InitializeELF(TM.Options.UseInitArray); SmallDataSection = getContext().getELFSection(".sdata", ELF::SHT_PROGBITS, diff --git a/llvm/lib/Target/Lanai/LanaiTargetObjectFile.cpp b/llvm/lib/Target/Lanai/LanaiTargetObjectFile.cpp index b0f7c090bb8ec..d47306c55bad9 100644 --- a/llvm/lib/Target/Lanai/LanaiTargetObjectFile.cpp +++ b/llvm/lib/Target/Lanai/LanaiTargetObjectFile.cpp @@ -28,7 +28,6 @@ static cl::opt SSThreshold( void LanaiTargetObjectFile::Initialize(MCContext &Ctx, const TargetMachine &TM) { TargetLoweringObjectFileELF::Initialize(Ctx, TM); - InitializeELF(TM.Options.UseInitArray); SmallDataSection = getContext().getELFSection( ".sdata", ELF::SHT_PROGBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC); diff --git a/llvm/lib/Target/MSP430/MSP430FrameLowering.cpp b/llvm/lib/Target/MSP430/MSP430FrameLowering.cpp index 9e3eefbd088f9..567180476ae0e 100644 --- a/llvm/lib/Target/MSP430/MSP430FrameLowering.cpp +++ b/llvm/lib/Target/MSP430/MSP430FrameLowering.cpp @@ -223,8 +223,6 @@ MachineBasicBlock::iterator MSP430FrameLowering::eliminateCallFramePseudoInstr( MachineBasicBlock::iterator I) const { const MSP430InstrInfo &TII = *static_cast(MF.getSubtarget().getInstrInfo()); - unsigned StackAlign = getStackAlignment(); - if (!hasReservedCallFrame(MF)) { // If the stack pointer can be changed after prologue, turn the // adjcallstackup instruction into a 'sub SP, ' and the @@ -236,7 +234,7 @@ MachineBasicBlock::iterator MSP430FrameLowering::eliminateCallFramePseudoInstr( // We need to keep the stack aligned properly. To do this, we round the // amount of space needed for the outgoing arguments up to the next // alignment boundary. - Amount = (Amount+StackAlign-1)/StackAlign*StackAlign; + Amount = alignTo(Amount, getStackAlign()); MachineInstr *New = nullptr; if (Old.getOpcode() == TII.getCallFrameSetupOpcode()) { diff --git a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp index df1d2d0832326..9dbbdeb34dbae 100644 --- a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp +++ b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp @@ -298,6 +298,12 @@ class MipsAsmParser : public MCTargetAsmParser { bool expandSgtImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, const MCSubtargetInfo *STI); + bool expandSle(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, + const MCSubtargetInfo *STI); + + bool expandSleImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, + const MCSubtargetInfo *STI); + bool expandRotation(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, const MCSubtargetInfo *STI); bool expandRotationImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, @@ -334,6 +340,12 @@ class MipsAsmParser : public MCTargetAsmParser { bool expandSeqI(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, const MCSubtargetInfo *STI); + bool expandSne(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, + const MCSubtargetInfo *STI); + + bool expandSneI(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, + const MCSubtargetInfo *STI); + bool expandMXTRAlias(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, const MCSubtargetInfo *STI); @@ -349,6 +361,7 @@ class MipsAsmParser : public MCTargetAsmParser { bool parseSetArchDirective(); bool parseSetFeature(uint64_t Feature); bool isPicAndNotNxxAbi(); // Used by .cpload, .cprestore, and .cpsetup. + bool parseDirectiveCpAdd(SMLoc Loc); bool parseDirectiveCpLoad(SMLoc Loc); bool parseDirectiveCpLocal(SMLoc Loc); bool parseDirectiveCpRestore(SMLoc Loc); @@ -2515,6 +2528,14 @@ MipsAsmParser::tryExpandInstruction(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, case Mips::SGTImm64: case Mips::SGTUImm64: return expandSgtImm(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success; + case Mips::SLE: + case Mips::SLEU: + return expandSle(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success; + case Mips::SLEImm: + case Mips::SLEUImm: + case Mips::SLEImm64: + case Mips::SLEUImm64: + return expandSleImm(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success; case Mips::SLTImm64: if (isInt<16>(Inst.getOperand(2).getImm())) { Inst.setOpcode(Mips::SLTi64); @@ -2591,6 +2612,10 @@ MipsAsmParser::tryExpandInstruction(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, return expandSeq(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success; case Mips::SEQIMacro: return expandSeqI(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success; + case Mips::SNEMacro: + return expandSne(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success; + case Mips::SNEIMacro: + return expandSneI(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success; case Mips::MFTC0: case Mips::MTTC0: case Mips::MFTGPR: case Mips::MTTGPR: case Mips::MFTLO: case Mips::MTTLO: @@ -4639,6 +4664,88 @@ bool MipsAsmParser::expandSgtImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, return false; } +bool MipsAsmParser::expandSle(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, + const MCSubtargetInfo *STI) { + MipsTargetStreamer &TOut = getTargetStreamer(); + + assert(Inst.getNumOperands() == 3 && "Invalid operand count"); + assert(Inst.getOperand(0).isReg() && + Inst.getOperand(1).isReg() && + Inst.getOperand(2).isReg() && "Invalid instruction operand."); + + unsigned DstReg = Inst.getOperand(0).getReg(); + unsigned SrcReg = Inst.getOperand(1).getReg(); + unsigned OpReg = Inst.getOperand(2).getReg(); + unsigned OpCode; + + warnIfNoMacro(IDLoc); + + switch (Inst.getOpcode()) { + case Mips::SLE: + OpCode = Mips::SLT; + break; + case Mips::SLEU: + OpCode = Mips::SLTu; + break; + default: + llvm_unreachable("unexpected 'sge' opcode"); + } + + // $SrcReg <= $OpReg is equal to (not ($OpReg < $SrcReg)) + TOut.emitRRR(OpCode, DstReg, OpReg, SrcReg, IDLoc, STI); + TOut.emitRRI(Mips::XORi, DstReg, DstReg, 1, IDLoc, STI); + + return false; +} + +bool MipsAsmParser::expandSleImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, + const MCSubtargetInfo *STI) { + MipsTargetStreamer &TOut = getTargetStreamer(); + + assert(Inst.getNumOperands() == 3 && "Invalid operand count"); + assert(Inst.getOperand(0).isReg() && + Inst.getOperand(1).isReg() && + Inst.getOperand(2).isImm() && "Invalid instruction operand."); + + unsigned DstReg = Inst.getOperand(0).getReg(); + unsigned SrcReg = Inst.getOperand(1).getReg(); + int64_t ImmValue = Inst.getOperand(2).getImm(); + unsigned OpRegCode; + + warnIfNoMacro(IDLoc); + + switch (Inst.getOpcode()) { + case Mips::SLEImm: + case Mips::SLEImm64: + OpRegCode = Mips::SLT; + break; + case Mips::SLEUImm: + case Mips::SLEUImm64: + OpRegCode = Mips::SLTu; + break; + default: + llvm_unreachable("unexpected 'sge' opcode with immediate"); + } + + // $SrcReg <= Imm is equal to (not (Imm < $SrcReg)) + unsigned ImmReg = DstReg; + if (DstReg == SrcReg) { + unsigned ATReg = getATReg(Inst.getLoc()); + if (!ATReg) + return true; + ImmReg = ATReg; + } + + if (loadImmediate(ImmValue, ImmReg, Mips::NoRegister, isInt<32>(ImmValue), + false, IDLoc, Out, STI)) + return true; + + TOut.emitRRR(OpRegCode, DstReg, ImmReg, SrcReg, IDLoc, STI); + TOut.emitRRI(Mips::XORi, DstReg, DstReg, 1, IDLoc, STI); + + return false; +} + bool MipsAsmParser::expandAliasImmediate(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, const MCSubtargetInfo *STI) { @@ -5328,6 +5435,88 @@ bool MipsAsmParser::expandSeqI(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, return false; } +bool MipsAsmParser::expandSne(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, + const MCSubtargetInfo *STI) { + + MipsTargetStreamer &TOut = getTargetStreamer(); + + assert(Inst.getNumOperands() == 3 && "Invalid operand count"); + assert(Inst.getOperand(0).isReg() && + Inst.getOperand(1).isReg() && + Inst.getOperand(2).isReg() && "Invalid instruction operand."); + + unsigned DstReg = Inst.getOperand(0).getReg(); + unsigned SrcReg = Inst.getOperand(1).getReg(); + unsigned OpReg = Inst.getOperand(2).getReg(); + + warnIfNoMacro(IDLoc); + + if (SrcReg != Mips::ZERO && OpReg != Mips::ZERO) { + TOut.emitRRR(Mips::XOR, DstReg, SrcReg, OpReg, IDLoc, STI); + TOut.emitRRR(Mips::SLTu, DstReg, Mips::ZERO, DstReg, IDLoc, STI); + return false; + } + + unsigned Reg = SrcReg == Mips::ZERO ? OpReg : SrcReg; + TOut.emitRRR(Mips::SLTu, DstReg, Mips::ZERO, Reg, IDLoc, STI); + return false; +} + +bool MipsAsmParser::expandSneI(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, + const MCSubtargetInfo *STI) { + MipsTargetStreamer &TOut = getTargetStreamer(); + + assert(Inst.getNumOperands() == 3 && "Invalid operand count"); + assert(Inst.getOperand(0).isReg() && + Inst.getOperand(1).isReg() && + Inst.getOperand(2).isImm() && "Invalid instruction operand."); + + unsigned DstReg = Inst.getOperand(0).getReg(); + unsigned SrcReg = Inst.getOperand(1).getReg(); + int64_t ImmValue = Inst.getOperand(2).getImm(); + + warnIfNoMacro(IDLoc); + + if (ImmValue == 0) { + TOut.emitRRR(Mips::SLTu, DstReg, Mips::ZERO, SrcReg, IDLoc, STI); + return false; + } + + if (SrcReg == Mips::ZERO) { + Warning(IDLoc, "comparison is always true"); + if (loadImmediate(1, DstReg, Mips::NoRegister, true, false, IDLoc, Out, + STI)) + return true; + return false; + } + + unsigned Opc; + if (ImmValue > -0x8000 && ImmValue < 0) { + ImmValue = -ImmValue; + Opc = isGP64bit() ? Mips::DADDiu : Mips::ADDiu; + } else { + Opc = Mips::XORi; + } + + if (isUInt<16>(ImmValue)) { + TOut.emitRRI(Opc, DstReg, SrcReg, ImmValue, IDLoc, STI); + TOut.emitRRR(Mips::SLTu, DstReg, Mips::ZERO, DstReg, IDLoc, STI); + return false; + } + + unsigned ATReg = getATReg(IDLoc); + if (!ATReg) + return true; + + if (loadImmediate(ImmValue, ATReg, Mips::NoRegister, isInt<32>(ImmValue), + false, IDLoc, Out, STI)) + return true; + + TOut.emitRRR(Mips::XOR, DstReg, SrcReg, ATReg, IDLoc, STI); + TOut.emitRRR(Mips::SLTu, DstReg, Mips::ZERO, DstReg, IDLoc, STI); + return false; +} + // Map the DSP accumulator and control register to the corresponding gpr // operand. Unlike the other alias, the m(f|t)t(lo|hi|acx) instructions // do not map the DSP registers contigously to gpr registers. @@ -7444,6 +7633,31 @@ bool MipsAsmParser::isPicAndNotNxxAbi() { return inPicMode() && !(isABI_N32() || isABI_N64()); } +bool MipsAsmParser::parseDirectiveCpAdd(SMLoc Loc) { + SmallVector, 1> Reg; + OperandMatchResultTy ResTy = parseAnyRegister(Reg); + if (ResTy == MatchOperand_NoMatch || ResTy == MatchOperand_ParseFail) { + reportParseError("expected register"); + return false; + } + + MipsOperand &RegOpnd = static_cast(*Reg[0]); + if (!RegOpnd.isGPRAsmReg()) { + reportParseError(RegOpnd.getStartLoc(), "invalid register"); + return false; + } + + // If this is not the end of the statement, report an error. + if (getLexer().isNot(AsmToken::EndOfStatement)) { + reportParseError("unexpected token, expected end of statement"); + return false; + } + getParser().Lex(); // Consume the EndOfStatement. + + getTargetStreamer().emitDirectiveCpAdd(RegOpnd.getGPR32Reg()); + return false; +} + bool MipsAsmParser::parseDirectiveCpLoad(SMLoc Loc) { if (AssemblerOptions.back()->isReorder()) Warning(Loc, ".cpload should be inside a noreorder section"); @@ -8356,6 +8570,10 @@ bool MipsAsmParser::ParseDirective(AsmToken DirectiveID) { MCAsmParser &Parser = getParser(); StringRef IDVal = DirectiveID.getString(); + if (IDVal == ".cpadd") { + parseDirectiveCpAdd(DirectiveID.getLoc()); + return false; + } if (IDVal == ".cpload") { parseDirectiveCpLoad(DirectiveID.getLoc()); return false; diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp index 579273004f1d7..6ec8fe8059680 100644 --- a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp +++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp @@ -111,6 +111,7 @@ void MipsTargetStreamer::emitDirectiveSetDspr2() { forbidModuleDirective(); } void MipsTargetStreamer::emitDirectiveSetNoDsp() { forbidModuleDirective(); } void MipsTargetStreamer::emitDirectiveSetMips3D() { forbidModuleDirective(); } void MipsTargetStreamer::emitDirectiveSetNoMips3D() { forbidModuleDirective(); } +void MipsTargetStreamer::emitDirectiveCpAdd(unsigned RegNo) {} void MipsTargetStreamer::emitDirectiveCpLoad(unsigned RegNo) {} void MipsTargetStreamer::emitDirectiveCpLocal(unsigned RegNo) { // .cplocal $reg @@ -662,6 +663,12 @@ void MipsTargetAsmStreamer::emitFMask(unsigned FPUBitmask, OS << "," << FPUTopSavedRegOff << '\n'; } +void MipsTargetAsmStreamer::emitDirectiveCpAdd(unsigned RegNo) { + OS << "\t.cpadd\t$" + << StringRef(MipsInstPrinter::getRegisterName(RegNo)).lower() << "\n"; + forbidModuleDirective(); +} + void MipsTargetAsmStreamer::emitDirectiveCpLoad(unsigned RegNo) { OS << "\t.cpload\t$" << StringRef(MipsInstPrinter::getRegisterName(RegNo)).lower() << "\n"; @@ -1120,6 +1127,17 @@ void MipsTargetELFStreamer::emitFMask(unsigned FPUBitmask, FPROffset = FPUTopSavedRegOff; } +void MipsTargetELFStreamer::emitDirectiveCpAdd(unsigned RegNo) { + // .cpadd $reg + // This directive inserts code to add $gp to the argument's register + // when support for position independent code is enabled. + if (!Pic) + return; + + emitAddu(RegNo, RegNo, GPReg, getABI().IsN64(), &STI); + forbidModuleDirective(); +} + void MipsTargetELFStreamer::emitDirectiveCpLoad(unsigned RegNo) { // .cpload $reg // This directive expands to: diff --git a/llvm/lib/Target/Mips/Mips64InstrInfo.td b/llvm/lib/Target/Mips/Mips64InstrInfo.td index 306289d56e4b7..bd62a56d3008f 100644 --- a/llvm/lib/Target/Mips/Mips64InstrInfo.td +++ b/llvm/lib/Target/Mips/Mips64InstrInfo.td @@ -1248,5 +1248,19 @@ def : MipsInstAlias<"sgtu $rs, $imm", (SGTUImm64 GPR64Opnd:$rs, GPR64Opnd:$rs, imm64:$imm), 0>, GPR_64; +def SLEImm64 : MipsAsmPseudoInst<(outs GPR64Opnd:$rd), + (ins GPR64Opnd:$rs, imm64:$imm), + "sle\t$rd, $rs, $imm">, GPR_64; +def : MipsInstAlias<"sle $rs, $imm", (SLEImm64 GPR64Opnd:$rs, + GPR64Opnd:$rs, + imm64:$imm), 0>, GPR_64; + +def SLEUImm64 : MipsAsmPseudoInst<(outs GPR64Opnd:$rd), + (ins GPR64Opnd:$rs, imm64:$imm), + "sleu\t$rd, $rs, $imm">, GPR_64; +def : MipsInstAlias<"sleu $rs, $imm", (SLEUImm64 GPR64Opnd:$rs, + GPR64Opnd:$rs, + imm64:$imm), 0>, GPR_64; + def : MipsInstAlias<"rdhwr $rt, $rs", (RDHWR64 GPR64Opnd:$rt, HWRegsOpnd:$rs, 0), 1>, GPR_64; diff --git a/llvm/lib/Target/Mips/MipsCallLowering.cpp b/llvm/lib/Target/Mips/MipsCallLowering.cpp index 4d8d42eed00b7..41aceceb0ea37 100644 --- a/llvm/lib/Target/Mips/MipsCallLowering.cpp +++ b/llvm/lib/Target/Mips/MipsCallLowering.cpp @@ -179,8 +179,9 @@ Register IncomingValueHandler::getStackAddress(const CCValAssign &VA, MachinePointerInfo::getFixedStack(MIRBuilder.getMF(), FI); const TargetFrameLowering *TFL = MF.getSubtarget().getFrameLowering(); - unsigned Align = MinAlign(TFL->getStackAlignment(), Offset); - MMO = MF.getMachineMemOperand(MPO, MachineMemOperand::MOLoad, Size, Align); + Align Alignment = commonAlignment(TFL->getStackAlign(), Offset); + MMO = MF.getMachineMemOperand(MPO, MachineMemOperand::MOLoad, Size, + Alignment.value()); return MIRBuilder.buildFrameIndex(LLT::pointer(0, 32), FI).getReg(0); } diff --git a/llvm/lib/Target/Mips/MipsInstrFPU.td b/llvm/lib/Target/Mips/MipsInstrFPU.td index 37a0aefc9636e..5696df96e7987 100644 --- a/llvm/lib/Target/Mips/MipsInstrFPU.td +++ b/llvm/lib/Target/Mips/MipsInstrFPU.td @@ -72,7 +72,7 @@ def IsNotSingleFloat : Predicate<"!Subtarget->isSingleFloat()">, AssemblerPredicate<(all_of (not FeatureSingleFloat))>; def IsNotSoftFloat : Predicate<"!Subtarget->useSoftFloat()">, AssemblerPredicate<(all_of (not FeatureSoftFloat))>; -def Mips3D : Predicate<"Subtarget->has3D()">, +def HasMips3D : Predicate<"Subtarget->has3D()">, AssemblerPredicate<(all_of FeatureMips3D)>; //===----------------------------------------------------------------------===// @@ -479,7 +479,7 @@ let DecoderNamespace = "MipsFP64" in { } let DecoderNamespace = "MipsFP64" in { - let AdditionalPredicates = [Mips3D] in { + let AdditionalPredicates = [HasMips3D] in { def ADDR_PS64 : ADDS_FT<"addr.ps", FGR64Opnd, II_ADDR_PS, 0>, ADDS_FM<0x18, 22>, ISA_MIPS32R2_NOT_32R6_64R6, FGR_64; def MULR_PS64 : ADDS_FT<"mulr.ps", FGR64Opnd, II_MULR_PS, 0>, diff --git a/llvm/lib/Target/Mips/MipsInstrInfo.td b/llvm/lib/Target/Mips/MipsInstrInfo.td index eabfdcf912d66..ad964df2ea426 100644 --- a/llvm/lib/Target/Mips/MipsInstrInfo.td +++ b/llvm/lib/Target/Mips/MipsInstrInfo.td @@ -2589,6 +2589,22 @@ def : MipsInstAlias<"seq $rd, $imm", (SEQIMacro GPR32Opnd:$rd, GPR32Opnd:$rd, simm32:$imm), 0>, NOT_ASE_CNMIPS; +def SNEMacro : MipsAsmPseudoInst<(outs GPR32Opnd:$rd), + (ins GPR32Opnd:$rs, GPR32Opnd:$rt), + "sne $rd, $rs, $rt">, NOT_ASE_CNMIPS; + +def : MipsInstAlias<"sne $rd, $rs", + (SNEMacro GPR32Opnd:$rd, GPR32Opnd:$rd, GPR32Opnd:$rs), 0>, + NOT_ASE_CNMIPS; + +def SNEIMacro : MipsAsmPseudoInst<(outs GPR32Opnd:$rd), + (ins GPR32Opnd:$rs, simm32_relaxed:$imm), + "sne $rd, $rs, $imm">, NOT_ASE_CNMIPS; + +def : MipsInstAlias<"sne $rd, $imm", + (SNEIMacro GPR32Opnd:$rd, GPR32Opnd:$rd, simm32:$imm), 0>, + NOT_ASE_CNMIPS; + def MULImmMacro : MipsAsmPseudoInst<(outs), (ins GPR32Opnd:$rd, GPR32Opnd:$rs, simm32_relaxed:$imm), "mul\t$rd, $rs, $imm">, @@ -2736,6 +2752,34 @@ let AdditionalPredicates = [NotInMicroMips] in { uimm32_coerced:$imm), 0>, GPR_32; + def SLE : MipsAsmPseudoInst<(outs GPR32Opnd:$rd), + (ins GPR32Opnd:$rs, GPR32Opnd:$rt), + "sle\t$rd, $rs, $rt">, ISA_MIPS1; + def : MipsInstAlias<"sle $rs, $rt", + (SLE GPR32Opnd:$rs, GPR32Opnd:$rs, GPR32Opnd:$rt), 0>, + ISA_MIPS1; + def SLEImm : MipsAsmPseudoInst<(outs GPR32Opnd:$rd), + (ins GPR32Opnd:$rs, simm32:$imm), + "sle\t$rd, $rs, $imm">, GPR_32; + def : MipsInstAlias<"sle $rs, $imm", (SLEImm GPR32Opnd:$rs, + GPR32Opnd:$rs, + simm32:$imm), 0>, + GPR_32; + + def SLEU : MipsAsmPseudoInst<(outs GPR32Opnd:$rd), + (ins GPR32Opnd:$rs, GPR32Opnd:$rt), + "sleu\t$rd, $rs, $rt">, ISA_MIPS1; + def : MipsInstAlias<"sleu $rs, $rt", + (SLEU GPR32Opnd:$rs, GPR32Opnd:$rs, GPR32Opnd:$rt), 0>, + ISA_MIPS1; + def SLEUImm : MipsAsmPseudoInst<(outs GPR32Opnd:$rd), + (ins GPR32Opnd:$rs, uimm32_coerced:$imm), + "sleu\t$rd, $rs, $imm">, GPR_32; + def : MipsInstAlias<"sleu $rs, $imm", (SLEUImm GPR32Opnd:$rs, + GPR32Opnd:$rs, + uimm32_coerced:$imm), 0>, + GPR_32; + def : MipsInstAlias< "not $rt, $rs", (NOR GPR32Opnd:$rt, GPR32Opnd:$rs, ZERO), 0>, ISA_MIPS1; diff --git a/llvm/lib/Target/Mips/MipsScheduleP5600.td b/llvm/lib/Target/Mips/MipsScheduleP5600.td index 872ff59f5041f..3d159d412489f 100644 --- a/llvm/lib/Target/Mips/MipsScheduleP5600.td +++ b/llvm/lib/Target/Mips/MipsScheduleP5600.td @@ -20,7 +20,8 @@ def MipsP5600Model : SchedMachineModel { IsGP64bit, IsPTR64bit, InMicroMips, InMips16Mode, HasCnMips, HasCnMipsP, - HasDSP, HasDSPR2, HasMT, HasCRC]; + HasDSP, HasDSPR2, HasMips3D, HasMT, + HasCRC]; } let SchedModel = MipsP5600Model in { @@ -458,8 +459,6 @@ def : InstRW<[P5600WriteFPUL], (instregex "^C_[A-Z]+_(S|D32|D64)$")>; def : InstRW<[P5600WriteFPUL], (instregex "^FCMP_(S32|D32|D64)$")>; def : InstRW<[P5600WriteFPUL], (instregex "^PseudoCVT_(S|D32|D64)_(L|W)$")>; def : InstRW<[P5600WriteFPUL], (instrs PLL_PS64, PLU_PS64, PUL_PS64, PUU_PS64)>; -def : InstRW<[P5600WriteFPUL], (instrs ADDR_PS64, MULR_PS64)>; -def : InstRW<[P5600WriteFPUL], (instrs CVT_PS_PW64, CVT_PW_PS64)>; // div.[ds], div.ps def : InstRW<[P5600WriteFPUDivS], (instrs FDIV_S)>; diff --git a/llvm/lib/Target/Mips/MipsSubtarget.h b/llvm/lib/Target/Mips/MipsSubtarget.h index 9303832e38a9f..26ee961fc95dd 100644 --- a/llvm/lib/Target/Mips/MipsSubtarget.h +++ b/llvm/lib/Target/Mips/MipsSubtarget.h @@ -315,6 +315,7 @@ class MipsSubtarget : public MipsGenSubtargetInfo { bool hasDSP() const { return HasDSP; } bool hasDSPR2() const { return HasDSPR2; } bool hasDSPR3() const { return HasDSPR3; } + bool has3D() const { return Has3D; } bool hasMSA() const { return HasMSA; } bool disableMadd4() const { return DisableMadd4; } bool hasEVA() const { return HasEVA; } diff --git a/llvm/lib/Target/Mips/MipsTargetObjectFile.cpp b/llvm/lib/Target/Mips/MipsTargetObjectFile.cpp index 0852b5a18c68f..ffd7a057bf058 100644 --- a/llvm/lib/Target/Mips/MipsTargetObjectFile.cpp +++ b/llvm/lib/Target/Mips/MipsTargetObjectFile.cpp @@ -44,7 +44,6 @@ EmbeddedData("membedded-data", cl::Hidden, void MipsTargetObjectFile::Initialize(MCContext &Ctx, const TargetMachine &TM){ TargetLoweringObjectFileELF::Initialize(Ctx, TM); - InitializeELF(TM.Options.UseInitArray); SmallDataSection = getContext().getELFSection( ".sdata", ELF::SHT_PROGBITS, diff --git a/llvm/lib/Target/Mips/MipsTargetStreamer.h b/llvm/lib/Target/Mips/MipsTargetStreamer.h index e383d749d1d11..8c2b3bb38b9c6 100644 --- a/llvm/lib/Target/Mips/MipsTargetStreamer.h +++ b/llvm/lib/Target/Mips/MipsTargetStreamer.h @@ -92,6 +92,7 @@ class MipsTargetStreamer : public MCTargetStreamer { virtual void emitDirectiveSetHardFloat(); // PIC support + virtual void emitDirectiveCpAdd(unsigned RegNo); virtual void emitDirectiveCpLoad(unsigned RegNo); virtual void emitDirectiveCpLocal(unsigned RegNo); virtual bool emitDirectiveCpRestore(int Offset, @@ -273,6 +274,7 @@ class MipsTargetAsmStreamer : public MipsTargetStreamer { void emitDirectiveSetHardFloat() override; // PIC support + void emitDirectiveCpAdd(unsigned RegNo) override; void emitDirectiveCpLoad(unsigned RegNo) override; void emitDirectiveCpLocal(unsigned RegNo) override; @@ -345,6 +347,7 @@ class MipsTargetELFStreamer : public MipsTargetStreamer { void emitFMask(unsigned FPUBitmask, int FPUTopSavedRegOff) override; // PIC support + void emitDirectiveCpAdd(unsigned RegNo) override; void emitDirectiveCpLoad(unsigned RegNo) override; void emitDirectiveCpLocal(unsigned RegNo) override; bool emitDirectiveCpRestore(int Offset, function_ref GetATReg, diff --git a/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp b/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp index 83039241a7c75..a8d2abec3e9e4 100644 --- a/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp @@ -113,8 +113,8 @@ bool NVPTXLowerAggrCopies::runOnFunction(Function &F) { createMemCpyLoopKnownSize(/* ConvertedInst */ SI, /* SrcAddr */ SrcAddr, /* DstAddr */ DstAddr, /* CopyLen */ CopyLen, - /* SrcAlign */ LI->getAlignment(), - /* DestAlign */ SI->getAlignment(), + /* SrcAlign */ LI->getAlign().valueOrOne(), + /* DestAlign */ SI->getAlign().valueOrOne(), /* SrcIsVolatile */ LI->isVolatile(), /* DstIsVolatile */ SI->isVolatile(), TTI); diff --git a/llvm/lib/Target/PowerPC/PPC.td b/llvm/lib/Target/PowerPC/PPC.td index fc817631e0ace..888b228450b65 100644 --- a/llvm/lib/Target/PowerPC/PPC.td +++ b/llvm/lib/Target/PowerPC/PPC.td @@ -463,7 +463,7 @@ def : ProcessorModel<"g5", G5Model, def : ProcessorModel<"e500", PPCE500Model, [DirectiveE500, FeatureICBT, FeatureBookE, - FeatureISEL, FeatureMFTB, FeatureSPE]>; + FeatureISEL, FeatureMFTB, FeatureMSYNC, FeatureSPE]>; def : ProcessorModel<"e500mc", PPCE500mcModel, [DirectiveE500mc, FeatureSTFIWX, FeatureICBT, FeatureBookE, diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index 2d718011059ab..9aee6cb6655ac 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -15368,22 +15368,48 @@ bool PPCTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, bool PPCTargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const { - VT = VT.getScalarType(); - - if (!VT.isSimple()) - return false; + return isFMAFasterThanFMulAndFAdd( + MF.getFunction(), VT.getTypeForEVT(MF.getFunction().getContext())); +} - switch (VT.getSimpleVT().SimpleTy) { - case MVT::f32: - case MVT::f64: +bool PPCTargetLowering::isFMAFasterThanFMulAndFAdd(const Function &F, + Type *Ty) const { + switch (Ty->getScalarType()->getTypeID()) { + case Type::FloatTyID: + case Type::DoubleTyID: return true; - case MVT::f128: - return (EnableQuadPrecision && Subtarget.hasP9Vector()); + case Type::FP128TyID: + return EnableQuadPrecision && Subtarget.hasP9Vector(); default: - break; + return false; } +} - return false; +// Currently this is a copy from AArch64TargetLowering::isProfitableToHoist. +// FIXME: add more patterns which are profitable to hoist. +bool PPCTargetLowering::isProfitableToHoist(Instruction *I) const { + if (I->getOpcode() != Instruction::FMul) + return true; + + if (!I->hasOneUse()) + return true; + + Instruction *User = I->user_back(); + assert(User && "A single use instruction with no uses."); + + if (User->getOpcode() != Instruction::FSub && + User->getOpcode() != Instruction::FAdd) + return true; + + const TargetOptions &Options = getTargetMachine().Options; + const Function *F = I->getFunction(); + const DataLayout &DL = F->getParent()->getDataLayout(); + Type *Ty = User->getOperand(0)->getType(); + + return !( + isFMAFasterThanFMulAndFAdd(*F, Ty) && + isOperationLegalOrCustom(ISD::FMA, getValueType(DL, Ty)) && + (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath)); } const MCPhysReg * diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h index c0a0f9a79a3ab..70bf4fbfce1d1 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.h +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -907,6 +907,14 @@ namespace llvm { bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override; + bool isFMAFasterThanFMulAndFAdd(const Function &F, Type *Ty) const override; + + /// isProfitableToHoist - Check if it is profitable to hoist instruction + /// \p I to its dominator block. + /// For example, it is not profitable if \p I and it's only user can form a + /// FMA instruction, because Powerpc prefers FMADD. + bool isProfitableToHoist(Instruction *I) const override; + const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override; // Should we expand the build vector with shuffles? diff --git a/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/llvm/lib/Target/PowerPC/PPCInstrVSX.td index b12096dacdd33..73529533c26b4 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrVSX.td +++ b/llvm/lib/Target/PowerPC/PPCInstrVSX.td @@ -1341,6 +1341,21 @@ def DWToSPExtractConv { dag BVS = (v4f32 (build_vector El0SS1, El1SS1, El0SS2, El1SS2)); } +def WToDPExtractConv { + dag El0S = (f64 (PPCfcfid (PPCmtvsra (extractelt v4i32:$A, 0)))); + dag El1S = (f64 (PPCfcfid (PPCmtvsra (extractelt v4i32:$A, 1)))); + dag El2S = (f64 (PPCfcfid (PPCmtvsra (extractelt v4i32:$A, 2)))); + dag El3S = (f64 (PPCfcfid (PPCmtvsra (extractelt v4i32:$A, 3)))); + dag El0U = (f64 (PPCfcfidu (PPCmtvsrz (extractelt v4i32:$A, 0)))); + dag El1U = (f64 (PPCfcfidu (PPCmtvsrz (extractelt v4i32:$A, 1)))); + dag El2U = (f64 (PPCfcfidu (PPCmtvsrz (extractelt v4i32:$A, 2)))); + dag El3U = (f64 (PPCfcfidu (PPCmtvsrz (extractelt v4i32:$A, 3)))); + dag BV02S = (v2f64 (build_vector El0S, El2S)); + dag BV13S = (v2f64 (build_vector El1S, El3S)); + dag BV02U = (v2f64 (build_vector El0U, El2U)); + dag BV13U = (v2f64 (build_vector El1U, El3U)); +} + // The following VSX instructions were introduced in Power ISA 2.07 /* FIXME: if the operands are v2i64, these patterns will not match. we should define new patterns or otherwise match the same patterns @@ -4171,6 +4186,41 @@ let AddedComplexity = 400 in { def : Pat<(v4i32 (build_vector ExtDbl.A0U, ExtDbl.A1U, ExtDbl.B0U, ExtDbl.B1U)), (v4i32 (VMRGEW MrgWords.CVA0B0U, MrgWords.CVA1B1U))>; + def : Pat<(v2f64 (build_vector (f64 (fpextend (extractelt v4f32:$A, 0))), + (f64 (fpextend (extractelt v4f32:$A, 1))))), + (v2f64 (XVCVSPDP (XXMRGHW $A, $A)))>; + def : Pat<(v2f64 (build_vector (f64 (fpextend (extractelt v4f32:$A, 1))), + (f64 (fpextend (extractelt v4f32:$A, 0))))), + (v2f64 (XXPERMDI (XVCVSPDP (XXMRGHW $A, $A)), + (XVCVSPDP (XXMRGHW $A, $A)), 2))>; + def : Pat<(v2f64 (build_vector (f64 (fpextend (extractelt v4f32:$A, 0))), + (f64 (fpextend (extractelt v4f32:$A, 2))))), + (v2f64 (XVCVSPDP $A))>; + def : Pat<(v2f64 (build_vector (f64 (fpextend (extractelt v4f32:$A, 1))), + (f64 (fpextend (extractelt v4f32:$A, 3))))), + (v2f64 (XVCVSPDP (XXSLDWI $A, $A, 3)))>; + def : Pat<(v2f64 (build_vector (f64 (fpextend (extractelt v4f32:$A, 2))), + (f64 (fpextend (extractelt v4f32:$A, 3))))), + (v2f64 (XVCVSPDP (XXMRGLW $A, $A)))>; + def : Pat<(v2f64 (build_vector (f64 (fpextend (extractelt v4f32:$A, 3))), + (f64 (fpextend (extractelt v4f32:$A, 2))))), + (v2f64 (XXPERMDI (XVCVSPDP (XXMRGLW $A, $A)), + (XVCVSPDP (XXMRGLW $A, $A)), 2))>; + def : Pat<(v2f64 (build_vector (f64 (fpextend (extractelt v4f32:$A, 0))), + (f64 (fpextend (extractelt v4f32:$B, 0))))), + (v2f64 (XVCVSPDP (XXPERMDI $A, $B, 0)))>; + def : Pat<(v2f64 (build_vector (f64 (fpextend (extractelt v4f32:$A, 3))), + (f64 (fpextend (extractelt v4f32:$B, 3))))), + (v2f64 (XVCVSPDP (XXSLDWI (XXPERMDI $A, $B, 3), + (XXPERMDI $A, $B, 3), 1)))>; + def : Pat; + def : Pat; + def : Pat; + def : Pat; } let Predicates = [IsLittleEndian, HasP8Vector] in { @@ -4249,6 +4299,41 @@ let AddedComplexity = 400 in { def : Pat<(v4i32 (build_vector ExtDbl.A0U, ExtDbl.A1U, ExtDbl.B0U, ExtDbl.B1U)), (v4i32 (VMRGEW MrgWords.CVB1A1U, MrgWords.CVB0A0U))>; + def : Pat<(v2f64 (build_vector (f64 (fpextend (extractelt v4f32:$A, 0))), + (f64 (fpextend (extractelt v4f32:$A, 1))))), + (v2f64 (XVCVSPDP (XXMRGLW $A, $A)))>; + def : Pat<(v2f64 (build_vector (f64 (fpextend (extractelt v4f32:$A, 1))), + (f64 (fpextend (extractelt v4f32:$A, 0))))), + (v2f64 (XXPERMDI (XVCVSPDP (XXMRGLW $A, $A)), + (XVCVSPDP (XXMRGLW $A, $A)), 2))>; + def : Pat<(v2f64 (build_vector (f64 (fpextend (extractelt v4f32:$A, 0))), + (f64 (fpextend (extractelt v4f32:$A, 2))))), + (v2f64 (XVCVSPDP (XXSLDWI $A, $A, 1)))>; + def : Pat<(v2f64 (build_vector (f64 (fpextend (extractelt v4f32:$A, 1))), + (f64 (fpextend (extractelt v4f32:$A, 3))))), + (v2f64 (XVCVSPDP $A))>; + def : Pat<(v2f64 (build_vector (f64 (fpextend (extractelt v4f32:$A, 2))), + (f64 (fpextend (extractelt v4f32:$A, 3))))), + (v2f64 (XVCVSPDP (XXMRGHW $A, $A)))>; + def : Pat<(v2f64 (build_vector (f64 (fpextend (extractelt v4f32:$A, 3))), + (f64 (fpextend (extractelt v4f32:$A, 2))))), + (v2f64 (XXPERMDI (XVCVSPDP (XXMRGHW $A, $A)), + (XVCVSPDP (XXMRGHW $A, $A)), 2))>; + def : Pat<(v2f64 (build_vector (f64 (fpextend (extractelt v4f32:$A, 0))), + (f64 (fpextend (extractelt v4f32:$B, 0))))), + (v2f64 (XVCVSPDP (XXSLDWI (XXPERMDI $B, $A, 3), + (XXPERMDI $B, $A, 3), 1)))>; + def : Pat<(v2f64 (build_vector (f64 (fpextend (extractelt v4f32:$A, 3))), + (f64 (fpextend (extractelt v4f32:$B, 3))))), + (v2f64 (XVCVSPDP (XXPERMDI $B, $A, 0)))>; + def : Pat; + def : Pat; + def : Pat; + def : Pat; } let Predicates = [HasDirectMove] in { diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp index 35d3c420c3c3e..24ad0f894924a 100644 --- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp @@ -498,9 +498,9 @@ void PPCRegisterInfo::lowerDynamicAlloc(MachineBasicBlock::iterator II) const { // Get stack alignments. const PPCFrameLowering *TFI = getFrameLowering(MF); - unsigned TargetAlign = TFI->getStackAlignment(); - unsigned MaxAlign = MFI.getMaxAlign().value(); - assert((maxCallFrameSize & (MaxAlign-1)) == 0 && + Align TargetAlign = TFI->getStackAlign(); + Align MaxAlign = MFI.getMaxAlign(); + assert(isAligned(MaxAlign, maxCallFrameSize) && "Maximum call-frame size not sufficiently aligned"); // Determine the previous frame's address. If FrameSize can't be @@ -545,7 +545,7 @@ void PPCRegisterInfo::lowerDynamicAlloc(MachineBasicBlock::iterator II) const { // Unfortunately, there is no andi, only andi., and we can't insert that // here because we might clobber cr0 while it is live. BuildMI(MBB, II, dl, TII.get(PPC::LI8), NegSizeReg) - .addImm(~(MaxAlign-1)); + .addImm(~(MaxAlign.value() - 1)); unsigned NegSizeReg1 = NegSizeReg; NegSizeReg = MF.getRegInfo().createVirtualRegister(G8RC); @@ -570,7 +570,7 @@ void PPCRegisterInfo::lowerDynamicAlloc(MachineBasicBlock::iterator II) const { // Unfortunately, there is no andi, only andi., and we can't insert that // here because we might clobber cr0 while it is live. BuildMI(MBB, II, dl, TII.get(PPC::LI), NegSizeReg) - .addImm(~(MaxAlign-1)); + .addImm(~(MaxAlign.value() - 1)); unsigned NegSizeReg1 = NegSizeReg; NegSizeReg = MF.getRegInfo().createVirtualRegister(GPRC); diff --git a/llvm/lib/Target/PowerPC/PPCTargetObjectFile.cpp b/llvm/lib/Target/PowerPC/PPCTargetObjectFile.cpp index e237fab1b2679..168630f55b24a 100644 --- a/llvm/lib/Target/PowerPC/PPCTargetObjectFile.cpp +++ b/llvm/lib/Target/PowerPC/PPCTargetObjectFile.cpp @@ -18,7 +18,6 @@ void PPC64LinuxTargetObjectFile:: Initialize(MCContext &Ctx, const TargetMachine &TM) { TargetLoweringObjectFileELF::Initialize(Ctx, TM); - InitializeELF(TM.Options.UseInitArray); } MCSection *PPC64LinuxTargetObjectFile::SelectSectionForGlobal( diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp index 927e8534ece87..0caafdd32f88c 100644 --- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp @@ -631,7 +631,6 @@ RISCVFrameLowering::getFirstSPAdjustAmount(const MachineFunction &MF) const { const MachineFrameInfo &MFI = MF.getFrameInfo(); const std::vector &CSI = MFI.getCalleeSavedInfo(); uint64_t StackSize = MFI.getStackSize(); - uint64_t StackAlign = getStackAlignment(); // Disable SplitSPAdjust if save-restore libcall used. The callee saved // registers will be pushed by the save-restore libcalls, so we don't have to @@ -648,7 +647,7 @@ RISCVFrameLowering::getFirstSPAdjustAmount(const MachineFunction &MF) const { // load/store instruction and we have to stick with the stack alignment. // 2048 is 16-byte alignment. The stack alignment for RV32 and RV64 is 16, // for RV32E is 4. So (2048 - StackAlign) will satisfy the stack alignment. - return 2048 - StackAlign; + return 2048 - getStackAlign().value(); } return 0; } diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index be6d3f5222428..4e3fde556068d 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -336,6 +336,17 @@ bool RISCVTargetLowering::isSExtCheaperThanZExt(EVT SrcVT, EVT DstVT) const { return Subtarget.is64Bit() && SrcVT == MVT::i32 && DstVT == MVT::i64; } +bool RISCVTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT, + bool ForCodeSize) const { + if (VT == MVT::f32 && !Subtarget.hasStdExtF()) + return false; + if (VT == MVT::f64 && !Subtarget.hasStdExtD()) + return false; + if (Imm.isNegZero()) + return false; + return Imm.isZero(); +} + bool RISCVTargetLowering::hasBitPreservingFPLogic(EVT VT) const { return (VT == MVT::f32 && Subtarget.hasStdExtF()) || (VT == MVT::f64 && Subtarget.hasStdExtD()); diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index 58bb0f3264b98..b56d6dce27574 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -74,6 +74,8 @@ class RISCVTargetLowering : public TargetLowering { bool isTruncateFree(EVT SrcVT, EVT DstVT) const override; bool isZExtFree(SDValue Val, EVT VT2) const override; bool isSExtCheaperThanZExt(EVT SrcVT, EVT DstVT) const override; + bool isFPImmLegal(const APFloat &Imm, EVT VT, + bool ForCodeSize) const override; bool hasBitPreservingFPLogic(EVT VT) const override; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoD.td b/llvm/lib/Target/RISCV/RISCVInstrInfoD.td index 8b3274ddc4619..d32c5f37d630b 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoD.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoD.td @@ -339,6 +339,10 @@ def SplitF64Pseudo } // Predicates = [HasStdExtD] let Predicates = [HasStdExtD, IsRV32] in { + +/// Float constants +def : Pat<(f64 (fpimm0)), (FCVT_D_W X0)>; + // double->[u]int. Round-to-zero must be used. def : Pat<(fp_to_sint FPR64:$rs1), (FCVT_W_D FPR64:$rs1, 0b001)>; def : Pat<(fp_to_uint FPR64:$rs1), (FCVT_WU_D FPR64:$rs1, 0b001)>; @@ -349,6 +353,10 @@ def : Pat<(uint_to_fp GPR:$rs1), (FCVT_D_WU GPR:$rs1)>; } // Predicates = [HasStdExtD, IsRV32] let Predicates = [HasStdExtD, IsRV64] in { + +/// Float constants +def : Pat<(f64 (fpimm0)), (FMV_D_X X0)>; + def : Pat<(bitconvert GPR:$rs1), (FMV_D_X GPR:$rs1)>; def : Pat<(bitconvert FPR64:$rs1), (FMV_X_D FPR64:$rs1)>; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoF.td b/llvm/lib/Target/RISCV/RISCVInstrInfoF.td index 757fc7de56a9d..190007fe6f1fb 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoF.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoF.td @@ -286,6 +286,9 @@ def PseudoFSW : PseudoStore<"fsw", FPR32>; // Pseudo-instructions and codegen patterns //===----------------------------------------------------------------------===// +/// Floating point constants +def fpimm0 : PatLeaf<(fpimm), [{ return N->isExactlyValue(+0.0); }]>; + /// Generic pattern classes class PatFpr32Fpr32 : Pat<(OpNode FPR32:$rs1, FPR32:$rs2), (Inst $rs1, $rs2)>; @@ -295,6 +298,9 @@ class PatFpr32Fpr32DynFrm let Predicates = [HasStdExtF] in { +/// Float constants +def : Pat<(f32 (fpimm0)), (FMV_W_X X0)>; + /// Float conversion operations // Moves (no conversion) diff --git a/llvm/lib/Target/RISCV/RISCVRegisterBankInfo.cpp b/llvm/lib/Target/RISCV/RISCVRegisterBankInfo.cpp index 9db3107da0733..bd3b95a98b9f7 100644 --- a/llvm/lib/Target/RISCV/RISCVRegisterBankInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVRegisterBankInfo.cpp @@ -22,5 +22,5 @@ using namespace llvm; -RISCVRegisterBankInfo::RISCVRegisterBankInfo(unsigned HwMode) - : RISCVGenRegisterBankInfo(HwMode) {} +RISCVRegisterBankInfo::RISCVRegisterBankInfo(const TargetRegisterInfo &TRI) + : RISCVGenRegisterBankInfo() {} diff --git a/llvm/lib/Target/RISCV/RISCVRegisterBankInfo.h b/llvm/lib/Target/RISCV/RISCVRegisterBankInfo.h index 71dddd28380de..05fac992734d9 100644 --- a/llvm/lib/Target/RISCV/RISCVRegisterBankInfo.h +++ b/llvm/lib/Target/RISCV/RISCVRegisterBankInfo.h @@ -31,7 +31,7 @@ class RISCVGenRegisterBankInfo : public RegisterBankInfo { /// This class provides the information for the target register banks. class RISCVRegisterBankInfo final : public RISCVGenRegisterBankInfo { public: - RISCVRegisterBankInfo(unsigned HwMode); + RISCVRegisterBankInfo(const TargetRegisterInfo &TRI); }; } // end namespace llvm #endif diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp index 9815a78526890..47a48c820a290 100644 --- a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp +++ b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp @@ -56,7 +56,7 @@ RISCVSubtarget::RISCVSubtarget(const Triple &TT, StringRef CPU, StringRef FS, CallLoweringInfo.reset(new RISCVCallLowering(*getTargetLowering())); Legalizer.reset(new RISCVLegalizerInfo(*this)); - auto *RBI = new RISCVRegisterBankInfo(getHwMode()); + auto *RBI = new RISCVRegisterBankInfo(*getRegisterInfo()); RegBankInfo.reset(RBI); InstSelector.reset(createRISCVInstructionSelector( *static_cast(&TM), *this, *RBI)); diff --git a/llvm/lib/Target/RISCV/RISCVTargetObjectFile.cpp b/llvm/lib/Target/RISCV/RISCVTargetObjectFile.cpp index bbd45c970d3dc..0d1480170fda8 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetObjectFile.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetObjectFile.cpp @@ -17,7 +17,6 @@ using namespace llvm; void RISCVELFTargetObjectFile::Initialize(MCContext &Ctx, const TargetMachine &TM) { TargetLoweringObjectFileELF::Initialize(Ctx, TM); - InitializeELF(TM.Options.UseInitArray); SmallDataSection = getContext().getELFSection( ".sdata", ELF::SHT_PROGBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC); diff --git a/llvm/lib/Target/Sparc/SparcTargetObjectFile.cpp b/llvm/lib/Target/Sparc/SparcTargetObjectFile.cpp index e6ad4d2d67aa7..022ee4e9503e5 100644 --- a/llvm/lib/Target/Sparc/SparcTargetObjectFile.cpp +++ b/llvm/lib/Target/Sparc/SparcTargetObjectFile.cpp @@ -17,7 +17,6 @@ using namespace llvm; void SparcELFTargetObjectFile::Initialize(MCContext &Ctx, const TargetMachine &TM) { TargetLoweringObjectFileELF::Initialize(Ctx, TM); - InitializeELF(TM.Options.UseInitArray); } const MCExpr *SparcELFTargetObjectFile::getTTypeGlobalReference( diff --git a/llvm/lib/Target/SystemZ/SystemZShortenInst.cpp b/llvm/lib/Target/SystemZ/SystemZShortenInst.cpp index f6184cec795ae..092d3a2bcc30f 100644 --- a/llvm/lib/Target/SystemZ/SystemZShortenInst.cpp +++ b/llvm/lib/Target/SystemZ/SystemZShortenInst.cpp @@ -46,6 +46,7 @@ class SystemZShortenInst : public MachineFunctionPass { bool shortenOn001(MachineInstr &MI, unsigned Opcode); bool shortenOn001AddCC(MachineInstr &MI, unsigned Opcode); bool shortenFPConv(MachineInstr &MI, unsigned Opcode); + bool shortenFusedFPOp(MachineInstr &MI, unsigned Opcode); const SystemZInstrInfo *TII; const TargetRegisterInfo *TRI; @@ -175,6 +176,32 @@ bool SystemZShortenInst::shortenFPConv(MachineInstr &MI, unsigned Opcode) { return false; } +bool SystemZShortenInst::shortenFusedFPOp(MachineInstr &MI, unsigned Opcode) { + MachineOperand &DstMO = MI.getOperand(0); + MachineOperand &LHSMO = MI.getOperand(1); + MachineOperand &RHSMO = MI.getOperand(2); + MachineOperand &AccMO = MI.getOperand(3); + if (SystemZMC::getFirstReg(DstMO.getReg()) < 16 && + SystemZMC::getFirstReg(LHSMO.getReg()) < 16 && + SystemZMC::getFirstReg(RHSMO.getReg()) < 16 && + SystemZMC::getFirstReg(AccMO.getReg()) < 16 && + DstMO.getReg() == AccMO.getReg()) { + MachineOperand Lhs(LHSMO); + MachineOperand Rhs(RHSMO); + MachineOperand Src(AccMO); + MI.RemoveOperand(3); + MI.RemoveOperand(2); + MI.RemoveOperand(1); + MI.setDesc(TII->get(Opcode)); + MachineInstrBuilder(*MI.getParent()->getParent(), &MI) + .add(Src) + .add(Lhs) + .add(Rhs); + return true; + } + return false; +} + // Process all instructions in MBB. Return true if something changed. bool SystemZShortenInst::processBlock(MachineBasicBlock &MBB) { bool Changed = false; @@ -235,6 +262,22 @@ bool SystemZShortenInst::processBlock(MachineBasicBlock &MBB) { Changed |= shortenOn001(MI, SystemZ::MEEBR); break; + case SystemZ::WFMADB: + Changed |= shortenFusedFPOp(MI, SystemZ::MADBR); + break; + + case SystemZ::WFMASB: + Changed |= shortenFusedFPOp(MI, SystemZ::MAEBR); + break; + + case SystemZ::WFMSDB: + Changed |= shortenFusedFPOp(MI, SystemZ::MSDBR); + break; + + case SystemZ::WFMSSB: + Changed |= shortenFusedFPOp(MI, SystemZ::MSEBR); + break; + case SystemZ::WFLCDB: Changed |= shortenOn01(MI, SystemZ::LCDFR); break; diff --git a/llvm/lib/Target/TargetLoweringObjectFile.cpp b/llvm/lib/Target/TargetLoweringObjectFile.cpp index 9f464c0201eb6..92caa6f940907 100644 --- a/llvm/lib/Target/TargetLoweringObjectFile.cpp +++ b/llvm/lib/Target/TargetLoweringObjectFile.cpp @@ -38,11 +38,10 @@ using namespace llvm; /// lowering implementations a chance to set up their default sections. void TargetLoweringObjectFile::Initialize(MCContext &ctx, const TargetMachine &TM) { - Ctx = &ctx; // `Initialize` can be called more than once. delete Mang; Mang = new Mangler(); - InitMCObjectFileInfo(TM.getTargetTriple(), TM.isPositionIndependent(), *Ctx, + InitMCObjectFileInfo(TM.getTargetTriple(), TM.isPositionIndependent(), ctx, TM.getCodeModel() == CodeModel::Large); // Reset various EH DWARF encodings. @@ -121,7 +120,7 @@ MCSymbol *TargetLoweringObjectFile::getSymbolWithGlobalValueBase( NameStr += GV->getParent()->getDataLayout().getPrivateGlobalPrefix(); TM.getNameWithPrefix(NameStr, GV, *Mang); NameStr.append(Suffix.begin(), Suffix.end()); - return Ctx->getOrCreateSymbol(NameStr); + return getContext().getOrCreateSymbol(NameStr); } MCSymbol *TargetLoweringObjectFile::getCFIPersonalitySymbol( @@ -353,7 +352,7 @@ getTTypeReference(const MCSymbolRefExpr *Sym, unsigned Encoding, const MCExpr *TargetLoweringObjectFile::getDebugThreadLocalSymbol(const MCSymbol *Sym) const { // FIXME: It's not clear what, if any, default this should have - perhaps a // null return could mean 'no location' & we should just do that here. - return MCSymbolRefExpr::create(Sym, *Ctx); + return MCSymbolRefExpr::create(Sym, getContext()); } void TargetLoweringObjectFile::getNameWithPrefix( diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEInstPrinter.cpp b/llvm/lib/Target/VE/MCTargetDesc/VEInstPrinter.cpp index 0a9e27462c0b4..1229a8c6dc711 100644 --- a/llvm/lib/Target/VE/MCTargetDesc/VEInstPrinter.cpp +++ b/llvm/lib/Target/VE/MCTargetDesc/VEInstPrinter.cpp @@ -113,5 +113,5 @@ void VEInstPrinter::printMemASOperand(const MCInst *MI, int opNum, void VEInstPrinter::printCCOperand(const MCInst *MI, int opNum, const MCSubtargetInfo &STI, raw_ostream &O) { int CC = (int)MI->getOperand(opNum).getImm(); - O << VECondCodeToString((VECC::CondCodes)CC); + O << VECondCodeToString((VECC::CondCode)CC); } diff --git a/llvm/lib/Target/VE/VE.h b/llvm/lib/Target/VE/VE.h index 0317278279666..e5a5b87adf727 100644 --- a/llvm/lib/Target/VE/VE.h +++ b/llvm/lib/Target/VE/VE.h @@ -37,7 +37,7 @@ namespace llvm { // Enums corresponding to VE condition codes, both icc's and fcc's. These // values must be kept in sync with the ones in the .td file. namespace VECC { -enum CondCodes { +enum CondCode { // Integer comparison CC_IG = 0, // Greater CC_IL = 1, // Less @@ -66,7 +66,7 @@ enum CondCodes { }; } -inline static const char *VECondCodeToString(VECC::CondCodes CC) { +inline static const char *VECondCodeToString(VECC::CondCode CC) { switch (CC) { case VECC::CC_IG: return "gt"; case VECC::CC_IL: return "lt"; diff --git a/llvm/lib/Target/VE/VEFrameLowering.cpp b/llvm/lib/Target/VE/VEFrameLowering.cpp index dcbb4bc75f5d0..1305f12d7a34d 100644 --- a/llvm/lib/Target/VE/VEFrameLowering.cpp +++ b/llvm/lib/Target/VE/VEFrameLowering.cpp @@ -192,7 +192,7 @@ void VEFrameLowering::emitPrologue(MachineFunction &MF, // rather than reporting an error, as would be sensible. This is // poor, but fixing that bogosity is going to be a large project. // For now, just see if it's lied, and report an error here. - if (!NeedsStackRealignment && MFI.getMaxAlignment() > getStackAlignment()) + if (!NeedsStackRealignment && MFI.getMaxAlign() > getStackAlign()) report_fatal_error("Function \"" + Twine(MF.getName()) + "\" required " "stack re-alignment, but LLVM couldn't handle it " @@ -222,9 +222,7 @@ void VEFrameLowering::emitPrologue(MachineFunction &MF, // Finally, ensure that the size is sufficiently aligned for the // data on the stack. - if (MFI.getMaxAlignment() > 0) { - NumBytes = alignTo(NumBytes, MFI.getMaxAlignment()); - } + NumBytes = alignTo(NumBytes, MFI.getMaxAlign().value()); // Update stack size with corrected value. MFI.setStackSize(NumBytes); diff --git a/llvm/lib/Target/VE/VEInstrInfo.cpp b/llvm/lib/Target/VE/VEInstrInfo.cpp index 8d2fff7b76dff..cc24f2e29c597 100644 --- a/llvm/lib/Target/VE/VEInstrInfo.cpp +++ b/llvm/lib/Target/VE/VEInstrInfo.cpp @@ -36,12 +36,11 @@ using namespace llvm; void VEInstrInfo::anchor() {} VEInstrInfo::VEInstrInfo(VESubtarget &ST) - : VEGenInstrInfo(VE::ADJCALLSTACKDOWN, VE::ADJCALLSTACKUP), RI(), - Subtarget(ST) {} + : VEGenInstrInfo(VE::ADJCALLSTACKDOWN, VE::ADJCALLSTACKUP), RI() {} static bool IsIntegerCC(unsigned CC) { return (CC < VECC::CC_AF); } -static VECC::CondCodes GetOppositeBranchCondition(VECC::CondCodes CC) { +static VECC::CondCode GetOppositeBranchCondition(VECC::CondCode CC) { switch(CC) { case VECC::CC_IG: return VECC::CC_ILE; case VECC::CC_IL: return VECC::CC_IGE; @@ -269,7 +268,7 @@ unsigned VEInstrInfo::removeBranch(MachineBasicBlock &MBB, bool VEInstrInfo::reverseBranchCondition( SmallVectorImpl &Cond) const { - VECC::CondCodes CC = static_cast(Cond[0].getImm()); + VECC::CondCode CC = static_cast(Cond[0].getImm()); Cond[0].setImm(GetOppositeBranchCondition(CC)); return false; } diff --git a/llvm/lib/Target/VE/VEInstrInfo.h b/llvm/lib/Target/VE/VEInstrInfo.h index 47021efa9016f..4e28279a6675e 100644 --- a/llvm/lib/Target/VE/VEInstrInfo.h +++ b/llvm/lib/Target/VE/VEInstrInfo.h @@ -25,7 +25,6 @@ class VESubtarget; class VEInstrInfo : public VEGenInstrInfo { const VERegisterInfo RI; - const VESubtarget &Subtarget; virtual void anchor(); public: diff --git a/llvm/lib/Target/VE/VEInstrInfo.td b/llvm/lib/Target/VE/VEInstrInfo.td index 58244540ccd27..38dfb28cae1b5 100644 --- a/llvm/lib/Target/VE/VEInstrInfo.td +++ b/llvm/lib/Target/VE/VEInstrInfo.td @@ -83,7 +83,7 @@ def HI32 : SDNodeXForm; def icond2cc : SDNodeXFormget()) { default: llvm_unreachable("Unknown integer condition code!"); case ISD::SETEQ: cc = VECC::CC_IEQ; break; @@ -101,7 +101,7 @@ def icond2cc : SDNodeXForm; def fcond2cc : SDNodeXFormget()) { default: llvm_unreachable("Unknown float condition code!"); case ISD::SETFALSE: cc = VECC::CC_AF; break; diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp index 51d1d7e067c0d..a9842a1f72bda 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp @@ -103,7 +103,7 @@ void WebAssemblyAsmPrinter::emitEndOfAsmFile(Module &M) { if (F.isDeclarationForLinker()) { SmallVector Results; SmallVector Params; - computeSignatureVTs(F.getFunctionType(), F, TM, Params, Results); + computeSignatureVTs(F.getFunctionType(), &F, F, TM, Params, Results); auto *Sym = cast(getSymbol(&F)); Sym->setType(wasm::WASM_SYMBOL_TYPE_FUNCTION); if (!Sym->getSignature()) { @@ -290,7 +290,8 @@ void WebAssemblyAsmPrinter::emitFunctionBodyStart() { const Function &F = MF->getFunction(); SmallVector ResultVTs; SmallVector ParamVTs; - computeSignatureVTs(F.getFunctionType(), F, TM, ParamVTs, ResultVTs); + computeSignatureVTs(F.getFunctionType(), &F, F, TM, ParamVTs, ResultVTs); + auto Signature = signatureFromMVTs(ResultVTs, ParamVTs); auto *WasmSym = cast(CurrentFnSym); WasmSym->setSignature(Signature.get()); diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp index 10d90d1e486bc..f7e98804bf6e3 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp @@ -640,6 +640,9 @@ bool WebAssemblyFastISel::fastLowerArguments() { if (F->isVarArg()) return false; + if (FuncInfo.Fn->getCallingConv() == CallingConv::Swift) + return false; + unsigned I = 0; for (auto const &Arg : F->args()) { const AttributeList &Attrs = F->getAttributes(); @@ -754,6 +757,9 @@ bool WebAssemblyFastISel::selectCall(const Instruction *I) { if (Func && Func->isIntrinsic()) return false; + if (Call->getCallingConv() == CallingConv::Swift) + return false; + bool IsDirect = Func != nullptr; if (!IsDirect && isa(Call->getCalledValue())) return false; diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp index 6b1bbd7a2b079..8b1bf590c2ea5 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp @@ -244,6 +244,10 @@ bool FixFunctionBitcasts::runOnModule(Module &M) { // Collect all the places that need wrappers. for (Function &F : M) { + // Skip to fix when the function is swiftcc because swiftcc allows + // bitcast type difference for swiftself and swifterror. + if (F.getCallingConv() == CallingConv::Swift) + continue; findUses(&F, F, Uses, ConstantBCs); // If we have a "main" function, and its type isn't diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp index 68e9aa644f246..f18cadd141915 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -125,6 +125,10 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering( for (auto T : {MVT::v16i8, MVT::v8i16}) setOperationAction(Op, T, Legal); + // Support integer abs + for (auto T : {MVT::v16i8, MVT::v8i16, MVT::v4i32}) + setOperationAction(ISD::ABS, T, Legal); + // Custom lower BUILD_VECTORs to minimize number of replace_lanes for (auto T : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v4f32, MVT::v2i64, MVT::v2f64}) @@ -765,10 +769,14 @@ WebAssemblyTargetLowering::LowerCall(CallLoweringInfo &CLI, std::swap(OutVals[0], OutVals[1]); } + bool HasSwiftSelfArg = false; + bool HasSwiftErrorArg = false; unsigned NumFixedArgs = 0; for (unsigned I = 0; I < Outs.size(); ++I) { const ISD::OutputArg &Out = Outs[I]; SDValue &OutVal = OutVals[I]; + HasSwiftSelfArg |= Out.Flags.isSwiftSelf(); + HasSwiftErrorArg |= Out.Flags.isSwiftError(); if (Out.Flags.isNest()) fail(DL, DAG, "WebAssembly hasn't implemented nest arguments"); if (Out.Flags.isInAlloca()) @@ -798,6 +806,29 @@ WebAssemblyTargetLowering::LowerCall(CallLoweringInfo &CLI, bool IsVarArg = CLI.IsVarArg; auto PtrVT = getPointerTy(Layout); + // For swiftcc, emit additional swiftself and swifterror arguments + // if there aren't. These additional arguments are also added for callee + // signature They are necessary to match callee and caller signature for + // indirect call. + if (CallConv == CallingConv::Swift) { + if (!HasSwiftSelfArg) { + NumFixedArgs++; + ISD::OutputArg Arg; + Arg.Flags.setSwiftSelf(); + CLI.Outs.push_back(Arg); + SDValue ArgVal = DAG.getUNDEF(PtrVT); + CLI.OutVals.push_back(ArgVal); + } + if (!HasSwiftErrorArg) { + NumFixedArgs++; + ISD::OutputArg Arg; + Arg.Flags.setSwiftError(); + CLI.Outs.push_back(Arg); + SDValue ArgVal = DAG.getUNDEF(PtrVT); + CLI.OutVals.push_back(ArgVal); + } + } + // Analyze operands of the call, assigning locations to each operand. SmallVector ArgLocs; CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext()); @@ -960,7 +991,11 @@ SDValue WebAssemblyTargetLowering::LowerFormalArguments( // of the incoming values before they're represented by virtual registers. MF.getRegInfo().addLiveIn(WebAssembly::ARGUMENTS); + bool HasSwiftErrorArg = false; + bool HasSwiftSelfArg = false; for (const ISD::InputArg &In : Ins) { + HasSwiftSelfArg |= In.Flags.isSwiftSelf(); + HasSwiftErrorArg |= In.Flags.isSwiftError(); if (In.Flags.isInAlloca()) fail(DL, DAG, "WebAssembly hasn't implemented inalloca arguments"); if (In.Flags.isNest()) @@ -980,6 +1015,19 @@ SDValue WebAssemblyTargetLowering::LowerFormalArguments( MFI->addParam(In.VT); } + // For swiftcc, emit additional swiftself and swifterror arguments + // if there aren't. These additional arguments are also added for callee + // signature They are necessary to match callee and caller signature for + // indirect call. + auto PtrVT = getPointerTy(MF.getDataLayout()); + if (CallConv == CallingConv::Swift) { + if (!HasSwiftSelfArg) { + MFI->addParam(PtrVT); + } + if (!HasSwiftErrorArg) { + MFI->addParam(PtrVT); + } + } // Varargs are copied into a buffer allocated by the caller, and a pointer to // the buffer is passed as an argument. if (IsVarArg) { @@ -997,8 +1045,8 @@ SDValue WebAssemblyTargetLowering::LowerFormalArguments( // Record the number and types of arguments and results. SmallVector Params; SmallVector Results; - computeSignatureVTs(MF.getFunction().getFunctionType(), MF.getFunction(), - DAG.getTarget(), Params, Results); + computeSignatureVTs(MF.getFunction().getFunctionType(), &MF.getFunction(), + MF.getFunction(), DAG.getTarget(), Params, Results); for (MVT VT : Results) MFI->addResult(VT); // TODO: Use signatures in WebAssemblyMachineFunctionInfo too and unify diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td index 144b7f6ca23ed..a9cb9177f5a20 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td @@ -575,6 +575,11 @@ multiclass SIMDReduce baseInst> { // Integer vector negation def ivneg : PatFrag<(ops node:$in), (sub immAllZerosV, node:$in)>; +// Integer absolute value: abs +defm ABS : SIMDUnary; +defm ABS : SIMDUnary; +defm ABS : SIMDUnary; + // Integer negation: neg defm NEG : SIMDUnaryInt; @@ -606,6 +611,18 @@ def : Pat<(i32 (seteq (i32 (!cast(reduction[1]#"_"#ty) (ty V128:$x)))>; } +multiclass SIMDBitmask simdop> { + defm _#vec_t : SIMD_I<(outs I32:$dst), (ins V128:$vec), (outs), (ins), + [(set I32:$dst, + (i32 (int_wasm_bitmask (vec_t V128:$vec))) + )], + vec#".bitmask\t$dst, $vec", vec#".bitmask", simdop>; +} + +defm BITMASK : SIMDBitmask; +defm BITMASK : SIMDBitmask; +defm BITMASK : SIMDBitmask; + //===----------------------------------------------------------------------===// // Bit shifts //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp index b3e74601d5d02..07f183c0e1a16 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp @@ -56,7 +56,8 @@ WebAssemblyMCInstLower::GetGlobalAddressSymbol(const MachineOperand &MO) const { SmallVector ResultMVTs; SmallVector ParamMVTs; - computeSignatureVTs(FuncTy, CurrentFunc, TM, ParamMVTs, ResultMVTs); + const auto *const F = dyn_cast(Global); + computeSignatureVTs(FuncTy, F, CurrentFunc, TM, ParamMVTs, ResultMVTs); auto Signature = signatureFromMVTs(ResultMVTs, ParamMVTs); WasmSym->setSignature(Signature.get()); diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp index e4cc2389147bc..1e1c097a0dcd1 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp @@ -42,15 +42,17 @@ void llvm::computeLegalValueVTs(const Function &F, const TargetMachine &TM, } } -void llvm::computeSignatureVTs(const FunctionType *Ty, const Function &F, +void llvm::computeSignatureVTs(const FunctionType *Ty, + const Function *TargetFunc, + const Function &ContextFunc, const TargetMachine &TM, SmallVectorImpl &Params, SmallVectorImpl &Results) { - computeLegalValueVTs(F, TM, Ty->getReturnType(), Results); + computeLegalValueVTs(ContextFunc, TM, Ty->getReturnType(), Results); MVT PtrVT = MVT::getIntegerVT(TM.createDataLayout().getPointerSizeInBits()); if (Results.size() > 1 && - !TM.getSubtarget(F).hasMultivalue()) { + !TM.getSubtarget(ContextFunc).hasMultivalue()) { // WebAssembly can't lower returns of multiple values without demoting to // sret unless multivalue is enabled (see // WebAssemblyTargetLowering::CanLowerReturn). So replace multiple return @@ -60,9 +62,28 @@ void llvm::computeSignatureVTs(const FunctionType *Ty, const Function &F, } for (auto *Param : Ty->params()) - computeLegalValueVTs(F, TM, Param, Params); + computeLegalValueVTs(ContextFunc, TM, Param, Params); if (Ty->isVarArg()) Params.push_back(PtrVT); + + // For swiftcc, emit additional swiftself and swifterror parameters + // if there aren't. These additional parameters are also passed for caller. + // They are necessary to match callee and caller signature for indirect + // call. + + if (TargetFunc && TargetFunc->getCallingConv() == CallingConv::Swift) { + MVT PtrVT = MVT::getIntegerVT(TM.createDataLayout().getPointerSizeInBits()); + bool HasSwiftErrorArg = false; + bool HasSwiftSelfArg = false; + for (const auto &Arg : TargetFunc->args()) { + HasSwiftErrorArg |= Arg.hasAttribute(Attribute::SwiftError); + HasSwiftSelfArg |= Arg.hasAttribute(Attribute::SwiftSelf); + } + if (!HasSwiftErrorArg) + Params.push_back(PtrVT); + if (!HasSwiftSelfArg) + Params.push_back(PtrVT); + } } void llvm::valTypesFromMVTs(const ArrayRef &In, diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h b/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h index 312e1ee9d6870..7d2279df5b283 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h +++ b/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h @@ -159,9 +159,10 @@ void computeLegalValueVTs(const Function &F, const TargetMachine &TM, Type *Ty, SmallVectorImpl &ValueVTs); // Compute the signature for a given FunctionType (Ty). Note that it's not the -// signature for F (F is just used to get varous context) -void computeSignatureVTs(const FunctionType *Ty, const Function &F, - const TargetMachine &TM, SmallVectorImpl &Params, +// signature for ContextFunc (ContextFunc is just used to get varous context) +void computeSignatureVTs(const FunctionType *Ty, const Function *TargetFunc, + const Function &ContextFunc, const TargetMachine &TM, + SmallVectorImpl &Params, SmallVectorImpl &Results); void valTypesFromMVTs(const ArrayRef &In, diff --git a/llvm/lib/Target/X86/X86.h b/llvm/lib/Target/X86/X86.h index 604438f83531d..c81b349ecedd3 100644 --- a/llvm/lib/Target/X86/X86.h +++ b/llvm/lib/Target/X86/X86.h @@ -145,6 +145,7 @@ void initializeFixupLEAPassPass(PassRegistry &); void initializeFPSPass(PassRegistry &); void initializeWinEHStatePassPass(PassRegistry &); void initializeX86AvoidSFBPassPass(PassRegistry &); +void initializeX86AvoidTrailingCallPassPass(PassRegistry &); void initializeX86CallFrameOptimizationPass(PassRegistry &); void initializeX86CmovConverterPassPass(PassRegistry &); void initializeX86CondBrFoldingPassPass(PassRegistry &); diff --git a/llvm/lib/Target/X86/X86AvoidTrailingCall.cpp b/llvm/lib/Target/X86/X86AvoidTrailingCall.cpp index fb4f9e2901dc7..0899783d5f607 100644 --- a/llvm/lib/Target/X86/X86AvoidTrailingCall.cpp +++ b/llvm/lib/Target/X86/X86AvoidTrailingCall.cpp @@ -6,10 +6,29 @@ // //===----------------------------------------------------------------------===// // -// The Windows x64 unwinder has trouble unwinding the stack when a return -// address points to the end of the function. This pass maintains the invariant -// that every return address is inside the bounds of its parent function or -// funclet by inserting int3 if the last instruction would otherwise be a call. +// The Windows x64 unwinder decodes the instruction stream during unwinding. +// The unwinder decodes forward from the current PC to detect epilogue code +// patterns. +// +// First, this means that there must be an instruction after every +// call instruction for the unwinder to decode. LLVM must maintain the invariant +// that the last instruction of a function or funclet is not a call, or the +// unwinder may decode into the next function. Similarly, a call may not +// immediately precede an epilogue code pattern. As of this writing, the +// SEH_Epilogue pseudo instruction takes care of that. +// +// Second, all non-tail call jump targets must be within the *half-open* +// interval of the bounds of the function. The unwinder distinguishes between +// internal jump instructions and tail calls in an epilogue sequence by checking +// the jump target against the function bounds from the .pdata section. This +// means that the last regular MBB of an LLVM function must not be empty if +// there are regular jumps targeting it. +// +// This pass upholds these invariants by ensuring that blocks at the end of a +// function or funclet are a) not empty and b) do not end in a CALL instruction. +// +// Unwinder implementation for reference: +// https://github.com/dotnet/coreclr/blob/a9f3fc16483eecfc47fb79c362811d870be02249/src/unwinder/amd64/unwinder_amd64.cpp#L1015 // //===----------------------------------------------------------------------===// @@ -18,33 +37,35 @@ #include "X86Subtarget.h" #include "llvm/CodeGen/MachineInstrBuilder.h" -#define DEBUG_TYPE "x86-avoid-trailing-call" +#define AVOIDCALL_DESC "X86 avoid trailing call pass" +#define AVOIDCALL_NAME "x86-avoid-trailing-call" + +#define DEBUG_TYPE AVOIDCALL_NAME using namespace llvm; namespace { - class X86AvoidTrailingCallPass : public MachineFunctionPass { public: X86AvoidTrailingCallPass() : MachineFunctionPass(ID) {} bool runOnMachineFunction(MachineFunction &MF) override; -private: - StringRef getPassName() const override { - return "X86 avoid trailing call pass"; - } static char ID; + +private: + StringRef getPassName() const override { return AVOIDCALL_DESC; } }; +} // end anonymous namespace char X86AvoidTrailingCallPass::ID = 0; -} // end anonymous namespace - FunctionPass *llvm::createX86AvoidTrailingCallPass() { return new X86AvoidTrailingCallPass(); } +INITIALIZE_PASS(X86AvoidTrailingCallPass, AVOIDCALL_NAME, AVOIDCALL_DESC, false, false) + // A real instruction is a non-meta, non-pseudo instruction. Some pseudos // expand to nothing, and some expand to code. This logic conservatively assumes // they might expand to nothing. @@ -62,6 +83,11 @@ bool X86AvoidTrailingCallPass::runOnMachineFunction(MachineFunction &MF) { const X86InstrInfo &TII = *STI.getInstrInfo(); assert(STI.isTargetWin64() && "pass only runs on Win64"); + // We don't need to worry about any of the invariants described above if there + // is no unwind info (CFI). + if (!MF.hasWinCFI()) + return false; + // FIXME: Perhaps this pass should also replace SEH_Epilogue by inserting nops // before epilogues. @@ -73,33 +99,34 @@ bool X86AvoidTrailingCallPass::runOnMachineFunction(MachineFunction &MF) { if (NextMBB && !NextMBB->isEHFuncletEntry()) continue; - // Find the last real instruction in this block, or previous blocks if this - // block is empty. - MachineBasicBlock::reverse_iterator LastRealInstr; - for (MachineBasicBlock &RMBB : - make_range(MBB.getReverseIterator(), MF.rend())) { - LastRealInstr = llvm::find_if(reverse(RMBB), isRealInstruction); - if (LastRealInstr != RMBB.rend()) - break; - } - - // Do nothing if this function or funclet has no instructions. - if (LastRealInstr == MF.begin()->rend()) - continue; + // Find the last real instruction in this block. + auto LastRealInstr = llvm::find_if(reverse(MBB), isRealInstruction); - // If this is a call instruction, insert int3 right after it with the same - // DebugLoc. Convert back to a forward iterator and advance the insertion - // position once. - if (isCallInstruction(*LastRealInstr)) { + // If the block is empty or the last real instruction is a call instruction, + // insert an int3. If there is a call instruction, insert the int3 between + // the call and any labels or other meta instructions. If the block is + // empty, insert at block end. + bool IsEmpty = LastRealInstr == MBB.rend(); + bool IsCall = !IsEmpty && isCallInstruction(*LastRealInstr); + if (IsEmpty || IsCall) { LLVM_DEBUG({ - dbgs() << "inserting int3 after trailing call instruction:\n"; - LastRealInstr->dump(); - dbgs() << '\n'; + if (IsCall) { + dbgs() << "inserting int3 after trailing call instruction:\n"; + LastRealInstr->dump(); + dbgs() << '\n'; + } else { + dbgs() << "inserting int3 in trailing empty MBB:\n"; + MBB.dump(); + } }); - MachineBasicBlock::iterator MBBI = std::next(LastRealInstr.getReverse()); - BuildMI(*LastRealInstr->getParent(), MBBI, LastRealInstr->getDebugLoc(), - TII.get(X86::INT3)); + MachineBasicBlock::iterator MBBI = MBB.end(); + DebugLoc DL; + if (IsCall) { + MBBI = std::next(LastRealInstr.getReverse()); + DL = LastRealInstr->getDebugLoc(); + } + BuildMI(MBB, MBBI, DL, TII.get(X86::INT3)); Changed = true; } } diff --git a/llvm/lib/Target/X86/X86CallFrameOptimization.cpp b/llvm/lib/Target/X86/X86CallFrameOptimization.cpp index 467d24394dad6..a5831bc8ef0bc 100644 --- a/llvm/lib/Target/X86/X86CallFrameOptimization.cpp +++ b/llvm/lib/Target/X86/X86CallFrameOptimization.cpp @@ -198,7 +198,7 @@ bool X86CallFrameOptimization::isProfitable(MachineFunction &MF, if (CannotReserveFrame) return true; - unsigned StackAlign = TFL->getStackAlignment(); + Align StackAlign = TFL->getStackAlign(); int64_t Advantage = 0; for (auto CC : CallSeqVector) { @@ -221,7 +221,7 @@ bool X86CallFrameOptimization::isProfitable(MachineFunction &MF, // We'll need a add after the call. Advantage -= 3; // If we have to realign the stack, we'll also need a sub before - if (CC.ExpectedDist % StackAlign) + if (!isAligned(StackAlign, CC.ExpectedDist)) Advantage -= 3; // Now, for each push, we save ~3 bytes. For small constants, we actually, // save more (up to 5 bytes), but 3 should be a good approximation. diff --git a/llvm/lib/Target/X86/X86FastISel.cpp b/llvm/lib/Target/X86/X86FastISel.cpp index c25785b9000dd..c9f3186367ee1 100644 --- a/llvm/lib/Target/X86/X86FastISel.cpp +++ b/llvm/lib/Target/X86/X86FastISel.cpp @@ -3938,10 +3938,8 @@ bool X86FastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo, const X86InstrInfo &XII = (const X86InstrInfo &)TII; unsigned Size = DL.getTypeAllocSize(LI->getType()); - unsigned Alignment = LI->getAlignment(); - - if (Alignment == 0) // Ensure that codegen never sees alignment 0 - Alignment = DL.getABITypeAlignment(LI->getType()); + Align Alignment = + DL.getValueOrABITypeAlignment(LI->getAlign(), LI->getType()); SmallVector AddrOps; AM.getFullAddress(AddrOps); diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp index 1661497849d15..96bdb155075d0 100644 --- a/llvm/lib/Target/X86/X86FrameLowering.cpp +++ b/llvm/lib/Target/X86/X86FrameLowering.cpp @@ -486,7 +486,7 @@ void X86FrameLowering::BuildCFI(MachineBasicBlock &MBB, void X86FrameLowering::emitCalleeSavedFrameMoves( MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, - const DebugLoc &DL, bool IsPrologue) const { + const DebugLoc &DL) const { MachineFunction &MF = *MBB.getParent(); MachineFrameInfo &MFI = MF.getFrameInfo(); MachineModuleInfo &MMI = MF.getMMI(); @@ -501,15 +501,10 @@ void X86FrameLowering::emitCalleeSavedFrameMoves( I = CSI.begin(), E = CSI.end(); I != E; ++I) { int64_t Offset = MFI.getObjectOffset(I->getFrameIdx()); unsigned Reg = I->getReg(); - unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true); - if (IsPrologue) { - BuildCFI(MBB, MBBI, DL, - MCCFIInstruction::createOffset(nullptr, DwarfReg, Offset)); - } else { - BuildCFI(MBB, MBBI, DL, - MCCFIInstruction::createRestore(nullptr, DwarfReg)); - } + unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true); + BuildCFI(MBB, MBBI, DL, + MCCFIInstruction::createOffset(nullptr, DwarfReg, Offset)); } } @@ -1680,7 +1675,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, } // Emit DWARF info specifying the offsets of the callee-saved registers. - emitCalleeSavedFrameMoves(MBB, MBBI, DL, true); + emitCalleeSavedFrameMoves(MBB, MBBI, DL); } // X86 Interrupt handling function cannot assume anything about the direction @@ -1769,7 +1764,7 @@ X86FrameLowering::getWinEHFuncletFrameSize(const MachineFunction &MF) const { // RBP is not included in the callee saved register block. After pushing RBP, // everything is 16 byte aligned. Everything we allocate before an outgoing // call must also be 16 byte aligned. - unsigned FrameSizeMinusRBP = alignTo(CSSize + UsedSize, getStackAlignment()); + unsigned FrameSizeMinusRBP = alignTo(CSSize + UsedSize, getStackAlign()); // Subtract out the size of the callee saved registers. This is how much stack // each funclet will allocate. return FrameSizeMinusRBP + XMMSize - CSSize; @@ -1830,8 +1825,6 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, } uint64_t SEHStackAllocAmt = NumBytes; - // AfterPop is the position to insert .cfi_restore. - MachineBasicBlock::iterator AfterPop = MBBI; if (HasFP) { // Pop EBP. BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::POP64r : X86::POP32r), @@ -1842,13 +1835,6 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, TRI->getDwarfRegNum(Is64Bit ? X86::RSP : X86::ESP, true); BuildCFI(MBB, MBBI, DL, MCCFIInstruction::createDefCfa( nullptr, DwarfStackPtr, -SlotSize)); - if (!MBB.succ_empty() && !MBB.isReturnBlock()) { - unsigned DwarfFramePtr = TRI->getDwarfRegNum(MachineFramePtr, true); - BuildCFI(MBB, AfterPop, DL, - MCCFIInstruction::createRestore(nullptr, DwarfFramePtr)); - --MBBI; - --AfterPop; - } --MBBI; } } @@ -1948,13 +1934,6 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, } } - // Emit DWARF info specifying the restores of the callee-saved registers. - // For epilogue with return inside or being other block without successor, - // no need to generate .cfi_restore for callee-saved registers. - if (NeedsDwarfCFI && !MBB.succ_empty() && !MBB.isReturnBlock()) { - emitCalleeSavedFrameMoves(MBB, AfterPop, DL, false); - } - if (Terminator == MBB.end() || !isTailCallOpcode(Terminator->getOpcode())) { // Add the return addr area delta back since we are not tail calling. int Offset = -1 * X86FI->getTCReturnAddrDelta(); @@ -2072,7 +2051,8 @@ int X86FrameLowering::getWin64EHFrameIndexRef(const MachineFunction &MF, return getFrameIndexReference(MF, FI, FrameReg); FrameReg = TRI->getStackRegister(); - return alignDown(MFI.getMaxCallFrameSize(), getStackAlignment()) + it->second; + return alignDown(MFI.getMaxCallFrameSize(), getStackAlign().value()) + + it->second; } int X86FrameLowering::getFrameIndexReferenceSP(const MachineFunction &MF, @@ -2250,16 +2230,16 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots( const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT); unsigned Size = TRI->getSpillSize(*RC); - unsigned Align = TRI->getSpillAlignment(*RC); + Align Alignment = TRI->getSpillAlign(*RC); // ensure alignment assert(SpillSlotOffset < 0 && "SpillSlotOffset should always < 0 on X86"); - SpillSlotOffset = -alignTo(-SpillSlotOffset, Align); + SpillSlotOffset = -alignTo(-SpillSlotOffset, Alignment); // spill into slot SpillSlotOffset -= Size; int SlotIndex = MFI.CreateFixedSpillStackObject(Size, SpillSlotOffset); CSI[i - 1].setFrameIdx(SlotIndex); - MFI.ensureMaxAlignment(Align); + MFI.ensureMaxAlignment(Alignment); // Save the start offset and size of XMM in stack frame for funclets. if (X86::VR128RegClass.contains(Reg)) { @@ -3009,6 +2989,12 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, I = MBB.erase(I); auto InsertPos = skipDebugInstructionsForward(I, MBB.end()); + // Try to avoid emitting dead SP adjustments if the block end is unreachable, + // typically because the function is marked noreturn (abort, throw, + // assert_fail, etc). + if (isDestroy && blockEndIsUnreachable(MBB, I)) + return I; + if (!reserveCallFrame) { // If the stack pointer can be changed after prologue, turn the // adjcallstackup instruction into a 'sub ESP, ' and the @@ -3017,8 +3003,7 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, // We need to keep the stack aligned properly. To do this, we round the // amount of space needed for the outgoing arguments up to the next // alignment boundary. - unsigned StackAlign = getStackAlignment(); - Amount = alignTo(Amount, StackAlign); + Amount = alignTo(Amount, getStackAlign()); const Function &F = MF.getFunction(); bool WindowsCFI = MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); @@ -3091,13 +3076,7 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, return I; } - if (isDestroy && InternalAmt && !blockEndIsUnreachable(MBB, I)) { - // If we are performing frame pointer elimination and if the callee pops - // something off the stack pointer, add it back. We do this until we have - // more advanced stack pointer tracking ability. - // We are not tracking the stack pointer adjustment by the callee, so make - // sure we restore the stack pointer immediately after the call, there may - // be spill code inserted between the CALL and ADJCALLSTACKUP instructions. + if (InternalAmt) { MachineBasicBlock::iterator CI = I; MachineBasicBlock::iterator B = MBB.begin(); while (CI != B && !std::prev(CI)->isCall()) diff --git a/llvm/lib/Target/X86/X86FrameLowering.h b/llvm/lib/Target/X86/X86FrameLowering.h index 43c81d9d5a483..c7b41543c500b 100644 --- a/llvm/lib/Target/X86/X86FrameLowering.h +++ b/llvm/lib/Target/X86/X86FrameLowering.h @@ -60,7 +60,7 @@ class X86FrameLowering : public TargetFrameLowering { void emitCalleeSavedFrameMoves(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, - const DebugLoc &DL, bool IsPrologue) const; + const DebugLoc &DL) const; /// emitProlog/emitEpilog - These methods insert prolog and epilog code into /// the function. diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index e4bf4ab685388..1a9e5b4c5d359 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -4366,12 +4366,6 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, else NumBytesForCalleeToPop = 0; // Callee pops nothing. - if (CLI.DoesNotReturn && !getTargetMachine().Options.TrapUnreachable) { - // No need to reset the stack after the call if the call doesn't return. To - // make the MI verify, we'll pretend the callee does it for us. - NumBytesForCalleeToPop = NumBytes; - } - // Returns a flag for retval copy to use. if (!IsSibcall) { Chain = DAG.getCALLSEQ_END(Chain, @@ -4424,7 +4418,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, unsigned X86TargetLowering::GetAlignedArgumentStackSize(const unsigned StackSize, SelectionDAG &DAG) const { - const Align StackAlignment(Subtarget.getFrameLowering()->getStackAlignment()); + const Align StackAlignment = Subtarget.getFrameLowering()->getStackAlign(); const uint64_t SlotSize = Subtarget.getRegisterInfo()->getSlotSize(); assert(StackSize % SlotSize == 0 && "StackSize must be a multiple of SlotSize"); @@ -7118,6 +7112,24 @@ static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl &Mask, continue; } + // INSERT_SUBVECTOR - to widen vectors we often insert them into UNDEF + // base vectors. + if (V.getOpcode() == ISD::INSERT_SUBVECTOR) { + SDValue Vec = V.getOperand(0); + int NumVecElts = Vec.getValueType().getVectorNumElements(); + if (Vec.isUndef() && Size == NumVecElts) { + auto *CIdx = dyn_cast(V.getOperand(2)); + int NumSubElts = V.getOperand(1).getValueType().getVectorNumElements(); + if (CIdx && CIdx->getAPIntValue().ule(NumVecElts - NumSubElts)) { + int Idx = CIdx->getZExtValue(); + if (M < Idx || (Idx + NumSubElts) <= M) { + KnownUndef.setBit(i); + } + } + } + continue; + } + // Attempt to extract from the source's constant bits. if (IsSrcConstant[SrcIdx]) { if (UndefSrcElts[SrcIdx][M]) @@ -15948,7 +15960,6 @@ static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2, // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, .. // Mask for V4F64; 0/1, 4/5, 2/3, 6/7.. ShuffleImm = 0; - bool UnaryMask = isUndefOrZeroOrInRange(Mask, 0, NumElts); bool ShufpdMask = true; bool CommutableMask = true; for (int i = 0; i < NumElts; ++i) { @@ -15956,7 +15967,7 @@ static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2, continue; if (Mask[i] < 0) return false; - int Val = (i & 6) + (UnaryMask ? 0 : (NumElts * (i & 1))); + int Val = (i & 6) + NumElts * (i & 1); int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1); if (Mask[i] < Val || Mask[i] > Val + 1) ShufpdMask = false; @@ -15968,9 +15979,7 @@ static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2, if (!ShufpdMask && !CommutableMask) return false; - if (UnaryMask) - V2 = V1; - else if (!ShufpdMask && CommutableMask) + if (!ShufpdMask && CommutableMask) std::swap(V1, V2); ForceV1Zero = ZeroLane[0]; @@ -23323,7 +23332,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, " not tell us which reg is the stack pointer!"); const TargetFrameLowering &TFI = *Subtarget.getFrameLowering(); - const Align StackAlign(TFI.getStackAlignment()); + const Align StackAlign = TFI.getStackAlign(); if (hasInlineStackProbe(MF)) { MachineRegisterInfo &MRI = MF.getRegInfo(); @@ -37012,6 +37021,51 @@ static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size) { return false; } +// Helper to flip between AND/OR/XOR opcodes and their X86ISD FP equivalents. +static unsigned getAltBitOpcode(unsigned Opcode) { + switch(Opcode) { + case ISD::AND: return X86ISD::FAND; + case ISD::OR: return X86ISD::FOR; + case ISD::XOR: return X86ISD::FXOR; + case X86ISD::ANDNP: return X86ISD::FANDN; + } + llvm_unreachable("Unknown bitwise opcode"); +} + +// Helper to adjust v4i32 MOVMSK expansion to work with SSE1-only targets. +static SDValue adjustBitcastSrcVectorSSE1(SelectionDAG &DAG, SDValue Src, + const SDLoc &DL) { + EVT SrcVT = Src.getValueType(); + if (SrcVT != MVT::v4i1) + return SDValue(); + + switch (Src.getOpcode()) { + case ISD::SETCC: + if (Src.getOperand(0).getValueType() == MVT::v4i32 && + ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode()) && + cast(Src.getOperand(2))->get() == ISD::SETLT) { + SDValue Op0 = Src.getOperand(0); + if (ISD::isNormalLoad(Op0.getNode())) + return DAG.getBitcast(MVT::v4f32, Op0); + if (Op0.getOpcode() == ISD::BITCAST && + Op0.getOperand(0).getValueType() == MVT::v4f32) + return Op0.getOperand(0); + } + break; + case ISD::AND: + case ISD::XOR: + case ISD::OR: { + SDValue Op0 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(0), DL); + SDValue Op1 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(1), DL); + if (Op0 && Op1) + return DAG.getNode(getAltBitOpcode(Src.getOpcode()), DL, MVT::v4f32, Op0, + Op1); + break; + } + } + return SDValue(); +} + // Helper to push sign extension of vXi1 SETCC result through bitops. static SDValue signExtendBitcastSrcVector(SelectionDAG &DAG, EVT SExtVT, SDValue Src, const SDLoc &DL) { @@ -37042,6 +37096,16 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src, if (!SrcVT.isSimple() || SrcVT.getScalarType() != MVT::i1) return SDValue(); + // Recognize the IR pattern for the movmsk intrinsic under SSE1 before type + // legalization destroys the v4i32 type. + if (Subtarget.hasSSE1() && !Subtarget.hasSSE2()) { + if (SDValue V = adjustBitcastSrcVectorSSE1(DAG, Src, DL)) { + V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, + DAG.getBitcast(MVT::v4f32, V)); + return DAG.getZExtOrTrunc(V, DL, VT); + } + } + // If the input is a truncate from v16i8 or v32i8 go ahead and use a // movmskb even with avx512. This will be better than truncating to vXi1 and // using a kmov. This can especially help KNL if the input is a v16i8/v32i8 @@ -37304,24 +37368,6 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG, if (SDValue V = combineBitcastvxi1(DAG, VT, N0, dl, Subtarget)) return V; - // Recognize the IR pattern for the movmsk intrinsic under SSE1 before type - // legalization destroys the v4i32 type. - if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && SrcVT == MVT::v4i1 && - VT.isScalarInteger() && N0.getOpcode() == ISD::SETCC && - N0.getOperand(0).getValueType() == MVT::v4i32 && - ISD::isBuildVectorAllZeros(N0.getOperand(1).getNode()) && - cast(N0.getOperand(2))->get() == ISD::SETLT) { - SDValue N00 = N0.getOperand(0); - // Only do this if we can avoid scalarizing the input. - if (ISD::isNormalLoad(N00.getNode()) || - (N00.getOpcode() == ISD::BITCAST && - N00.getOperand(0).getValueType() == MVT::v4f32)) { - SDValue V = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, - DAG.getBitcast(MVT::v4f32, N00)); - return DAG.getZExtOrTrunc(V, dl, VT); - } - } - // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer // type, widen both sides to avoid a trip through memory. if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() && @@ -37385,19 +37431,22 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG, auto *BCast = cast(N0); unsigned SrcVTSize = SrcVT.getScalarSizeInBits(); unsigned MemSize = BCast->getMemoryVT().getScalarSizeInBits(); - MVT MemVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(MemSize) - : MVT::getIntegerVT(MemSize); - MVT LoadVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(SrcVTSize) - : MVT::getIntegerVT(SrcVTSize); - LoadVT = MVT::getVectorVT(LoadVT, SrcVT.getVectorNumElements()); + // Don't swap i8/i16 since don't have fp types that size. + if (MemSize >= 32) { + MVT MemVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(MemSize) + : MVT::getIntegerVT(MemSize); + MVT LoadVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(SrcVTSize) + : MVT::getIntegerVT(SrcVTSize); + LoadVT = MVT::getVectorVT(LoadVT, SrcVT.getVectorNumElements()); - SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other); - SDValue Ops[] = { BCast->getChain(), BCast->getBasePtr() }; - SDValue ResNode = - DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, SDLoc(N), Tys, Ops, - MemVT, BCast->getMemOperand()); - DAG.ReplaceAllUsesOfValueWith(SDValue(BCast, 1), ResNode.getValue(1)); - return DAG.getBitcast(VT, ResNode); + SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other); + SDValue Ops[] = { BCast->getChain(), BCast->getBasePtr() }; + SDValue ResNode = + DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, SDLoc(N), Tys, Ops, + MemVT, BCast->getMemOperand()); + DAG.ReplaceAllUsesOfValueWith(SDValue(BCast, 1), ResNode.getValue(1)); + return DAG.getBitcast(VT, ResNode); + } } // Since MMX types are special and don't usually play with other vector types, @@ -37791,21 +37840,14 @@ static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG, if (!Subtarget.hasSSE2()) return SDValue(); - // Verify the type we're extracting from is any integer type above i16. - EVT VT = Extract->getOperand(0).getValueType(); - if (!VT.isSimple() || !(VT.getVectorElementType().getSizeInBits() > 16)) + EVT ExtractVT = Extract->getValueType(0); + // Verify the type we're extracting is either i32 or i64. + // FIXME: Could support other types, but this is what we have coverage for. + if (ExtractVT != MVT::i32 && ExtractVT != MVT::i64) return SDValue(); - unsigned RegSize = 128; - if (Subtarget.useBWIRegs()) - RegSize = 512; - else if (Subtarget.hasAVX()) - RegSize = 256; - - // We handle upto v16i* for SSE2 / v32i* for AVX / v64i* for AVX512. - // TODO: We should be able to handle larger vectors by splitting them before - // feeding them into several SADs, and then reducing over those. - if (RegSize / VT.getVectorNumElements() < 8) + EVT VT = Extract->getOperand(0).getValueType(); + if (!isPowerOf2_32(VT.getVectorNumElements())) return SDValue(); // Match shuffle + add pyramid. @@ -37821,8 +37863,8 @@ static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG, // (extends the sign bit which is zero). // So it is correct to skip the sign/zero extend instruction. if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND || - Root.getOpcode() == ISD::ZERO_EXTEND || - Root.getOpcode() == ISD::ANY_EXTEND)) + Root.getOpcode() == ISD::ZERO_EXTEND || + Root.getOpcode() == ISD::ANY_EXTEND)) Root = Root.getOperand(0); // If there was a match, we want Root to be a select that is the root of an @@ -37842,7 +37884,7 @@ static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG, // If the original vector was wider than 8 elements, sum over the results // in the SAD vector. unsigned Stages = Log2_32(VT.getVectorNumElements()); - MVT SadVT = SAD.getSimpleValueType(); + EVT SadVT = SAD.getValueType(); if (Stages > 3) { unsigned SadElems = SadVT.getVectorNumElements(); @@ -37857,12 +37899,12 @@ static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG, } } - MVT Type = Extract->getSimpleValueType(0); - unsigned TypeSizeInBits = Type.getSizeInBits(); - // Return the lowest TypeSizeInBits bits. - MVT ResVT = MVT::getVectorVT(Type, SadVT.getSizeInBits() / TypeSizeInBits); + unsigned ExtractSizeInBits = ExtractVT.getSizeInBits(); + // Return the lowest ExtractSizeInBits bits. + EVT ResVT = EVT::getVectorVT(*DAG.getContext(), ExtractVT, + SadVT.getSizeInBits() / ExtractSizeInBits); SAD = DAG.getBitcast(ResVT, SAD); - return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Type, SAD, + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, SAD, Extract->getOperand(1)); } @@ -45833,20 +45875,8 @@ static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG, // TODO: There's nothing special about i32, any integer type above i16 should // work just as well. - if (!VT.isVector() || !VT.isSimple() || - !(VT.getVectorElementType() == MVT::i32)) - return SDValue(); - - unsigned RegSize = 128; - if (Subtarget.useBWIRegs()) - RegSize = 512; - else if (Subtarget.hasAVX()) - RegSize = 256; - - // We only handle v16i32 for SSE2 / v32i32 for AVX / v64i32 for AVX512. - // TODO: We should be able to handle larger vectors by splitting them before - // feeding them into several SADs, and then reducing over those. - if (VT.getSizeInBits() / 4 > RegSize) + if (!VT.isVector() || !isPowerOf2_32(VT.getVectorNumElements()) || + VT.getVectorElementType() != MVT::i32) return SDValue(); // We know N is a reduction add. To match SAD, we need one of the operands to @@ -45873,9 +45903,7 @@ static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG, // We need to turn the vector of i64 into a vector of i32. // If the reduction vector is at least as wide as the psadbw result, just // bitcast. If it's narrower which can only occur for v2i32, bits 127:16 of - // the PSADBW will be zero. If we promote/ narrow vectors, truncate the v2i64 - // result to v2i32 which will be removed by type legalization. If we/ widen - // narrow vectors then we bitcast to v4i32 and extract v2i32. + // the PSADBW will be zero. MVT ResVT = MVT::getVectorVT(MVT::i32, Sad.getValueSizeInBits() / 32); Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad); @@ -46402,6 +46430,24 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, })) { unsigned NumOps = Ops.size(); switch (Op0.getOpcode()) { + case X86ISD::SHUFP: { + // Add SHUFPD support if/when necessary. + if (!IsSplat && VT.getScalarType() == MVT::f32 && + llvm::all_of(Ops, [Op0](SDValue Op) { + return Op.getOperand(2) == Op0.getOperand(2); + })) { + SmallVector LHS, RHS; + for (unsigned i = 0; i != NumOps; ++i) { + LHS.push_back(Ops[i].getOperand(0)); + RHS.push_back(Ops[i].getOperand(1)); + } + return DAG.getNode(Op0.getOpcode(), DL, VT, + DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LHS), + DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, RHS), + Op0.getOperand(2)); + } + break; + } case X86ISD::PSHUFHW: case X86ISD::PSHUFLW: case X86ISD::PSHUFD: diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index 825ddedad1768..8ebe36977ba2c 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -1595,32 +1595,6 @@ namespace llvm { } } - /// Helper function to scale a shuffle or target shuffle mask, replacing each - /// mask index with the scaled sequential indices for an equivalent narrowed - /// mask. This is the reverse process to canWidenShuffleElements, but can - /// always succeed. - template - void scaleShuffleMask(size_t Scale, ArrayRef Mask, - SmallVectorImpl &ScaledMask) { - assert(0 < Scale && "Unexpected scaling factor"); - size_t NumElts = Mask.size(); - ScaledMask.assign(NumElts * Scale, -1); - - for (size_t i = 0; i != NumElts; ++i) { - int M = Mask[i]; - - // Repeat sentinel values in every mask element. - if (M < 0) { - for (size_t s = 0; s != Scale; ++s) - ScaledMask[(Scale * i) + s] = M; - continue; - } - - // Scale mask element and increment across each mask element. - for (size_t s = 0; s != Scale; ++s) - ScaledMask[(Scale * i) + s] = (Scale * M) + s; - } - } } // end namespace llvm #endif // LLVM_LIB_TARGET_X86_X86ISELLOWERING_H diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp index 5254565efca17..3bfad3d190f58 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -625,8 +625,7 @@ int X86InstrInfo::getSPAdjust(const MachineInstr &MI) const { const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering(); if (isFrameInstr(MI)) { - unsigned StackAlign = TFI->getStackAlignment(); - int SPAdj = alignTo(getFrameSize(MI), StackAlign); + int SPAdj = alignTo(getFrameSize(MI), TFI->getStackAlign()); SPAdj -= getFrameAdjustment(MI); if (!isFrameSetup(MI)) SPAdj = -SPAdj; @@ -3737,7 +3736,7 @@ void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, "Stack slot too small for store"); unsigned Alignment = std::max(TRI->getSpillSize(*RC), 16); bool isAligned = - (Subtarget.getFrameLowering()->getStackAlignment() >= Alignment) || + (Subtarget.getFrameLowering()->getStackAlign() >= Alignment) || RI.canRealignStack(MF); unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, Subtarget); addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx) @@ -3752,7 +3751,7 @@ void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, const MachineFunction &MF = *MBB.getParent(); unsigned Alignment = std::max(TRI->getSpillSize(*RC), 16); bool isAligned = - (Subtarget.getFrameLowering()->getStackAlignment() >= Alignment) || + (Subtarget.getFrameLowering()->getStackAlign() >= Alignment) || RI.canRealignStack(MF); unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, Subtarget); addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg), FrameIdx); @@ -5211,7 +5210,7 @@ static MachineInstr *MakeM0Inst(const TargetInstrInfo &TII, unsigned Opcode, MachineInstr *X86InstrInfo::foldMemoryOperandCustom( MachineFunction &MF, MachineInstr &MI, unsigned OpNum, ArrayRef MOs, MachineBasicBlock::iterator InsertPt, - unsigned Size, unsigned Align) const { + unsigned Size, Align Alignment) const { switch (MI.getOpcode()) { case X86::INSERTPSrr: case X86::VINSERTPSrr: @@ -5227,7 +5226,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom( const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI, MF); unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8; - if ((Size == 0 || Size >= 16) && RCSize >= 16 && 4 <= Align) { + if ((Size == 0 || Size >= 16) && RCSize >= 16 && Alignment >= Align(4)) { int PtrOffset = SrcIdx * 4; unsigned NewImm = (DstIdx << 4) | ZMask; unsigned NewOpCode = @@ -5251,7 +5250,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom( const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI, MF); unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8; - if ((Size == 0 || Size >= 16) && RCSize >= 16 && 8 <= Align) { + if ((Size == 0 || Size >= 16) && RCSize >= 16 && Alignment >= Align(8)) { unsigned NewOpCode = (MI.getOpcode() == X86::VMOVHLPSZrr) ? X86::VMOVLPSZ128rm : (MI.getOpcode() == X86::VMOVHLPSrr) ? X86::VMOVLPSrm : @@ -5270,7 +5269,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom( const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo(); const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI, MF); unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8; - if ((Size == 0 || Size >= 16) && RCSize >= 16 && Align < 16) { + if ((Size == 0 || Size >= 16) && RCSize >= 16 && Alignment < Align(16)) { MachineInstr *NewMI = FuseInst(MF, X86::MOVHPDrm, OpNum, MOs, InsertPt, MI, *this); return NewMI; @@ -5302,11 +5301,10 @@ static bool shouldPreventUndefRegUpdateMemFold(MachineFunction &MF, return VRegDef && VRegDef->isImplicitDef(); } - MachineInstr *X86InstrInfo::foldMemoryOperandImpl( MachineFunction &MF, MachineInstr &MI, unsigned OpNum, ArrayRef MOs, MachineBasicBlock::iterator InsertPt, - unsigned Size, unsigned Align, bool AllowCommute) const { + unsigned Size, Align Alignment, bool AllowCommute) const { bool isSlowTwoMemOps = Subtarget.slowTwoMemOps(); bool isTwoAddrFold = false; @@ -5346,8 +5344,8 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( MachineInstr *NewMI = nullptr; // Attempt to fold any custom cases we have. - if (MachineInstr *CustomMI = - foldMemoryOperandCustom(MF, MI, OpNum, MOs, InsertPt, Size, Align)) + if (MachineInstr *CustomMI = foldMemoryOperandCustom( + MF, MI, OpNum, MOs, InsertPt, Size, Alignment)) return CustomMI; const X86MemoryFoldTableEntry *I = nullptr; @@ -5374,9 +5372,9 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( if (I != nullptr) { unsigned Opcode = I->DstOp; - unsigned MinAlign = (I->Flags & TB_ALIGN_MASK) >> TB_ALIGN_SHIFT; - MinAlign = MinAlign ? 1 << (MinAlign - 1) : 0; - if (Align < MinAlign) + MaybeAlign MinAlign = + decodeMaybeAlign((I->Flags & TB_ALIGN_MASK) >> TB_ALIGN_SHIFT); + if (MinAlign && Alignment < *MinAlign) return nullptr; bool NarrowToMOV32rm = false; if (Size) { @@ -5451,8 +5449,8 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( } // Attempt to fold with the commuted version of the instruction. - NewMI = foldMemoryOperandImpl(MF, MI, CommuteOpIdx2, MOs, InsertPt, - Size, Align, /*AllowCommute=*/false); + NewMI = foldMemoryOperandImpl(MF, MI, CommuteOpIdx2, MOs, InsertPt, Size, + Alignment, /*AllowCommute=*/false); if (NewMI) return NewMI; @@ -5506,12 +5504,12 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, const MachineFrameInfo &MFI = MF.getFrameInfo(); unsigned Size = MFI.getObjectSize(FrameIndex); - unsigned Alignment = MFI.getObjectAlignment(FrameIndex); + Align Alignment = MFI.getObjectAlign(FrameIndex); // If the function stack isn't realigned we don't want to fold instructions // that need increased alignment. if (!RI.needsStackRealignment(MF)) Alignment = - std::min(Alignment, Subtarget.getFrameLowering()->getStackAlignment()); + std::min(Alignment, Subtarget.getFrameLowering()->getStackAlign()); if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) { unsigned NewOpc = 0; unsigned RCSize = 0; @@ -5811,36 +5809,36 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( return nullptr; // Determine the alignment of the load. - unsigned Alignment = 0; + Align Alignment; if (LoadMI.hasOneMemOperand()) - Alignment = (*LoadMI.memoperands_begin())->getAlignment(); + Alignment = Align((*LoadMI.memoperands_begin())->getAlignment()); else switch (LoadMI.getOpcode()) { case X86::AVX512_512_SET0: case X86::AVX512_512_SETALLONES: - Alignment = 64; + Alignment = Align(64); break; case X86::AVX2_SETALLONES: case X86::AVX1_SETALLONES: case X86::AVX_SET0: case X86::AVX512_256_SET0: - Alignment = 32; + Alignment = Align(32); break; case X86::V_SET0: case X86::V_SETALLONES: case X86::AVX512_128_SET0: case X86::FsFLD0F128: case X86::AVX512_FsFLD0F128: - Alignment = 16; + Alignment = Align(16); break; case X86::MMX_SET0: case X86::FsFLD0SD: case X86::AVX512_FsFLD0SD: - Alignment = 8; + Alignment = Align(8); break; case X86::FsFLD0SS: case X86::AVX512_FsFLD0SS: - Alignment = 4; + Alignment = Align(4); break; default: return nullptr; @@ -5929,7 +5927,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( Opc == X86::AVX1_SETALLONES); const Constant *C = IsAllOnes ? Constant::getAllOnesValue(Ty) : Constant::getNullValue(Ty); - unsigned CPI = MCP.getConstantPoolIndex(C, Alignment); + unsigned CPI = MCP.getConstantPoolIndex(C, Alignment.value()); // Create operands to load from the constant pool entry. MOs.push_back(MachineOperand::CreateReg(PICBase, false)); diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h index c9cc9f1108b03..d303ea8c7bd69 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.h +++ b/llvm/lib/Target/X86/X86InstrInfo.h @@ -474,7 +474,7 @@ class X86InstrInfo final : public X86GenInstrInfo { unsigned OpNum, ArrayRef MOs, MachineBasicBlock::iterator InsertPt, - unsigned Size, unsigned Alignment, + unsigned Size, Align Alignment, bool AllowCommute) const; bool isHighLatencyDef(int opc) const override; @@ -594,7 +594,7 @@ class X86InstrInfo final : public X86GenInstrInfo { unsigned OpNum, ArrayRef MOs, MachineBasicBlock::iterator InsertPt, - unsigned Size, unsigned Align) const; + unsigned Size, Align Alignment) const; /// isFrameOperand - Return true and the FrameIndex if the specified /// operand and follow operands form a reference to the stack frame. diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp index 22b4e2805a5ea..dd6b67865ac0d 100644 --- a/llvm/lib/Target/X86/X86TargetMachine.cpp +++ b/llvm/lib/Target/X86/X86TargetMachine.cpp @@ -79,6 +79,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeX86Target() { initializeX86ExecutionDomainFixPass(PR); initializeX86DomainReassignmentPass(PR); initializeX86AvoidSFBPassPass(PR); + initializeX86AvoidTrailingCallPassPass(PR); initializeX86SpeculativeLoadHardeningPassPass(PR); initializeX86FlagsCopyLoweringPassPass(PR); initializeX86CondBrFoldingPassPass(PR); @@ -92,19 +93,9 @@ static std::unique_ptr createTLOF(const Triple &TT) { return std::make_unique(); } - if (TT.isOSFreeBSD()) - return std::make_unique(); - if (TT.isOSLinux() || TT.isOSNaCl() || TT.isOSIAMCU()) - return std::make_unique(); - if (TT.isOSSolaris()) - return std::make_unique(); - if (TT.isOSFuchsia()) - return std::make_unique(); - if (TT.isOSBinFormatELF()) - return std::make_unique(); if (TT.isOSBinFormatCOFF()) return std::make_unique(); - llvm_unreachable("unknown subtarget type"); + return std::make_unique(); } static std::string computeDataLayout(const Triple &TT) { @@ -232,6 +223,9 @@ X86TargetMachine::X86TargetMachine(const Target &T, const Triple &TT, setMachineOutliner(true); + // x86 supports the debug entry values. + setSupportsDebugEntryValues(true); + initAsmInfo(); } diff --git a/llvm/lib/Target/X86/X86TargetObjectFile.cpp b/llvm/lib/Target/X86/X86TargetObjectFile.cpp index 44185957686b6..e3a8ff45f9159 100644 --- a/llvm/lib/Target/X86/X86TargetObjectFile.cpp +++ b/llvm/lib/Target/X86/X86TargetObjectFile.cpp @@ -63,30 +63,3 @@ const MCExpr *X86ELFTargetObjectFile::getDebugThreadLocalSymbol( const MCSymbol *Sym) const { return MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_DTPOFF, getContext()); } - -void -X86FreeBSDTargetObjectFile::Initialize(MCContext &Ctx, - const TargetMachine &TM) { - TargetLoweringObjectFileELF::Initialize(Ctx, TM); - InitializeELF(TM.Options.UseInitArray); -} - -void -X86FuchsiaTargetObjectFile::Initialize(MCContext &Ctx, - const TargetMachine &TM) { - TargetLoweringObjectFileELF::Initialize(Ctx, TM); - InitializeELF(TM.Options.UseInitArray); -} - -void -X86LinuxNaClTargetObjectFile::Initialize(MCContext &Ctx, - const TargetMachine &TM) { - TargetLoweringObjectFileELF::Initialize(Ctx, TM); - InitializeELF(TM.Options.UseInitArray); -} - -void X86SolarisTargetObjectFile::Initialize(MCContext &Ctx, - const TargetMachine &TM) { - TargetLoweringObjectFileELF::Initialize(Ctx, TM); - InitializeELF(TM.Options.UseInitArray); -} diff --git a/llvm/lib/Target/X86/X86TargetObjectFile.h b/llvm/lib/Target/X86/X86TargetObjectFile.h index 1fd0bbf56b19a..45904c3b0b697 100644 --- a/llvm/lib/Target/X86/X86TargetObjectFile.h +++ b/llvm/lib/Target/X86/X86TargetObjectFile.h @@ -44,33 +44,10 @@ namespace llvm { X86ELFTargetObjectFile() { PLTRelativeVariantKind = MCSymbolRefExpr::VK_PLT; } - /// Describe a TLS variable address within debug info. const MCExpr *getDebugThreadLocalSymbol(const MCSymbol *Sym) const override; }; - /// X86FreeBSDTargetObjectFile - This implementation is used for FreeBSD - /// on x86 and x86-64. - class X86FreeBSDTargetObjectFile : public X86ELFTargetObjectFile { - void Initialize(MCContext &Ctx, const TargetMachine &TM) override; - }; - - /// This implementation is used for Fuchsia on x86-64. - class X86FuchsiaTargetObjectFile : public X86ELFTargetObjectFile { - void Initialize(MCContext &Ctx, const TargetMachine &TM) override; - }; - - /// X86LinuxNaClTargetObjectFile - This implementation is used for linux and - /// Native Client on x86 and x86-64. - class X86LinuxNaClTargetObjectFile : public X86ELFTargetObjectFile { - void Initialize(MCContext &Ctx, const TargetMachine &TM) override; - }; - - /// This implementation is used for Solaris on x86/x86-64. - class X86SolarisTargetObjectFile : public X86ELFTargetObjectFile { - void Initialize(MCContext &Ctx, const TargetMachine &TM) override; - }; - } // end namespace llvm #endif diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index a03c75bb7dda2..3fc61541d5116 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -2648,39 +2648,13 @@ int X86TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE, int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy, bool IsPairwise) { + // Just use the default implementation for pair reductions. + if (IsPairwise) + return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwise); + // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput // and make it as the cost. - static const CostTblEntry SLMCostTblPairWise[] = { - { ISD::FADD, MVT::v2f64, 3 }, - { ISD::ADD, MVT::v2i64, 5 }, - }; - - static const CostTblEntry SSE2CostTblPairWise[] = { - { ISD::FADD, MVT::v2f64, 2 }, - { ISD::FADD, MVT::v4f32, 4 }, - { ISD::ADD, MVT::v2i64, 2 }, // The data reported by the IACA tool is "1.6". - { ISD::ADD, MVT::v2i32, 2 }, // FIXME: chosen to be less than v4i32. - { ISD::ADD, MVT::v4i32, 3 }, // The data reported by the IACA tool is "3.5". - { ISD::ADD, MVT::v2i16, 3 }, // FIXME: chosen to be less than v4i16 - { ISD::ADD, MVT::v4i16, 4 }, // FIXME: chosen to be less than v8i16 - { ISD::ADD, MVT::v8i16, 5 }, - { ISD::ADD, MVT::v2i8, 2 }, - { ISD::ADD, MVT::v4i8, 2 }, - { ISD::ADD, MVT::v8i8, 2 }, - { ISD::ADD, MVT::v16i8, 3 }, - }; - - static const CostTblEntry AVX1CostTblPairWise[] = { - { ISD::FADD, MVT::v4f64, 5 }, - { ISD::FADD, MVT::v8f32, 7 }, - { ISD::ADD, MVT::v2i64, 1 }, // The data reported by the IACA tool is "1.5". - { ISD::ADD, MVT::v4i64, 5 }, // The data reported by the IACA tool is "4.8". - { ISD::ADD, MVT::v8i32, 5 }, - { ISD::ADD, MVT::v16i16, 6 }, - { ISD::ADD, MVT::v32i8, 4 }, - }; - static const CostTblEntry SLMCostTblNoPairWise[] = { { ISD::FADD, MVT::v2f64, 3 }, { ISD::ADD, MVT::v2i64, 5 }, @@ -2721,66 +2695,47 @@ int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy, EVT VT = TLI->getValueType(DL, ValTy); if (VT.isSimple()) { MVT MTy = VT.getSimpleVT(); - if (IsPairwise) { - if (ST->isSLM()) - if (const auto *Entry = CostTableLookup(SLMCostTblPairWise, ISD, MTy)) - return Entry->Cost; - - if (ST->hasAVX()) - if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy)) - return Entry->Cost; - - if (ST->hasSSE2()) - if (const auto *Entry = CostTableLookup(SSE2CostTblPairWise, ISD, MTy)) - return Entry->Cost; - } else { - if (ST->isSLM()) - if (const auto *Entry = CostTableLookup(SLMCostTblNoPairWise, ISD, MTy)) - return Entry->Cost; + if (ST->isSLM()) + if (const auto *Entry = CostTableLookup(SLMCostTblNoPairWise, ISD, MTy)) + return Entry->Cost; - if (ST->hasAVX()) - if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy)) - return Entry->Cost; + if (ST->hasAVX()) + if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy)) + return Entry->Cost; - if (ST->hasSSE2()) - if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy)) - return Entry->Cost; - } + if (ST->hasSSE2()) + if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy)) + return Entry->Cost; } std::pair LT = TLI->getTypeLegalizationCost(DL, ValTy); MVT MTy = LT.second; - if (IsPairwise) { - if (ST->isSLM()) - if (const auto *Entry = CostTableLookup(SLMCostTblPairWise, ISD, MTy)) - return LT.first * Entry->Cost; - - if (ST->hasAVX()) - if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy)) - return LT.first * Entry->Cost; + unsigned ArithmeticCost = 0; + if (LT.first != 1 && MTy.isVector() && + MTy.getVectorNumElements() < ValTy->getVectorNumElements()) { + // Type needs to be split. We need LT.first - 1 arithmetic ops. + Type *SingleOpTy = VectorType::get(ValTy->getVectorElementType(), + MTy.getVectorNumElements()); + ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy); + ArithmeticCost *= LT.first - 1; + } - if (ST->hasSSE2()) - if (const auto *Entry = CostTableLookup(SSE2CostTblPairWise, ISD, MTy)) - return LT.first * Entry->Cost; - } else { - if (ST->isSLM()) - if (const auto *Entry = CostTableLookup(SLMCostTblNoPairWise, ISD, MTy)) - return LT.first * Entry->Cost; + if (ST->isSLM()) + if (const auto *Entry = CostTableLookup(SLMCostTblNoPairWise, ISD, MTy)) + return ArithmeticCost + Entry->Cost; - if (ST->hasAVX()) - if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy)) - return LT.first * Entry->Cost; + if (ST->hasAVX()) + if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy)) + return ArithmeticCost + Entry->Cost; - if (ST->hasSSE2()) - if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy)) - return LT.first * Entry->Cost; - } + if (ST->hasSSE2()) + if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy)) + return ArithmeticCost + Entry->Cost; // FIXME: These assume a naive kshift+binop lowering, which is probably // conservative in most cases. - // FIXME: This doesn't cost large types like v128i1 correctly. static const CostTblEntry AVX512BoolReduction[] = { { ISD::AND, MVT::v2i1, 3 }, { ISD::AND, MVT::v4i1, 5 }, @@ -2826,22 +2781,100 @@ int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy, }; // Handle bool allof/anyof patterns. - if (!IsPairwise && ValTy->getVectorElementType()->isIntegerTy(1)) { + if (ValTy->getVectorElementType()->isIntegerTy(1)) { + unsigned ArithmeticCost = 0; + if (LT.first != 1 && MTy.isVector() && + MTy.getVectorNumElements() < ValTy->getVectorNumElements()) { + // Type needs to be split. We need LT.first - 1 arithmetic ops. + Type *SingleOpTy = VectorType::get(ValTy->getVectorElementType(), + MTy.getVectorNumElements()); + ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy); + ArithmeticCost *= LT.first - 1; + } + if (ST->hasAVX512()) if (const auto *Entry = CostTableLookup(AVX512BoolReduction, ISD, MTy)) - return LT.first * Entry->Cost; + return ArithmeticCost + Entry->Cost; if (ST->hasAVX2()) if (const auto *Entry = CostTableLookup(AVX2BoolReduction, ISD, MTy)) - return LT.first * Entry->Cost; + return ArithmeticCost + Entry->Cost; if (ST->hasAVX()) if (const auto *Entry = CostTableLookup(AVX1BoolReduction, ISD, MTy)) - return LT.first * Entry->Cost; + return ArithmeticCost + Entry->Cost; if (ST->hasSSE2()) if (const auto *Entry = CostTableLookup(SSE2BoolReduction, ISD, MTy)) - return LT.first * Entry->Cost; + return ArithmeticCost + Entry->Cost; + + return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwise); + } + + unsigned NumVecElts = ValTy->getVectorNumElements(); + unsigned ScalarSize = ValTy->getScalarSizeInBits(); + + // Special case power of 2 reductions where the scalar type isn't changed + // by type legalization. + if (!isPowerOf2_32(NumVecElts) || ScalarSize != MTy.getScalarSizeInBits()) + return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwise); + + unsigned ReductionCost = 0; + + Type *Ty = ValTy; + if (LT.first != 1 && MTy.isVector() && + MTy.getVectorNumElements() < ValTy->getVectorNumElements()) { + // Type needs to be split. We need LT.first - 1 arithmetic ops. + Ty = VectorType::get(ValTy->getVectorElementType(), + MTy.getVectorNumElements()); + ReductionCost = getArithmeticInstrCost(Opcode, Ty); + ReductionCost *= LT.first - 1; + NumVecElts = MTy.getVectorNumElements(); + } + + // Now handle reduction with the legal type, taking into account size changes + // at each level. + while (NumVecElts > 1) { + // Determine the size of the remaining vector we need to reduce. + unsigned Size = NumVecElts * ScalarSize; + NumVecElts /= 2; + // If we're reducing from 256/512 bits, use an extract_subvector. + if (Size > 128) { + Type *SubTy = VectorType::get(ValTy->getVectorElementType(), NumVecElts); + ReductionCost += + getShuffleCost(TTI::SK_ExtractSubvector, Ty, NumVecElts, SubTy); + Ty = SubTy; + } else if (Size == 128) { + // Reducing from 128 bits is a permute of v2f64/v2i64. + Type *ShufTy; + if (ValTy->isFloatingPointTy()) + ShufTy = VectorType::get(Type::getDoubleTy(ValTy->getContext()), 2); + else + ShufTy = VectorType::get(Type::getInt64Ty(ValTy->getContext()), 2); + ReductionCost += + getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, 0, nullptr); + } else if (Size == 64) { + // Reducing from 64 bits is a shuffle of v4f32/v4i32. + Type *ShufTy; + if (ValTy->isFloatingPointTy()) + ShufTy = VectorType::get(Type::getFloatTy(ValTy->getContext()), 4); + else + ShufTy = VectorType::get(Type::getInt32Ty(ValTy->getContext()), 4); + ReductionCost += + getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, 0, nullptr); + } else { + // Reducing from smaller size is a shift by immediate. + Type *ShiftTy = VectorType::get( + Type::getIntNTy(ValTy->getContext(), Size), 128 / Size); + ReductionCost += getArithmeticInstrCost( + Instruction::LShr, ShiftTy, TargetTransformInfo::OK_AnyValue, + TargetTransformInfo::OK_UniformConstantValue, + TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); + } + + // Add the arithmetic op for this level. + ReductionCost += getArithmeticInstrCost(Opcode, Ty); } - return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwise); + // Add the final extract element to the cost. + return ReductionCost + getVectorInstrCost(Instruction::ExtractElement, Ty, 0); } int X86TTIImpl::getMinMaxReductionCost(Type *ValTy, Type *CondTy, diff --git a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp index 0016597a8da82..c85f480775808 100644 --- a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp +++ b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp @@ -108,7 +108,6 @@ struct SuspendCrossingInfo { size_t const DefIndex = Mapping.blockToIndex(DefBB); size_t const UseIndex = Mapping.blockToIndex(UseBB); - assert(Block[UseIndex].Consumes[DefIndex] && "use must consume def"); bool const Result = Block[UseIndex].Kills[DefIndex]; LLVM_DEBUG(dbgs() << UseBB->getName() << " => " << DefBB->getName() << " answer is " << Result << "\n"); @@ -1396,6 +1395,24 @@ void coro::buildCoroutineFrame(Function &F, Shape &Shape) { Spills.clear(); } + // Collect lifetime.start info for each alloca. + using LifetimeStart = SmallPtrSet; + llvm::DenseMap> LifetimeMap; + for (Instruction &I : instructions(F)) { + auto *II = dyn_cast(&I); + if (!II || II->getIntrinsicID() != Intrinsic::lifetime_start) + continue; + + if (auto *OpInst = dyn_cast(I.getOperand(1))) + if (auto *AI = dyn_cast(OpInst->getOperand(0))) { + + if (LifetimeMap.find(AI) == LifetimeMap.end()) + LifetimeMap[AI] = std::make_unique(); + + LifetimeMap[AI]->insert(OpInst); + } + } + // Collect the spills for arguments and other not-materializable values. for (Argument &A : F.args()) for (User *U : A.users()) @@ -1441,14 +1458,27 @@ void coro::buildCoroutineFrame(Function &F, Shape &Shape) { continue; } - for (User *U : I.users()) - if (Checker.isDefinitionAcrossSuspend(I, U)) { + auto Iter = LifetimeMap.find(&I); + for (User *U : I.users()) { + bool NeedSpill = false; + + // Check against lifetime.start if the instruction has the info. + if (Iter != LifetimeMap.end()) + for (auto *S : *Iter->second) { + if ((NeedSpill = Checker.isDefinitionAcrossSuspend(*S, U))) + break; + } + else + NeedSpill = Checker.isDefinitionAcrossSuspend(I, U); + + if (NeedSpill) { // We cannot spill a token. if (I.getType()->isTokenTy()) report_fatal_error( "token definition is separated from the use by a suspend point"); Spills.emplace_back(&I, U); } + } } LLVM_DEBUG(dump("Spills", Spills)); Shape.FrameTy = buildFrameType(F, Shape, Spills); diff --git a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp index 04c06e3653d7a..465b6598da650 100644 --- a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp +++ b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp @@ -567,8 +567,9 @@ void CoroCloner::replaceEntryBlock() { // branching to the original beginning of the coroutine. Make this // the entry block of the cloned function. auto *Entry = cast(VMap[Shape.AllocaSpillBlock]); + auto *OldEntry = &NewF->getEntryBlock(); Entry->setName("entry" + Suffix); - Entry->moveBefore(&NewF->getEntryBlock()); + Entry->moveBefore(OldEntry); Entry->getTerminator()->eraseFromParent(); // Clear all predecessors of the new entry block. There should be @@ -581,8 +582,14 @@ void CoroCloner::replaceEntryBlock() { Builder.CreateUnreachable(); BranchToEntry->eraseFromParent(); - // TODO: move any allocas into Entry that weren't moved into the frame. - // (Currently we move all allocas into the frame.) + // Move any allocas into Entry that weren't moved into the frame. + for (auto IT = OldEntry->begin(), End = OldEntry->end(); IT != End;) { + Instruction &I = *IT++; + if (!isa(&I) || I.getNumUses() == 0) + continue; + + I.moveBefore(*Entry, Entry->getFirstInsertionPt()); + } // Branch from the entry to the appropriate place. Builder.SetInsertPoint(Entry); @@ -1157,7 +1164,10 @@ static void simplifySuspendPoints(coro::Shape &Shape) { if (N == 0) return; while (true) { - if (simplifySuspendPoint(cast(S[I]), Shape.CoroBegin)) { + auto SI = cast(S[I]); + // Leave final.suspend to handleFinalSuspend since it is undefined behavior + // to resume a coroutine suspended at the final suspend point. + if (!SI->isFinal() && simplifySuspendPoint(SI, Shape.CoroBegin)) { if (--N == I) break; std::swap(S[I], S[N]); diff --git a/llvm/lib/Transforms/IPO/Attributor.cpp b/llvm/lib/Transforms/IPO/Attributor.cpp index a3ee90acc1665..053255ae46023 100644 --- a/llvm/lib/Transforms/IPO/Attributor.cpp +++ b/llvm/lib/Transforms/IPO/Attributor.cpp @@ -398,8 +398,8 @@ static Value *constructPointer(Type *ResTy, Value *Ptr, int64_t Offset, template static bool genericValueTraversal( Attributor &A, IRPosition IRP, const AAType &QueryingAA, StateTy &State, - const function_ref &VisitValueCB, - int MaxValues = 8, const function_ref StripCB = nullptr) { + function_ref VisitValueCB, + int MaxValues = 8, function_ref StripCB = nullptr) { const AAIsDead *LivenessAA = nullptr; if (IRP.getAnchorScope()) @@ -558,7 +558,7 @@ ChangeStatus AbstractAttribute::update(Attributor &A) { ChangeStatus IRAttributeManifest::manifestAttrs(Attributor &A, const IRPosition &IRP, const ArrayRef &DeducedAttrs) { - Function *ScopeFn = IRP.getAssociatedFunction(); + Function *ScopeFn = IRP.getAnchorScope(); IRPosition::Kind PK = IRP.getPositionKind(); // In the following some generic code that will manifest attributes in @@ -627,8 +627,7 @@ SubsumingPositionIterator::SubsumingPositionIterator(const IRPosition &IRP) { return; case IRPosition::IRP_ARGUMENT: case IRPosition::IRP_RETURNED: - IRPositions.emplace_back( - IRPosition::function(*IRP.getAssociatedFunction())); + IRPositions.emplace_back(IRPosition::function(*IRP.getAnchorScope())); return; case IRPosition::IRP_CALL_SITE: assert(ICS && "Expected call site!"); @@ -678,7 +677,7 @@ SubsumingPositionIterator::SubsumingPositionIterator(const IRPosition &IRP) { } bool IRPosition::hasAttr(ArrayRef AKs, - bool IgnoreSubsumingPositions) const { + bool IgnoreSubsumingPositions, Attributor *A) const { SmallVector Attrs; for (const IRPosition &EquivIRP : SubsumingPositionIterator(*this)) { for (Attribute::AttrKind AK : AKs) @@ -690,12 +689,16 @@ bool IRPosition::hasAttr(ArrayRef AKs, if (IgnoreSubsumingPositions) break; } + if (A) + for (Attribute::AttrKind AK : AKs) + if (getAttrsFromAssumes(AK, Attrs, *A)) + return true; return false; } void IRPosition::getAttrs(ArrayRef AKs, SmallVectorImpl &Attrs, - bool IgnoreSubsumingPositions) const { + bool IgnoreSubsumingPositions, Attributor *A) const { for (const IRPosition &EquivIRP : SubsumingPositionIterator(*this)) { for (Attribute::AttrKind AK : AKs) EquivIRP.getAttrsFromIRAttr(AK, Attrs); @@ -705,6 +708,9 @@ void IRPosition::getAttrs(ArrayRef AKs, if (IgnoreSubsumingPositions) break; } + if (A) + for (Attribute::AttrKind AK : AKs) + getAttrsFromAssumes(AK, Attrs, *A); } bool IRPosition::getAttrsFromIRAttr(Attribute::AttrKind AK, @@ -724,6 +730,30 @@ bool IRPosition::getAttrsFromIRAttr(Attribute::AttrKind AK, return HasAttr; } +bool IRPosition::getAttrsFromAssumes(Attribute::AttrKind AK, + SmallVectorImpl &Attrs, + Attributor &A) const { + assert(getPositionKind() != IRP_INVALID && "Did expect a valid position!"); + Value &AssociatedValue = getAssociatedValue(); + + const Assume2KnowledgeMap &A2K = + A.getInfoCache().getKnowledgeMap().lookup({&AssociatedValue, AK}); + + // Check if we found any potential assume use, if not we don't need to create + // explorer iterators. + if (A2K.empty()) + return false; + + LLVMContext &Ctx = AssociatedValue.getContext(); + unsigned AttrsSize = Attrs.size(); + MustBeExecutedContextExplorer &Explorer = + A.getInfoCache().getMustBeExecutedContextExplorer(); + auto EIt = Explorer.begin(getCtxI()), EEnd = Explorer.end(getCtxI()); + for (auto &It : A2K) + if (Explorer.findInContextOf(It.first, EIt, EEnd)) + Attrs.push_back(Attribute::get(Ctx, AK, It.second.Max)); + return AttrsSize != Attrs.size(); +} void IRPosition::verify() { switch (KindOrArgNo) { @@ -1273,8 +1303,8 @@ class AAReturnedValuesImpl : public AAReturnedValues, public AbstractState { /// See AbstractState::checkForAllReturnedValues(...). bool checkForAllReturnedValuesAndReturnInsts( - const function_ref &)> - &Pred) const override; + function_ref &)> Pred) + const override; /// Pretty print the attribute similar to the IR representation. const std::string getAsStr() const override; @@ -1318,7 +1348,7 @@ ChangeStatus AAReturnedValuesImpl::manifest(Attributor &A) { // Callback to replace the uses of CB with the constant C. auto ReplaceCallSiteUsersWith = [&A](CallBase &CB, Constant &C) { - if (CB.getNumUses() == 0 || CB.isMustTailCall()) + if (CB.getNumUses() == 0) return ChangeStatus::UNCHANGED; if (A.changeValueAfterManifest(CB, C)) return ChangeStatus::CHANGED; @@ -1400,8 +1430,8 @@ AAReturnedValuesImpl::getAssumedUniqueReturnValue(Attributor &A) const { } bool AAReturnedValuesImpl::checkForAllReturnedValuesAndReturnInsts( - const function_ref &)> - &Pred) const { + function_ref &)> Pred) + const { if (!isValidState()) return false; @@ -2059,7 +2089,8 @@ struct AANonNullImpl : AANonNull { /// See AbstractAttribute::initialize(...). void initialize(Attributor &A) override { if (!NullIsDefined && - hasAttr({Attribute::NonNull, Attribute::Dereferenceable})) + hasAttr({Attribute::NonNull, Attribute::Dereferenceable}, + /* IgnoreSubsumingPositions */ false, &A)) indicateOptimisticFixpoint(); else if (isa(getAssociatedValue())) indicatePessimisticFixpoint(); @@ -2529,7 +2560,7 @@ struct AAWillReturnImpl : public AAWillReturn { void initialize(Attributor &A) override { AAWillReturn::initialize(A); - Function *F = getAssociatedFunction(); + Function *F = getAnchorScope(); if (!F || !A.isFunctionIPOAmendable(*F) || mayContainUnboundedCycle(*F, A)) indicatePessimisticFixpoint(); } @@ -3114,7 +3145,7 @@ struct AAIsDeadArgument : public AAIsDeadFloating { /// See AbstractAttribute::initialize(...). void initialize(Attributor &A) override { - if (!A.isFunctionIPOAmendable(*getAssociatedFunction())) + if (!A.isFunctionIPOAmendable(*getAnchorScope())) indicatePessimisticFixpoint(); } @@ -3208,14 +3239,6 @@ struct AAIsDeadCallSiteReturned : public AAIsDeadFloating { return Changed; } - /// See AbstractAttribute::manifest(...). - ChangeStatus manifest(Attributor &A) override { - if (auto *CI = dyn_cast(&getAssociatedValue())) - if (CI->isMustTailCall()) - return ChangeStatus::UNCHANGED; - return AAIsDeadFloating::manifest(A); - } - /// See AbstractAttribute::trackStatistics() void trackStatistics() const override { if (IsAssumedSideEffectFree) @@ -3265,9 +3288,6 @@ struct AAIsDeadReturned : public AAIsDeadValueImpl { UndefValue &UV = *UndefValue::get(getAssociatedFunction()->getReturnType()); auto RetInstPred = [&](Instruction &I) { ReturnInst &RI = cast(I); - if (auto *CI = dyn_cast(RI.getReturnValue())) - if (CI->isMustTailCall()) - return true; if (!isa(RI.getReturnValue())) AnyChange |= A.changeUseAfterManifest(RI.getOperandUse(0), UV); return true; @@ -3285,7 +3305,7 @@ struct AAIsDeadFunction : public AAIsDead { /// See AbstractAttribute::initialize(...). void initialize(Attributor &A) override { - const Function *F = getAssociatedFunction(); + const Function *F = getAnchorScope(); if (F && !F->isDeclaration()) { ToBeExploredFrom.insert(&F->getEntryBlock().front()); assumeLive(A, F->getEntryBlock()); @@ -3295,7 +3315,7 @@ struct AAIsDeadFunction : public AAIsDead { /// See AbstractAttribute::getAsStr(). const std::string getAsStr() const override { return "Live[#BB " + std::to_string(AssumedLiveBlocks.size()) + "/" + - std::to_string(getAssociatedFunction()->size()) + "][#TBEP " + + std::to_string(getAnchorScope()->size()) + "][#TBEP " + std::to_string(ToBeExploredFrom.size()) + "][#KDE " + std::to_string(KnownDeadEnds.size()) + "]"; } @@ -3306,7 +3326,7 @@ struct AAIsDeadFunction : public AAIsDead { "Attempted to manifest an invalid state!"); ChangeStatus HasChanged = ChangeStatus::UNCHANGED; - Function &F = *getAssociatedFunction(); + Function &F = *getAnchorScope(); if (AssumedLiveBlocks.empty()) { A.deleteAfterManifest(F); @@ -3358,7 +3378,7 @@ struct AAIsDeadFunction : public AAIsDead { /// See AAIsDead::isAssumedDead(BasicBlock *). bool isAssumedDead(const BasicBlock *BB) const override { - assert(BB->getParent() == getAssociatedFunction() && + assert(BB->getParent() == getAnchorScope() && "BB must be in the same anchor scope function."); if (!getAssumed()) @@ -3373,7 +3393,7 @@ struct AAIsDeadFunction : public AAIsDead { /// See AAIsDead::isAssumed(Instruction *I). bool isAssumedDead(const Instruction *I) const override { - assert(I->getParent()->getParent() == getAssociatedFunction() && + assert(I->getParent()->getParent() == getAnchorScope() && "Instruction must be in the same anchor scope function."); if (!getAssumed()) @@ -3527,7 +3547,7 @@ ChangeStatus AAIsDeadFunction::updateImpl(Attributor &A) { ChangeStatus Change = ChangeStatus::UNCHANGED; LLVM_DEBUG(dbgs() << "[AAIsDead] Live [" << AssumedLiveBlocks.size() << "/" - << getAssociatedFunction()->size() << "] BBs and " + << getAnchorScope()->size() << "] BBs and " << ToBeExploredFrom.size() << " exploration points and " << KnownDeadEnds.size() << " known dead ends\n"); @@ -3607,7 +3627,7 @@ ChangeStatus AAIsDeadFunction::updateImpl(Attributor &A) { // discovered any non-trivial dead end and (2) not ruled unreachable code // dead. if (ToBeExploredFrom.empty() && - getAssociatedFunction()->size() == AssumedLiveBlocks.size() && + getAnchorScope()->size() == AssumedLiveBlocks.size() && llvm::all_of(KnownDeadEnds, [](const Instruction *DeadEndI) { return DeadEndI->isTerminator() && DeadEndI->getNumSuccessors() == 0; })) @@ -3656,7 +3676,7 @@ struct AADereferenceableImpl : AADereferenceable { void initialize(Attributor &A) override { SmallVector Attrs; getAttrs({Attribute::Dereferenceable, Attribute::DereferenceableOrNull}, - Attrs); + Attrs, /* IgnoreSubsumingPositions */ false, &A); for (const Attribute &Attr : Attrs) takeKnownDerefBytesMaximum(Attr.getValueAsInt()); @@ -3947,7 +3967,7 @@ struct AAAlignImpl : AAAlign { takeKnownMaximum(Attr.getValueAsInt()); if (getIRPosition().isFnInterfaceKind() && - (!getAssociatedFunction() || + (!getAnchorScope() || !A.isFunctionIPOAmendable(*getAssociatedFunction()))) indicatePessimisticFixpoint(); } @@ -4407,8 +4427,9 @@ struct AACaptureUseTracker final : public CaptureTracker { /// See CaptureTracker::shouldExplore(...). bool shouldExplore(const Use *U) override { - // Check liveness. - return !A.isAssumedDead(*U, &NoCaptureAA, &IsDeadAA); + // Check liveness and ignore droppable users. + return !U->getUser()->isDroppable() && + !A.isAssumedDead(*U, &NoCaptureAA, &IsDeadAA); } /// Update the state according to \p CapturedInMem, \p CapturedInInt, and @@ -4748,7 +4769,7 @@ struct AAValueSimplifyArgument final : AAValueSimplifyImpl { void initialize(Attributor &A) override { AAValueSimplifyImpl::initialize(A); - if (!getAssociatedFunction() || getAssociatedFunction()->isDeclaration()) + if (!getAnchorScope() || getAnchorScope()->isDeclaration()) indicatePessimisticFixpoint(); if (hasAttr({Attribute::InAlloca, Attribute::StructRet, Attribute::Nest}, /* IgnoreSubsumingPositions */ true)) @@ -4856,9 +4877,6 @@ struct AAValueSimplifyReturned : AAValueSimplifyImpl { // We can replace the AssociatedValue with the constant. if (&V == C || V.getType() != C->getType() || isa(V)) return true; - if (auto *CI = dyn_cast(&V)) - if (CI->isMustTailCall()) - return true; for (ReturnInst *RI : RetInsts) { if (RI->getFunction() != getAnchorScope()) @@ -4966,9 +4984,6 @@ struct AAValueSimplifyCallSiteReturned : AAValueSimplifyReturned { /// See AbstractAttribute::manifest(...). ChangeStatus manifest(Attributor &A) override { - if (auto *CI = dyn_cast(&getAssociatedValue())) - if (CI->isMustTailCall()) - return ChangeStatus::UNCHANGED; return AAValueSimplifyImpl::manifest(A); } @@ -4998,7 +5013,7 @@ struct AAHeapToStackImpl : public AAHeapToStack { "Attempted to manifest an invalid state!"); ChangeStatus HasChanged = ChangeStatus::UNCHANGED; - Function *F = getAssociatedFunction(); + Function *F = getAnchorScope(); const auto *TLI = A.getInfoCache().getTargetLibraryInfoForFunction(*F); for (Instruction *MallocCall : MallocCalls) { @@ -5075,7 +5090,7 @@ struct AAHeapToStackImpl : public AAHeapToStack { }; ChangeStatus AAHeapToStackImpl::updateImpl(Attributor &A) { - const Function *F = getAssociatedFunction(); + const Function *F = getAnchorScope(); const auto *TLI = A.getInfoCache().getTargetLibraryInfoForFunction(*F); MustBeExecutedContextExplorer &Explorer = @@ -5633,7 +5648,7 @@ struct AAPrivatizablePtrArgument final : public AAPrivatizablePtrImpl { createReplacementValues( PrivatizableType.getValue(), ACS, ACS.getCallArgOperand(ARI.getReplacedArg().getArgNo()), - NewArgOperands); + NewArgOperands); }; // Collect the types that will replace the privatizable type in the function @@ -6176,6 +6191,10 @@ ChangeStatus AAMemoryBehaviorFloating::updateImpl(Attributor &A) { if (A.isAssumedDead(*U, this, &LivenessAA)) continue; + // Droppable users, e.g., llvm::assume does not actually perform any action. + if (UserI->isDroppable()) + continue; + // Check if the users of UserI should also be visited. if (followUsersOfUseIn(A, U, UserI)) for (const Use &UserIUse : UserI->uses()) @@ -6402,8 +6421,9 @@ struct AAMemoryLocationImpl : public AAMemoryLocation { /// See AAMemoryLocation::checkForAllAccessesToMemoryKind(...). bool checkForAllAccessesToMemoryKind( - const function_ref &Pred, + function_ref + Pred, MemoryLocationsKind RequestedMLK) const override { if (!isValidState()) return false; @@ -7152,7 +7172,7 @@ struct AAValueConstantRangeFloating : AAValueConstantRangeImpl { auto VisitValueCB = [&](Value &V, IntegerRangeState &T, bool Stripped) -> bool { Instruction *I = dyn_cast(&V); - if (!I) { + if (!I || isa(I)) { // If the value is not instruction, we query AA to Attributor. const auto &AA = @@ -7384,10 +7404,9 @@ bool Attributor::isAssumedDead(const IRPosition &IRP, return false; } -bool Attributor::checkForAllUses( - const function_ref &Pred, - const AbstractAttribute &QueryingAA, const Value &V, - DepClassTy LivenessDepClass) { +bool Attributor::checkForAllUses(function_ref Pred, + const AbstractAttribute &QueryingAA, + const Value &V, DepClassTy LivenessDepClass) { // Check the trivial case first as it catches void values. if (V.use_empty()) @@ -7433,6 +7452,10 @@ bool Attributor::checkForAllUses( LLVM_DEBUG(dbgs() << "[Attributor] Dead use, skip!\n"); continue; } + if (U->getUser()->isDroppable()) { + LLVM_DEBUG(dbgs() << "[Attributor] Droppable user, skip!\n"); + continue; + } bool Follow = false; if (!Pred(*U, Follow)) @@ -7446,10 +7469,10 @@ bool Attributor::checkForAllUses( return true; } -bool Attributor::checkForAllCallSites( - const function_ref &Pred, - const AbstractAttribute &QueryingAA, bool RequireAllCallSites, - bool &AllCallSitesKnown) { +bool Attributor::checkForAllCallSites(function_ref Pred, + const AbstractAttribute &QueryingAA, + bool RequireAllCallSites, + bool &AllCallSitesKnown) { // We can try to determine information from // the call sites. However, this is only possible all call sites are known, // hence the function has internal linkage. @@ -7466,10 +7489,11 @@ bool Attributor::checkForAllCallSites( &QueryingAA, AllCallSitesKnown); } -bool Attributor::checkForAllCallSites( - const function_ref &Pred, const Function &Fn, - bool RequireAllCallSites, const AbstractAttribute *QueryingAA, - bool &AllCallSitesKnown) { +bool Attributor::checkForAllCallSites(function_ref Pred, + const Function &Fn, + bool RequireAllCallSites, + const AbstractAttribute *QueryingAA, + bool &AllCallSitesKnown) { if (RequireAllCallSites && !Fn.hasLocalLinkage()) { LLVM_DEBUG( dbgs() @@ -7551,8 +7575,7 @@ bool Attributor::checkForAllCallSites( } bool Attributor::checkForAllReturnedValuesAndReturnInsts( - const function_ref &)> - &Pred, + function_ref &)> Pred, const AbstractAttribute &QueryingAA) { const IRPosition &IRP = QueryingAA.getIRPosition(); @@ -7574,8 +7597,7 @@ bool Attributor::checkForAllReturnedValuesAndReturnInsts( } bool Attributor::checkForAllReturnedValues( - const function_ref &Pred, - const AbstractAttribute &QueryingAA) { + function_ref Pred, const AbstractAttribute &QueryingAA) { const IRPosition &IRP = QueryingAA.getIRPosition(); const Function *AssociatedFunction = IRP.getAssociatedFunction(); @@ -7596,9 +7618,9 @@ bool Attributor::checkForAllReturnedValues( static bool checkForAllInstructionsImpl( Attributor *A, InformationCache::OpcodeInstMapTy &OpcodeInstMap, - const function_ref &Pred, - const AbstractAttribute *QueryingAA, const AAIsDead *LivenessAA, - const ArrayRef &Opcodes, bool CheckBBLivenessOnly = false) { + function_ref Pred, const AbstractAttribute *QueryingAA, + const AAIsDead *LivenessAA, const ArrayRef &Opcodes, + bool CheckBBLivenessOnly = false) { for (unsigned Opcode : Opcodes) { for (Instruction *I : OpcodeInstMap[Opcode]) { // Skip dead instructions. @@ -7613,10 +7635,10 @@ static bool checkForAllInstructionsImpl( return true; } -bool Attributor::checkForAllInstructions( - const llvm::function_ref &Pred, - const AbstractAttribute &QueryingAA, const ArrayRef &Opcodes, - bool CheckBBLivenessOnly) { +bool Attributor::checkForAllInstructions(function_ref Pred, + const AbstractAttribute &QueryingAA, + const ArrayRef &Opcodes, + bool CheckBBLivenessOnly) { const IRPosition &IRP = QueryingAA.getIRPosition(); // Since we need to provide instructions we have to have an exact definition. @@ -7639,8 +7661,7 @@ bool Attributor::checkForAllInstructions( } bool Attributor::checkForAllReadWriteInstructions( - const llvm::function_ref &Pred, - AbstractAttribute &QueryingAA) { + function_ref Pred, AbstractAttribute &QueryingAA) { const Function *AssociatedFunction = QueryingAA.getIRPosition().getAssociatedFunction(); @@ -7675,7 +7696,7 @@ ChangeStatus Attributor::run() { unsigned IterationCounter = 1; - SmallVector ChangedAAs; + SmallVector ChangedAAs; SetVector Worklist, InvalidAAs; Worklist.insert(AllAbstractAttributes.begin(), AllAbstractAttributes.end()); @@ -7880,6 +7901,14 @@ ChangeStatus Attributor::run() { Use *U = It.first; Value *NewV = It.second; Value *OldV = U->get(); + + // Do not replace uses in returns if the value is a must-tail call we will + // not delete. + if (isa(U->getUser())) + if (auto *CI = dyn_cast(OldV->stripPointerCasts())) + if (CI->isMustTailCall() && !ToBeDeletedInsts.count(CI)) + continue; + LLVM_DEBUG(dbgs() << "Use " << *NewV << " in " << *U->getUser() << " instead of " << *OldV << "\n"); U->set(NewV); @@ -8318,14 +8347,21 @@ void Attributor::initializeInformationCache(Function &F) { switch (I.getOpcode()) { default: assert((!ImmutableCallSite(&I)) && (!isa(&I)) && - "New call site/base instruction type needs to be known int the " + "New call site/base instruction type needs to be known in the " "Attributor."); break; + case Instruction::Call: + // Calls are interesting but for `llvm.assume` calls we also fill the + // KnowledgeMap as we find them. + if (IntrinsicInst *Assume = dyn_cast(&I)) { + if (Assume->getIntrinsicID() == Intrinsic::assume) + fillMapFromAssume(*Assume, InfoCache.KnowledgeMap); + } + LLVM_FALLTHROUGH; case Instruction::Load: // The alignment of a pointer is interesting for loads. case Instruction::Store: // The alignment of a pointer is interesting for stores. - case Instruction::Call: case Instruction::CallBr: case Instruction::Invoke: case Instruction::CleanupRet: @@ -8647,6 +8683,10 @@ static bool runAttributorOnFunctions(InformationCache &InfoCache, // while we identify default attribute opportunities. Attributor A(Functions, InfoCache, CGUpdater, DepRecInterval); + // Note: _Don't_ combine/fuse this loop with the one below because + // when A.identifyDefaultAbstractAttributes() is called for one + // function, it assumes that the information cach has been + // initialized for _all_ functions. for (Function *F : Functions) A.initializeInformationCache(*F); diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp index 3b234ca0be7d3..0ab6333cb3acf 100644 --- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp +++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp @@ -659,9 +659,6 @@ static bool AllUsesOfValueWillTrapIfNull(const Value *V, // checked. if (PHIs.insert(PN).second && !AllUsesOfValueWillTrapIfNull(PN, PHIs)) return false; - } else if (isa(U) && - isa(U->getOperand(1))) { - // Ignore icmp X, null } else { //cerr << "NONTRAPPING USE: " << *U; return false; diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp index a766513ded039..8b45350ce44ce 100644 --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -79,7 +79,7 @@ struct OpenMPOpt { SmallVector ArgumentTypes; /// The declaration if available. - Function *Declaration; + Function *Declaration = nullptr; /// Uses of this runtime function per function containing the use. DenseMap> UsesMap; @@ -119,7 +119,7 @@ struct OpenMPOpt { } private: - /// Try to delete parallel regions if possible + /// Try to delete parallel regions if possible. bool deleteParallelRegions() { const unsigned CallbackCalleeOperand = 2; @@ -385,6 +385,32 @@ struct OpenMPOpt { return nullptr; } + /// Returns true if the function declaration \p F matches the runtime + /// function types, that is, return type \p RTFRetType, and argument types + /// \p RTFArgTypes. + static bool declMatchesRTFTypes(Function *F, Type *RTFRetType, + SmallVector &RTFArgTypes) { + // TODO: We should output information to the user (under debug output + // and via remarks). + + if (!F) + return false; + if (F->getReturnType() != RTFRetType) + return false; + if (F->arg_size() != RTFArgTypes.size()) + return false; + + auto RTFTyIt = RTFArgTypes.begin(); + for (Argument &Arg : F->args()) { + if (Arg.getType() != *RTFTyIt) + return false; + + ++RTFTyIt; + } + + return true; + } + /// Helper to initialize all runtime function information for those defined in /// OpenMPKinds.def. void initializeRuntimeFunctions() { @@ -415,26 +441,29 @@ struct OpenMPOpt { #define OMP_RTL(_Enum, _Name, _IsVarArg, _ReturnType, ...) \ { \ - auto &RFI = RFIs[_Enum]; \ - RFI.Kind = _Enum; \ - RFI.Name = _Name; \ - RFI.IsVarArg = _IsVarArg; \ - RFI.ReturnType = _ReturnType; \ - RFI.ArgumentTypes = SmallVector({__VA_ARGS__}); \ - RFI.Declaration = M.getFunction(_Name); \ - unsigned NumUses = CollectUses(RFI); \ - (void)NumUses; \ - LLVM_DEBUG({ \ - dbgs() << TAG << RFI.Name << (RFI.Declaration ? "" : " not") \ - << " found\n"; \ - if (RFI.Declaration) \ - dbgs() << TAG << "-> got " << NumUses << " uses in " \ - << RFI.UsesMap.size() << " different functions.\n"; \ - }); \ + SmallVector ArgsTypes({__VA_ARGS__}); \ + Function *F = M.getFunction(_Name); \ + if (declMatchesRTFTypes(F, _ReturnType , ArgsTypes)) { \ + auto &RFI = RFIs[_Enum]; \ + RFI.Kind = _Enum; \ + RFI.Name = _Name; \ + RFI.IsVarArg = _IsVarArg; \ + RFI.ReturnType = _ReturnType; \ + RFI.ArgumentTypes = std::move(ArgsTypes); \ + RFI.Declaration = F; \ + unsigned NumUses = CollectUses(RFI); \ + (void)NumUses; \ + LLVM_DEBUG({ \ + dbgs() << TAG << RFI.Name << (RFI.Declaration ? "" : " not") \ + << " found\n"; \ + if (RFI.Declaration) \ + dbgs() << TAG << "-> got " << NumUses << " uses in " \ + << RFI.UsesMap.size() << " different functions.\n"; \ + }); \ + } \ } #include "llvm/Frontend/OpenMP/OMPKinds.def" - // TODO: We should validate the declaration agains the types we expect. // TODO: We should attach the attributes defined in OMPKinds.def. } diff --git a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp index e332f0d83e1db..e61d8b1ce6ee5 100644 --- a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp +++ b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -270,12 +270,6 @@ void PassManagerBuilder::addInitialAliasAnalysisPasses( PM.add(createScopedNoAliasAAWrapperPass()); } -void PassManagerBuilder::addInstructionCombiningPass( - legacy::PassManagerBase &PM) const { - bool ExpensiveCombines = OptLevel > 2; - PM.add(createInstructionCombiningPass(ExpensiveCombines)); -} - void PassManagerBuilder::populateFunctionPassManager( legacy::FunctionPassManager &FPM) { addExtensionsToPM(EP_EarlyAsPossible, FPM); @@ -374,7 +368,7 @@ void PassManagerBuilder::addFunctionSimplificationPasses( // Combine silly seq's if (OptLevel > 2) MPM.add(createAggressiveInstCombinerPass()); - addInstructionCombiningPass(MPM); + MPM.add(createInstructionCombiningPass()); if (SizeLevel == 0 && !DisableLibCallsShrinkWrap) MPM.add(createLibCallsShrinkWrapPass()); addExtensionsToPM(EP_Peephole, MPM); @@ -409,7 +403,7 @@ void PassManagerBuilder::addFunctionSimplificationPasses( // simplify-cfg. Eventually loop-simplifycfg should be enhanced to replace the // need for this. MPM.add(createCFGSimplificationPass()); - addInstructionCombiningPass(MPM); + MPM.add(createInstructionCombiningPass()); // We resume loop passes creating a second loop pipeline here. MPM.add(createIndVarSimplifyPass()); // Canonicalize indvars MPM.add(createLoopIdiomPass()); // Recognize idioms like memset. @@ -440,7 +434,7 @@ void PassManagerBuilder::addFunctionSimplificationPasses( // Run instcombine after redundancy elimination to exploit opportunities // opened up by them. - addInstructionCombiningPass(MPM); + MPM.add(createInstructionCombiningPass()); addExtensionsToPM(EP_Peephole, MPM); if (OptLevel > 1) { MPM.add(createJumpThreadingPass()); // Thread jumps @@ -458,7 +452,7 @@ void PassManagerBuilder::addFunctionSimplificationPasses( MPM.add(createAggressiveDCEPass()); // Delete dead instructions MPM.add(createCFGSimplificationPass()); // Merge & remove BBs // Clean up after everything. - addInstructionCombiningPass(MPM); + MPM.add(createInstructionCombiningPass()); addExtensionsToPM(EP_Peephole, MPM); if (EnableCHR && OptLevel >= 3 && @@ -569,7 +563,7 @@ void PassManagerBuilder::populateModulePassManager( MPM.add(createDeadArgEliminationPass()); // Dead argument elimination - addInstructionCombiningPass(MPM); // Clean up after IPCP & DAE + MPM.add(createInstructionCombiningPass()); // Clean up after IPCP & DAE addExtensionsToPM(EP_Peephole, MPM); MPM.add(createCFGSimplificationPass()); // Clean up after IPCP & DAE @@ -741,7 +735,7 @@ void PassManagerBuilder::populateModulePassManager( // on -O1 and no #pragma is found). Would be good to have these two passes // as function calls, so that we can only pass them when the vectorizer // changed the code. - addInstructionCombiningPass(MPM); + MPM.add(createInstructionCombiningPass()); if (OptLevel > 1 && ExtraVectorizerPasses) { // At higher optimization levels, try to clean up any runtime overlap and // alignment checks inserted by the vectorizer. We want to track correllated @@ -750,11 +744,11 @@ void PassManagerBuilder::populateModulePassManager( // and unswitch the runtime checks if possible. Once hoisted, we may have // dead (or speculatable) control flows or more combining opportunities. MPM.add(createCorrelatedValuePropagationPass()); - addInstructionCombiningPass(MPM); + MPM.add(createInstructionCombiningPass()); MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap)); MPM.add(createLoopUnswitchPass(SizeLevel || OptLevel < 3, DivergentTarget)); MPM.add(createCFGSimplificationPass()); - addInstructionCombiningPass(MPM); + MPM.add(createInstructionCombiningPass()); } // Cleanup after loop vectorization, etc. Simplification passes like CVP and @@ -772,7 +766,7 @@ void PassManagerBuilder::populateModulePassManager( } addExtensionsToPM(EP_Peephole, MPM); - addInstructionCombiningPass(MPM); + MPM.add(createInstructionCombiningPass()); if (EnableUnrollAndJam && !DisableUnrollLoops) { // Unroll and Jam. We do this before unroll but need to be in a separate @@ -787,7 +781,7 @@ void PassManagerBuilder::populateModulePassManager( if (!DisableUnrollLoops) { // LoopUnroll may generate some redundency to cleanup. - addInstructionCombiningPass(MPM); + MPM.add(createInstructionCombiningPass()); // Runtime unrolling will introduce runtime check in loop prologue. If the // unrolled loop is a inner loop, then the prologue will be inside the @@ -925,7 +919,7 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) { // calls, etc, so let instcombine do this. if (OptLevel > 2) PM.add(createAggressiveInstCombinerPass()); - addInstructionCombiningPass(PM); + PM.add(createInstructionCombiningPass()); addExtensionsToPM(EP_Peephole, PM); // Inline small functions @@ -958,7 +952,7 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) { PM.add(createArgumentPromotionPass()); // The IPO passes may leave cruft around. Clean up after them. - addInstructionCombiningPass(PM); + PM.add(createInstructionCombiningPass()); addExtensionsToPM(EP_Peephole, PM); PM.add(createJumpThreadingPass()); @@ -1004,10 +998,10 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) { // we may have exposed more scalar opportunities. Run parts of the scalar // optimizer again at this point. PM.add(createVectorCombinePass()); - addInstructionCombiningPass(PM); // Initial cleanup + PM.add(createInstructionCombiningPass()); // Initial cleanup PM.add(createCFGSimplificationPass()); // if-convert PM.add(createSCCPPass()); // Propagate exposed constants - addInstructionCombiningPass(PM); // Clean up again + PM.add(createInstructionCombiningPass()); // Clean up again PM.add(createBitTrackingDCEPass()); // More scalar chains could be vectorized due to more alias information @@ -1021,7 +1015,7 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) { PM.add(createAlignmentFromAssumptionsPass()); // Cleanup and simplify the code after the scalar optimizations. - addInstructionCombiningPass(PM); + PM.add(createInstructionCombiningPass()); addExtensionsToPM(EP_Peephole, PM); PM.add(createJumpThreadingPass()); diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index 2138017606b79..b28da94649334 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -296,106 +296,154 @@ static Value *simplifyX86immShift(const IntrinsicInst &II, InstCombiner::BuilderTy &Builder) { bool LogicalShift = false; bool ShiftLeft = false; + bool IsImm = false; switch (II.getIntrinsicID()) { default: llvm_unreachable("Unexpected intrinsic!"); - case Intrinsic::x86_sse2_psra_d: - case Intrinsic::x86_sse2_psra_w: case Intrinsic::x86_sse2_psrai_d: case Intrinsic::x86_sse2_psrai_w: - case Intrinsic::x86_avx2_psra_d: - case Intrinsic::x86_avx2_psra_w: case Intrinsic::x86_avx2_psrai_d: case Intrinsic::x86_avx2_psrai_w: - case Intrinsic::x86_avx512_psra_q_128: case Intrinsic::x86_avx512_psrai_q_128: - case Intrinsic::x86_avx512_psra_q_256: case Intrinsic::x86_avx512_psrai_q_256: - case Intrinsic::x86_avx512_psra_d_512: - case Intrinsic::x86_avx512_psra_q_512: - case Intrinsic::x86_avx512_psra_w_512: case Intrinsic::x86_avx512_psrai_d_512: case Intrinsic::x86_avx512_psrai_q_512: case Intrinsic::x86_avx512_psrai_w_512: - LogicalShift = false; ShiftLeft = false; + IsImm = true; + LLVM_FALLTHROUGH; + case Intrinsic::x86_sse2_psra_d: + case Intrinsic::x86_sse2_psra_w: + case Intrinsic::x86_avx2_psra_d: + case Intrinsic::x86_avx2_psra_w: + case Intrinsic::x86_avx512_psra_q_128: + case Intrinsic::x86_avx512_psra_q_256: + case Intrinsic::x86_avx512_psra_d_512: + case Intrinsic::x86_avx512_psra_q_512: + case Intrinsic::x86_avx512_psra_w_512: + LogicalShift = false; + ShiftLeft = false; break; - case Intrinsic::x86_sse2_psrl_d: - case Intrinsic::x86_sse2_psrl_q: - case Intrinsic::x86_sse2_psrl_w: case Intrinsic::x86_sse2_psrli_d: case Intrinsic::x86_sse2_psrli_q: case Intrinsic::x86_sse2_psrli_w: - case Intrinsic::x86_avx2_psrl_d: - case Intrinsic::x86_avx2_psrl_q: - case Intrinsic::x86_avx2_psrl_w: case Intrinsic::x86_avx2_psrli_d: case Intrinsic::x86_avx2_psrli_q: case Intrinsic::x86_avx2_psrli_w: - case Intrinsic::x86_avx512_psrl_d_512: - case Intrinsic::x86_avx512_psrl_q_512: - case Intrinsic::x86_avx512_psrl_w_512: case Intrinsic::x86_avx512_psrli_d_512: case Intrinsic::x86_avx512_psrli_q_512: case Intrinsic::x86_avx512_psrli_w_512: - LogicalShift = true; ShiftLeft = false; + IsImm = true; + LLVM_FALLTHROUGH; + case Intrinsic::x86_sse2_psrl_d: + case Intrinsic::x86_sse2_psrl_q: + case Intrinsic::x86_sse2_psrl_w: + case Intrinsic::x86_avx2_psrl_d: + case Intrinsic::x86_avx2_psrl_q: + case Intrinsic::x86_avx2_psrl_w: + case Intrinsic::x86_avx512_psrl_d_512: + case Intrinsic::x86_avx512_psrl_q_512: + case Intrinsic::x86_avx512_psrl_w_512: + LogicalShift = true; + ShiftLeft = false; break; - case Intrinsic::x86_sse2_psll_d: - case Intrinsic::x86_sse2_psll_q: - case Intrinsic::x86_sse2_psll_w: case Intrinsic::x86_sse2_pslli_d: case Intrinsic::x86_sse2_pslli_q: case Intrinsic::x86_sse2_pslli_w: - case Intrinsic::x86_avx2_psll_d: - case Intrinsic::x86_avx2_psll_q: - case Intrinsic::x86_avx2_psll_w: case Intrinsic::x86_avx2_pslli_d: case Intrinsic::x86_avx2_pslli_q: case Intrinsic::x86_avx2_pslli_w: - case Intrinsic::x86_avx512_psll_d_512: - case Intrinsic::x86_avx512_psll_q_512: - case Intrinsic::x86_avx512_psll_w_512: case Intrinsic::x86_avx512_pslli_d_512: case Intrinsic::x86_avx512_pslli_q_512: case Intrinsic::x86_avx512_pslli_w_512: - LogicalShift = true; ShiftLeft = true; + IsImm = true; + LLVM_FALLTHROUGH; + case Intrinsic::x86_sse2_psll_d: + case Intrinsic::x86_sse2_psll_q: + case Intrinsic::x86_sse2_psll_w: + case Intrinsic::x86_avx2_psll_d: + case Intrinsic::x86_avx2_psll_q: + case Intrinsic::x86_avx2_psll_w: + case Intrinsic::x86_avx512_psll_d_512: + case Intrinsic::x86_avx512_psll_q_512: + case Intrinsic::x86_avx512_psll_w_512: + LogicalShift = true; + ShiftLeft = true; break; } assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left"); - // Simplify if count is constant. - auto Arg1 = II.getArgOperand(1); - auto CAZ = dyn_cast(Arg1); - auto CDV = dyn_cast(Arg1); - auto CInt = dyn_cast(Arg1); - if (!CAZ && !CDV && !CInt) - return nullptr; - - APInt Count(64, 0); - if (CDV) { - // SSE2/AVX2 uses all the first 64-bits of the 128-bit vector - // operand to compute the shift amount. - auto VT = cast(CDV->getType()); - unsigned BitWidth = VT->getElementType()->getPrimitiveSizeInBits(); - assert((64 % BitWidth) == 0 && "Unexpected packed shift size"); - unsigned NumSubElts = 64 / BitWidth; - - // Concatenate the sub-elements to create the 64-bit value. - for (unsigned i = 0; i != NumSubElts; ++i) { - unsigned SubEltIdx = (NumSubElts - 1) - i; - auto SubElt = cast(CDV->getElementAsConstant(SubEltIdx)); - Count <<= BitWidth; - Count |= SubElt->getValue().zextOrTrunc(64); - } - } - else if (CInt) - Count = CInt->getValue(); - auto Vec = II.getArgOperand(0); + auto Amt = II.getArgOperand(1); auto VT = cast(Vec->getType()); auto SVT = VT->getElementType(); + auto AmtVT = Amt->getType(); unsigned VWidth = VT->getNumElements(); unsigned BitWidth = SVT->getPrimitiveSizeInBits(); + // If the shift amount is guaranteed to be in-range we can replace it with a + // generic shift. If its guaranteed to be out of range, logical shifts combine to + // zero and arithmetic shifts are clamped to (BitWidth - 1). + if (IsImm) { + assert(AmtVT ->isIntegerTy(32) && + "Unexpected shift-by-immediate type"); + KnownBits KnownAmtBits = + llvm::computeKnownBits(Amt, II.getModule()->getDataLayout()); + if (KnownAmtBits.getMaxValue().ult(BitWidth)) { + Amt = Builder.CreateZExtOrTrunc(Amt, SVT); + Amt = Builder.CreateVectorSplat(VWidth, Amt); + return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt) + : Builder.CreateLShr(Vec, Amt)) + : Builder.CreateAShr(Vec, Amt)); + } + if (KnownAmtBits.getMinValue().uge(BitWidth)) { + if (LogicalShift) + return ConstantAggregateZero::get(VT); + Amt = ConstantInt::get(SVT, BitWidth - 1); + return Builder.CreateAShr(Vec, Builder.CreateVectorSplat(VWidth, Amt)); + } + } else { + // Ensure the first element has an in-range value and the rest of the + // elements in the bottom 64 bits are zero. + assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 && + cast(AmtVT)->getElementType() == SVT && + "Unexpected shift-by-scalar type"); + unsigned NumAmtElts = cast(AmtVT)->getNumElements(); + APInt DemandedLower = APInt::getOneBitSet(NumAmtElts, 0); + APInt DemandedUpper = APInt::getBitsSet(NumAmtElts, 1, NumAmtElts / 2); + KnownBits KnownLowerBits = llvm::computeKnownBits( + Amt, DemandedLower, II.getModule()->getDataLayout()); + KnownBits KnownUpperBits = llvm::computeKnownBits( + Amt, DemandedUpper, II.getModule()->getDataLayout()); + if (KnownLowerBits.getMaxValue().ult(BitWidth) && + (DemandedUpper.isNullValue() || KnownUpperBits.isZero())) { + SmallVector ZeroSplat(VWidth, 0); + Amt = Builder.CreateShuffleVector(Amt, Amt, ZeroSplat); + return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt) + : Builder.CreateLShr(Vec, Amt)) + : Builder.CreateAShr(Vec, Amt)); + } + } + + // Simplify if count is constant vector. + auto CDV = dyn_cast(Amt); + if (!CDV) + return nullptr; + + // SSE2/AVX2 uses all the first 64-bits of the 128-bit vector + // operand to compute the shift amount. + assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 && + cast(AmtVT)->getElementType() == SVT && + "Unexpected shift-by-scalar type"); + + // Concatenate the sub-elements to create the 64-bit value. + APInt Count(64, 0); + for (unsigned i = 0, NumSubElts = 64 / BitWidth; i != NumSubElts; ++i) { + unsigned SubEltIdx = (NumSubElts - 1) - i; + auto SubElt = cast(CDV->getElementAsConstant(SubEltIdx)); + Count <<= BitWidth; + Count |= SubElt->getValue().zextOrTrunc(64); + } + // If shift-by-zero then just return the original value. if (Count.isNullValue()) return Vec; @@ -1274,19 +1322,39 @@ static Instruction *foldCttzCtlz(IntrinsicInst &II, InstCombiner &IC) { static Instruction *foldCtpop(IntrinsicInst &II, InstCombiner &IC) { assert(II.getIntrinsicID() == Intrinsic::ctpop && "Expected ctpop intrinsic"); + Type *Ty = II.getType(); + unsigned BitWidth = Ty->getScalarSizeInBits(); Value *Op0 = II.getArgOperand(0); Value *X; + // ctpop(bitreverse(x)) -> ctpop(x) // ctpop(bswap(x)) -> ctpop(x) if (match(Op0, m_BitReverse(m_Value(X))) || match(Op0, m_BSwap(m_Value(X)))) return IC.replaceOperand(II, 0, X); + // ctpop(x | -x) -> bitwidth - cttz(x, false) + if (Op0->hasOneUse() && + match(Op0, m_c_Or(m_Value(X), m_Neg(m_Deferred(X))))) { + Function *F = + Intrinsic::getDeclaration(II.getModule(), Intrinsic::cttz, Ty); + auto *Cttz = IC.Builder.CreateCall(F, {X, IC.Builder.getFalse()}); + auto *Bw = ConstantInt::get(Ty, APInt(BitWidth, BitWidth)); + return IC.replaceInstUsesWith(II, IC.Builder.CreateSub(Bw, Cttz)); + } + + // ctpop(~x & (x - 1)) -> cttz(x, false) + if (match(Op0, + m_c_And(m_Not(m_Value(X)), m_Add(m_Deferred(X), m_AllOnes())))) { + Function *F = + Intrinsic::getDeclaration(II.getModule(), Intrinsic::cttz, Ty); + return CallInst::Create(F, {X, IC.Builder.getFalse()}); + } + // FIXME: Try to simplify vectors of integers. - auto *IT = dyn_cast(Op0->getType()); + auto *IT = dyn_cast(Ty); if (!IT) return nullptr; - unsigned BitWidth = IT->getBitWidth(); KnownBits Known(BitWidth); IC.computeKnownBits(Op0, Known, 0, &II); @@ -4536,6 +4604,10 @@ Instruction *InstCombiner::visitCallBase(CallBase &Call) { if (I) return eraseInstFromFunction(*I); } + if (!Call.use_empty() && !Call.isMustTailCall()) + if (Value *ReturnedArg = Call.getReturnedArgOperand()) + return replaceInstUsesWith(Call, ReturnedArg); + if (isAllocLikeFn(&Call, &TLI)) return visitAllocSite(Call); diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h index 38ba742bf5fa6..d6053c2617838 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h +++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h @@ -313,9 +313,6 @@ class LLVM_LIBRARY_VISIBILITY InstCombiner // Mode in which we are running the combiner. const bool MinimizeSize; - /// Enable combines that trigger rarely but are costly in compiletime. - const bool ExpensiveCombines; - AliasAnalysis *AA; // Required analyses. @@ -336,12 +333,12 @@ class LLVM_LIBRARY_VISIBILITY InstCombiner public: InstCombiner(InstCombineWorklist &Worklist, BuilderTy &Builder, - bool MinimizeSize, bool ExpensiveCombines, AliasAnalysis *AA, + bool MinimizeSize, AliasAnalysis *AA, AssumptionCache &AC, TargetLibraryInfo &TLI, DominatorTree &DT, OptimizationRemarkEmitter &ORE, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, const DataLayout &DL, LoopInfo *LI) : Worklist(Worklist), Builder(Builder), MinimizeSize(MinimizeSize), - ExpensiveCombines(ExpensiveCombines), AA(AA), AC(AC), TLI(TLI), DT(DT), + AA(AA), AC(AC), TLI(TLI), DT(DT), DL(DL), SQ(DL, &TLI, &DT, &AC), ORE(ORE), BFI(BFI), PSI(PSI), LI(LI) {} /// Run the combiner over the entire worklist until it is empty. diff --git a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp index 4db482646ab2d..b95f3f6a0ecdc 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp @@ -348,7 +348,7 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) { // Move all alloca's of zero byte objects to the entry block and merge them // together. Note that we only do this for alloca's, because malloc should // allocate and return a unique pointer, even for a zero byte allocation. - if (DL.getTypeAllocSize(AI.getAllocatedType()) == 0) { + if (DL.getTypeAllocSize(AI.getAllocatedType()).getKnownMinSize() == 0) { // For a zero sized alloca there is no point in doing an array allocation. // This is helpful if the array size is a complicated expression not used // elsewhere. @@ -365,7 +365,8 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) { // dominance as the array size was forced to a constant earlier already. AllocaInst *EntryAI = dyn_cast(FirstInst); if (!EntryAI || !EntryAI->getAllocatedType()->isSized() || - DL.getTypeAllocSize(EntryAI->getAllocatedType()) != 0) { + DL.getTypeAllocSize(EntryAI->getAllocatedType()) + .getKnownMinSize() != 0) { AI.moveBefore(FirstInst); return &AI; } diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp index aa5d33ce8ac8f..e697b99d6cf82 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -553,11 +553,25 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, if (SimplifyDemandedBits(I, 0, DemandedMaskIn, Known, Depth + 1)) return I; assert(!Known.hasConflict() && "Bits known to be one AND zero?"); + + bool SignBitZero = Known.Zero.isSignBitSet(); + bool SignBitOne = Known.One.isSignBitSet(); Known.Zero <<= ShiftAmt; Known.One <<= ShiftAmt; // low bits known zero. if (ShiftAmt) Known.Zero.setLowBits(ShiftAmt); + + // If this shift has "nsw" keyword, then the result is either a poison + // value or has the same sign bit as the first operand. + if (IOp->hasNoSignedWrap()) { + if (SignBitZero) + Known.Zero.setSignBit(); + else if (SignBitOne) + Known.One.setSignBit(); + if (Known.hasConflict()) + return UndefValue::get(I->getType()); + } } else { computeKnownBits(I, Known, Depth, CxtI); } diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index 9d17e92eca203..da43f510d32ea 100644 --- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -130,10 +130,6 @@ static cl::opt EnableCodeSinking("instcombine-code-sinking", cl::desc("Enable code sinking"), cl::init(true)); -static cl::opt -EnableExpensiveCombines("expensive-combines", - cl::desc("Enable expensive instruction combines")); - static cl::opt LimitMaxIterations( "instcombine-max-iterations", cl::desc("Limit the maximum number of instruction combining iterations"), @@ -2760,6 +2756,12 @@ Instruction *InstCombiner::visitFree(CallInst &FI) { return nullptr; } +static bool isMustTailCall(Value *V) { + if (auto *CI = dyn_cast(V)) + return CI->isMustTailCall(); + return false; +} + Instruction *InstCombiner::visitReturnInst(ReturnInst &RI) { if (RI.getNumOperands() == 0) // ret void return nullptr; @@ -2769,6 +2771,10 @@ Instruction *InstCombiner::visitReturnInst(ReturnInst &RI) { if (!VTy->isIntegerTy() || isa(ResultOp)) return nullptr; + // Don't replace result of musttail calls. + if (isMustTailCall(ResultOp)) + return nullptr; + // There might be assume intrinsics dominating this return that completely // determine the value. If so, constant fold it. KnownBits Known = computeKnownBits(ResultOp, 0, &RI); @@ -3481,26 +3487,6 @@ bool InstCombiner::run() { } } - // In general, it is possible for computeKnownBits to determine all bits in - // a value even when the operands are not all constants. - Type *Ty = I->getType(); - if (ExpensiveCombines && !I->use_empty() && Ty->isIntOrIntVectorTy()) { - KnownBits Known = computeKnownBits(I, /*Depth*/0, I); - if (Known.isConstant()) { - Constant *C = ConstantInt::get(Ty, Known.getConstant()); - LLVM_DEBUG(dbgs() << "IC: ConstFold (all bits known) to: " << *C - << " from: " << *I << '\n'); - - // Add operands to the worklist. - replaceInstUsesWith(*I, C); - ++NumConstProp; - if (isInstructionTriviallyDead(I, &TLI)) - eraseInstFromFunction(*I); - MadeIRChange = true; - continue; - } - } - // See if we can trivially sink this instruction to a successor basic block. if (EnableCodeSinking && I->hasOneUse()) { BasicBlock *BB = I->getParent(); @@ -3752,11 +3738,8 @@ static bool combineInstructionsOverFunction( Function &F, InstCombineWorklist &Worklist, AliasAnalysis *AA, AssumptionCache &AC, TargetLibraryInfo &TLI, DominatorTree &DT, OptimizationRemarkEmitter &ORE, BlockFrequencyInfo *BFI, - ProfileSummaryInfo *PSI, bool ExpensiveCombines, unsigned MaxIterations, - LoopInfo *LI) { + ProfileSummaryInfo *PSI, unsigned MaxIterations, LoopInfo *LI) { auto &DL = F.getParent()->getDataLayout(); - if (EnableExpensiveCombines.getNumOccurrences()) - ExpensiveCombines = EnableExpensiveCombines; MaxIterations = std::min(MaxIterations, LimitMaxIterations.getValue()); /// Builder - This is an IRBuilder that automatically inserts new @@ -3798,7 +3781,7 @@ static bool combineInstructionsOverFunction( MadeIRChange |= prepareICWorklistFromFunction(F, DL, &TLI, Worklist); - InstCombiner IC(Worklist, Builder, F.hasMinSize(), ExpensiveCombines, AA, + InstCombiner IC(Worklist, Builder, F.hasMinSize(), AA, AC, TLI, DT, ORE, BFI, PSI, DL, LI); IC.MaxArraySizeForCombine = MaxArraySize; @@ -3811,11 +3794,10 @@ static bool combineInstructionsOverFunction( return MadeIRChange; } -InstCombinePass::InstCombinePass(bool ExpensiveCombines) - : ExpensiveCombines(ExpensiveCombines), MaxIterations(LimitMaxIterations) {} +InstCombinePass::InstCombinePass() : MaxIterations(LimitMaxIterations) {} -InstCombinePass::InstCombinePass(bool ExpensiveCombines, unsigned MaxIterations) - : ExpensiveCombines(ExpensiveCombines), MaxIterations(MaxIterations) {} +InstCombinePass::InstCombinePass(unsigned MaxIterations) + : MaxIterations(MaxIterations) {} PreservedAnalyses InstCombinePass::run(Function &F, FunctionAnalysisManager &AM) { @@ -3835,8 +3817,7 @@ PreservedAnalyses InstCombinePass::run(Function &F, &AM.getResult(F) : nullptr; if (!combineInstructionsOverFunction(F, Worklist, AA, AC, TLI, DT, ORE, BFI, - PSI, ExpensiveCombines, MaxIterations, - LI)) + PSI, MaxIterations, LI)) // No changes, all analyses are preserved. return PreservedAnalyses::all(); @@ -3886,22 +3867,18 @@ bool InstructionCombiningPass::runOnFunction(Function &F) { nullptr; return combineInstructionsOverFunction(F, Worklist, AA, AC, TLI, DT, ORE, BFI, - PSI, ExpensiveCombines, MaxIterations, - LI); + PSI, MaxIterations, LI); } char InstructionCombiningPass::ID = 0; -InstructionCombiningPass::InstructionCombiningPass(bool ExpensiveCombines) - : FunctionPass(ID), ExpensiveCombines(ExpensiveCombines), - MaxIterations(InstCombineDefaultMaxIterations) { +InstructionCombiningPass::InstructionCombiningPass() + : FunctionPass(ID), MaxIterations(InstCombineDefaultMaxIterations) { initializeInstructionCombiningPassPass(*PassRegistry::getPassRegistry()); } -InstructionCombiningPass::InstructionCombiningPass(bool ExpensiveCombines, - unsigned MaxIterations) - : FunctionPass(ID), ExpensiveCombines(ExpensiveCombines), - MaxIterations(MaxIterations) { +InstructionCombiningPass::InstructionCombiningPass(unsigned MaxIterations) + : FunctionPass(ID), MaxIterations(MaxIterations) { initializeInstructionCombiningPassPass(*PassRegistry::getPassRegistry()); } @@ -3927,13 +3904,12 @@ void LLVMInitializeInstCombine(LLVMPassRegistryRef R) { initializeInstructionCombiningPassPass(*unwrap(R)); } -FunctionPass *llvm::createInstructionCombiningPass(bool ExpensiveCombines) { - return new InstructionCombiningPass(ExpensiveCombines); +FunctionPass *llvm::createInstructionCombiningPass() { + return new InstructionCombiningPass(); } -FunctionPass *llvm::createInstructionCombiningPass(bool ExpensiveCombines, - unsigned MaxIterations) { - return new InstructionCombiningPass(ExpensiveCombines, MaxIterations); +FunctionPass *llvm::createInstructionCombiningPass(unsigned MaxIterations) { + return new InstructionCombiningPass(MaxIterations); } void LLVMAddInstructionCombiningPass(LLVMPassManagerRef PM) { diff --git a/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp b/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp index 5bfece010bec2..0fc07fb9778d6 100644 --- a/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp +++ b/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp @@ -491,7 +491,7 @@ void ConstantHoistingPass::collectConstantCandidates( // take constant variables is lower than `TargetTransformInfo::TCC_Basic`. // So it's safe for us to collect constant candidates from all // IntrinsicInsts. - if (canReplaceOperandWithVariable(Inst, Idx) || isa(Inst)) { + if (canReplaceOperandWithVariable(Inst, Idx)) { collectConstantCandidates(ConstCandMap, Inst, Idx); } } // end of for all operands diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp index d5ca36b09579e..eb6fcae0a5481 100644 --- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp +++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp @@ -18,6 +18,7 @@ #include "llvm/ADT/APInt.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/MapVector.h" +#include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" @@ -42,7 +43,6 @@ #include "llvm/IR/DataLayout.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" -#include "llvm/IR/InstIterator.h" #include "llvm/IR/InstrTypes.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" @@ -1487,6 +1487,9 @@ struct DSEState { SmallPtrSet InvisibleToCaller; // Keep track of blocks with throwing instructions not modeled in MemorySSA. SmallPtrSet ThrowingBlocks; + // Post-order numbers for each basic block. Used to figure out if memory + // accesses are executed before another access. + DenseMap PostOrderNumbers; /// Keep track of instructions (partly) overlapping with killing MemoryDefs per /// basic block. @@ -1502,23 +1505,28 @@ struct DSEState { DSEState State(F, AA, MSSA, DT, PDT, TLI); // Collect blocks with throwing instructions not modeled in MemorySSA and // alloc-like objects. - for (Instruction &I : instructions(F)) { - if (I.mayThrow() && !MSSA.getMemoryAccess(&I)) - State.ThrowingBlocks.insert(I.getParent()); - - auto *MD = dyn_cast_or_null(MSSA.getMemoryAccess(&I)); - if (MD && State.MemDefs.size() < MemorySSADefsPerBlockLimit && - hasAnalyzableMemoryWrite(&I, TLI) && isRemovable(&I)) - State.MemDefs.push_back(MD); - - // Track alloca and alloca-like objects. Here we care about objects not - // visible to the caller during function execution. Alloca objects are - // invalid in the caller, for alloca-like objects we ensure that they are - // not captured throughout the function. - if (isa(&I) || - (isAllocLikeFn(&I, &TLI) && !PointerMayBeCaptured(&I, false, true))) - State.InvisibleToCaller.insert(&I); + unsigned PO = 0; + for (BasicBlock *BB : post_order(&F)) { + State.PostOrderNumbers[BB] = PO++; + for (Instruction &I : *BB) { + if (I.mayThrow() && !MSSA.getMemoryAccess(&I)) + State.ThrowingBlocks.insert(I.getParent()); + + auto *MD = dyn_cast_or_null(MSSA.getMemoryAccess(&I)); + if (MD && State.MemDefs.size() < MemorySSADefsPerBlockLimit && + hasAnalyzableMemoryWrite(&I, TLI) && isRemovable(&I)) + State.MemDefs.push_back(MD); + + // Track alloca and alloca-like objects. Here we care about objects not + // visible to the caller during function execution. Alloca objects are + // invalid in the caller, for alloca-like objects we ensure that they + // are not captured throughout the function. + if (isa(&I) || + (isAllocLikeFn(&I, &TLI) && !PointerMayBeCaptured(&I, false, true))) + State.InvisibleToCaller.insert(&I); + } } + // Treat byval or inalloca arguments the same as Allocas, stores to them are // dead at the end of the function. for (Argument &AI : F.args()) @@ -1593,16 +1601,13 @@ struct DSEState { // Find a MemoryDef writing to \p DefLoc and dominating \p Current, with no // read access in between or return None otherwise. The returned value may not // (completely) overwrite \p DefLoc. Currently we bail out when we encounter - // any of the following - // * An aliasing MemoryUse (read). - // * A MemoryPHI. + // an aliasing MemoryUse (read). Optional getDomMemoryDef(MemoryDef *KillingDef, MemoryAccess *Current, MemoryLocation DefLoc, bool DefVisibleToCaller, int &ScanLimit) const { - MemoryDef *DomDef; - MemoryAccess *StartDef = Current; + MemoryAccess *DomAccess; bool StepAgain; LLVM_DEBUG(dbgs() << " trying to get dominating access for " << *Current << "\n"); @@ -1613,37 +1618,44 @@ struct DSEState { if (MSSA.isLiveOnEntryDef(Current)) return None; - MemoryUseOrDef *CurrentUD = dyn_cast(Current); - if (!CurrentUD) - return None; - + if (isa(Current)) { + DomAccess = Current; + break; + } + MemoryUseOrDef *CurrentUD = cast(Current); // Look for access that clobber DefLoc. - MemoryAccess *DomAccess = - MSSA.getSkipSelfWalker()->getClobberingMemoryAccess( - CurrentUD->getDefiningAccess(), DefLoc); - DomDef = dyn_cast(DomAccess); - if (!DomDef || MSSA.isLiveOnEntryDef(DomDef)) + DomAccess = MSSA.getSkipSelfWalker()->getClobberingMemoryAccess(CurrentUD, + DefLoc); + if (MSSA.isLiveOnEntryDef(DomAccess)) return None; + if (isa(DomAccess)) + break; + // Check if we can skip DomDef for DSE. We also require the KillingDef // execute whenever DomDef executes and use post-dominance to ensure that. - if (canSkipDef(DomDef, DefVisibleToCaller) || + + MemoryDef *DomDef = dyn_cast(DomAccess); + if ((DomDef && canSkipDef(DomDef, DefVisibleToCaller)) || !PDT.dominates(KillingDef->getBlock(), DomDef->getBlock())) { StepAgain = true; - Current = DomDef; + Current = DomDef->getDefiningAccess(); } } while (StepAgain); - LLVM_DEBUG(dbgs() << " Checking for reads of " << *DomDef << " (" - << *DomDef->getMemoryInst() << ")\n"); + LLVM_DEBUG({ + dbgs() << " Checking for reads of " << *DomAccess; + if (isa(DomAccess)) + dbgs() << " (" << *cast(DomAccess)->getMemoryInst() << ")\n"; + }); SmallSetVector WorkList; auto PushMemUses = [&WorkList](MemoryAccess *Acc) { for (Use &U : Acc->uses()) WorkList.insert(cast(U.getUser())); }; - PushMemUses(DomDef); + PushMemUses(DomAccess); // Check if DomDef may be read. for (unsigned I = 0; I < WorkList.size(); I++) { @@ -1655,10 +1667,9 @@ struct DSEState { return None; } - // Bail out on MemoryPhis for now. if (isa(UseAccess)) { - LLVM_DEBUG(dbgs() << " ... hit MemoryPhi\n"); - return None; + PushMemUses(UseAccess); + continue; } Instruction *UseInst = cast(UseAccess)->getMemoryInst(); @@ -1676,7 +1687,11 @@ struct DSEState { return None; } - if (StartDef == UseAccess) + // For the KillingDef we only have to check if it reads the memory + // location. + // TODO: It would probably be better to check for self-reads before + // calling the function. + if (KillingDef == UseAccess) continue; // Check all uses for MemoryDefs, except for defs completely overwriting @@ -1695,8 +1710,8 @@ struct DSEState { } } - // No aliasing MemoryUses of DomDef found, DomDef is potentially dead. - return {DomDef}; + // No aliasing MemoryUses of DomAccess found, DomAccess is potentially dead. + return {DomAccess}; } // Delete dead memory defs @@ -1788,10 +1803,10 @@ bool eliminateDeadStoresMemorySSA(Function &F, AliasAnalysis &AA, DSEState State = DSEState::get(F, AA, MSSA, DT, PDT, TLI); // For each store: for (unsigned I = 0; I < State.MemDefs.size(); I++) { - MemoryDef *Current = State.MemDefs[I]; - if (State.SkipStores.count(Current)) + MemoryDef *KillingDef = State.MemDefs[I]; + if (State.SkipStores.count(KillingDef)) continue; - Instruction *SI = cast(Current)->getMemoryInst(); + Instruction *SI = KillingDef->getMemoryInst(); auto MaybeSILoc = State.getLocForWriteEx(SI); if (!MaybeSILoc) { LLVM_DEBUG(dbgs() << "Failed to find analyzable write location for " @@ -1808,22 +1823,54 @@ bool eliminateDeadStoresMemorySSA(Function &F, AliasAnalysis &AA, !PointerMayBeCapturedBefore(DefObj, false, true, SI, &DT)))) DefVisibleToCaller = false; - LLVM_DEBUG(dbgs() << "Trying to eliminate MemoryDefs killed by " << *SI - << "\n"); + MemoryAccess *Current = KillingDef; + LLVM_DEBUG(dbgs() << "Trying to eliminate MemoryDefs killed by " + << *KillingDef << " (" << *SI << ")\n"); int ScanLimit = MemorySSAScanLimit; - MemoryDef *StartDef = Current; - // Walk MemorySSA upward to find MemoryDefs that might be killed by SI. - while (Optional Next = State.getDomMemoryDef( - StartDef, Current, SILoc, DefVisibleToCaller, ScanLimit)) { + // Worklist of MemoryAccesses that may be killed by KillingDef. + SetVector ToCheck; + ToCheck.insert(KillingDef->getDefiningAccess()); + + // Check if MemoryAccesses in the worklist are killed by KillingDef. + for (unsigned I = 0; I < ToCheck.size(); I++) { + Current = ToCheck[I]; + if (State.SkipStores.count(Current)) + continue; + + Optional Next = State.getDomMemoryDef( + KillingDef, Current, SILoc, DefVisibleToCaller, ScanLimit); + + if (!Next) { + LLVM_DEBUG(dbgs() << " finished walk\n"); + continue; + } + MemoryAccess *DomAccess = *Next; LLVM_DEBUG(dbgs() << " Checking if we can kill " << *DomAccess << "\n"); + if (isa(DomAccess)) { + for (Value *V : cast(DomAccess)->incoming_values()) { + MemoryAccess *IncomingAccess = cast(V); + BasicBlock *IncomingBlock = IncomingAccess->getBlock(); + BasicBlock *PhiBlock = DomAccess->getBlock(); + + // We only consider incoming MemoryAccesses that come before the + // MemoryPhi. Otherwise we could discover candidates that do not + // strictly dominate our starting def. + if (State.PostOrderNumbers[IncomingBlock] > + State.PostOrderNumbers[PhiBlock]) + ToCheck.insert(IncomingAccess); + } + continue; + } MemoryDef *NextDef = dyn_cast(DomAccess); Instruction *NI = NextDef->getMemoryInst(); LLVM_DEBUG(dbgs() << " def " << *NI << "\n"); - if (!hasAnalyzableMemoryWrite(NI, TLI)) - break; + if (!hasAnalyzableMemoryWrite(NI, TLI)) { + LLVM_DEBUG(dbgs() << " skip, cannot analyze def\n"); + continue; + } if (!isRemovable(NI)) { LLVM_DEBUG(dbgs() << " skip, cannot remove def\n"); @@ -1834,14 +1881,14 @@ bool eliminateDeadStoresMemorySSA(Function &F, AliasAnalysis &AA, // Check for anything that looks like it will be a barrier to further // removal if (State.isDSEBarrier(SI, SILoc, SILocUnd, NI, NILoc)) { - LLVM_DEBUG(dbgs() << " stop, barrier\n"); - break; + LLVM_DEBUG(dbgs() << " skip, barrier\n"); + continue; } // Before we try to remove anything, check for any extra throwing // instructions that block us from DSEing if (State.mayThrowBetween(SI, NI, SILocUnd)) { - LLVM_DEBUG(dbgs() << " stop, may throw!\n"); + LLVM_DEBUG(dbgs() << " skip, may throw!\n"); break; } @@ -1857,14 +1904,14 @@ bool eliminateDeadStoresMemorySSA(Function &F, AliasAnalysis &AA, OverwriteResult OR = isOverwrite(SILoc, NILoc, DL, TLI, DepWriteOffset, InstWriteOffset, NI, IOL, AA, &F); + ToCheck.insert(NextDef->getDefiningAccess()); if (OR == OW_Complete) { LLVM_DEBUG(dbgs() << "DSE: Remove Dead Store:\n DEAD: " << *NI << "\n KILLER: " << *SI << '\n'); State.deleteDeadInstruction(NI); ++NumFastStores; MadeChange = true; - } else - Current = NextDef; + } } } diff --git a/llvm/lib/Transforms/Scalar/DivRemPairs.cpp b/llvm/lib/Transforms/Scalar/DivRemPairs.cpp index d434b6058cfc0..8d419d95c7520 100644 --- a/llvm/lib/Transforms/Scalar/DivRemPairs.cpp +++ b/llvm/lib/Transforms/Scalar/DivRemPairs.cpp @@ -17,6 +17,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" #include "llvm/IR/PatternMatch.h" @@ -303,6 +304,29 @@ static bool optimizeDivRem(Function &F, const TargetTransformInfo &TTI, Mul->insertAfter(RemInst); Sub->insertAfter(Mul); + // If X can be undef, X should be frozen first. + // For example, let's assume that Y = 1 & X = undef: + // %div = sdiv undef, 1 // %div = undef + // %rem = srem undef, 1 // %rem = 0 + // => + // %div = sdiv undef, 1 // %div = undef + // %mul = mul %div, 1 // %mul = undef + // %rem = sub %x, %mul // %rem = undef - undef = undef + // If X is not frozen, %rem becomes undef after transformation. + // TODO: We need a undef-specific checking function in ValueTracking + if (!isGuaranteedNotToBeUndefOrPoison(X, DivInst, &DT)) { + auto *FrX = new FreezeInst(X, X->getName() + ".frozen", DivInst); + DivInst->setOperand(0, FrX); + Sub->setOperand(0, FrX); + } + // Same for Y. If X = 1 and Y = (undef | 1), %rem in src is either 1 or 0, + // but %rem in tgt can be one of many integer values. + if (!isGuaranteedNotToBeUndefOrPoison(Y, DivInst, &DT)) { + auto *FrY = new FreezeInst(Y, Y->getName() + ".frozen", DivInst); + DivInst->setOperand(1, FrY); + Mul->setOperand(1, FrY); + } + // Now kill the explicit remainder. We have replaced it with: // (sub X, (mul (div X, Y), Y) Sub->setName(RemInst->getName() + ".decomposed"); diff --git a/llvm/lib/Transforms/Scalar/GVNSink.cpp b/llvm/lib/Transforms/Scalar/GVNSink.cpp index 6d0a4975e2668..6b9a88d04eda9 100644 --- a/llvm/lib/Transforms/Scalar/GVNSink.cpp +++ b/llvm/lib/Transforms/Scalar/GVNSink.cpp @@ -475,6 +475,7 @@ class ValueTable { case Instruction::PtrToInt: case Instruction::IntToPtr: case Instruction::BitCast: + case Instruction::AddrSpaceCast: case Instruction::Select: case Instruction::ExtractElement: case Instruction::InsertElement: diff --git a/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/llvm/lib/Transforms/Scalar/JumpThreading.cpp index 616d64a794b61..327a1a6f2e7b6 100644 --- a/llvm/lib/Transforms/Scalar/JumpThreading.cpp +++ b/llvm/lib/Transforms/Scalar/JumpThreading.cpp @@ -2125,6 +2125,18 @@ bool JumpThreadingPass::MaybeThreadThroughTwoBasicBlocks(BasicBlock *BB, if (PredBB->getSinglePredecessor()) return false; + // Don't thread through PredBB if it contains a successor edge to itself, in + // which case we would infinite loop. Suppose we are threading an edge from + // PredPredBB through PredBB and BB to SuccBB with PredBB containing a + // successor edge to itself. If we allowed jump threading in this case, we + // could duplicate PredBB and BB as, say, PredBB.thread and BB.thread. Since + // PredBB.thread has a successor edge to PredBB, we would immediately come up + // with another jump threading opportunity from PredBB.thread through PredBB + // and BB to SuccBB. This jump threading would repeatedly occur. That is, we + // would keep peeling one iteration from PredBB. + if (llvm::is_contained(successors(PredBB), PredBB)) + return false; + // Don't thread across a loop header. if (LoopHeaders.count(PredBB)) return false; diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp index 5efd3ffc2680c..671da3f539931 100644 --- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp +++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp @@ -165,62 +165,85 @@ class LowerMatrixIntrinsics { } }; - /// Wrapper class representing a matrix as a set of column vectors. - /// All column vectors must have the same vector type. - class ColumnMatrixTy { - SmallVector Columns; + /// Wrapper class representing a matrix as a set of vectors, either in row or + /// column major layout. All vectors must have the same vector type. + class MatrixTy { + SmallVector Vectors; OpInfoTy OpInfo; + bool IsColumnMajor = true; + public: - ColumnMatrixTy() : Columns() {} - ColumnMatrixTy(ArrayRef Cols) - : Columns(Cols.begin(), Cols.end()) {} + MatrixTy() : Vectors() {} + MatrixTy(ArrayRef Vectors) + : Vectors(Vectors.begin(), Vectors.end()) {} + + Value *getVector(unsigned i) const { return Vectors[i]; } + Value *getColumn(unsigned i) const { + assert(isColumnMajor() && "only supported for column-major matrixes"); + return Vectors[i]; + } - Value *getColumn(unsigned i) const { return Columns[i]; } + void setColumn(unsigned i, Value *V) { Vectors[i] = V; } - void setColumn(unsigned i, Value *V) { Columns[i] = V; } + Type *getElementType() { return getVectorTy()->getElementType(); } - size_t getNumColumns() const { return Columns.size(); } - size_t getNumRows() const { - assert(Columns.size() > 0 && "Cannot call getNumRows without columns"); - return cast(Columns[0]->getType())->getNumElements(); + unsigned getNumColumns() const { + if (isColumnMajor()) + return Vectors.size(); + else { + assert(Vectors.size() > 0 && "Cannot call getNumRows without columns"); + return cast(Vectors[0]->getType())->getNumElements(); + } + } + unsigned getNumRows() const { + if (isColumnMajor()) { + assert(Vectors.size() > 0 && "Cannot call getNumRows without columns"); + return cast(Vectors[0]->getType())->getNumElements(); + } else + return Vectors.size(); } - const SmallVectorImpl &getColumnVectors() const { return Columns; } + const SmallVectorImpl &getColumnVectors() const { return Vectors; } - SmallVectorImpl &getColumnVectors() { return Columns; } + SmallVectorImpl &getColumnVectors() { return Vectors; } - void addColumn(Value *V) { Columns.push_back(V); } + void addColumn(Value *V) { Vectors.push_back(V); } VectorType *getColumnTy() { - return cast(Columns[0]->getType()); + assert(isColumnMajor() && "only supported for column-major matrixes"); + return getVectorTy(); + } + + VectorType *getVectorTy() { + return cast(Vectors[0]->getType()); } iterator_range::iterator> columns() { - return make_range(Columns.begin(), Columns.end()); + return make_range(Vectors.begin(), Vectors.end()); } /// Embed the columns of the matrix into a flat vector by concatenating /// them. Value *embedInVector(IRBuilder<> &Builder) const { - return Columns.size() == 1 ? Columns[0] - : concatenateVectors(Builder, Columns); + return Vectors.size() == 1 ? Vectors[0] + : concatenateVectors(Builder, Vectors); } - ColumnMatrixTy &addNumLoads(unsigned N) { + MatrixTy &addNumLoads(unsigned N) { OpInfo.NumLoads += N; return *this; } void setNumLoads(unsigned N) { OpInfo.NumLoads = N; } - ColumnMatrixTy &addNumStores(unsigned N) { + MatrixTy &addNumStores(unsigned N) { OpInfo.NumStores += N; return *this; } - ColumnMatrixTy &addNumComputeOps(unsigned N) { + MatrixTy &addNumComputeOps(unsigned N) { OpInfo.NumComputeOps += N; return *this; } @@ -230,6 +253,8 @@ class LowerMatrixIntrinsics { unsigned getNumComputeOps() const { return OpInfo.NumComputeOps; } const OpInfoTy &getOpInfo() const { return OpInfo; } + + bool isColumnMajor() const { return IsColumnMajor; } }; struct ShapeInfo { @@ -270,7 +295,7 @@ class LowerMatrixIntrinsics { SmallVector ToRemove; /// Map from instructions to their produced column matrix. - MapVector Inst2ColumnMatrix; + MapVector Inst2ColumnMatrix; public: LowerMatrixIntrinsics(Function &F, TargetTransformInfo &TTI, @@ -296,8 +321,8 @@ class LowerMatrixIntrinsics { /// If we lowered \p MatrixVal, just return the cache result column matrix. /// Otherwie split the flat vector \p MatrixVal containing a matrix with /// shape \p SI into column vectors. - ColumnMatrixTy getMatrix(Value *MatrixVal, const ShapeInfo &SI, - IRBuilder<> &Builder) { + MatrixTy getMatrix(Value *MatrixVal, const ShapeInfo &SI, + IRBuilder<> &Builder) { VectorType *VType = dyn_cast(MatrixVal->getType()); assert(VType && "MatrixVal must be a vector type"); assert(VType->getNumElements() == SI.NumRows * SI.NumColumns && @@ -309,7 +334,7 @@ class LowerMatrixIntrinsics { // vector and split it later. auto Found = Inst2ColumnMatrix.find(MatrixVal); if (Found != Inst2ColumnMatrix.end()) { - ColumnMatrixTy &M = Found->second; + MatrixTy &M = Found->second; // Return the found matrix, if its shape matches the requested shape // information if (SI.NumRows == M.getNumRows() && SI.NumColumns == M.getNumColumns()) @@ -634,12 +659,13 @@ class LowerMatrixIntrinsics { return true; } - void LowerLoad(Instruction *Inst, Value *Ptr, Value *Stride, - ShapeInfo Shape) { - IRBuilder<> Builder(Inst); - auto VType = cast(Inst->getType()); + /// Load a matrix with \p Shape starting at \p Ptr and using \p Stride between + /// columns. + MatrixTy loadMatrix(Type *Ty, Value *Ptr, Value *Stride, ShapeInfo Shape, + IRBuilder<> &Builder) { + auto VType = cast(Ty); Value *EltPtr = createElementPtr(Ptr, VType->getElementType(), Builder); - ColumnMatrixTy Result; + MatrixTy Result; // Distance between start of one column and the start of the next for (unsigned C = 0, E = Shape.NumColumns; C < E; ++C) { Value *GEP = @@ -648,10 +674,41 @@ class LowerMatrixIntrinsics { Value *Column = createColumnLoad(GEP, VType->getElementType(), Builder); Result.addColumn(Column); } + return Result.addNumLoads(getNumOps(Result.getColumnTy()) * + Result.getNumColumns()); + } + + /// Loads a sub-matrix with shape \p ResultShape from a \p R x \p C matrix, + /// starting at \p MatrixPtr[I][J]. + MatrixTy loadMatrix(Value *MatrixPtr, ShapeInfo MatrixShape, unsigned I, + unsigned J, ShapeInfo ResultShape, Type *EltTy, + IRBuilder<> &Builder) { + + Value *Offset = Builder.CreateAdd( + Builder.CreateMul(Builder.getInt32(J), + Builder.getInt32(MatrixShape.NumRows)), + Builder.getInt32(I)); + + unsigned AS = cast(MatrixPtr->getType())->getAddressSpace(); + Value *EltPtr = + Builder.CreatePointerCast(MatrixPtr, PointerType::get(EltTy, AS)); + Value *TileStart = Builder.CreateGEP(EltTy, EltPtr, Offset); + Type *TileTy = + VectorType::get(EltTy, ResultShape.NumRows * ResultShape.NumColumns); + Type *TilePtrTy = PointerType::get(TileTy, AS); + Value *TilePtr = + Builder.CreatePointerCast(TileStart, TilePtrTy, "col.cast"); + + return loadMatrix(TileTy, TilePtr, Builder.getInt32(ResultShape.NumRows), + ResultShape, Builder); + } + /// Lower a load instruction with shape information. + void LowerLoad(Instruction *Inst, Value *Ptr, Value *Stride, + ShapeInfo Shape) { + IRBuilder<> Builder(Inst); finalizeLowering(Inst, - Result.addNumLoads(getNumOps(Result.getColumnTy()) * - Result.getNumColumns()), + loadMatrix(Inst->getType(), Ptr, Stride, Shape, Builder), Builder); } @@ -665,22 +722,54 @@ class LowerMatrixIntrinsics { {Inst->getArgOperand(2), Inst->getArgOperand(3)}); } - void LowerStore(Instruction *Inst, Value *Matrix, Value *Ptr, Value *Stride, - ShapeInfo Shape) { - IRBuilder<> Builder(Inst); - auto VType = cast(Matrix->getType()); + /// Stores a sub-matrix \p StoreVal into the \p R x \p C matrix starting at \p + /// MatrixPtr[I][J]. + void storeMatrix(const MatrixTy &StoreVal, Value *MatrixPtr, + ShapeInfo MatrixShape, unsigned I, unsigned J, Type *EltTy, + IRBuilder<> &Builder) { + Value *Offset = Builder.CreateAdd( + Builder.CreateMul(Builder.getInt32(J), + Builder.getInt32(MatrixShape.NumRows)), + Builder.getInt32(I)); + + unsigned AS = cast(MatrixPtr->getType())->getAddressSpace(); + Value *EltPtr = + Builder.CreatePointerCast(MatrixPtr, PointerType::get(EltTy, AS)); + Value *TileStart = Builder.CreateGEP(EltTy, EltPtr, Offset); + Type *TileTy = VectorType::get(EltTy, StoreVal.getNumRows() * + StoreVal.getNumColumns()); + Type *TilePtrTy = PointerType::get(TileTy, AS); + Value *TilePtr = + Builder.CreatePointerCast(TileStart, TilePtrTy, "col.cast"); + + storeMatrix(TileTy, StoreVal, TilePtr, + Builder.getInt32(StoreVal.getNumRows()), Builder); + } + + /// Store matrix \p StoreVal starting at \p Ptr and using \p Stride between + /// columns. + MatrixTy storeMatrix(Type *Ty, MatrixTy StoreVal, Value *Ptr, Value *Stride, + IRBuilder<> &Builder) { + auto VType = cast(Ty); Value *EltPtr = createElementPtr(Ptr, VType->getElementType(), Builder); - auto LM = getMatrix(Matrix, Shape, Builder); - for (auto C : enumerate(LM.columns())) { - Value *GEP = - computeColumnAddr(EltPtr, Builder.getInt32(C.index()), Stride, - Shape.NumRows, VType->getElementType(), Builder); + for (auto C : enumerate(StoreVal.columns())) { + Value *GEP = computeColumnAddr(EltPtr, Builder.getInt32(C.index()), + Stride, StoreVal.getNumRows(), + VType->getElementType(), Builder); createColumnStore(C.value(), GEP, VType->getElementType(), Builder); } - Inst2ColumnMatrix[Inst] = ColumnMatrixTy().addNumStores( - getNumOps(LM.getColumnTy()) * LM.getNumColumns()); + return MatrixTy().addNumStores(getNumOps(StoreVal.getColumnTy()) * + StoreVal.getNumColumns()); + } - ToRemove.push_back(Inst); + /// Lower a store instruction with shape information. + void LowerStore(Instruction *Inst, Value *Matrix, Value *Ptr, Value *Stride, + ShapeInfo Shape) { + IRBuilder<> Builder(Inst); + auto StoreVal = getMatrix(Matrix, Shape, Builder); + finalizeLowering( + Inst, storeMatrix(Matrix->getType(), StoreVal, Ptr, Stride, Builder), + Builder); } /// Lowers llvm.matrix.columnwise.store. @@ -696,7 +785,7 @@ class LowerMatrixIntrinsics { /// Extract a column vector of \p NumElts starting at index (\p I, \p J) from /// the matrix \p LM represented as a vector of column vectors. - Value *extractVector(const ColumnMatrixTy &LM, unsigned I, unsigned J, + Value *extractVector(const MatrixTy &LM, unsigned I, unsigned J, unsigned NumElts, IRBuilder<> &Builder) { Value *Col = LM.getColumn(J); Value *Undef = UndefValue::get(Col->getType()); @@ -768,7 +857,7 @@ class LowerMatrixIntrinsics { /// cached value when they are lowered. For other users, \p Matrix is /// flattened and the uses are updated to use it. Also marks \p Inst for /// deletion. - void finalizeLowering(Instruction *Inst, ColumnMatrixTy Matrix, + void finalizeLowering(Instruction *Inst, MatrixTy Matrix, IRBuilder<> &Builder) { Inst2ColumnMatrix.insert(std::make_pair(Inst, Matrix)); @@ -784,6 +873,48 @@ class LowerMatrixIntrinsics { } } + /// Compute Res += A * B for tile-sized matrices with left-associating + /// addition. + void emitChainedMatrixMultiply(MatrixTy &Result, const MatrixTy &A, + const MatrixTy &B, bool AllowContraction, + IRBuilder<> &Builder, bool isTiled) { + const unsigned VF = std::max( + TTI.getRegisterBitWidth(true) / + Result.getElementType()->getPrimitiveSizeInBits().getFixedSize(), + 1U); + unsigned R = Result.getNumRows(); + unsigned C = Result.getNumColumns(); + unsigned M = A.getNumColumns(); + + for (unsigned J = 0; J < C; ++J) { + unsigned BlockSize = VF; + + // If Result is zero, we don't need to accumulate in the K==0 iteration. + bool isSumZero = isa(Result.getColumn(J)); + + unsigned NumOps = 0; + for (unsigned I = 0; I < R; I += BlockSize) { + // Gradually lower the vectorization factor to cover the remainder. + while (I + BlockSize > R) + BlockSize /= 2; + + Value *Sum = + isTiled ? extractVector(Result, I, J, BlockSize, Builder) : nullptr; + for (unsigned K = 0; K < M; ++K) { + Value *L = extractVector(A, I, K, BlockSize, Builder); + Value *RH = Builder.CreateExtractElement(B.getColumn(J), K); + Value *Splat = Builder.CreateVectorSplat(BlockSize, RH, "splat"); + Sum = createMulAdd(isSumZero && K == 0 ? nullptr : Sum, L, Splat, + Result.getElementType()->isFloatingPointTy(), + Builder, AllowContraction, NumOps); + } + Result.setColumn(J, insertVector(Result.getColumn(J), I, Sum, Builder)); + } + + Result.addNumComputeOps(NumOps); + } + } + /// Lowers llvm.matrix.multiply. void LowerMultiply(CallInst *MatMul) { IRBuilder<> Builder(MatMul); @@ -791,61 +922,32 @@ class LowerMatrixIntrinsics { ShapeInfo LShape(MatMul->getArgOperand(2), MatMul->getArgOperand(3)); ShapeInfo RShape(MatMul->getArgOperand(3), MatMul->getArgOperand(4)); - const ColumnMatrixTy &Lhs = - getMatrix(MatMul->getArgOperand(0), LShape, Builder); - const ColumnMatrixTy &Rhs = - getMatrix(MatMul->getArgOperand(1), RShape, Builder); + const MatrixTy &Lhs = getMatrix(MatMul->getArgOperand(0), LShape, Builder); + const MatrixTy &Rhs = getMatrix(MatMul->getArgOperand(1), RShape, Builder); const unsigned R = LShape.NumRows; - const unsigned M = LShape.NumColumns; const unsigned C = RShape.NumColumns; - assert(M == RShape.NumRows); + assert(LShape.NumColumns == RShape.NumRows); // Initialize the output - ColumnMatrixTy Result; + MatrixTy Result; for (unsigned J = 0; J < C; ++J) Result.addColumn(UndefValue::get(VectorType::get(EltType, R))); - const unsigned VF = std::max(TTI.getRegisterBitWidth(true) / - EltType->getPrimitiveSizeInBits(), - uint64_t(1)); - bool AllowContract = AllowContractEnabled || (isa(MatMul) && MatMul->hasAllowContract()); - unsigned NumComputeOps = 0; - // Multiply columns from the first operand with scalars from the second - // operand. Then move along the K axes and accumulate the columns. With - // this the adds can be vectorized without reassociation. - for (unsigned J = 0; J < C; ++J) { - unsigned BlockSize = VF; - for (unsigned I = 0; I < R; I += BlockSize) { - // Gradually lower the vectorization factor to cover the remainder. - while (I + BlockSize > R) - BlockSize /= 2; - - Value *Sum = nullptr; - for (unsigned K = 0; K < M; ++K) { - Value *L = extractVector(Lhs, I, K, BlockSize, Builder); - Value *RH = Builder.CreateExtractElement(Rhs.getColumn(J), K); - Value *Splat = Builder.CreateVectorSplat(BlockSize, RH, "splat"); - Sum = createMulAdd(Sum, L, Splat, EltType->isFloatingPointTy(), - Builder, AllowContract, NumComputeOps); - } - Result.setColumn(J, insertVector(Result.getColumn(J), I, Sum, Builder)); - } - } - Result.addNumComputeOps(NumComputeOps); + emitChainedMatrixMultiply(Result, Lhs, Rhs, AllowContract, Builder, false); finalizeLowering(MatMul, Result, Builder); } /// Lowers llvm.matrix.transpose. void LowerTranspose(CallInst *Inst) { - ColumnMatrixTy Result; + MatrixTy Result; IRBuilder<> Builder(Inst); Value *InputVal = Inst->getArgOperand(0); VectorType *VectorTy = cast(InputVal->getType()); ShapeInfo ArgShape(Inst->getArgOperand(1), Inst->getArgOperand(2)); - ColumnMatrixTy InputMatrix = getMatrix(InputVal, ArgShape, Builder); + MatrixTy InputMatrix = getMatrix(InputVal, ArgShape, Builder); for (unsigned Row = 0; Row < ArgShape.NumRows; ++Row) { // Build a single column vector for this row. First initialize it. @@ -905,11 +1007,11 @@ class LowerMatrixIntrinsics { IRBuilder<> Builder(Inst); ShapeInfo &Shape = I->second; - ColumnMatrixTy LoweredLhs = getMatrix(Lhs, Shape, Builder); - ColumnMatrixTy LoweredRhs = getMatrix(Rhs, Shape, Builder); + MatrixTy LoweredLhs = getMatrix(Lhs, Shape, Builder); + MatrixTy LoweredRhs = getMatrix(Rhs, Shape, Builder); // Add each column and store the result back into the opmapping - ColumnMatrixTy Result; + MatrixTy Result; auto BuildColumnOp = [&Builder, Inst](Value *LHS, Value *RHS) { switch (Inst->getOpcode()) { case Instruction::Add: @@ -951,7 +1053,7 @@ class LowerMatrixIntrinsics { /// Mapping from instructions to column matrixes. It is used to identify /// matrix instructions. - const MapVector &Inst2ColumnMatrix; + const MapVector &Inst2ColumnMatrix; /// Mapping from values to the leaves of all expressions that the value is /// part of. @@ -968,7 +1070,7 @@ class LowerMatrixIntrinsics { SmallPtrSet ReusedExprs; ExprLinearizer(const DataLayout &DL, - const MapVector &Inst2ColumnMatrix, + const MapVector &Inst2ColumnMatrix, const DenseMap> &Shared, const SmallSetVector &ExprsInSubprogram, Value *Leaf) @@ -1212,12 +1314,12 @@ class LowerMatrixIntrinsics { /// that multiple leaves can share sub-expressions. Shared subexpressions /// are explicitly marked as shared(). struct RemarkGenerator { - const MapVector &Inst2ColumnMatrix; + const MapVector &Inst2ColumnMatrix; OptimizationRemarkEmitter &ORE; Function &Func; const DataLayout &DL; - RemarkGenerator(const MapVector &Inst2ColumnMatrix, + RemarkGenerator(const MapVector &Inst2ColumnMatrix, OptimizationRemarkEmitter &ORE, Function &Func) : Inst2ColumnMatrix(Inst2ColumnMatrix), ORE(ORE), Func(Func), DL(Func.getParent()->getDataLayout()) {} diff --git a/llvm/lib/Transforms/Scalar/SCCP.cpp b/llvm/lib/Transforms/Scalar/SCCP.cpp index c17c35b7e3232..fcd7ed195291a 100644 --- a/llvm/lib/Transforms/Scalar/SCCP.cpp +++ b/llvm/lib/Transforms/Scalar/SCCP.cpp @@ -739,9 +739,8 @@ void SCCPSolver::visitPHINode(PHINode &PN) { if (PN.getType()->isStructTy()) return (void)markOverdefined(&PN); - if (isOverdefined(getValueState(&PN))) { - return (void)markOverdefined(&PN); - } + if (getValueState(&PN).isOverdefined()) + return; // Quick exit // Super-extra-high-degree PHI nodes are unlikely to ever be marked constant, // and slow us down a lot. Just mark them overdefined. @@ -753,38 +752,19 @@ void SCCPSolver::visitPHINode(PHINode &PN) { // constant, and they agree with each other, the PHI becomes the identical // constant. If they are constant and don't agree, the PHI is overdefined. // If there are no executable operands, the PHI remains unknown. - Constant *OperandVal = nullptr; + bool Changed = false; for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) { LatticeVal IV = getValueState(PN.getIncomingValue(i)); - if (IV.isUnknownOrUndef()) continue; // Doesn't influence PHI node. - if (!isEdgeFeasible(PN.getIncomingBlock(i), PN.getParent())) continue; - if (isOverdefined(IV)) // PHI node becomes overdefined! - return (void)markOverdefined(&PN); - - if (!OperandVal) { // Grab the first value. - OperandVal = getConstant(IV); - continue; - } - - // There is already a reachable operand. If we conflict with it, - // then the PHI node becomes overdefined. If we agree with it, we - // can continue on. - - // Check to see if there are two different constants merging, if so, the PHI - // node is overdefined. - if (getConstant(IV) != OperandVal) - return (void)markOverdefined(&PN); + LatticeVal &Res = getValueState(&PN); + Changed |= Res.mergeIn(IV, DL); + if (Res.isOverdefined()) + break; } - - // If we exited the loop, this means that the PHI node only has constant - // arguments that agree with each other(and OperandVal is the constant) or - // OperandVal is null because there are no defined incoming arguments. If - // this is the case, the PHI remains unknown. - if (OperandVal) - markConstant(&PN, OperandVal); // Acquire operand value + if (Changed) + pushToWorkListMsg(ValueState[&PN], &PN); } void SCCPSolver::visitReturnInst(ReturnInst &I) { @@ -977,9 +957,18 @@ void SCCPSolver::visitBinaryOperator(Instruction &I) { LatticeVal V2State = getValueState(I.getOperand(1)); LatticeVal &IV = ValueState[&I]; - if (isOverdefined(IV)) + if (IV.isOverdefined()) + return; + + // If something is undef, wait for it to resolve. + if (V1State.isUnknownOrUndef() || V2State.isUnknownOrUndef()) + return; + + if (V1State.isOverdefined() && V2State.isOverdefined()) return (void)markOverdefined(&I); + // Both operands are non-integer constants or constant expressions. + // TODO: Use information from notconstant better. if (isConstant(V1State) && isConstant(V2State)) { Constant *C = ConstantExpr::get(I.getOpcode(), getConstant(V1State), getConstant(V2State)); @@ -989,50 +978,21 @@ void SCCPSolver::visitBinaryOperator(Instruction &I) { return (void)markConstant(IV, &I, C); } - // If something is undef, wait for it to resolve. - if (V1State.isUnknownOrUndef() || V2State.isUnknownOrUndef()) - return; + // Operands are either constant ranges, notconstant, overdefined or one of the + // operands is a constant. + ConstantRange A = ConstantRange::getFull(I.getType()->getScalarSizeInBits()); + ConstantRange B = ConstantRange::getFull(I.getType()->getScalarSizeInBits()); + if (V1State.isConstantRange()) + A = V1State.getConstantRange(); + if (V2State.isConstantRange()) + B = V2State.getConstantRange(); - // Otherwise, one of our operands is overdefined. Try to produce something - // better than overdefined with some tricks. - // If this is 0 / Y, it doesn't matter that the second operand is - // overdefined, and we can replace it with zero. - if (I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction::SDiv) - if (isConstant(V1State) && getConstant(V1State)->isNullValue()) - return (void)markConstant(IV, &I, getConstant(V1State)); - - // If this is: - // -> AND/MUL with 0 - // -> OR with -1 - // it doesn't matter that the other operand is overdefined. - if (I.getOpcode() == Instruction::And || I.getOpcode() == Instruction::Mul || - I.getOpcode() == Instruction::Or) { - LatticeVal *NonOverdefVal = nullptr; - if (!isOverdefined(V1State)) - NonOverdefVal = &V1State; - - else if (!isOverdefined(V2State)) - NonOverdefVal = &V2State; - if (NonOverdefVal) { - if (!isConstant(*NonOverdefVal)) - return; - - if (I.getOpcode() == Instruction::And || - I.getOpcode() == Instruction::Mul) { - // X and 0 = 0 - // X * 0 = 0 - if (getConstant(*NonOverdefVal)->isNullValue()) - return (void)markConstant(IV, &I, getConstant(*NonOverdefVal)); - } else { - // X or -1 = -1 - if (ConstantInt *CI = getConstantInt(*NonOverdefVal)) - if (CI->isMinusOne()) - return (void)markConstant(IV, &I, CI); - } - } - } + ConstantRange R = A.binaryOp(cast(&I)->getOpcode(), B); + mergeInValue(&I, LatticeVal::getRange(R)); - markOverdefined(&I); + // TODO: Currently we do not exploit special values that produce something + // better than overdefined with an overdefined operand for vector or floating + // point types, like and <4 x i32> overdefined, zeroinitializer. } // Handle ICmpInst instruction. diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp index 1267226dfeb2b..d5d9dff75ef04 100644 --- a/llvm/lib/Transforms/Utils/Local.cpp +++ b/llvm/lib/Transforms/Utils/Local.cpp @@ -2935,21 +2935,39 @@ bool llvm::canReplaceOperandWithVariable(const Instruction *I, unsigned OpIdx) { default: return true; case Instruction::Call: - case Instruction::Invoke: + case Instruction::Invoke: { + ImmutableCallSite CS(I); + // Can't handle inline asm. Skip it. - if (isa(ImmutableCallSite(I).getCalledValue())) - return false; - // Many arithmetic intrinsics have no issue taking a - // variable, however it's hard to distingish these from - // specials such as @llvm.frameaddress that require a constant. - if (isa(I)) + if (CS.isInlineAsm()) return false; // Constant bundle operands may need to retain their constant-ness for // correctness. - if (ImmutableCallSite(I).isBundleOperand(OpIdx)) + if (CS.isBundleOperand(OpIdx)) return false; - return true; + + if (OpIdx < CS.getNumArgOperands()) { + // Some variadic intrinsics require constants in the variadic arguments, + // which currently aren't markable as immarg. + if (CS.isIntrinsic() && OpIdx >= CS.getFunctionType()->getNumParams()) { + // This is known to be OK for stackmap. + return CS.getIntrinsicID() == Intrinsic::experimental_stackmap; + } + + // gcroot is a special case, since it requires a constant argument which + // isn't also required to be a simple ConstantInt. + if (CS.getIntrinsicID() == Intrinsic::gcroot) + return false; + + // Some intrinsic operands are required to be immediates. + return !CS.paramHasAttr(OpIdx, Attribute::ImmArg); + } + + // It is never allowed to replace the call argument to an intrinsic, but it + // may be possible for a call. + return !CS.isIntrinsic(); + } case Instruction::ShuffleVector: // Shufflevector masks are constant. return OpIdx != 2; diff --git a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp index 0bb2746eed2d0..616b4e8eb01c9 100644 --- a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp +++ b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp @@ -16,7 +16,7 @@ using namespace llvm; void llvm::createMemCpyLoopKnownSize(Instruction *InsertBefore, Value *SrcAddr, Value *DstAddr, ConstantInt *CopyLen, - unsigned SrcAlign, unsigned DstAlign, + Align SrcAlign, Align DstAlign, bool SrcIsVolatile, bool DstIsVolatile, const TargetTransformInfo &TTI) { // No need to expand zero length copies. @@ -33,8 +33,8 @@ void llvm::createMemCpyLoopKnownSize(Instruction *InsertBefore, Value *SrcAddr, unsigned DstAS = cast(DstAddr->getType())->getAddressSpace(); Type *TypeOfCopyLen = CopyLen->getType(); - Type *LoopOpType = TTI.getMemcpyLoopLoweringType(Ctx, CopyLen, SrcAS, DstAS, - SrcAlign, DstAlign); + Type *LoopOpType = TTI.getMemcpyLoopLoweringType( + Ctx, CopyLen, SrcAS, DstAS, SrcAlign.value(), DstAlign.value()); unsigned LoopOpSize = DL.getTypeStoreSize(LoopOpType); uint64_t LoopEndCount = CopyLen->getZExtValue() / LoopOpSize; @@ -59,8 +59,8 @@ void llvm::createMemCpyLoopKnownSize(Instruction *InsertBefore, Value *SrcAddr, DstAddr = PLBuilder.CreateBitCast(DstAddr, DstOpType); } - Align PartDstAlign(MinAlign(DstAlign, LoopOpSize)); - Align PartSrcAlign(MinAlign(SrcAlign, LoopOpSize)); + Align PartDstAlign(commonAlignment(DstAlign, LoopOpSize)); + Align PartSrcAlign(commonAlignment(SrcAlign, LoopOpSize)); IRBuilder<> LoopBuilder(LoopBB); PHINode *LoopIndex = LoopBuilder.CreatePHI(TypeOfCopyLen, 2, "loop-index"); @@ -92,11 +92,12 @@ void llvm::createMemCpyLoopKnownSize(Instruction *InsertBefore, Value *SrcAddr, SmallVector RemainingOps; TTI.getMemcpyLoopResidualLoweringType(RemainingOps, Ctx, RemainingBytes, - SrcAS, DstAS, SrcAlign, DstAlign); + SrcAS, DstAS, SrcAlign.value(), + DstAlign.value()); for (auto OpTy : RemainingOps) { - Align PartSrcAlign(MinAlign(SrcAlign, BytesCopied)); - Align PartDstAlign(MinAlign(DstAlign, BytesCopied)); + Align PartSrcAlign(commonAlignment(SrcAlign, BytesCopied)); + Align PartDstAlign(commonAlignment(DstAlign, BytesCopied)); // Calaculate the new index unsigned OperandSize = DL.getTypeStoreSize(OpTy); @@ -131,8 +132,8 @@ void llvm::createMemCpyLoopKnownSize(Instruction *InsertBefore, Value *SrcAddr, void llvm::createMemCpyLoopUnknownSize(Instruction *InsertBefore, Value *SrcAddr, Value *DstAddr, - Value *CopyLen, unsigned SrcAlign, - unsigned DstAlign, bool SrcIsVolatile, + Value *CopyLen, Align SrcAlign, + Align DstAlign, bool SrcIsVolatile, bool DstIsVolatile, const TargetTransformInfo &TTI) { BasicBlock *PreLoopBB = InsertBefore->getParent(); @@ -145,8 +146,8 @@ void llvm::createMemCpyLoopUnknownSize(Instruction *InsertBefore, unsigned SrcAS = cast(SrcAddr->getType())->getAddressSpace(); unsigned DstAS = cast(DstAddr->getType())->getAddressSpace(); - Type *LoopOpType = TTI.getMemcpyLoopLoweringType(Ctx, CopyLen, SrcAS, DstAS, - SrcAlign, DstAlign); + Type *LoopOpType = TTI.getMemcpyLoopLoweringType( + Ctx, CopyLen, SrcAS, DstAS, SrcAlign.value(), DstAlign.value()); unsigned LoopOpSize = DL.getTypeStoreSize(LoopOpType); IRBuilder<> PLBuilder(PreLoopBB->getTerminator()); @@ -175,8 +176,8 @@ void llvm::createMemCpyLoopUnknownSize(Instruction *InsertBefore, BasicBlock::Create(Ctx, "loop-memcpy-expansion", ParentFunc, PostLoopBB); IRBuilder<> LoopBuilder(LoopBB); - Align PartSrcAlign(MinAlign(SrcAlign, LoopOpSize)); - Align PartDstAlign(MinAlign(DstAlign, LoopOpSize)); + Align PartSrcAlign(commonAlignment(SrcAlign, LoopOpSize)); + Align PartDstAlign(commonAlignment(DstAlign, LoopOpSize)); PHINode *LoopIndex = LoopBuilder.CreatePHI(CopyLenType, 2, "loop-index"); LoopIndex->addIncoming(ConstantInt::get(CopyLenType, 0U), PreLoopBB); @@ -288,8 +289,8 @@ void llvm::createMemCpyLoopUnknownSize(Instruction *InsertBefore, // return dst; // } static void createMemMoveLoop(Instruction *InsertBefore, Value *SrcAddr, - Value *DstAddr, Value *CopyLen, unsigned SrcAlign, - unsigned DstAlign, bool SrcIsVolatile, + Value *DstAddr, Value *CopyLen, Align SrcAlign, + Align DstAlign, bool SrcIsVolatile, bool DstIsVolatile) { Type *TypeOfCopyLen = CopyLen->getType(); BasicBlock *OrigBB = InsertBefore->getParent(); @@ -323,8 +324,8 @@ static void createMemMoveLoop(Instruction *InsertBefore, Value *SrcAddr, ExitBB->setName("memmove_done"); unsigned PartSize = DL.getTypeStoreSize(EltTy); - Align PartSrcAlign(MinAlign(SrcAlign, PartSize)); - Align PartDstAlign(MinAlign(DstAlign, PartSize)); + Align PartSrcAlign(commonAlignment(SrcAlign, PartSize)); + Align PartDstAlign(commonAlignment(DstAlign, PartSize)); // Initial comparison of n == 0 that lets us skip the loops altogether. Shared // between both backwards and forward copy clauses. @@ -375,7 +376,7 @@ static void createMemMoveLoop(Instruction *InsertBefore, Value *SrcAddr, } static void createMemSetLoop(Instruction *InsertBefore, Value *DstAddr, - Value *CopyLen, Value *SetValue, unsigned DstAlign, + Value *CopyLen, Value *SetValue, Align DstAlign, bool IsVolatile) { Type *TypeOfCopyLen = CopyLen->getType(); BasicBlock *OrigBB = InsertBefore->getParent(); @@ -399,7 +400,7 @@ static void createMemSetLoop(Instruction *InsertBefore, Value *DstAddr, OrigBB->getTerminator()->eraseFromParent(); unsigned PartSize = DL.getTypeStoreSize(SetValue->getType()); - Align PartAlign(MinAlign(DstAlign, PartSize)); + Align PartAlign(commonAlignment(DstAlign, PartSize)); IRBuilder<> LoopBuilder(LoopBB); PHINode *LoopIndex = LoopBuilder.CreatePHI(TypeOfCopyLen, 0); @@ -421,25 +422,27 @@ static void createMemSetLoop(Instruction *InsertBefore, Value *DstAddr, void llvm::expandMemCpyAsLoop(MemCpyInst *Memcpy, const TargetTransformInfo &TTI) { if (ConstantInt *CI = dyn_cast(Memcpy->getLength())) { - createMemCpyLoopKnownSize(/* InsertBefore */ Memcpy, - /* SrcAddr */ Memcpy->getRawSource(), - /* DstAddr */ Memcpy->getRawDest(), - /* CopyLen */ CI, - /* SrcAlign */ Memcpy->getSourceAlignment(), - /* DestAlign */ Memcpy->getDestAlignment(), - /* SrcIsVolatile */ Memcpy->isVolatile(), - /* DstIsVolatile */ Memcpy->isVolatile(), - /* TargetTransformInfo */ TTI); + createMemCpyLoopKnownSize( + /* InsertBefore */ Memcpy, + /* SrcAddr */ Memcpy->getRawSource(), + /* DstAddr */ Memcpy->getRawDest(), + /* CopyLen */ CI, + /* SrcAlign */ Memcpy->getSourceAlign().valueOrOne(), + /* DestAlign */ Memcpy->getDestAlign().valueOrOne(), + /* SrcIsVolatile */ Memcpy->isVolatile(), + /* DstIsVolatile */ Memcpy->isVolatile(), + /* TargetTransformInfo */ TTI); } else { - createMemCpyLoopUnknownSize(/* InsertBefore */ Memcpy, - /* SrcAddr */ Memcpy->getRawSource(), - /* DstAddr */ Memcpy->getRawDest(), - /* CopyLen */ Memcpy->getLength(), - /* SrcAlign */ Memcpy->getSourceAlignment(), - /* DestAlign */ Memcpy->getDestAlignment(), - /* SrcIsVolatile */ Memcpy->isVolatile(), - /* DstIsVolatile */ Memcpy->isVolatile(), - /* TargetTransfomrInfo */ TTI); + createMemCpyLoopUnknownSize( + /* InsertBefore */ Memcpy, + /* SrcAddr */ Memcpy->getRawSource(), + /* DstAddr */ Memcpy->getRawDest(), + /* CopyLen */ Memcpy->getLength(), + /* SrcAlign */ Memcpy->getSourceAlign().valueOrOne(), + /* DestAlign */ Memcpy->getDestAlign().valueOrOne(), + /* SrcIsVolatile */ Memcpy->isVolatile(), + /* DstIsVolatile */ Memcpy->isVolatile(), + /* TargetTransfomrInfo */ TTI); } } @@ -448,8 +451,8 @@ void llvm::expandMemMoveAsLoop(MemMoveInst *Memmove) { /* SrcAddr */ Memmove->getRawSource(), /* DstAddr */ Memmove->getRawDest(), /* CopyLen */ Memmove->getLength(), - /* SrcAlign */ Memmove->getSourceAlignment(), - /* DestAlign */ Memmove->getDestAlignment(), + /* SrcAlign */ Memmove->getSourceAlign().valueOrOne(), + /* DestAlign */ Memmove->getDestAlign().valueOrOne(), /* SrcIsVolatile */ Memmove->isVolatile(), /* DstIsVolatile */ Memmove->isVolatile()); } @@ -459,6 +462,6 @@ void llvm::expandMemSetAsLoop(MemSetInst *Memset) { /* DstAddr */ Memset->getRawDest(), /* CopyLen */ Memset->getLength(), /* SetValue */ Memset->getValue(), - /* Alignment */ Memset->getDestAlignment(), + /* Alignment */ Memset->getDestAlign().valueOrOne(), Memset->isVolatile()); } diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index 36358ef34fd08..9a39df4c35f88 100644 --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -1445,6 +1445,13 @@ static bool isLifeTimeMarker(const Instruction *I) { return false; } +// TODO: Refine this. This should avoid cases like turning constant memcpy sizes +// into variables. +static bool replacingOperandWithVariableIsCheap(const Instruction *I, + int OpIdx) { + return !isa(I); +} + // All instructions in Insts belong to different blocks that all unconditionally // branch to a common successor. Analyze each instruction and return true if it // would be possible to sink them into their successor, creating one common @@ -1522,7 +1529,8 @@ static bool canSinkInstructions( return false; for (unsigned OI = 0, OE = I0->getNumOperands(); OI != OE; ++OI) { - if (I0->getOperand(OI)->getType()->isTokenTy()) + Value *Op = I0->getOperand(OI); + if (Op->getType()->isTokenTy()) // Don't touch any operand of token type. return false; @@ -1531,7 +1539,8 @@ static bool canSinkInstructions( return I->getOperand(OI) == I0->getOperand(OI); }; if (!all_of(Insts, SameAsI0)) { - if (!canReplaceOperandWithVariable(I0, OI)) + if ((isa(Op) && !replacingOperandWithVariableIsCheap(I0, OI)) || + !canReplaceOperandWithVariable(I0, OI)) // We can't create a PHI from this GEP. return false; // Don't create indirect calls! The called value is the final operand. diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index a1957ccda3a12..60cf5be692fb1 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -5919,7 +5919,7 @@ unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); unsigned Cost = TTI.getInterleavedMemoryOpCost( I->getOpcode(), WideVecTy, Group->getFactor(), Indices, - Group->getAlignment(), AS, Legal->isMaskRequired(I), UseMaskForGaps); + Group->getAlign().value(), AS, Legal->isMaskRequired(I), UseMaskForGaps); if (Group->isReverse()) { // TODO: Add support for reversed masked interleaved access. diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 377aa78730b04..53678bc97d7ed 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -5308,8 +5308,10 @@ unsigned BoUpSLP::getVectorElementSize(Value *V) const { // of memory operations where possible. SmallVector Worklist; SmallPtrSet Visited; - if (auto *I = dyn_cast(V)) + if (auto *I = dyn_cast(V)) { Worklist.push_back(I); + Visited.insert(I); + } // Traverse the expression tree in bottom-up order looking for loads. If we // encounter an instruction we don't yet handle, we give up. @@ -5317,7 +5319,6 @@ unsigned BoUpSLP::getVectorElementSize(Value *V) const { auto FoundUnknownInst = false; while (!Worklist.empty() && !FoundUnknownInst) { auto *I = Worklist.pop_back_val(); - Visited.insert(I); // We should only be looking at scalar instructions here. If the current // instruction has a vector type, give up. @@ -5337,7 +5338,7 @@ unsigned BoUpSLP::getVectorElementSize(Value *V) const { isa(I) || isa(I) || isa(I)) { for (Use &U : I->operands()) if (auto *J = dyn_cast(U.get())) - if (!Visited.count(J)) + if (Visited.insert(J).second) Worklist.push_back(J); } diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 19841edb1eeb7..f584b92866944 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -580,19 +580,10 @@ void VPlanPrinter::dump() { OS << "graph [labelloc=t, fontsize=30; label=\"Vectorization Plan"; if (!Plan.getName().empty()) OS << "\\n" << DOT::EscapeString(Plan.getName()); - if (!Plan.Value2VPValue.empty() || Plan.BackedgeTakenCount) { - OS << ", where:"; - if (Plan.BackedgeTakenCount) { - OS << "\\n"; - Plan.BackedgeTakenCount->print(OS, SlotTracker); - OS << " := BackedgeTakenCount"; - } - for (auto Entry : Plan.Value2VPValue) { - OS << "\\n"; - Entry.second->print(OS, SlotTracker); - OS << DOT::EscapeString(" := "); - Entry.first->printAsOperand(OS, false); - } + if (Plan.BackedgeTakenCount) { + OS << ", where:\\n"; + Plan.BackedgeTakenCount->print(OS, SlotTracker); + OS << " := BackedgeTakenCount"; } OS << "\"]\n"; OS << "node [shape=rect, fontname=Courier, fontsize=30]\n"; @@ -853,7 +844,7 @@ void VPInterleavedAccessInfo::visitBlock(VPBlockBase *Block, Old2NewTy &Old2New, auto NewIGIter = Old2New.find(IG); if (NewIGIter == Old2New.end()) Old2New[IG] = new InterleaveGroup( - IG->getFactor(), IG->isReverse(), Align(IG->getAlignment())); + IG->getFactor(), IG->isReverse(), IG->getAlign()); if (Inst == IG->getInsertPos()) Old2New[IG]->setInsertPos(VPInst); diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 17218e469af7f..044d2ffc3fc3e 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -1488,7 +1488,7 @@ class VPlan { void addVPValue(Value *V) { assert(V && "Trying to add a null Value to VPlan"); assert(!Value2VPValue.count(V) && "Value already exists in VPlan"); - Value2VPValue[V] = new VPValue(); + Value2VPValue[V] = new VPValue(V); } VPValue *getVPValue(Value *V) { diff --git a/llvm/test/Analysis/CostModel/X86/reduce-add.ll b/llvm/test/Analysis/CostModel/X86/reduce-add.ll index b5729eac4bc41..0b0e78059bc65 100644 --- a/llvm/test/Analysis/CostModel/X86/reduce-add.ll +++ b/llvm/test/Analysis/CostModel/X86/reduce-add.ll @@ -14,18 +14,26 @@ define i32 @reduce_i64(i32 %arg) { ; SSE-LABEL: 'reduce_i64' ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.add.v1i64(<1 x i64> undef) ; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> undef) ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; -; AVX-LABEL: 'reduce_i64' -; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.add.v1i64(<1 x i64> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX1-LABEL: 'reduce_i64' +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.add.v1i64(<1 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX2-LABEL: 'reduce_i64' +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.add.v1i64(<1 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512-LABEL: 'reduce_i64' ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.add.v1i64(<1 x i64> undef) @@ -38,9 +46,9 @@ define i32 @reduce_i64(i32 %arg) { ; SLM-LABEL: 'reduce_i64' ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.add.v1i64(<1 x i64> undef) ; SLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> undef) ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %V1 = call i64 @llvm.experimental.vector.reduce.add.v1i64(<1 x i64> undef) @@ -55,18 +63,26 @@ define i32 @reduce_i32(i32 %arg) { ; SSE-LABEL: 'reduce_i32' ; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.add.v2i32(<2 x i32> undef) ; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.add.v32i32(<32 x i32> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.add.v32i32(<32 x i32> undef) ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; -; AVX-LABEL: 'reduce_i32' -; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.add.v2i32(<2 x i32> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.add.v32i32(<32 x i32> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX1-LABEL: 'reduce_i32' +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.add.v2i32(<2 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.add.v32i32(<32 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX2-LABEL: 'reduce_i32' +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.add.v2i32(<2 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.add.v32i32(<32 x i32> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512-LABEL: 'reduce_i32' ; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.add.v2i32(<2 x i32> undef) @@ -79,9 +95,9 @@ define i32 @reduce_i32(i32 %arg) { ; SLM-LABEL: 'reduce_i32' ; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.add.v2i32(<2 x i32> undef) ; SLM-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.add.v32i32(<32 x i32> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.add.v32i32(<32 x i32> undef) ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %V2 = call i32 @llvm.experimental.vector.reduce.add.v2i32(<2 x i32> undef) @@ -97,27 +113,36 @@ define i32 @reduce_i16(i32 %arg) { ; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> undef) ; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> undef) ; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16> undef) ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; -; AVX-LABEL: 'reduce_i16' -; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX1-LABEL: 'reduce_i16' +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX2-LABEL: 'reduce_i16' +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'reduce_i16' ; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i16' @@ -134,17 +159,17 @@ define i32 @reduce_i16(i32 %arg) { ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SLM-LABEL: 'reduce_i16' ; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> undef) ; SLM-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.add.v4i16(<4 x i16> undef) ; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.add.v32i16(<32 x i16> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.add.v64i16(<64 x i16> undef) ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %V2 = call i16 @llvm.experimental.vector.reduce.add.v2i16(<2 x i16> undef) @@ -162,20 +187,30 @@ define i32 @reduce_i8(i32 %arg) { ; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef) ; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef) ; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef) ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; -; AVX-LABEL: 'reduce_i8' -; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef) -; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX1-LABEL: 'reduce_i8' +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; +; AVX2-LABEL: 'reduce_i8' +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'reduce_i8' ; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef) @@ -183,8 +218,8 @@ define i32 @reduce_i8(i32 %arg) { ; AVX512F-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i8' @@ -193,8 +228,8 @@ define i32 @reduce_i8(i32 %arg) { ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 55 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512DQ-LABEL: 'reduce_i8' @@ -203,8 +238,8 @@ define i32 @reduce_i8(i32 %arg) { ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; SLM-LABEL: 'reduce_i8' @@ -212,9 +247,9 @@ define i32 @reduce_i8(i32 %arg) { ; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> undef) ; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> undef) ; SLM-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef) -; SLM-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.add.v32i8(<32 x i8> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.add.v64i8(<64 x i8> undef) +; SLM-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.add.v128i8(<128 x i8> undef) ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %V2 = call i8 @llvm.experimental.vector.reduce.add.v2i8(<2 x i8> undef) diff --git a/llvm/test/Analysis/CostModel/X86/reduce-and.ll b/llvm/test/Analysis/CostModel/X86/reduce-and.ll index 455f7326f6b79..0943984167e12 100644 --- a/llvm/test/Analysis/CostModel/X86/reduce-and.ll +++ b/llvm/test/Analysis/CostModel/X86/reduce-and.ll @@ -17,21 +17,13 @@ define i32 @reduce_i64(i32 %arg) { ; SSE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.and.v16i64(<16 x i64> undef) ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; -; AVX1-LABEL: 'reduce_i64' -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.and.v1i64(<1 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.and.v2i64(<2 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.and.v4i64(<4 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.and.v8i64(<8 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.and.v16i64(<16 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef -; -; AVX2-LABEL: 'reduce_i64' -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.and.v1i64(<1 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.and.v2i64(<2 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.and.v4i64(<4 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.and.v8i64(<8 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.and.v16i64(<16 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX-LABEL: 'reduce_i64' +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.and.v1i64(<1 x i64> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.and.v2i64(<2 x i64> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.and.v4i64(<4 x i64> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.and.v8i64(<8 x i64> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.and.v16i64(<16 x i64> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512-LABEL: 'reduce_i64' ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.and.v1i64(<1 x i64> undef) @@ -58,21 +50,13 @@ define i32 @reduce_i32(i32 %arg) { ; SSE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.and.v32i32(<32 x i32> undef) ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; -; AVX1-LABEL: 'reduce_i32' -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.and.v2i32(<2 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.and.v16i32(<16 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.and.v32i32(<32 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef -; -; AVX2-LABEL: 'reduce_i32' -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.and.v2i32(<2 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.and.v16i32(<16 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.and.v32i32(<32 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX-LABEL: 'reduce_i32' +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.and.v2i32(<2 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.and.v4i32(<4 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.and.v8i32(<8 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.and.v16i32(<16 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.and.v32i32(<32 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512-LABEL: 'reduce_i32' ; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.and.v2i32(<2 x i32> undef) @@ -91,58 +75,31 @@ define i32 @reduce_i32(i32 %arg) { } define i32 @reduce_i16(i32 %arg) { -; SSE2-LABEL: 'reduce_i16' -; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.and.v2i16(<2 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.v4i16(<4 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.v8i16(<8 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.v16i16(<16 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.v32i16(<32 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.v64i16(<64 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef -; -; SSSE3-LABEL: 'reduce_i16' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.and.v2i16(<2 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.v4i16(<4 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.v8i16(<8 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.v16i16(<16 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.v32i16(<32 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.v64i16(<64 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef -; -; SSE42-LABEL: 'reduce_i16' -; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.and.v2i16(<2 x i16> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.v4i16(<4 x i16> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.v8i16(<8 x i16> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.v16i16(<16 x i16> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.v32i16(<32 x i16> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.v64i16(<64 x i16> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef -; -; AVX1-LABEL: 'reduce_i16' -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.and.v2i16(<2 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.v4i16(<4 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.v8i16(<8 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.v16i16(<16 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.v32i16(<32 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.v64i16(<64 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE-LABEL: 'reduce_i16' +; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.and.v2i16(<2 x i16> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.v4i16(<4 x i16> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.v8i16(<8 x i16> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.v16i16(<16 x i16> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.v32i16(<32 x i16> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.v64i16(<64 x i16> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; -; AVX2-LABEL: 'reduce_i16' -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.and.v2i16(<2 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.v4i16(<4 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.v8i16(<8 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.v16i16(<16 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.v32i16(<32 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.v64i16(<64 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX-LABEL: 'reduce_i16' +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.and.v2i16(<2 x i16> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.v4i16(<4 x i16> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.v8i16(<8 x i16> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.v16i16(<16 x i16> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.v32i16(<32 x i16> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.v64i16(<64 x i16> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'reduce_i16' ; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.and.v2i16(<2 x i16> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.v4i16(<4 x i16> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.v8i16(<8 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.v16i16(<16 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.v32i16(<32 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.v64i16(<64 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.v16i16(<16 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.v32i16(<32 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.v64i16(<64 x i16> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i16' @@ -158,9 +115,9 @@ define i32 @reduce_i16(i32 %arg) { ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.and.v2i16(<2 x i16> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.and.v4i16(<4 x i16> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.and.v8i16(<8 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.v16i16(<16 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.v32i16(<32 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.v64i16(<64 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.and.v16i16(<16 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.and.v32i16(<32 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.and.v64i16(<64 x i16> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %V2 = call i16 @llvm.experimental.vector.reduce.and.v2i16(<2 x i16> undef) @@ -173,64 +130,34 @@ define i32 @reduce_i16(i32 %arg) { } define i32 @reduce_i8(i32 %arg) { -; SSE2-LABEL: 'reduce_i8' -; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.and.v2i8(<2 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.and.v4i8(<4 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.v8i8(<8 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.v16i8(<16 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.v32i8(<32 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.v64i8(<64 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.v128i8(<128 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef -; -; SSSE3-LABEL: 'reduce_i8' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.and.v2i8(<2 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.and.v4i8(<4 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.v8i8(<8 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.v16i8(<16 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.v32i8(<32 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.v64i8(<64 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.v128i8(<128 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef -; -; SSE42-LABEL: 'reduce_i8' -; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.and.v2i8(<2 x i8> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.and.v4i8(<4 x i8> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.v8i8(<8 x i8> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.v16i8(<16 x i8> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.v32i8(<32 x i8> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.v64i8(<64 x i8> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.v128i8(<128 x i8> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef -; -; AVX1-LABEL: 'reduce_i8' -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.and.v2i8(<2 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.and.v4i8(<4 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.v8i8(<8 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.v16i8(<16 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.v32i8(<32 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.v64i8(<64 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.v128i8(<128 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE-LABEL: 'reduce_i8' +; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.and.v2i8(<2 x i8> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.and.v4i8(<4 x i8> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.v8i8(<8 x i8> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.v16i8(<16 x i8> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.v32i8(<32 x i8> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.v64i8(<64 x i8> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.v128i8(<128 x i8> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; -; AVX2-LABEL: 'reduce_i8' -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.and.v2i8(<2 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.and.v4i8(<4 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.v8i8(<8 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.v16i8(<16 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.v32i8(<32 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.v64i8(<64 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.v128i8(<128 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX-LABEL: 'reduce_i8' +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.and.v2i8(<2 x i8> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.and.v4i8(<4 x i8> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.v8i8(<8 x i8> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.v16i8(<16 x i8> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.v32i8(<32 x i8> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.v64i8(<64 x i8> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.v128i8(<128 x i8> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'reduce_i8' ; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.and.v2i8(<2 x i8> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.and.v4i8(<4 x i8> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.v8i8(<8 x i8> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.v16i8(<16 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.v32i8(<32 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.v64i8(<64 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.v128i8(<128 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.v32i8(<32 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.v64i8(<64 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.v128i8(<128 x i8> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i8' @@ -238,9 +165,9 @@ define i32 @reduce_i8(i32 %arg) { ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.and.v4i8(<4 x i8> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.v8i8(<8 x i8> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.v16i8(<16 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.v32i8(<32 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 55 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.v64i8(<64 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.v128i8(<128 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.v32i8(<32 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.v64i8(<64 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.v128i8(<128 x i8> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512DQ-LABEL: 'reduce_i8' @@ -248,9 +175,9 @@ define i32 @reduce_i8(i32 %arg) { ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.and.v4i8(<4 x i8> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.and.v8i8(<8 x i8> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.and.v16i8(<16 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.v32i8(<32 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.v64i8(<64 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.v128i8(<128 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.and.v32i8(<32 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.and.v64i8(<64 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.and.v128i8(<128 x i8> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %V2 = call i8 @llvm.experimental.vector.reduce.and.v2i8(<2 x i8> undef) @@ -270,9 +197,9 @@ define i32 @reduce_i1(i32 %arg) { ; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.v4i1(<4 x i1> undef) ; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.v8i1(<8 x i1> undef) ; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.v16i1(<16 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.v32i1(<32 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.v64i1(<64 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.v128i1(<128 x i1> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.v32i1(<32 x i1> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.v64i1(<64 x i1> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.v128i1(<128 x i1> undef) ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'reduce_i1' @@ -282,8 +209,8 @@ define i32 @reduce_i1(i32 %arg) { ; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.v8i1(<8 x i1> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.v16i1(<16 x i1> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.v32i1(<32 x i1> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.v64i1(<64 x i1> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.v128i1(<128 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.v64i1(<64 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.v128i1(<128 x i1> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'reduce_i1' @@ -293,8 +220,8 @@ define i32 @reduce_i1(i32 %arg) { ; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.v8i1(<8 x i1> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.v16i1(<16 x i1> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.v32i1(<32 x i1> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.v64i1(<64 x i1> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.v128i1(<128 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.v64i1(<64 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.v128i1(<128 x i1> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'reduce_i1' @@ -303,9 +230,9 @@ define i32 @reduce_i1(i32 %arg) { ; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.v4i1(<4 x i1> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.v8i1(<8 x i1> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.v16i1(<16 x i1> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.v32i1(<32 x i1> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.v64i1(<64 x i1> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.v128i1(<128 x i1> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.v32i1(<32 x i1> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.v64i1(<64 x i1> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.v128i1(<128 x i1> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i1' @@ -316,7 +243,7 @@ define i32 @reduce_i1(i32 %arg) { ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.v16i1(<16 x i1> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.v32i1(<32 x i1> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.v64i1(<64 x i1> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.v128i1(<128 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.v128i1(<128 x i1> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512DQ-LABEL: 'reduce_i1' @@ -325,9 +252,9 @@ define i32 @reduce_i1(i32 %arg) { ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.and.v4i1(<4 x i1> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.and.v8i1(<8 x i1> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.and.v16i1(<16 x i1> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.v32i1(<32 x i1> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.v64i1(<64 x i1> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.v128i1(<128 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.and.v32i1(<32 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.and.v64i1(<64 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.and.v128i1(<128 x i1> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %V1 = call i1 @llvm.experimental.vector.reduce.and.v1i1(<1 x i1> undef) diff --git a/llvm/test/Analysis/CostModel/X86/reduce-mul.ll b/llvm/test/Analysis/CostModel/X86/reduce-mul.ll index 42c9b5cedf4a0..bbf57578e6948 100644 --- a/llvm/test/Analysis/CostModel/X86/reduce-mul.ll +++ b/llvm/test/Analysis/CostModel/X86/reduce-mul.ll @@ -20,9 +20,9 @@ define i32 @reduce_i64(i32 %arg) { ; AVX1-LABEL: 'reduce_i64' ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.mul.v1i64(<1 x i64> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.mul.v2i64(<2 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 41 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.mul.v4i64(<4 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 59 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.mul.v8i64(<8 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 95 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.mul.v16i64(<16 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.mul.v4i64(<4 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.mul.v8i64(<8 x i64> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 73 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.mul.v16i64(<16 x i64> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'reduce_i64' @@ -93,9 +93,9 @@ define i32 @reduce_i32(i32 %arg) { ; AVX1-LABEL: 'reduce_i32' ; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.mul.v2i32(<2 x i32> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.mul.v4i32(<4 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.v16i32(<16 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.v32i32(<32 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.mul.v8i32(<8 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.mul.v16i32(<16 x i32> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.mul.v32i32(<32 x i32> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'reduce_i32' @@ -123,58 +123,40 @@ define i32 @reduce_i32(i32 %arg) { } define i32 @reduce_i16(i32 %arg) { -; SSE2-LABEL: 'reduce_i16' -; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.v2i16(<2 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.v4i16(<4 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.v8i16(<8 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.v16i16(<16 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.v32i16(<32 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.v64i16(<64 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef -; -; SSSE3-LABEL: 'reduce_i16' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.v2i16(<2 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.v4i16(<4 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.v8i16(<8 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.v16i16(<16 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.v32i16(<32 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.v64i16(<64 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef -; -; SSE42-LABEL: 'reduce_i16' -; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.v2i16(<2 x i16> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.v4i16(<4 x i16> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.v8i16(<8 x i16> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.v16i16(<16 x i16> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.v32i16(<32 x i16> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.v64i16(<64 x i16> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE-LABEL: 'reduce_i16' +; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.v2i16(<2 x i16> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.v4i16(<4 x i16> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.v8i16(<8 x i16> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.v16i16(<16 x i16> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.v32i16(<32 x i16> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.v64i16(<64 x i16> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'reduce_i16' ; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.v2i16(<2 x i16> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.v4i16(<4 x i16> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.v8i16(<8 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.v16i16(<16 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 53 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.v32i16(<32 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.v64i16(<64 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.v16i16(<16 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.v32i16(<32 x i16> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.v64i16(<64 x i16> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'reduce_i16' ; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.v2i16(<2 x i16> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.v4i16(<4 x i16> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.v8i16(<8 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.v16i16(<16 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.v32i16(<32 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.v64i16(<64 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.v16i16(<16 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.v32i16(<32 x i16> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.v64i16(<64 x i16> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'reduce_i16' ; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.v2i16(<2 x i16> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.v4i16(<4 x i16> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.v8i16(<8 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.v16i16(<16 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.v32i16(<32 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.v64i16(<64 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.v16i16(<16 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.v32i16(<32 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.v64i16(<64 x i16> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i16' @@ -190,9 +172,9 @@ define i32 @reduce_i16(i32 %arg) { ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.mul.v2i16(<2 x i16> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.mul.v4i16(<4 x i16> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.mul.v8i16(<8 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.v16i16(<16 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.v32i16(<32 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.v64i16(<64 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.mul.v16i16(<16 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.mul.v32i16(<32 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.mul.v64i16(<64 x i16> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %V2 = call i16 @llvm.experimental.vector.reduce.mul.v2i16(<2 x i16> undef) @@ -205,44 +187,24 @@ define i32 @reduce_i16(i32 %arg) { } define i32 @reduce_i8(i32 %arg) { -; SSE2-LABEL: 'reduce_i8' -; SSE2-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.mul.v2i8(<2 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.v4i8(<4 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 67 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 89 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.v16i8(<16 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 101 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.v32i8(<32 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 125 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.v64i8(<64 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 173 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.v128i8(<128 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef -; -; SSSE3-LABEL: 'reduce_i8' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.mul.v2i8(<2 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.v4i8(<4 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 53 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.v16i8(<16 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.v32i8(<32 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 89 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.v64i8(<64 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 137 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.v128i8(<128 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef -; -; SSE42-LABEL: 'reduce_i8' -; SSE42-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.mul.v2i8(<2 x i8> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.v4i8(<4 x i8> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 53 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.v16i8(<16 x i8> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.v32i8(<32 x i8> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 89 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.v64i8(<64 x i8> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 137 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.v128i8(<128 x i8> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE-LABEL: 'reduce_i8' +; SSE-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.mul.v2i8(<2 x i8> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.v4i8(<4 x i8> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 53 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.v16i8(<16 x i8> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.v32i8(<32 x i8> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 89 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.v64i8(<64 x i8> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 137 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.v128i8(<128 x i8> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'reduce_i8' ; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.mul.v2i8(<2 x i8> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.v4i8(<4 x i8> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 53 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.v16i8(<16 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 171 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.v32i8(<32 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 197 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.v64i8(<64 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 249 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.v128i8(<128 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.v32i8(<32 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.v64i8(<64 x i8> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 144 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.v128i8(<128 x i8> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'reduce_i8' @@ -250,9 +212,9 @@ define i32 @reduce_i8(i32 %arg) { ; AVX2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.v4i8(<4 x i8> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.v16i8(<16 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 106 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.v32i8(<32 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 123 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.v64i8(<64 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 157 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.v128i8(<128 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 41 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.v32i8(<32 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.v64i8(<64 x i8> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.v128i8(<128 x i8> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'reduce_i8' @@ -260,9 +222,9 @@ define i32 @reduce_i8(i32 %arg) { ; AVX512F-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.v4i8(<4 x i8> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.v16i8(<16 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 86 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.v32i8(<32 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 99 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.v64i8(<64 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 125 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.v128i8(<128 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.v32i8(<32 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.v64i8(<64 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.v128i8(<128 x i8> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i8' @@ -270,9 +232,9 @@ define i32 @reduce_i8(i32 %arg) { ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.v4i8(<4 x i8> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.v16i8(<16 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.v32i8(<32 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 115 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.v64i8(<64 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 126 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.v128i8(<128 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.v32i8(<32 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.v64i8(<64 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.v128i8(<128 x i8> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512DQ-LABEL: 'reduce_i8' @@ -280,9 +242,9 @@ define i32 @reduce_i8(i32 %arg) { ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.mul.v4i8(<4 x i8> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.mul.v8i8(<8 x i8> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.mul.v16i8(<16 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 86 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.v32i8(<32 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 99 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.v64i8(<64 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 125 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.v128i8(<128 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.mul.v32i8(<32 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.mul.v64i8(<64 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 70 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.mul.v128i8(<128 x i8> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %V2 = call i8 @llvm.experimental.vector.reduce.mul.v2i8(<2 x i8> undef) diff --git a/llvm/test/Analysis/CostModel/X86/reduce-or.ll b/llvm/test/Analysis/CostModel/X86/reduce-or.ll index ee05562dc241e..8bdf419c13ba6 100644 --- a/llvm/test/Analysis/CostModel/X86/reduce-or.ll +++ b/llvm/test/Analysis/CostModel/X86/reduce-or.ll @@ -17,21 +17,13 @@ define i32 @reduce_i64(i32 %arg) { ; SSE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.or.v16i64(<16 x i64> undef) ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; -; AVX1-LABEL: 'reduce_i64' -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.or.v1i64(<1 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.or.v2i64(<2 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.or.v4i64(<4 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.or.v8i64(<8 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.or.v16i64(<16 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef -; -; AVX2-LABEL: 'reduce_i64' -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.or.v1i64(<1 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.or.v2i64(<2 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.or.v4i64(<4 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.or.v8i64(<8 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.or.v16i64(<16 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX-LABEL: 'reduce_i64' +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.or.v1i64(<1 x i64> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.or.v2i64(<2 x i64> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.or.v4i64(<4 x i64> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.or.v8i64(<8 x i64> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.or.v16i64(<16 x i64> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512-LABEL: 'reduce_i64' ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.or.v1i64(<1 x i64> undef) @@ -58,21 +50,13 @@ define i32 @reduce_i32(i32 %arg) { ; SSE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.or.v32i32(<32 x i32> undef) ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; -; AVX1-LABEL: 'reduce_i32' -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.or.v2i32(<2 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.or.v8i32(<8 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.or.v16i32(<16 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.or.v32i32(<32 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef -; -; AVX2-LABEL: 'reduce_i32' -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.or.v2i32(<2 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.or.v8i32(<8 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.or.v16i32(<16 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.or.v32i32(<32 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX-LABEL: 'reduce_i32' +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.or.v2i32(<2 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.or.v4i32(<4 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.or.v8i32(<8 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.or.v16i32(<16 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.or.v32i32(<32 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512-LABEL: 'reduce_i32' ; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.or.v2i32(<2 x i32> undef) @@ -91,58 +75,31 @@ define i32 @reduce_i32(i32 %arg) { } define i32 @reduce_i16(i32 %arg) { -; SSE2-LABEL: 'reduce_i16' -; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.or.v2i16(<2 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.v4i16(<4 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.v8i16(<8 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.v16i16(<16 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.v32i16(<32 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.v64i16(<64 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef -; -; SSSE3-LABEL: 'reduce_i16' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.or.v2i16(<2 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.v4i16(<4 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.v8i16(<8 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.v16i16(<16 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.v32i16(<32 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.v64i16(<64 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef -; -; SSE42-LABEL: 'reduce_i16' -; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.or.v2i16(<2 x i16> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.v4i16(<4 x i16> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.v8i16(<8 x i16> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.v16i16(<16 x i16> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.v32i16(<32 x i16> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.v64i16(<64 x i16> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef -; -; AVX1-LABEL: 'reduce_i16' -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.or.v2i16(<2 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.v4i16(<4 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.v8i16(<8 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.v16i16(<16 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.v32i16(<32 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.v64i16(<64 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE-LABEL: 'reduce_i16' +; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.or.v2i16(<2 x i16> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.v4i16(<4 x i16> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.v8i16(<8 x i16> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.v16i16(<16 x i16> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.v32i16(<32 x i16> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.v64i16(<64 x i16> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; -; AVX2-LABEL: 'reduce_i16' -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.or.v2i16(<2 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.v4i16(<4 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.v8i16(<8 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.v16i16(<16 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.v32i16(<32 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.v64i16(<64 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX-LABEL: 'reduce_i16' +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.or.v2i16(<2 x i16> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.v4i16(<4 x i16> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.v8i16(<8 x i16> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.v16i16(<16 x i16> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.v32i16(<32 x i16> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.v64i16(<64 x i16> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'reduce_i16' ; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.or.v2i16(<2 x i16> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.v4i16(<4 x i16> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.v8i16(<8 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.v16i16(<16 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.v32i16(<32 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.v64i16(<64 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.v16i16(<16 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.v32i16(<32 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.v64i16(<64 x i16> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i16' @@ -158,9 +115,9 @@ define i32 @reduce_i16(i32 %arg) { ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.or.v2i16(<2 x i16> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.or.v4i16(<4 x i16> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.or.v8i16(<8 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.v16i16(<16 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.v32i16(<32 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.v64i16(<64 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.or.v16i16(<16 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.or.v32i16(<32 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.or.v64i16(<64 x i16> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %V2 = call i16 @llvm.experimental.vector.reduce.or.v2i16(<2 x i16> undef) @@ -173,64 +130,34 @@ define i32 @reduce_i16(i32 %arg) { } define i32 @reduce_i8(i32 %arg) { -; SSE2-LABEL: 'reduce_i8' -; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.or.v2i8(<2 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.or.v4i8(<4 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.v8i8(<8 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.v16i8(<16 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.v32i8(<32 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.v64i8(<64 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.v128i8(<128 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef -; -; SSSE3-LABEL: 'reduce_i8' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.or.v2i8(<2 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.or.v4i8(<4 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.v8i8(<8 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.v16i8(<16 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.v32i8(<32 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.v64i8(<64 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.v128i8(<128 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef -; -; SSE42-LABEL: 'reduce_i8' -; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.or.v2i8(<2 x i8> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.or.v4i8(<4 x i8> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.v8i8(<8 x i8> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.v16i8(<16 x i8> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.v32i8(<32 x i8> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.v64i8(<64 x i8> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.v128i8(<128 x i8> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef -; -; AVX1-LABEL: 'reduce_i8' -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.or.v2i8(<2 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.or.v4i8(<4 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.v8i8(<8 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.v16i8(<16 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.v32i8(<32 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.v64i8(<64 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.v128i8(<128 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE-LABEL: 'reduce_i8' +; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.or.v2i8(<2 x i8> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.or.v4i8(<4 x i8> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.v8i8(<8 x i8> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.v16i8(<16 x i8> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.v32i8(<32 x i8> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.v64i8(<64 x i8> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.v128i8(<128 x i8> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; -; AVX2-LABEL: 'reduce_i8' -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.or.v2i8(<2 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.or.v4i8(<4 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.v8i8(<8 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.v16i8(<16 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.v32i8(<32 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.v64i8(<64 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.v128i8(<128 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX-LABEL: 'reduce_i8' +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.or.v2i8(<2 x i8> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.or.v4i8(<4 x i8> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.v8i8(<8 x i8> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.v16i8(<16 x i8> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.v32i8(<32 x i8> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.v64i8(<64 x i8> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.v128i8(<128 x i8> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'reduce_i8' ; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.or.v2i8(<2 x i8> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.or.v4i8(<4 x i8> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.v8i8(<8 x i8> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.v16i8(<16 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.v32i8(<32 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.v64i8(<64 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.v128i8(<128 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.v32i8(<32 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.v64i8(<64 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.v128i8(<128 x i8> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i8' @@ -238,9 +165,9 @@ define i32 @reduce_i8(i32 %arg) { ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.or.v4i8(<4 x i8> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.v8i8(<8 x i8> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.v16i8(<16 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.v32i8(<32 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 55 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.v64i8(<64 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.v128i8(<128 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.v32i8(<32 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.v64i8(<64 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.v128i8(<128 x i8> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512DQ-LABEL: 'reduce_i8' @@ -248,9 +175,9 @@ define i32 @reduce_i8(i32 %arg) { ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.or.v4i8(<4 x i8> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.or.v8i8(<8 x i8> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.or.v16i8(<16 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.v32i8(<32 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.v64i8(<64 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.v128i8(<128 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.or.v32i8(<32 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.or.v64i8(<64 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.or.v128i8(<128 x i8> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %V2 = call i8 @llvm.experimental.vector.reduce.or.v2i8(<2 x i8> undef) @@ -270,9 +197,9 @@ define i32 @reduce_i1(i32 %arg) { ; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.v4i1(<4 x i1> undef) ; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.v8i1(<8 x i1> undef) ; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.v16i1(<16 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.v32i1(<32 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.v64i1(<64 x i1> undef) -; SSE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.v128i1(<128 x i1> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.v32i1(<32 x i1> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.v64i1(<64 x i1> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.v128i1(<128 x i1> undef) ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX1-LABEL: 'reduce_i1' @@ -282,8 +209,8 @@ define i32 @reduce_i1(i32 %arg) { ; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.v8i1(<8 x i1> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.v16i1(<16 x i1> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.v32i1(<32 x i1> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.v64i1(<64 x i1> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.v128i1(<128 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.v64i1(<64 x i1> undef) +; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.v128i1(<128 x i1> undef) ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX2-LABEL: 'reduce_i1' @@ -293,8 +220,8 @@ define i32 @reduce_i1(i32 %arg) { ; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.v8i1(<8 x i1> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.v16i1(<16 x i1> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.v32i1(<32 x i1> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.v64i1(<64 x i1> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.v128i1(<128 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.v64i1(<64 x i1> undef) +; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.v128i1(<128 x i1> undef) ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'reduce_i1' @@ -303,9 +230,9 @@ define i32 @reduce_i1(i32 %arg) { ; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.v4i1(<4 x i1> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.v8i1(<8 x i1> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.v16i1(<16 x i1> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.v32i1(<32 x i1> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.v64i1(<64 x i1> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.v128i1(<128 x i1> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.v32i1(<32 x i1> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.v64i1(<64 x i1> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.v128i1(<128 x i1> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i1' @@ -316,7 +243,7 @@ define i32 @reduce_i1(i32 %arg) { ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.v16i1(<16 x i1> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.v32i1(<32 x i1> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.v64i1(<64 x i1> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.v128i1(<128 x i1> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.v128i1(<128 x i1> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512DQ-LABEL: 'reduce_i1' @@ -325,9 +252,9 @@ define i32 @reduce_i1(i32 %arg) { ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i1 @llvm.experimental.vector.reduce.or.v4i1(<4 x i1> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i1 @llvm.experimental.vector.reduce.or.v8i1(<8 x i1> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i1 @llvm.experimental.vector.reduce.or.v16i1(<16 x i1> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.v32i1(<32 x i1> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.v64i1(<64 x i1> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 72 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.v128i1(<128 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i1 @llvm.experimental.vector.reduce.or.v32i1(<32 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i1 @llvm.experimental.vector.reduce.or.v64i1(<64 x i1> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i1 @llvm.experimental.vector.reduce.or.v128i1(<128 x i1> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %V1 = call i1 @llvm.experimental.vector.reduce.or.v1i1(<1 x i1> undef) diff --git a/llvm/test/Analysis/CostModel/X86/reduce-xor.ll b/llvm/test/Analysis/CostModel/X86/reduce-xor.ll index df927b436ac13..679cd3368b50d 100644 --- a/llvm/test/Analysis/CostModel/X86/reduce-xor.ll +++ b/llvm/test/Analysis/CostModel/X86/reduce-xor.ll @@ -17,21 +17,13 @@ define i32 @reduce_i64(i32 %arg) { ; SSE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.xor.v16i64(<16 x i64> undef) ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; -; AVX1-LABEL: 'reduce_i64' -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.xor.v1i64(<1 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.xor.v2i64(<2 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.xor.v4i64(<4 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.xor.v8i64(<8 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.xor.v16i64(<16 x i64> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef -; -; AVX2-LABEL: 'reduce_i64' -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.xor.v1i64(<1 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.xor.v2i64(<2 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.xor.v4i64(<4 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.xor.v8i64(<8 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.xor.v16i64(<16 x i64> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX-LABEL: 'reduce_i64' +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.xor.v1i64(<1 x i64> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i64 @llvm.experimental.vector.reduce.xor.v2i64(<2 x i64> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i64 @llvm.experimental.vector.reduce.xor.v4i64(<4 x i64> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V8 = call i64 @llvm.experimental.vector.reduce.xor.v8i64(<8 x i64> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i64 @llvm.experimental.vector.reduce.xor.v16i64(<16 x i64> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512-LABEL: 'reduce_i64' ; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V1 = call i64 @llvm.experimental.vector.reduce.xor.v1i64(<1 x i64> undef) @@ -58,21 +50,13 @@ define i32 @reduce_i32(i32 %arg) { ; SSE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.xor.v32i32(<32 x i32> undef) ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; -; AVX1-LABEL: 'reduce_i32' -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.xor.v2i32(<2 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.xor.v8i32(<8 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.xor.v16i32(<16 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.xor.v32i32(<32 x i32> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef -; -; AVX2-LABEL: 'reduce_i32' -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.xor.v2i32(<2 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.xor.v8i32(<8 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.xor.v16i32(<16 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.xor.v32i32(<32 x i32> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX-LABEL: 'reduce_i32' +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.xor.v2i32(<2 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i32 @llvm.experimental.vector.reduce.xor.v4i32(<4 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i32 @llvm.experimental.vector.reduce.xor.v8i32(<8 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i32 @llvm.experimental.vector.reduce.xor.v16i32(<16 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i32 @llvm.experimental.vector.reduce.xor.v32i32(<32 x i32> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512-LABEL: 'reduce_i32' ; AVX512-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i32 @llvm.experimental.vector.reduce.xor.v2i32(<2 x i32> undef) @@ -91,58 +75,31 @@ define i32 @reduce_i32(i32 %arg) { } define i32 @reduce_i16(i32 %arg) { -; SSE2-LABEL: 'reduce_i16' -; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.xor.v2i16(<2 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.v4i16(<4 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.v8i16(<8 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.v16i16(<16 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.v32i16(<32 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.v64i16(<64 x i16> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef -; -; SSSE3-LABEL: 'reduce_i16' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.xor.v2i16(<2 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.v4i16(<4 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.v8i16(<8 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.v16i16(<16 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.v32i16(<32 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.v64i16(<64 x i16> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef -; -; SSE42-LABEL: 'reduce_i16' -; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.xor.v2i16(<2 x i16> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.v4i16(<4 x i16> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.v8i16(<8 x i16> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.v16i16(<16 x i16> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.v32i16(<32 x i16> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.v64i16(<64 x i16> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef -; -; AVX1-LABEL: 'reduce_i16' -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.xor.v2i16(<2 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.v4i16(<4 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.v8i16(<8 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 37 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.v16i16(<16 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.v32i16(<32 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.v64i16(<64 x i16> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE-LABEL: 'reduce_i16' +; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.xor.v2i16(<2 x i16> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.v4i16(<4 x i16> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.v8i16(<8 x i16> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.v16i16(<16 x i16> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.v32i16(<32 x i16> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.v64i16(<64 x i16> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; -; AVX2-LABEL: 'reduce_i16' -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.xor.v2i16(<2 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.v4i16(<4 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.v8i16(<8 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.v16i16(<16 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.v32i16(<32 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.v64i16(<64 x i16> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX-LABEL: 'reduce_i16' +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.xor.v2i16(<2 x i16> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.v4i16(<4 x i16> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.v8i16(<8 x i16> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.v16i16(<16 x i16> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.v32i16(<32 x i16> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.v64i16(<64 x i16> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'reduce_i16' ; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.xor.v2i16(<2 x i16> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.v4i16(<4 x i16> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.v8i16(<8 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.v16i16(<16 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.v32i16(<32 x i16> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.v64i16(<64 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.v16i16(<16 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.v32i16(<32 x i16> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.v64i16(<64 x i16> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i16' @@ -158,9 +115,9 @@ define i32 @reduce_i16(i32 %arg) { ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i16 @llvm.experimental.vector.reduce.xor.v2i16(<2 x i16> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i16 @llvm.experimental.vector.reduce.xor.v4i16(<4 x i16> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i16 @llvm.experimental.vector.reduce.xor.v8i16(<8 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.v16i16(<16 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.v32i16(<32 x i16> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.v64i16(<64 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i16 @llvm.experimental.vector.reduce.xor.v16i16(<16 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i16 @llvm.experimental.vector.reduce.xor.v32i16(<32 x i16> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i16 @llvm.experimental.vector.reduce.xor.v64i16(<64 x i16> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %V2 = call i16 @llvm.experimental.vector.reduce.xor.v2i16(<2 x i16> undef) @@ -173,64 +130,34 @@ define i32 @reduce_i16(i32 %arg) { } define i32 @reduce_i8(i32 %arg) { -; SSE2-LABEL: 'reduce_i8' -; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.xor.v2i8(<2 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 23 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.xor.v4i8(<4 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.v8i8(<8 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.v16i8(<16 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.v32i8(<32 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.v64i8(<64 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 52 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.v128i8(<128 x i8> undef) -; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef -; -; SSSE3-LABEL: 'reduce_i8' -; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.xor.v2i8(<2 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.xor.v4i8(<4 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.v8i8(<8 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.v16i8(<16 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.v32i8(<32 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.v64i8(<64 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.v128i8(<128 x i8> undef) -; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef -; -; SSE42-LABEL: 'reduce_i8' -; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.xor.v2i8(<2 x i8> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.xor.v4i8(<4 x i8> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.v8i8(<8 x i8> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.v16i8(<16 x i8> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.v32i8(<32 x i8> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.v64i8(<64 x i8> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.v128i8(<128 x i8> undef) -; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef -; -; AVX1-LABEL: 'reduce_i8' -; AVX1-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.xor.v2i8(<2 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.xor.v4i8(<4 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.v8i8(<8 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.v16i8(<16 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.v32i8(<32 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.v64i8(<64 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 49 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.v128i8(<128 x i8> undef) -; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; SSE-LABEL: 'reduce_i8' +; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.xor.v2i8(<2 x i8> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.xor.v4i8(<4 x i8> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.v8i8(<8 x i8> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.v16i8(<16 x i8> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.v32i8(<32 x i8> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.v64i8(<64 x i8> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.v128i8(<128 x i8> undef) +; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; -; AVX2-LABEL: 'reduce_i8' -; AVX2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.xor.v2i8(<2 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.xor.v4i8(<4 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.v8i8(<8 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.v16i8(<16 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.v32i8(<32 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.v64i8(<64 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.v128i8(<128 x i8> undef) -; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef +; AVX-LABEL: 'reduce_i8' +; AVX-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.xor.v2i8(<2 x i8> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.xor.v4i8(<4 x i8> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.v8i8(<8 x i8> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.v16i8(<16 x i8> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.v32i8(<32 x i8> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.v64i8(<64 x i8> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.v128i8(<128 x i8> undef) +; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512F-LABEL: 'reduce_i8' ; AVX512F-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2 = call i8 @llvm.experimental.vector.reduce.xor.v2i8(<2 x i8> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.xor.v4i8(<4 x i8> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.v8i8(<8 x i8> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.v16i8(<16 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.v32i8(<32 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.v64i8(<64 x i8> undef) -; AVX512F-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.v128i8(<128 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.v32i8(<32 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.v64i8(<64 x i8> undef) +; AVX512F-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.v128i8(<128 x i8> undef) ; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512BW-LABEL: 'reduce_i8' @@ -238,9 +165,9 @@ define i32 @reduce_i8(i32 %arg) { ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.xor.v4i8(<4 x i8> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.v8i8(<8 x i8> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.v16i8(<16 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.v32i8(<32 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 55 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.v64i8(<64 x i8> undef) -; AVX512BW-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.v128i8(<128 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.v32i8(<32 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.v64i8(<64 x i8> undef) +; AVX512BW-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.v128i8(<128 x i8> undef) ; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; ; AVX512DQ-LABEL: 'reduce_i8' @@ -248,9 +175,9 @@ define i32 @reduce_i8(i32 %arg) { ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V4 = call i8 @llvm.experimental.vector.reduce.xor.v4i8(<4 x i8> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8 = call i8 @llvm.experimental.vector.reduce.xor.v8i8(<8 x i8> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16 = call i8 @llvm.experimental.vector.reduce.xor.v16i8(<16 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.v32i8(<32 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.v64i8(<64 x i8> undef) -; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.v128i8(<128 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V32 = call i8 @llvm.experimental.vector.reduce.xor.v32i8(<32 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V64 = call i8 @llvm.experimental.vector.reduce.xor.v64i8(<64 x i8> undef) +; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V128 = call i8 @llvm.experimental.vector.reduce.xor.v128i8(<128 x i8> undef) ; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef ; %V2 = call i8 @llvm.experimental.vector.reduce.xor.v2i8(<2 x i8> undef) diff --git a/llvm/test/Analysis/CostModel/X86/reduction.ll b/llvm/test/Analysis/CostModel/X86/reduction.ll index ac373020532be..1812a074f86bb 100644 --- a/llvm/test/Analysis/CostModel/X86/reduction.ll +++ b/llvm/test/Analysis/CostModel/X86/reduction.ll @@ -69,7 +69,7 @@ define fastcc i32 @reduction_cost_int(<8 x i32> %rdx) { ; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.2 = add <8 x i32> %bin.rdx, %rdx.shuf.2 ; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.3 = shufflevector <8 x i32> %bin.rdx.2, <8 x i32> undef, <8 x i32> ; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.3 = add <8 x i32> %bin.rdx.2, %rdx.shuf.3 -; SSE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %r = extractelement <8 x i32> %bin.rdx.3, i32 0 +; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <8 x i32> %bin.rdx.3, i32 0 ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r ; ; AVX1-LABEL: 'reduction_cost_int' @@ -99,7 +99,7 @@ define fastcc i32 @reduction_cost_int(<8 x i32> %rdx) { ; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.2 = add <8 x i32> %bin.rdx, %rdx.shuf.2 ; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.3 = shufflevector <8 x i32> %bin.rdx.2, <8 x i32> undef, <8 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.3 = add <8 x i32> %bin.rdx.2, %rdx.shuf.3 -; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %r = extractelement <8 x i32> %bin.rdx.3, i32 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <8 x i32> %bin.rdx.3, i32 0 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r ; %rdx.shuf = shufflevector <8 x i32> %rdx, <8 x i32> undef, @@ -127,7 +127,7 @@ define fastcc float @pairwise_hadd(<4 x float> %rdx, float %f1) { ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 ; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r2 = fadd float %r, %f1 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r2 ; @@ -138,7 +138,7 @@ define fastcc float @pairwise_hadd(<4 x float> %rdx, float %f1) { ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r2 = fadd float %r, %f1 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r2 ; @@ -149,7 +149,7 @@ define fastcc float @pairwise_hadd(<4 x float> %rdx, float %f1) { ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 +; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r2 = fadd float %r, %f1 ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r2 ; @@ -160,7 +160,7 @@ define fastcc float @pairwise_hadd(<4 x float> %rdx, float %f1) { ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 -; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r2 = fadd float %r, %f1 ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r2 ; @@ -171,7 +171,7 @@ define fastcc float @pairwise_hadd(<4 x float> %rdx, float %f1) { ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r2 = fadd float %r, %f1 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r2 ; @@ -199,7 +199,7 @@ define fastcc float @pairwise_hadd_assoc(<4 x float> %rdx, float %f1) { ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 ; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r2 = fadd float %r, %f1 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r2 ; @@ -210,7 +210,7 @@ define fastcc float @pairwise_hadd_assoc(<4 x float> %rdx, float %f1) { ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r2 = fadd float %r, %f1 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r2 ; @@ -221,7 +221,7 @@ define fastcc float @pairwise_hadd_assoc(<4 x float> %rdx, float %f1) { ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 +; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r2 = fadd float %r, %f1 ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r2 ; @@ -232,7 +232,7 @@ define fastcc float @pairwise_hadd_assoc(<4 x float> %rdx, float %f1) { ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 -; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r2 = fadd float %r, %f1 ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r2 ; @@ -243,7 +243,7 @@ define fastcc float @pairwise_hadd_assoc(<4 x float> %rdx, float %f1) { ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.1 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r2 = fadd float %r, %f1 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r2 ; @@ -270,7 +270,7 @@ define fastcc float @pairwise_hadd_skip_first(<4 x float> %rdx, float %f1) { ; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.1 = fadd <4 x float> %bin.rdx.0, %rdx.shuf.1.1 -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 ; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r2 = fadd float %r, %f1 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r2 ; @@ -280,7 +280,7 @@ define fastcc float @pairwise_hadd_skip_first(<4 x float> %rdx, float %f1) { ; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx.1 = fadd <4 x float> %bin.rdx.0, %rdx.shuf.1.1 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r2 = fadd float %r, %f1 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r2 ; @@ -290,7 +290,7 @@ define fastcc float @pairwise_hadd_skip_first(<4 x float> %rdx, float %f1) { ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.1 = fadd <4 x float> %bin.rdx.0, %rdx.shuf.1.1 -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 +; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r2 = fadd float %r, %f1 ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r2 ; @@ -300,7 +300,7 @@ define fastcc float @pairwise_hadd_skip_first(<4 x float> %rdx, float %f1) { ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.1 = fadd <4 x float> %bin.rdx.0, %rdx.shuf.1.1 -; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r2 = fadd float %r, %f1 ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r2 ; @@ -310,7 +310,7 @@ define fastcc float @pairwise_hadd_skip_first(<4 x float> %rdx, float %f1) { ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1 ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx.0, <4 x float> undef, <4 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx.1 = fadd <4 x float> %bin.rdx.0, %rdx.shuf.1.1 -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x float> %bin.rdx.1, i32 0 ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r2 = fadd float %r, %f1 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r2 ; @@ -438,7 +438,7 @@ define fastcc double @no_pairwise_reduction4double(<4 x double> %rdx, double %f1 ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <4 x double> %rdx, %rdx.shuf ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf7 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <4 x double> %bin.rdx, %rdx.shuf7 -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0 +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0 ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r ; ; AVX1-LABEL: 'no_pairwise_reduction4double' @@ -462,7 +462,7 @@ define fastcc double @no_pairwise_reduction4double(<4 x double> %rdx, double %f1 ; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = fadd <4 x double> %rdx, %rdx.shuf ; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf7 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <4 x double> %bin.rdx, %rdx.shuf7 -; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r ; %rdx.shuf = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> @@ -482,7 +482,7 @@ define fastcc float @no_pairwise_reduction8float(<8 x float> %rdx, float %f1) { ; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = fadd <8 x float> %bin.rdx4, %rdx.shuf ; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf7 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <8 x float> %bin.rdx, %rdx.shuf7 -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %r = extractelement <8 x float> %bin.rdx8, i32 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %r = extractelement <8 x float> %bin.rdx8, i32 0 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; ; SSSE3-LABEL: 'no_pairwise_reduction8float' @@ -492,7 +492,7 @@ define fastcc float @no_pairwise_reduction8float(<8 x float> %rdx, float %f1) { ; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx = fadd <8 x float> %bin.rdx4, %rdx.shuf ; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf7 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <8 x float> %bin.rdx, %rdx.shuf7 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %r = extractelement <8 x float> %bin.rdx8, i32 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %r = extractelement <8 x float> %bin.rdx8, i32 0 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; ; SSE42-LABEL: 'no_pairwise_reduction8float' @@ -502,7 +502,7 @@ define fastcc float @no_pairwise_reduction8float(<8 x float> %rdx, float %f1) { ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <8 x float> %bin.rdx4, %rdx.shuf ; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf7 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <8 x float> %bin.rdx, %rdx.shuf7 -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %r = extractelement <8 x float> %bin.rdx8, i32 0 +; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <8 x float> %bin.rdx8, i32 0 ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; ; AVX1-LABEL: 'no_pairwise_reduction8float' @@ -532,7 +532,7 @@ define fastcc float @no_pairwise_reduction8float(<8 x float> %rdx, float %f1) { ; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = fadd <8 x float> %bin.rdx4, %rdx.shuf ; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf7 = shufflevector <8 x float> %bin.rdx, <8 x float> undef, <8 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <8 x float> %bin.rdx, %rdx.shuf7 -; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %r = extractelement <8 x float> %bin.rdx8, i32 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <8 x float> %bin.rdx8, i32 0 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; %rdx.shuf3 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> @@ -604,7 +604,7 @@ define fastcc i64 @no_pairwise_reduction4i64(<4 x i64> %rdx, i64 %f1) { ; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = add <4 x i64> %rdx, %rdx.shuf ; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf7 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> ; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = add <4 x i64> %bin.rdx, %rdx.shuf7 -; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0 +; SSE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0 ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r ; ; AVX1-LABEL: 'no_pairwise_reduction4i64' @@ -628,7 +628,7 @@ define fastcc i64 @no_pairwise_reduction4i64(<4 x i64> %rdx, i64 %f1) { ; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %bin.rdx = add <4 x i64> %rdx, %rdx.shuf ; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf7 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %bin.rdx8 = add <4 x i64> %bin.rdx, %rdx.shuf7 -; SLM-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r ; %rdx.shuf = shufflevector <4 x i64> %rdx, <4 x i64> undef, <4 x i32> @@ -710,7 +710,7 @@ define fastcc i32 @no_pairwise_reduction8i32(<8 x i32> %rdx, i32 %f1) { ; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = add <8 x i32> %bin.rdx4, %rdx.shuf ; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf7 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> ; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = add <8 x i32> %bin.rdx, %rdx.shuf7 -; SSE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %r = extractelement <8 x i32> %bin.rdx8, i32 0 +; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <8 x i32> %bin.rdx8, i32 0 ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r ; ; AVX1-LABEL: 'no_pairwise_reduction8i32' @@ -740,7 +740,7 @@ define fastcc i32 @no_pairwise_reduction8i32(<8 x i32> %rdx, i32 %f1) { ; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx = add <8 x i32> %bin.rdx4, %rdx.shuf ; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf7 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = add <8 x i32> %bin.rdx, %rdx.shuf7 -; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %r = extractelement <8 x i32> %bin.rdx8, i32 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <8 x i32> %bin.rdx8, i32 0 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r ; %rdx.shuf3 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> @@ -759,14 +759,14 @@ define fastcc double @pairwise_reduction2double(<2 x double> %rdx, double %f1) { ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <2 x double> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = extractelement <2 x double> %bin.rdx8, i32 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r = extractelement <2 x double> %bin.rdx8, i32 0 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r ; ; SSSE3-LABEL: 'pairwise_reduction2double' ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <2 x double> %rdx, <2 x double> undef, <2 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <2 x double> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = extractelement <2 x double> %bin.rdx8, i32 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r = extractelement <2 x double> %bin.rdx8, i32 0 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r ; ; SSE42-LABEL: 'pairwise_reduction2double' @@ -806,7 +806,7 @@ define fastcc float @pairwise_reduction4float(<4 x float> %rdx, float %f1) { ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; ; SSSE3-LABEL: 'pairwise_reduction4float' @@ -816,7 +816,7 @@ define fastcc float @pairwise_reduction4float(<4 x float> %rdx, float %f1) { ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; ; SSE42-LABEL: 'pairwise_reduction4float' @@ -826,7 +826,7 @@ define fastcc float @pairwise_reduction4float(<4 x float> %rdx, float %f1) { ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0 +; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0 ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; ; AVX-LABEL: 'pairwise_reduction4float' @@ -836,7 +836,7 @@ define fastcc float @pairwise_reduction4float(<4 x float> %rdx, float %f1) { ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 -; AVX-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0 +; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0 ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; ; SLM-LABEL: 'pairwise_reduction4float' @@ -846,7 +846,7 @@ define fastcc float @pairwise_reduction4float(<4 x float> %rdx, float %f1) { ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x float> %bin.rdx8, i32 0 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef, <4 x i32> @@ -868,7 +868,7 @@ define fastcc double @pairwise_reduction4double(<4 x double> %rdx, double %f1) { ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.1.1 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <4 x double> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r ; ; SSSE3-LABEL: 'pairwise_reduction4double' @@ -878,7 +878,7 @@ define fastcc double @pairwise_reduction4double(<4 x double> %rdx, double %f1) { ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.1.1 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <4 x double> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r ; ; SSE42-LABEL: 'pairwise_reduction4double' @@ -888,7 +888,7 @@ define fastcc double @pairwise_reduction4double(<4 x double> %rdx, double %f1) { ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.1.1 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <4 x double> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0 +; SSE42-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0 ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r ; ; AVX1-LABEL: 'pairwise_reduction4double' @@ -898,7 +898,7 @@ define fastcc double @pairwise_reduction4double(<4 x double> %rdx, double %f1) { ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.1.1 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx8 = fadd <4 x double> %rdx.shuf.1.0, %rdx.shuf.1.1 -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0 +; AVX1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0 ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r ; ; AVX2-LABEL: 'pairwise_reduction4double' @@ -918,7 +918,7 @@ define fastcc double @pairwise_reduction4double(<4 x double> %rdx, double %f1) { ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.1.1 = shufflevector <4 x double> %bin.rdx, <4 x double> undef, <4 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = fadd <4 x double> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x double> %bin.rdx8, i32 0 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret double %r ; %rdx.shuf.0.0 = shufflevector <4 x double> %rdx, <4 x double> undef, <4 x i32> @@ -943,7 +943,7 @@ define fastcc float @pairwise_reduction8float(<8 x float> %rdx, float %f1) { ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.2.1 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx9 = fadd <8 x float> %rdx.shuf.2.0, %rdx.shuf.2.1 -; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %r = extractelement <8 x float> %bin.rdx9, i32 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %r = extractelement <8 x float> %bin.rdx9, i32 0 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; ; SSSE3-LABEL: 'pairwise_reduction8float' @@ -956,7 +956,7 @@ define fastcc float @pairwise_reduction8float(<8 x float> %rdx, float %f1) { ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.2.1 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx9 = fadd <8 x float> %rdx.shuf.2.0, %rdx.shuf.2.1 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %r = extractelement <8 x float> %bin.rdx9, i32 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %r = extractelement <8 x float> %bin.rdx9, i32 0 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; ; SSE42-LABEL: 'pairwise_reduction8float' @@ -969,7 +969,7 @@ define fastcc float @pairwise_reduction8float(<8 x float> %rdx, float %f1) { ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.2.1 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx9 = fadd <8 x float> %rdx.shuf.2.0, %rdx.shuf.2.1 -; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %r = extractelement <8 x float> %bin.rdx9, i32 0 +; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %r = extractelement <8 x float> %bin.rdx9, i32 0 ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; ; AVX1-LABEL: 'pairwise_reduction8float' @@ -982,7 +982,7 @@ define fastcc float @pairwise_reduction8float(<8 x float> %rdx, float %f1) { ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.2.1 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx9 = fadd <8 x float> %rdx.shuf.2.0, %rdx.shuf.2.1 -; AVX1-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %r = extractelement <8 x float> %bin.rdx9, i32 0 +; AVX1-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %r = extractelement <8 x float> %bin.rdx9, i32 0 ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; ; AVX2-LABEL: 'pairwise_reduction8float' @@ -995,7 +995,7 @@ define fastcc float @pairwise_reduction8float(<8 x float> %rdx, float %f1) { ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2.1 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx9 = fadd <8 x float> %rdx.shuf.2.0, %rdx.shuf.2.1 -; AVX2-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %r = extractelement <8 x float> %bin.rdx9, i32 0 +; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %r = extractelement <8 x float> %bin.rdx9, i32 0 ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; ; SLM-LABEL: 'pairwise_reduction8float' @@ -1008,7 +1008,7 @@ define fastcc float @pairwise_reduction8float(<8 x float> %rdx, float %f1) { ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.2.1 = shufflevector <8 x float> %bin.rdx8, <8 x float> undef, <8 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx9 = fadd <8 x float> %rdx.shuf.2.0, %rdx.shuf.2.1 -; SLM-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %r = extractelement <8 x float> %bin.rdx9, i32 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %r = extractelement <8 x float> %bin.rdx9, i32 0 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret float %r ; %rdx.shuf.0.0 = shufflevector <8 x float> %rdx, <8 x float> undef, <8 x i32> @@ -1026,25 +1026,18 @@ define fastcc float @pairwise_reduction8float(<8 x float> %rdx, float %f1) { } define fastcc i64 @pairwise_reduction2i64(<2 x i64> %rdx, i64 %f1) { -; SSE-LABEL: 'pairwise_reduction2i64' -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> -; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> -; SSE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <2 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %r = extractelement <2 x i64> %bin.rdx8, i32 0 -; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r -; -; AVX-LABEL: 'pairwise_reduction2i64' -; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <2 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1 -; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %r = extractelement <2 x i64> %bin.rdx8, i32 0 -; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r +; CHECK-LABEL: 'pairwise_reduction2i64' +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <2 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1 +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r = extractelement <2 x i64> %bin.rdx8, i32 0 +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r ; ; SLM-LABEL: 'pairwise_reduction2i64' ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = add <2 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <2 x i64> %bin.rdx8, i32 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %r = extractelement <2 x i64> %bin.rdx8, i32 0 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r ; %rdx.shuf.1.0 = shufflevector <2 x i64> %rdx, <2 x i64> undef, <2 x i32> @@ -1063,7 +1056,7 @@ define fastcc i32 @pairwise_reduction4i32(<4 x i32> %rdx, i32 %f1) { ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <4 x i32> %rdx.shuf.1.0, %rdx.shuf.1.1 -; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r = extractelement <4 x i32> %bin.rdx8, i32 0 +; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %r = extractelement <4 x i32> %bin.rdx8, i32 0 ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r ; ; SLM-LABEL: 'pairwise_reduction4i32' @@ -1073,7 +1066,7 @@ define fastcc i32 @pairwise_reduction4i32(<4 x i32> %rdx, i32 %f1) { ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <4 x i32> %rdx.shuf.1.0, %rdx.shuf.1.1 -; SLM-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %r = extractelement <4 x i32> %bin.rdx8, i32 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %r = extractelement <4 x i32> %bin.rdx8, i32 0 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r ; %rdx.shuf.0.0 = shufflevector <4 x i32> %rdx, <4 x i32> undef, <4 x i32> @@ -1105,7 +1098,7 @@ define fastcc i64 @pairwise_reduction4i64(<4 x i64> %rdx, i64 %f1) { ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %rdx.shuf.1.1 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx8 = add <4 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1 -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0 +; AVX1-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0 ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r ; ; AVX2-LABEL: 'pairwise_reduction4i64' @@ -1115,7 +1108,7 @@ define fastcc i64 @pairwise_reduction4i64(<4 x i64> %rdx, i64 %f1) { ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.1.0 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.1.1 = shufflevector <4 x i64> %bin.rdx, <4 x i64> undef, <4 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx8 = add <4 x i64> %rdx.shuf.1.0, %rdx.shuf.1.1 -; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0 +; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %r = extractelement <4 x i64> %bin.rdx8, i32 0 ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %r ; ; SLM-LABEL: 'pairwise_reduction4i64' @@ -1150,7 +1143,7 @@ define fastcc i16 @pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) { ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> ; SSE2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx9 = add <8 x i16> %rdx.shuf.2.0, %rdx.shuf.2.1 -; SSE2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <8 x i16> %bin.rdx9, i32 0 +; SSE2-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %r = extractelement <8 x i16> %bin.rdx9, i32 0 ; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r ; ; SSSE3-LABEL: 'pairwise_reduction8i16' @@ -1163,7 +1156,7 @@ define fastcc i16 @pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) { ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> ; SSSE3-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx9 = add <8 x i16> %rdx.shuf.2.0, %rdx.shuf.2.1 -; SSSE3-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <8 x i16> %bin.rdx9, i32 0 +; SSSE3-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %r = extractelement <8 x i16> %bin.rdx9, i32 0 ; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r ; ; SSE42-LABEL: 'pairwise_reduction8i16' @@ -1176,7 +1169,7 @@ define fastcc i16 @pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) { ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> ; SSE42-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx9 = add <8 x i16> %rdx.shuf.2.0, %rdx.shuf.2.1 -; SSE42-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <8 x i16> %bin.rdx9, i32 0 +; SSE42-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %r = extractelement <8 x i16> %bin.rdx9, i32 0 ; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r ; ; AVX-LABEL: 'pairwise_reduction8i16' @@ -1189,7 +1182,7 @@ define fastcc i16 @pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) { ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> ; AVX-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx9 = add <8 x i16> %rdx.shuf.2.0, %rdx.shuf.2.1 -; AVX-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <8 x i16> %bin.rdx9, i32 0 +; AVX-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %r = extractelement <8 x i16> %bin.rdx9, i32 0 ; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r ; ; SLM-LABEL: 'pairwise_reduction8i16' @@ -1202,7 +1195,7 @@ define fastcc i16 @pairwise_reduction8i16(<8 x i16> %rdx, i16 %f1) { ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i16> %bin.rdx8, <8 x i16> undef, <8 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx9 = add <8 x i16> %rdx.shuf.2.0, %rdx.shuf.2.1 -; SLM-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <8 x i16> %bin.rdx9, i32 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %r = extractelement <8 x i16> %bin.rdx9, i32 0 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %r ; %rdx.shuf.0.0 = shufflevector <8 x i16> %rdx, <8 x i16> undef, <8 x i32> @@ -1230,7 +1223,7 @@ define fastcc i32 @pairwise_reduction8i32(<8 x i32> %rdx, i32 %f1) { ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> ; SSE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> ; SSE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx9 = add <8 x i32> %rdx.shuf.2.0, %rdx.shuf.2.1 -; SSE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %r = extractelement <8 x i32> %bin.rdx9, i32 0 +; SSE-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %r = extractelement <8 x i32> %bin.rdx9, i32 0 ; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r ; ; AVX1-LABEL: 'pairwise_reduction8i32' @@ -1243,7 +1236,7 @@ define fastcc i32 @pairwise_reduction8i32(<8 x i32> %rdx, i32 %f1) { ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> ; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %bin.rdx9 = add <8 x i32> %rdx.shuf.2.0, %rdx.shuf.2.1 -; AVX1-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <8 x i32> %bin.rdx9, i32 0 +; AVX1-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %r = extractelement <8 x i32> %bin.rdx9, i32 0 ; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r ; ; AVX2-LABEL: 'pairwise_reduction8i32' @@ -1256,7 +1249,7 @@ define fastcc i32 @pairwise_reduction8i32(<8 x i32> %rdx, i32 %f1) { ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> ; AVX2-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bin.rdx9 = add <8 x i32> %rdx.shuf.2.0, %rdx.shuf.2.1 -; AVX2-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %r = extractelement <8 x i32> %bin.rdx9, i32 0 +; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %r = extractelement <8 x i32> %bin.rdx9, i32 0 ; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r ; ; SLM-LABEL: 'pairwise_reduction8i32' @@ -1269,7 +1262,7 @@ define fastcc i32 @pairwise_reduction8i32(<8 x i32> %rdx, i32 %f1) { ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %rdx.shuf.2.0 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %rdx.shuf.2.1 = shufflevector <8 x i32> %bin.rdx8, <8 x i32> undef, <8 x i32> ; SLM-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %bin.rdx9 = add <8 x i32> %rdx.shuf.2.0, %rdx.shuf.2.1 -; SLM-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %r = extractelement <8 x i32> %bin.rdx9, i32 0 +; SLM-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %r = extractelement <8 x i32> %bin.rdx9, i32 0 ; SLM-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %r ; %rdx.shuf.0.0 = shufflevector <8 x i32> %rdx, <8 x i32> undef, <8 x i32> diff --git a/llvm/test/Analysis/ValueTracking/known-bits-from-operator-constexpr.ll b/llvm/test/Analysis/ValueTracking/known-bits-from-operator-constexpr.ll new file mode 100644 index 0000000000000..cff06ae074128 --- /dev/null +++ b/llvm/test/Analysis/ValueTracking/known-bits-from-operator-constexpr.ll @@ -0,0 +1,15 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -instsimplify -S | FileCheck %s + +; Reproducer for a crash in computeKnownBitsFromOperator due to blindly +; casting from llvm::Operator to ExtractElementInst. That does not work +; if the Operator is a ConstantExpr. +@g = global [21 x i32] zeroinitializer +define i32 @test1(i32 %a) { +; CHECK-LABEL: @test1( +; CHECK-NEXT: [[T:%.*]] = sub i32 [[A:%.*]], extractelement (<4 x i32> ptrtoint (<4 x i32*> getelementptr inbounds ([21 x i32], [21 x i32]* @g, <4 x i32> zeroinitializer, <4 x i32> ) to <4 x i32>), i32 3) +; CHECK-NEXT: ret i32 [[T]] +; + %t = sub i32 %a, extractelement (<4 x i32> ptrtoint (<4 x i32 *> getelementptr inbounds ([21 x i32], [21 x i32] * @g, <4 x i32> zeroinitializer, <4 x i32> ) to <4 x i32>), i32 3) + ret i32 %t +} diff --git a/llvm/test/Analysis/ValueTracking/knownnonzero-shift.ll b/llvm/test/Analysis/ValueTracking/knownnonzero-shift.ll index e59d19cc2e268..c91e531578404 100644 --- a/llvm/test/Analysis/ValueTracking/knownnonzero-shift.ll +++ b/llvm/test/Analysis/ValueTracking/knownnonzero-shift.ll @@ -1,12 +1,14 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -instsimplify -S < %s | FileCheck %s -; CHECK-LABEL: @test define i1 @test(i8 %p, i8* %pq) { +; CHECK-LABEL: @test( +; CHECK-NEXT: ret i1 true +; %q = load i8, i8* %pq, !range !0 ; %q is known nonzero; no known bits %1 = shl i8 %p, %q ; because %q is nonzero, %1[0] is known to be zero. %2 = and i8 %1, 1 %x = icmp eq i8 %2, 0 - ; CHECK: ret i1 true ret i1 %x } diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir index 8ee9342e5c622..aa0ef61094616 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir @@ -314,6 +314,18 @@ # DEBUG-NEXT: G_SMULH (opcode {{[0-9]+}}): 1 type index, 0 imm indices # DEBUG-NEXT: .. the first uncovered type index: 1, OK # DEBUG-NEXT: .. the first uncovered imm index: 0, OK +# DEBUG-NEXT: G_UADDSAT (opcode 117): 1 type index, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_SADDSAT (opcode 118): 1 type index, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_USUBSAT (opcode 119): 1 type index, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_SSUBSAT (opcode 120): 1 type index, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined # DEBUG-NEXT: G_FADD (opcode {{[0-9]+}}): 1 type index, 0 imm indices # DEBUG-NEXT: .. the first uncovered type index: 1, OK # DEBUG-NEXT: .. the first uncovered imm index: 0, OK diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/localizer-arm64-tti.ll b/llvm/test/CodeGen/AArch64/GlobalISel/localizer-arm64-tti.ll index 88507680af74f..0210a9a89ca4e 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/localizer-arm64-tti.ll +++ b/llvm/test/CodeGen/AArch64/GlobalISel/localizer-arm64-tti.ll @@ -59,3 +59,46 @@ if.end: ret i32 0 } +@tls_gv = common thread_local global i32 0, align 4 + +; This test checks that we don't try to localize TLS variables on Darwin. +; If the user happens to be inside a call sequence, we could end up rematerializing +; below a physreg write, clobbering it (TLS accesses on Darwin need a function call). +; For now, we check we don't localize at all. We could in theory make sure that +; we don't localize into the middle of a call sequence instead. +define i32 @darwin_tls() { + ; CHECK-LABEL: name: darwin_tls + ; CHECK: bb.1.entry: + ; CHECK: successors: %bb.2(0x40000000), %bb.3(0x40000000) + ; CHECK: [[GV:%[0-9]+]]:gpr(p0) = G_GLOBAL_VALUE @tls_gv + ; CHECK: [[GV1:%[0-9]+]]:gpr(p0) = G_GLOBAL_VALUE @var2 + ; CHECK: [[C:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 0 + ; CHECK: [[GV2:%[0-9]+]]:gpr(p0) = G_GLOBAL_VALUE @var1 + ; CHECK: [[LOAD:%[0-9]+]]:gpr(s32) = G_LOAD [[GV2]](p0) :: (dereferenceable load 4 from @var1) + ; CHECK: [[C1:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 1 + ; CHECK: [[ICMP:%[0-9]+]]:gpr(s32) = G_ICMP intpred(ne), [[LOAD]](s32), [[C1]] + ; CHECK: [[TRUNC:%[0-9]+]]:gpr(s1) = G_TRUNC [[ICMP]](s32) + ; CHECK: G_BRCOND [[TRUNC]](s1), %bb.3 + ; CHECK: bb.2.if.then: + ; CHECK: successors: %bb.3(0x80000000) + ; CHECK: [[LOAD1:%[0-9]+]]:gpr(s32) = G_LOAD [[GV]](p0) :: (dereferenceable load 4 from @tls_gv) + ; CHECK: [[GV3:%[0-9]+]]:gpr(p0) = G_GLOBAL_VALUE @var2 + ; CHECK: G_STORE [[LOAD1]](s32), [[GV3]](p0) :: (store 4 into @var2) + ; CHECK: bb.3.if.end: + ; CHECK: [[C2:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 0 + ; CHECK: $w0 = COPY [[C2]](s32) + ; CHECK: RET_ReallyLR implicit $w0 +entry: + %0 = load i32, i32* @var1, align 4 + %cmp = icmp eq i32 %0, 1 + br i1 %cmp, label %if.then, label %if.end + +if.then: + %tls = load i32, i32* @tls_gv, align 4 + store i32 %tls, i32* @var2, align 4 + br label %if.end + +if.end: + ret i32 0 +} + diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-select.mir b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-select.mir new file mode 100644 index 0000000000000..646d7f8eea3fa --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-select.mir @@ -0,0 +1,95 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple aarch64 -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s + +name: self +tracksRegLiveness: true +body: | + bb.1.entry: + liveins: $w0, $w1 + ; Optimize (cond ? %a : %a) -> %a + ; CHECK-LABEL: name: self + ; CHECK: liveins: $w0, $w1 + ; CHECK: %a:_(s32) = COPY $w0 + ; CHECK: $w0 = COPY %a(s32) + ; CHECK: RET_ReallyLR implicit $w0 + %a:_(s32) = COPY $w0 + %cond_wide:gpr(s32) = COPY $w1 + %cond:gpr(s1) = G_TRUNC %cond_wide(s32) + %select:_(s32) = G_SELECT %cond(s1), %a, %a + $w0 = COPY %select(s32) + RET_ReallyLR implicit $w0 + +... +--- +name: self_with_copy +tracksRegLiveness: true +body: | + bb.1.entry: + liveins: $w0, $w1 + ; Optimize (cond ? %a : %b) -> %a + ; + ; This shows that we are looking through copies correctly and deduce that + ; %b is a copy from %a. + ; + ; CHECK-LABEL: name: self_with_copy + ; CHECK: liveins: $w0, $w1 + ; CHECK: %a:_(s32) = COPY $w0 + ; CHECK: $w0 = COPY %a(s32) + ; CHECK: RET_ReallyLR implicit $w0 + %a:_(s32) = COPY $w0 + %b:_(s32) = COPY %a + %cond_wide:gpr(s32) = COPY $w1 + %cond:gpr(s1) = G_TRUNC %cond_wide(s32) + %select:_(s32) = G_SELECT %cond(s1), %a, %b + $w0 = COPY %select(s32) + RET_ReallyLR implicit $w0 + +... +--- +name: self_with_equivalent +tracksRegLiveness: true +body: | + bb.1.entry: + liveins: $w0, $w1 + ; Optimize (cond ? %a : %b) -> %a + ; + ; This shows that we can detect when %a == %b, even though they define + ; different virtual registers. + ; + ; CHECK-LABEL: name: self_with_equivalent + ; CHECK: liveins: $w0, $w1 + ; CHECK: %a:_(s32) = COPY $w0 + ; CHECK: $w0 = COPY %a(s32) + ; CHECK: RET_ReallyLR implicit $w0 + %a:_(s32) = COPY $w0 + %b:_(s32) = COPY $w0 + %cond_wide:gpr(s32) = COPY $w1 + %cond:gpr(s1) = G_TRUNC %cond_wide(s32) + %select:_(s32) = G_SELECT %cond(s1), %a, %b + $w0 = COPY %select(s32) + RET_ReallyLR implicit $w0 + +... +--- +name: self_not_equivalent +tracksRegLiveness: true +body: | + bb.1.entry: + liveins: $w0, $w1 + ; In this case, the copies are not equivalent, so there is no optimization. + ; CHECK-LABEL: name: self_not_equivalent + ; CHECK: liveins: $w0, $w1 + ; CHECK: %a:_(s32) = COPY $w0 + ; CHECK: %b:_(s32) = COPY $w1 + ; CHECK: %cond_wide:gpr(s32) = COPY $w1 + ; CHECK: %cond:gpr(s1) = G_TRUNC %cond_wide(s32) + ; CHECK: %select:_(s32) = G_SELECT %cond(s1), %a, %b + ; CHECK: $w0 = COPY %select(s32) + ; CHECK: RET_ReallyLR implicit $w0 + %a:_(s32) = COPY $w0 + %b:_(s32) = COPY $w1 + %cond_wide:gpr(s32) = COPY $w1 + %cond:gpr(s1) = G_TRUNC %cond_wide(s32) + %select:_(s32) = G_SELECT %cond(s1), %a, %b + $w0 = COPY %select(s32) + RET_ReallyLR implicit $w0 diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-undef.mir b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-undef.mir index 2c5f12ac9d8dd..f1e14a32ee222 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-undef.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-undef.mir @@ -166,3 +166,61 @@ body: | RET_ReallyLR implicit $w0 ... +--- +name: shufflevector_undef_ops_to_undef +alignment: 4 +tracksRegLiveness: true +body: | + bb.0: + ; CHECK-LABEL: name: shufflevector_undef_ops_to_undef + ; CHECK: [[DEF:%[0-9]+]]:_(<2 x s32>) = G_IMPLICIT_DEF + ; CHECK: $d0 = COPY [[DEF]](<2 x s32>) + ; CHECK: RET_ReallyLR implicit $d0 + %1:_(<2 x s32>) = G_IMPLICIT_DEF + %2:_(<2 x s32>) = G_IMPLICIT_DEF + %0:_(<2 x s32>) = G_SHUFFLE_VECTOR %1(<2 x s32>), %2(<2 x s32>), shufflemask(0, 1) + $d0 = COPY %0(<2 x s32>) + RET_ReallyLR implicit $d0 + +... +--- +name: shufflevector_undef_mask_to_undef +alignment: 4 +tracksRegLiveness: true +body: | + bb.0: + liveins: $d0, $d1 + ; CHECK-LABEL: name: shufflevector_undef_mask_to_undef + ; CHECK: liveins: $d0, $d1 + ; CHECK: [[DEF:%[0-9]+]]:_(<2 x s32>) = G_IMPLICIT_DEF + ; CHECK: $d0 = COPY [[DEF]](<2 x s32>) + ; CHECK: RET_ReallyLR implicit $d0 + %0:_(<2 x s32>) = COPY $d0 + %1:_(<2 x s32>) = COPY $d1 + %2:_(<2 x s32>) = G_SHUFFLE_VECTOR %0(<2 x s32>), %1, shufflemask(undef, undef) + $d0 = COPY %2(<2 x s32>) + RET_ReallyLR implicit $d0 + +... +--- +name: shufflevector_not_all_ops_undef +alignment: 4 +tracksRegLiveness: true +body: | + bb.0: + liveins: $d0 + ; Show that we don't do the combine when one of the vectors is not a + ; G_IMPLICIT_DEF. + ; + ; CHECK-LABEL: name: shufflevector_not_all_ops_undef + ; CHECK: liveins: $d0 + ; CHECK: [[DEF:%[0-9]+]]:_(<2 x s32>) = G_IMPLICIT_DEF + ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $d0 + ; CHECK: [[SHUF:%[0-9]+]]:_(<2 x s32>) = G_SHUFFLE_VECTOR [[DEF]](<2 x s32>), [[COPY]], shufflemask(0, 1) + ; CHECK: $d0 = COPY [[SHUF]](<2 x s32>) + ; CHECK: RET_ReallyLR implicit $d0 + %1:_(<2 x s32>) = G_IMPLICIT_DEF + %2:_(<2 x s32>) = COPY $d0 + %0:_(<2 x s32>) = G_SHUFFLE_VECTOR %1(<2 x s32>), %2(<2 x s32>), shufflemask(0, 1) + $d0 = COPY %0(<2 x s32>) + RET_ReallyLR implicit $d0 diff --git a/llvm/test/CodeGen/AArch64/dag-combine-trunc-build-vec.ll b/llvm/test/CodeGen/AArch64/dag-combine-trunc-build-vec.ll new file mode 100644 index 0000000000000..d2a55f3e95bd4 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/dag-combine-trunc-build-vec.ll @@ -0,0 +1,48 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64 < %s | FileCheck %s + +; This test is reduced fom https://github.com/android/ndk/issues/1207 for an +; issue with the following DAGCobmine operation: +; truncate(build_vector(x,y)) -> build_vector(truncate(x),truncate(y)) +; The combine should avoid creating illegal types if types have already been +; legalized. + +define void @no_combine(i32 %p) local_unnamed_addr { +; CHECK-LABEL: no_combine: +; CHECK: // %bb.0: +; CHECK-NEXT: dup v0.4s, w0 +; CHECK-NEXT: movi v1.4h, #4 +; CHECK-NEXT: xtn v0.4h, v0.4s +; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: xtn v1.8b, v0.8h +; CHECK-NEXT: xtn2 v1.16b, v0.8h +; CHECK-NEXT: str q1, [x8] +; CHECK-NEXT: ret + +; The two shufflevector operations are needed to force the DAGCombine to happen +; after type legalization and before operation legalization. Removing either +; makes the combine to happen before type legalization and the issue no longer +; repros. + %1 = insertelement <16 x i32> undef, i32 %p, i32 0 + %2 = shufflevector <16 x i32> %1, <16 x i32> undef, <16 x i32> + %3 = shufflevector <16 x i32> %2, <16 x i32> , <16 x i32> + %4 = trunc <16 x i32> %3 to <16 x i8> + %5 = bitcast i8* undef to <16 x i8>* + store <16 x i8> %4, <16 x i8>* %5, align 1 + ret void +} + +; Test case to ensure that the combine is done before type legalization. +define void @do_combine(i32 %p) local_unnamed_addr { +; CHECK-LABEL: do_combine: +; CHECK: // %bb.0: +; CHECK-NEXT: dup v0.16b, w0 +; CHECK-NEXT: str q0, [x8] +; CHECK-NEXT: ret + %1 = insertelement <16 x i32> undef, i32 %p, i32 0 + %2 = shufflevector <16 x i32> %1, <16 x i32> undef, <16 x i32> + %3 = trunc <16 x i32> %2 to <16 x i8> + %4 = bitcast i8* undef to <16 x i8>* + store <16 x i8> %3, <16 x i8>* %4, align 1 + ret void +} diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-prefetches-scaled-offset.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-prefetches-scaled-offset.ll index 78251707a0105..2de5668d1b9ea 100644 --- a/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-prefetches-scaled-offset.ll +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-prefetches-scaled-offset.ll @@ -1,200 +1,200 @@ ; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve --asm-verbose=false < %s | FileCheck %s ; PRFB , , [, .S, ] -> 32-bit scaled offset -define void @llvm_aarch64_sve_gather_prfb_scaled_uxtw_nx4vi32( %Pg, i8* %base, %offset) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfb_scaled_uxtw_nx4vi32: +define void @llvm_aarch64_sve_prfb_gather_scaled_uxtw_nx4vi32( %Pg, i8* %base, %offset) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfb_gather_scaled_uxtw_nx4vi32: ; CHECK-NEXT: prfb pldl1strm, p0, [x0, z0.s, uxtw] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfb.scaled.uxtw.nx4vi32( %Pg, i8* %base, %offset, i32 1) + call void @llvm.aarch64.sve.prfb.gather.scaled.uxtw.nx4vi32( %Pg, i8* %base, %offset, i32 1) ret void } -define void @llvm_aarch64_sve_gather_prfb_scaled_sxtw_nx4vi32( %Pg, i8* %base, %offset) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfb_scaled_sxtw_nx4vi32: +define void @llvm_aarch64_sve_prfb_gather_scaled_sxtw_nx4vi32( %Pg, i8* %base, %offset) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfb_gather_scaled_sxtw_nx4vi32: ; CHECK-NEXT: prfb pldl1strm, p0, [x0, z0.s, sxtw] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfb.scaled.sxtw.nx4vi32( %Pg, i8* %base, %offset, i32 1) + call void @llvm.aarch64.sve.prfb.gather.scaled.sxtw.nx4vi32( %Pg, i8* %base, %offset, i32 1) ret void } ; PRFB , , [, .D, ] -> 32-bit unpacked scaled offset -define void @llvm_aarch64_sve_gather_prfb_scaled_uxtw_nx2vi64( %Pg, i8* %base, %offset) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfb_scaled_uxtw_nx2vi64: +define void @llvm_aarch64_sve_prfb_gather_scaled_uxtw_nx2vi64( %Pg, i8* %base, %offset) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfb_gather_scaled_uxtw_nx2vi64: ; CHECK-NEXT: prfb pldl1strm, p0, [x0, z0.d, uxtw] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfb.scaled.uxtw.nx2vi64( %Pg, i8* %base, %offset, i32 1) + call void @llvm.aarch64.sve.prfb.gather.scaled.uxtw.nx2vi64( %Pg, i8* %base, %offset, i32 1) ret void } -define void @llvm_aarch64_sve_gather_prfb_scaled_sxtw_nx2vi64( %Pg, i8* %base, %offset) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfb_scaled_sxtw_nx2vi64: +define void @llvm_aarch64_sve_prfb_gather_scaled_sxtw_nx2vi64( %Pg, i8* %base, %offset) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfb_gather_scaled_sxtw_nx2vi64: ; CHECK-NEXT: prfb pldl1strm, p0, [x0, z0.d, sxtw] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfb.scaled.sxtw.nx2vi64( %Pg, i8* %base, %offset, i32 1) + call void @llvm.aarch64.sve.prfb.gather.scaled.sxtw.nx2vi64( %Pg, i8* %base, %offset, i32 1) ret void } ; PRFB , , [, .D] -> 64-bit scaled offset -define void @llvm_aarch64_sve_gather_prfb_scaled_nx2vi64( %Pg, i8* %base, %offset) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfb_scaled_nx2vi64: +define void @llvm_aarch64_sve_prfb_gather_scaled_nx2vi64( %Pg, i8* %base, %offset) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfb_gather_scaled_nx2vi64: ; CHECK-NEXT: prfb pldl1strm, p0, [x0, z0.d] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfb.scaled.nx2vi64( %Pg, i8* %base, %offset, i32 1) + call void @llvm.aarch64.sve.prfb.gather.scaled.nx2vi64( %Pg, i8* %base, %offset, i32 1) ret void } ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; PRFH , , [, .S, ] -> 32-bit scaled offset -define void @llvm_aarch64_sve_gather_prfh_scaled_uxtw_nx4vi32( %Pg, i8* %base, %offset) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfh_scaled_uxtw_nx4vi32: +define void @llvm_aarch64_sve_prfh_gather_scaled_uxtw_nx4vi32( %Pg, i8* %base, %offset) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfh_gather_scaled_uxtw_nx4vi32: ; CHECK-NEXT: prfh pldl1strm, p0, [x0, z0.s, uxtw #1] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfh.scaled.uxtw.nx4vi32( %Pg, i8* %base, %offset, i32 1) + call void @llvm.aarch64.sve.prfh.gather.scaled.uxtw.nx4vi32( %Pg, i8* %base, %offset, i32 1) ret void } -define void @llvm_aarch64_sve_gather_prfh_scaled_sxtw_nx4vi32( %Pg, i8* %base, %offset) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfh_scaled_sxtw_nx4vi32: +define void @llvm_aarch64_sve_prfh_gather_scaled_sxtw_nx4vi32( %Pg, i8* %base, %offset) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfh_gather_scaled_sxtw_nx4vi32: ; CHECK-NEXT: prfh pldl1strm, p0, [x0, z0.s, sxtw #1] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfh.scaled.sxtw.nx4vi32( %Pg, i8* %base, %offset, i32 1) + call void @llvm.aarch64.sve.prfh.gather.scaled.sxtw.nx4vi32( %Pg, i8* %base, %offset, i32 1) ret void } ; PRFH , , [, .D, #1] -> 32-bit unpacked scaled offset -define void @llvm_aarch64_sve_gather_prfh_scaled_uxtw_nx2vi64( %Pg, i8* %base, %offset) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfh_scaled_uxtw_nx2vi64: +define void @llvm_aarch64_sve_prfh_gather_scaled_uxtw_nx2vi64( %Pg, i8* %base, %offset) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfh_gather_scaled_uxtw_nx2vi64: ; CHECK-NEXT: prfh pldl1strm, p0, [x0, z0.d, uxtw #1] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfh.scaled.uxtw.nx2vi64( %Pg, i8* %base, %offset, i32 1) + call void @llvm.aarch64.sve.prfh.gather.scaled.uxtw.nx2vi64( %Pg, i8* %base, %offset, i32 1) ret void } -define void @llvm_aarch64_sve_gather_prfh_scaled_sxtw_nx2vi64( %Pg, i8* %base, %offset) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfh_scaled_sxtw_nx2vi64: +define void @llvm_aarch64_sve_prfh_gather_scaled_sxtw_nx2vi64( %Pg, i8* %base, %offset) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfh_gather_scaled_sxtw_nx2vi64: ; CHECK-NEXT: prfh pldl1strm, p0, [x0, z0.d, sxtw #1] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfh.scaled.sxtw.nx2vi64( %Pg, i8* %base, %offset, i32 1) + call void @llvm.aarch64.sve.prfh.gather.scaled.sxtw.nx2vi64( %Pg, i8* %base, %offset, i32 1) ret void } ; PRFH , , [, .D] -> 64-bit scaled offset -define void @llvm_aarch64_sve_gather_prfh_scaled_nx2vi64( %Pg, i8* %base, %offset) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfh_scaled_nx2vi64: +define void @llvm_aarch64_sve_prfh_gather_scaled_nx2vi64( %Pg, i8* %base, %offset) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfh_gather_scaled_nx2vi64: ; CHECK-NEXT: prfh pldl1strm, p0, [x0, z0.d, lsl #1] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfh.scaled.nx2vi64( %Pg, i8* %base, %offset, i32 1) + call void @llvm.aarch64.sve.prfh.gather.scaled.nx2vi64( %Pg, i8* %base, %offset, i32 1) ret void } ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; PRFW , , [, .S, ] -> 32-bit scaled offset -define void @llvm_aarch64_sve_gather_prfw_scaled_uxtw_nx4vi32( %Pg, i8* %base, %offset) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfw_scaled_uxtw_nx4vi32: +define void @llvm_aarch64_sve_prfw_gather_scaled_uxtw_nx4vi32( %Pg, i8* %base, %offset) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfw_gather_scaled_uxtw_nx4vi32: ; CHECK-NEXT: prfw pldl1strm, p0, [x0, z0.s, uxtw #2] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfw.scaled.uxtw.nx4vi32( %Pg, i8* %base, %offset, i32 1) + call void @llvm.aarch64.sve.prfw.gather.scaled.uxtw.nx4vi32( %Pg, i8* %base, %offset, i32 1) ret void } -define void @llvm_aarch64_sve_gather_prfw_scaled_sxtw_nx4vi32( %Pg, i8* %base, %offset) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfw_scaled_sxtw_nx4vi32: +define void @llvm_aarch64_sve_prfw_gather_scaled_sxtw_nx4vi32( %Pg, i8* %base, %offset) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfw_gather_scaled_sxtw_nx4vi32: ; CHECK-NEXT: prfw pldl1strm, p0, [x0, z0.s, sxtw #2] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfw.scaled.sxtw.nx4vi32( %Pg, i8* %base, %offset, i32 1) + call void @llvm.aarch64.sve.prfw.gather.scaled.sxtw.nx4vi32( %Pg, i8* %base, %offset, i32 1) ret void } ; PRFW , , [, .D, #2] -> 32-bit unpacked scaled offset -define void @llvm_aarch64_sve_gather_prfw_scaled_uxtw_nx2vi64( %Pg, i8* %base, %offset) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfw_scaled_uxtw_nx2vi64: +define void @llvm_aarch64_sve_prfw_gather_scaled_uxtw_nx2vi64( %Pg, i8* %base, %offset) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfw_gather_scaled_uxtw_nx2vi64: ; CHECK-NEXT: prfw pldl1strm, p0, [x0, z0.d, uxtw #2] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfw.scaled.uxtw.nx2vi64( %Pg, i8* %base, %offset, i32 1) + call void @llvm.aarch64.sve.prfw.gather.scaled.uxtw.nx2vi64( %Pg, i8* %base, %offset, i32 1) ret void } -define void @llvm_aarch64_sve_gather_prfw_scaled_sxtw_nx2vi64( %Pg, i8* %base, %offset) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfw_scaled_sxtw_nx2vi64: +define void @llvm_aarch64_sve_prfw_gather_scaled_sxtw_nx2vi64( %Pg, i8* %base, %offset) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfw_gather_scaled_sxtw_nx2vi64: ; CHECK-NEXT: prfw pldl1strm, p0, [x0, z0.d, sxtw #2] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfw.scaled.sxtw.nx2vi64( %Pg, i8* %base, %offset, i32 1) + call void @llvm.aarch64.sve.prfw.gather.scaled.sxtw.nx2vi64( %Pg, i8* %base, %offset, i32 1) ret void } ; PRFW , , [, .D] -> 64-bit scaled offset -define void @llvm_aarch64_sve_gather_prfw_scaled_nx2vi64( %Pg, i8* %base, %offset) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfw_scaled_nx2vi64: +define void @llvm_aarch64_sve_prfw_gather_scaled_nx2vi64( %Pg, i8* %base, %offset) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfw_gather_scaled_nx2vi64: ; CHECK-NEXT: prfw pldl1strm, p0, [x0, z0.d, lsl #2] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfw.scaled.nx2vi64( %Pg, i8* %base, %offset, i32 1) + call void @llvm.aarch64.sve.prfw.gather.scaled.nx2vi64( %Pg, i8* %base, %offset, i32 1) ret void } ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; PRFD , , [, .S, ] -> 32-bit scaled offset -define void @llvm_aarch64_sve_gather_prfd_scaled_uxtw_nx4vi32( %Pg, i8* %base, %offset) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfd_scaled_uxtw_nx4vi32: +define void @llvm_aarch64_sve_prfd_gather_scaled_uxtw_nx4vi32( %Pg, i8* %base, %offset) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfd_gather_scaled_uxtw_nx4vi32: ; CHECK-NEXT: prfd pldl1strm, p0, [x0, z0.s, uxtw #3] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfd.scaled.uxtw.nx4vi32( %Pg, i8* %base, %offset, i32 1) + call void @llvm.aarch64.sve.prfd.gather.scaled.uxtw.nx4vi32( %Pg, i8* %base, %offset, i32 1) ret void } -define void @llvm_aarch64_sve_gather_prfd_scaled_sxtw_nx4vi32( %Pg, i8* %base, %offset) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfd_scaled_sxtw_nx4vi32: +define void @llvm_aarch64_sve_prfd_gather_scaled_sxtw_nx4vi32( %Pg, i8* %base, %offset) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfd_gather_scaled_sxtw_nx4vi32: ; CHECK-NEXT: prfd pldl1strm, p0, [x0, z0.s, sxtw #3] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfd.scaled.sxtw.nx4vi32( %Pg, i8* %base, %offset, i32 1) + call void @llvm.aarch64.sve.prfd.gather.scaled.sxtw.nx4vi32( %Pg, i8* %base, %offset, i32 1) ret void } ; PRFD , , [, .D, #3] -> 32-bit unpacked scaled offset -define void @llvm_aarch64_sve_gather_prfd_scaled_uxtw_nx2vi64( %Pg, i8* %base, %offset) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfd_scaled_uxtw_nx2vi64: +define void @llvm_aarch64_sve_prfd_gather_scaled_uxtw_nx2vi64( %Pg, i8* %base, %offset) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfd_gather_scaled_uxtw_nx2vi64: ; CHECK-NEXT: prfd pldl1strm, p0, [x0, z0.d, uxtw #3] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfd.scaled.uxtw.nx2vi64( %Pg, i8* %base, %offset, i32 1) + call void @llvm.aarch64.sve.prfd.gather.scaled.uxtw.nx2vi64( %Pg, i8* %base, %offset, i32 1) ret void } -define void @llvm_aarch64_sve_gather_prfd_scaled_sxtw_nx2vi64( %Pg, i8* %base, %offset) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfd_scaled_sxtw_nx2vi64: +define void @llvm_aarch64_sve_prfd_gather_scaled_sxtw_nx2vi64( %Pg, i8* %base, %offset) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfd_gather_scaled_sxtw_nx2vi64: ; CHECK-NEXT: prfd pldl1strm, p0, [x0, z0.d, sxtw #3] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfd.scaled.sxtw.nx2vi64( %Pg, i8* %base, %offset, i32 1) + call void @llvm.aarch64.sve.prfd.gather.scaled.sxtw.nx2vi64( %Pg, i8* %base, %offset, i32 1) ret void } ; PRFD , , [, .D] -> 64-bit scaled offset -define void @llvm_aarch64_sve_gather_prfd_scaled_nx2vi64( %Pg, i8* %base, %offset) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfd_scaled_nx2vi64: +define void @llvm_aarch64_sve_prfd_gather_scaled_nx2vi64( %Pg, i8* %base, %offset) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfd_gather_scaled_nx2vi64: ; CHECK-NEXT: prfd pldl1strm, p0, [x0, z0.d, lsl #3] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfd.scaled.nx2vi64( %Pg, i8* %base, %offset, i32 1) - ret void - } - -declare void @llvm.aarch64.sve.gather.prfb.scaled.uxtw.nx4vi32( %Pg, i8* %base, %offset, i32 %prfop) -declare void @llvm.aarch64.sve.gather.prfb.scaled.sxtw.nx4vi32( %Pg, i8* %base, %offset, i32 %prfop) -declare void @llvm.aarch64.sve.gather.prfb.scaled.uxtw.nx2vi64( %Pg, i8* %base, %offset, i32 %prfop) -declare void @llvm.aarch64.sve.gather.prfb.scaled.sxtw.nx2vi64( %Pg, i8* %base, %offset, i32 %prfop) -declare void @llvm.aarch64.sve.gather.prfb.scaled.nx2vi64( %Pg, i8* %base, %offset, i32 %prfop) -declare void @llvm.aarch64.sve.gather.prfh.scaled.uxtw.nx4vi32( %Pg, i8* %base, %offset, i32 %prfop) -declare void @llvm.aarch64.sve.gather.prfh.scaled.sxtw.nx4vi32( %Pg, i8* %base, %offset, i32 %prfop) -declare void @llvm.aarch64.sve.gather.prfh.scaled.uxtw.nx2vi64( %Pg, i8* %base, %offset, i32 %prfop) -declare void @llvm.aarch64.sve.gather.prfh.scaled.sxtw.nx2vi64( %Pg, i8* %base, %offset, i32 %prfop) -declare void @llvm.aarch64.sve.gather.prfh.scaled.nx2vi64( %Pg, i8* %base, %offset, i32 %prfop) -declare void @llvm.aarch64.sve.gather.prfw.scaled.uxtw.nx4vi32( %Pg, i8* %base, %offset, i32 %prfop) -declare void @llvm.aarch64.sve.gather.prfw.scaled.sxtw.nx4vi32( %Pg, i8* %base, %offset, i32 %prfop) -declare void @llvm.aarch64.sve.gather.prfw.scaled.uxtw.nx2vi64( %Pg, i8* %base, %offset, i32 %prfop) -declare void @llvm.aarch64.sve.gather.prfw.scaled.sxtw.nx2vi64( %Pg, i8* %base, %offset, i32 %prfop) -declare void @llvm.aarch64.sve.gather.prfw.scaled.nx2vi64( %Pg, i8* %base, %offset, i32 %prfop) -declare void @llvm.aarch64.sve.gather.prfd.scaled.uxtw.nx4vi32( %Pg, i8* %base, %offset, i32 %prfop) -declare void @llvm.aarch64.sve.gather.prfd.scaled.sxtw.nx4vi32( %Pg, i8* %base, %offset, i32 %prfop) -declare void @llvm.aarch64.sve.gather.prfd.scaled.uxtw.nx2vi64( %Pg, i8* %base, %offset, i32 %prfop) -declare void @llvm.aarch64.sve.gather.prfd.scaled.sxtw.nx2vi64( %Pg, i8* %base, %offset, i32 %prfop) -declare void @llvm.aarch64.sve.gather.prfd.scaled.nx2vi64( %Pg, i8* %base, %offset, i32 %prfop) + call void @llvm.aarch64.sve.prfd.gather.scaled.nx2vi64( %Pg, i8* %base, %offset, i32 1) + ret void + } + +declare void @llvm.aarch64.sve.prfb.gather.scaled.uxtw.nx4vi32( %Pg, i8* %base, %offset, i32 %prfop) +declare void @llvm.aarch64.sve.prfb.gather.scaled.sxtw.nx4vi32( %Pg, i8* %base, %offset, i32 %prfop) +declare void @llvm.aarch64.sve.prfb.gather.scaled.uxtw.nx2vi64( %Pg, i8* %base, %offset, i32 %prfop) +declare void @llvm.aarch64.sve.prfb.gather.scaled.sxtw.nx2vi64( %Pg, i8* %base, %offset, i32 %prfop) +declare void @llvm.aarch64.sve.prfb.gather.scaled.nx2vi64( %Pg, i8* %base, %offset, i32 %prfop) +declare void @llvm.aarch64.sve.prfh.gather.scaled.uxtw.nx4vi32( %Pg, i8* %base, %offset, i32 %prfop) +declare void @llvm.aarch64.sve.prfh.gather.scaled.sxtw.nx4vi32( %Pg, i8* %base, %offset, i32 %prfop) +declare void @llvm.aarch64.sve.prfh.gather.scaled.uxtw.nx2vi64( %Pg, i8* %base, %offset, i32 %prfop) +declare void @llvm.aarch64.sve.prfh.gather.scaled.sxtw.nx2vi64( %Pg, i8* %base, %offset, i32 %prfop) +declare void @llvm.aarch64.sve.prfh.gather.scaled.nx2vi64( %Pg, i8* %base, %offset, i32 %prfop) +declare void @llvm.aarch64.sve.prfw.gather.scaled.uxtw.nx4vi32( %Pg, i8* %base, %offset, i32 %prfop) +declare void @llvm.aarch64.sve.prfw.gather.scaled.sxtw.nx4vi32( %Pg, i8* %base, %offset, i32 %prfop) +declare void @llvm.aarch64.sve.prfw.gather.scaled.uxtw.nx2vi64( %Pg, i8* %base, %offset, i32 %prfop) +declare void @llvm.aarch64.sve.prfw.gather.scaled.sxtw.nx2vi64( %Pg, i8* %base, %offset, i32 %prfop) +declare void @llvm.aarch64.sve.prfw.gather.scaled.nx2vi64( %Pg, i8* %base, %offset, i32 %prfop) +declare void @llvm.aarch64.sve.prfd.gather.scaled.uxtw.nx4vi32( %Pg, i8* %base, %offset, i32 %prfop) +declare void @llvm.aarch64.sve.prfd.gather.scaled.sxtw.nx4vi32( %Pg, i8* %base, %offset, i32 %prfop) +declare void @llvm.aarch64.sve.prfd.gather.scaled.uxtw.nx2vi64( %Pg, i8* %base, %offset, i32 %prfop) +declare void @llvm.aarch64.sve.prfd.gather.scaled.sxtw.nx2vi64( %Pg, i8* %base, %offset, i32 %prfop) +declare void @llvm.aarch64.sve.prfd.gather.scaled.nx2vi64( %Pg, i8* %base, %offset, i32 %prfop) diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-prefetches-vect-base-imm-offset.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-prefetches-vect-base-imm-offset.ll index 481302ce59720..8be10be55f278 100644 --- a/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-prefetches-vect-base-imm-offset.ll +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-prefetches-vect-base-imm-offset.ll @@ -1,82 +1,82 @@ ; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve --asm-verbose=false < %s | FileCheck %s ; PRFB , , [.S{, #}] -> 32-bit element -define void @llvm_aarch64_sve_gather_prfb_nx4vi32( %bases, %Pg) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfb_nx4vi32: +define void @llvm_aarch64_sve_prfb_gather_nx4vi32( %bases, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfb_gather_nx4vi32: ; CHECK-NEXT: prfb pldl1strm, p0, [z0.s, #7] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfb.nx4vi32( %Pg, %bases, i64 7, i32 1) + call void @llvm.aarch64.sve.prfb.gather.nx4vi32( %Pg, %bases, i64 7, i32 1) ret void } ; PRFB , , [.D{, #}] -> 64-bit element -define void @llvm_aarch64_sve_gather_prfb_nx2vi64( %bases, %Pg) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfb_nx2vi64: +define void @llvm_aarch64_sve_prfb_gather_nx2vi64( %bases, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfb_gather_nx2vi64: ; CHECK-NEXT: prfb pldl1strm, p0, [z0.d, #7] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfb.nx2vi64( %Pg, %bases, i64 7, i32 1) + call void @llvm.aarch64.sve.prfb.gather.nx2vi64( %Pg, %bases, i64 7, i32 1) ret void } ; PRFH , , [.S{, #}] -> 32-bit element -define void @llvm_aarch64_sve_gather_prfh_nx4vi32( %bases, %Pg) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfh_nx4vi32: +define void @llvm_aarch64_sve_prfh_gather_nx4vi32( %bases, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfh_gather_nx4vi32: ; CHECK-NEXT: prfh pldl1strm, p0, [z0.s, #6] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfh.nx4vi32( %Pg, %bases, i64 6, i32 1) + call void @llvm.aarch64.sve.prfh.gather.nx4vi32( %Pg, %bases, i64 6, i32 1) ret void } ; PRFH , , [.D{, #}] -> 64-bit element -define void @llvm_aarch64_sve_gather_prfh_nx2vi64( %bases, %Pg) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfh_nx2vi64: +define void @llvm_aarch64_sve_prfh_gather_nx2vi64( %bases, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfh_gather_nx2vi64: ; CHECK-NEXT: prfh pldl1strm, p0, [z0.d, #6] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfh.nx2vi64( %Pg, %bases, i64 6, i32 1) + call void @llvm.aarch64.sve.prfh.gather.nx2vi64( %Pg, %bases, i64 6, i32 1) ret void } ; PRFW , , [.S{, #}] -> 32-bit element -define void @llvm_aarch64_sve_gather_prfw_nx4vi32( %bases, %Pg) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfw_nx4vi32: +define void @llvm_aarch64_sve_prfw_gather_nx4vi32( %bases, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfw_gather_nx4vi32: ; CHECK-NEXT: prfw pldl1strm, p0, [z0.s, #12] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfw.nx4vi32( %Pg, %bases, i64 12, i32 1) + call void @llvm.aarch64.sve.prfw.gather.nx4vi32( %Pg, %bases, i64 12, i32 1) ret void } ; PRFW , , [.D{, #}] -> 64-bit element -define void @llvm_aarch64_sve_gather_prfw_nx2vi64( %bases, %Pg) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfw_nx2vi64: +define void @llvm_aarch64_sve_prfw_gather_nx2vi64( %bases, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfw_gather_nx2vi64: ; CHECK-NEXT: prfw pldl1strm, p0, [z0.d, #12] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfw.nx2vi64( %Pg, %bases, i64 12, i32 1) + call void @llvm.aarch64.sve.prfw.gather.nx2vi64( %Pg, %bases, i64 12, i32 1) ret void } ; PRFD , , [.S{, #}] -> 32-bit element -define void @llvm_aarch64_sve_gather_prfd_nx4vi32( %bases, %Pg) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfd_nx4vi32: +define void @llvm_aarch64_sve_prfd_gather_nx4vi32( %bases, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfd_gather_nx4vi32: ; CHECK-NEXT: prfd pldl1strm, p0, [z0.s, #16] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfd.nx4vi32( %Pg, %bases, i64 16, i32 1) + call void @llvm.aarch64.sve.prfd.gather.nx4vi32( %Pg, %bases, i64 16, i32 1) ret void } ; PRFD , , [.D{, #}] -> 64-bit element -define void @llvm_aarch64_sve_gather_prfd_nx2vi64( %bases, %Pg) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfd_nx2vi64: +define void @llvm_aarch64_sve_prfd_gather_nx2vi64( %bases, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfd_gather_nx2vi64: ; CHECK-NEXT: prfd pldl1strm, p0, [z0.d, #16] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfd.nx2vi64( %Pg, %bases, i64 16, i32 1) + call void @llvm.aarch64.sve.prfd.gather.nx2vi64( %Pg, %bases, i64 16, i32 1) ret void } -declare void @llvm.aarch64.sve.gather.prfb.nx4vi32( %Pg, %bases, i64 %imm, i32 %prfop) -declare void @llvm.aarch64.sve.gather.prfb.nx2vi64( %Pg, %bases, i64 %imm, i32 %prfop) -declare void @llvm.aarch64.sve.gather.prfh.nx4vi32( %Pg, %bases, i64 %imm, i32 %prfop) -declare void @llvm.aarch64.sve.gather.prfh.nx2vi64( %Pg, %bases, i64 %imm, i32 %prfop) -declare void @llvm.aarch64.sve.gather.prfw.nx4vi32( %Pg, %bases, i64 %imm, i32 %prfop) -declare void @llvm.aarch64.sve.gather.prfw.nx2vi64( %Pg, %bases, i64 %imm, i32 %prfop) -declare void @llvm.aarch64.sve.gather.prfd.nx4vi32( %Pg, %bases, i64 %imm, i32 %prfop) -declare void @llvm.aarch64.sve.gather.prfd.nx2vi64( %Pg, %bases, i64 %imm, i32 %prfop) +declare void @llvm.aarch64.sve.prfb.gather.nx4vi32( %Pg, %bases, i64 %imm, i32 %prfop) +declare void @llvm.aarch64.sve.prfb.gather.nx2vi64( %Pg, %bases, i64 %imm, i32 %prfop) +declare void @llvm.aarch64.sve.prfh.gather.nx4vi32( %Pg, %bases, i64 %imm, i32 %prfop) +declare void @llvm.aarch64.sve.prfh.gather.nx2vi64( %Pg, %bases, i64 %imm, i32 %prfop) +declare void @llvm.aarch64.sve.prfw.gather.nx4vi32( %Pg, %bases, i64 %imm, i32 %prfop) +declare void @llvm.aarch64.sve.prfw.gather.nx2vi64( %Pg, %bases, i64 %imm, i32 %prfop) +declare void @llvm.aarch64.sve.prfd.gather.nx4vi32( %Pg, %bases, i64 %imm, i32 %prfop) +declare void @llvm.aarch64.sve.prfd.gather.nx2vi64( %Pg, %bases, i64 %imm, i32 %prfop) diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-prefetches-vect-base-invalid-imm-offset.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-prefetches-vect-base-invalid-imm-offset.ll index 4b0b42eb73b98..ca027edfd5def 100644 --- a/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-prefetches-vect-base-invalid-imm-offset.ll +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-gather-prefetches-vect-base-invalid-imm-offset.ll @@ -1,286 +1,286 @@ ; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve --asm-verbose=false < %s | FileCheck %s ; PRFB , , [.S{, #}] -> 32-bit element, imm = 0, 1, ..., 31 -define void @llvm_aarch64_sve_gather_prfb_nx4vi32_runtime_offset( %bases, i64 %imm, %Pg) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfb_nx4vi32_runtime_offset: +define void @llvm_aarch64_sve_prfb_gather_nx4vi32_runtime_offset( %bases, i64 %imm, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfb_gather_nx4vi32_runtime_offset: ; CHECK-NEXT: prfb pldl1strm, p0, [x0, z0.s, uxtw] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfb.nx4vi32( %Pg, %bases, i64 %imm, i32 1) + call void @llvm.aarch64.sve.prfb.gather.nx4vi32( %Pg, %bases, i64 %imm, i32 1) ret void } -define void @llvm_aarch64_sve_gather_prfb_nx4vi32_invalid_immediate_offset_upper_bound( %bases, %Pg) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfb_nx4vi32_invalid_immediate_offset_upper_bound: +define void @llvm_aarch64_sve_prfb_gather_nx4vi32_invalid_immediate_offset_upper_bound( %bases, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfb_gather_nx4vi32_invalid_immediate_offset_upper_bound: ; CHECK-NEXT: mov w[[N:[0-9]+]], #32 ; CHECK-NEXT: prfb pldl1strm, p0, [x[[N]], z0.s, uxtw] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfb.nx4vi32( %Pg, %bases, i64 32, i32 1) + call void @llvm.aarch64.sve.prfb.gather.nx4vi32( %Pg, %bases, i64 32, i32 1) ret void } -define void @llvm_aarch64_sve_gather_prfb_nx4vi32_invalid_immediate_offset_lower_bound( %bases, %Pg) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfb_nx4vi32_invalid_immediate_offset_lower_bound: +define void @llvm_aarch64_sve_prfb_gather_nx4vi32_invalid_immediate_offset_lower_bound( %bases, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfb_gather_nx4vi32_invalid_immediate_offset_lower_bound: ; CHECK-NEXT: mov x[[N:[0-9]+]], #-1 ; CHECK-NEXT: prfb pldl1strm, p0, [x[[N:[0-9]+]], z0.s, uxtw] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfb.nx4vi32( %Pg, %bases, i64 -1, i32 1) + call void @llvm.aarch64.sve.prfb.gather.nx4vi32( %Pg, %bases, i64 -1, i32 1) ret void } ; PRFB , , [.D{, #}] -> 64-bit element, imm = 0, 1, ..., 31 -define void @llvm_aarch64_sve_gather_prfb_nx2vi64_runtime_offset( %bases, i64 %imm, %Pg) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfb_nx2vi64_runtime_offset: +define void @llvm_aarch64_sve_prfb_gather_nx2vi64_runtime_offset( %bases, i64 %imm, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfb_gather_nx2vi64_runtime_offset: ; CHECK-NEXT: prfb pldl1strm, p0, [x0, z0.d, uxtw] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfb.nx2vi64( %Pg, %bases, i64 %imm, i32 1) + call void @llvm.aarch64.sve.prfb.gather.nx2vi64( %Pg, %bases, i64 %imm, i32 1) ret void } -define void @llvm_aarch64_sve_gather_prfb_nx2vi64_invalid_immediate_offset_upper_bound( %bases, %Pg) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfb_nx2vi64_invalid_immediate_offset_upper_bound: +define void @llvm_aarch64_sve_prfb_gather_nx2vi64_invalid_immediate_offset_upper_bound( %bases, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfb_gather_nx2vi64_invalid_immediate_offset_upper_bound: ; CHECK-NEXT: mov w[[N:[0-9]+]], #32 ; CHECK-NEXT: prfb pldl1strm, p0, [x[[N]], z0.d, uxtw] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfb.nx2vi64( %Pg, %bases, i64 32, i32 1) + call void @llvm.aarch64.sve.prfb.gather.nx2vi64( %Pg, %bases, i64 32, i32 1) ret void } -define void @llvm_aarch64_sve_gather_prfb_nx2vi64_invalid_immediate_offset_lower_bound( %bases, %Pg) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfb_nx2vi64_invalid_immediate_offset_lower_bound: +define void @llvm_aarch64_sve_prfb_gather_nx2vi64_invalid_immediate_offset_lower_bound( %bases, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfb_gather_nx2vi64_invalid_immediate_offset_lower_bound: ; CHECK-NEXT: mov x[[N:[0-9]+]], #-1 ; CHECK-NEXT: prfb pldl1strm, p0, [x[[N:[0-9]+]], z0.d, uxtw] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfb.nx2vi64( %Pg, %bases, i64 -1, i32 1) + call void @llvm.aarch64.sve.prfb.gather.nx2vi64( %Pg, %bases, i64 -1, i32 1) ret void } ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; PRFH , , [.S{, #}] -> 32-bit element, imm = 0, 2, ..., 62 -define void @llvm_aarch64_sve_gather_prfh_nx4vi32_runtime_offset( %bases, i64 %imm, %Pg) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfh_nx4vi32_runtime_offset: +define void @llvm_aarch64_sve_prfh_gather_nx4vi32_runtime_offset( %bases, i64 %imm, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfh_gather_nx4vi32_runtime_offset: ; CHECK-NEXT: prfh pldl1strm, p0, [x0, z0.s, uxtw #1] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfh.nx4vi32( %Pg, %bases, i64 %imm, i32 1) + call void @llvm.aarch64.sve.prfh.gather.nx4vi32( %Pg, %bases, i64 %imm, i32 1) ret void } -define void @llvm_aarch64_sve_gather_prfh_nx4vi32_invalid_immediate_offset_upper_bound( %bases, %Pg) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfh_nx4vi32_invalid_immediate_offset_upper_bound: +define void @llvm_aarch64_sve_prfh_gather_nx4vi32_invalid_immediate_offset_upper_bound( %bases, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfh_gather_nx4vi32_invalid_immediate_offset_upper_bound: ; CHECK-NEXT: mov w[[N:[0-9]+]], #63 ; CHECK-NEXT: prfh pldl1strm, p0, [x[[N]], z0.s, uxtw #1] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfh.nx4vi32( %Pg, %bases, i64 63, i32 1) + call void @llvm.aarch64.sve.prfh.gather.nx4vi32( %Pg, %bases, i64 63, i32 1) ret void } -define void @llvm_aarch64_sve_gather_prfh_nx4vi32_invalid_immediate_offset_lower_bound( %bases, %Pg) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfh_nx4vi32_invalid_immediate_offset_lower_bound: +define void @llvm_aarch64_sve_prfh_gather_nx4vi32_invalid_immediate_offset_lower_bound( %bases, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfh_gather_nx4vi32_invalid_immediate_offset_lower_bound: ; CHECK-NEXT: mov x[[N:[0-9]+]], #-1 ; CHECK-NEXT: prfh pldl1strm, p0, [x[[N:[0-9]+]], z0.s, uxtw #1] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfh.nx4vi32( %Pg, %bases, i64 -1, i32 1) + call void @llvm.aarch64.sve.prfh.gather.nx4vi32( %Pg, %bases, i64 -1, i32 1) ret void } -define void @llvm_aarch64_sve_gather_prfh_nx4vi32_invalid_immediate_offset_inbound_not_multiple_of_2( %bases, %Pg) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfh_nx4vi32_invalid_immediate_offset_inbound_not_multiple_of_2: +define void @llvm_aarch64_sve_prfh_gather_nx4vi32_invalid_immediate_offset_inbound_not_multiple_of_2( %bases, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfh_gather_nx4vi32_invalid_immediate_offset_inbound_not_multiple_of_2: ; CHECK-NEXT: mov w[[N:[0-9]+]], #33 ; CHECK-NEXT: prfh pldl1strm, p0, [x[[N:[0-9]+]], z0.s, uxtw #1] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfh.nx4vi32( %Pg, %bases, i64 33, i32 1) + call void @llvm.aarch64.sve.prfh.gather.nx4vi32( %Pg, %bases, i64 33, i32 1) ret void } ; PRFH , , [.D{, #}] -> 64-bit element, imm = 0, 2, ..., 62 -define void @llvm_aarch64_sve_gather_prfh_nx2vi64_runtime_offset( %bases, i64 %imm, %Pg) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfh_nx2vi64_runtime_offset: +define void @llvm_aarch64_sve_prfh_gather_nx2vi64_runtime_offset( %bases, i64 %imm, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfh_gather_nx2vi64_runtime_offset: ; CHECK-NEXT: prfh pldl1strm, p0, [x0, z0.d, uxtw #1] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfh.nx2vi64( %Pg, %bases, i64 %imm, i32 1) + call void @llvm.aarch64.sve.prfh.gather.nx2vi64( %Pg, %bases, i64 %imm, i32 1) ret void } -define void @llvm_aarch64_sve_gather_prfh_nx2vi64_invalid_immediate_offset_upper_bound( %bases, %Pg) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfh_nx2vi64_invalid_immediate_offset_upper_bound: +define void @llvm_aarch64_sve_prfh_gather_nx2vi64_invalid_immediate_offset_upper_bound( %bases, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfh_gather_nx2vi64_invalid_immediate_offset_upper_bound: ; CHECK-NEXT: mov w[[N:[0-9]+]], #63 ; CHECK-NEXT: prfh pldl1strm, p0, [x[[N]], z0.d, uxtw #1] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfh.nx2vi64( %Pg, %bases, i64 63, i32 1) + call void @llvm.aarch64.sve.prfh.gather.nx2vi64( %Pg, %bases, i64 63, i32 1) ret void } -define void @llvm_aarch64_sve_gather_prfh_nx2vi64_invalid_immediate_offset_lower_bound( %bases, %Pg) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfh_nx2vi64_invalid_immediate_offset_lower_bound: +define void @llvm_aarch64_sve_prfh_gather_nx2vi64_invalid_immediate_offset_lower_bound( %bases, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfh_gather_nx2vi64_invalid_immediate_offset_lower_bound: ; CHECK-NEXT: mov x[[N:[0-9]+]], #-1 ; CHECK-NEXT: prfh pldl1strm, p0, [x[[N:[0-9]+]], z0.d, uxtw #1] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfh.nx2vi64( %Pg, %bases, i64 -1, i32 1) + call void @llvm.aarch64.sve.prfh.gather.nx2vi64( %Pg, %bases, i64 -1, i32 1) ret void } -define void @llvm_aarch64_sve_gather_prfh_nx2vi64_invalid_immediate_offset_inbound_not_multiple_of_2( %bases, %Pg) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfh_nx2vi64_invalid_immediate_offset_inbound_not_multiple_of_2: +define void @llvm_aarch64_sve_prfh_gather_nx2vi64_invalid_immediate_offset_inbound_not_multiple_of_2( %bases, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfh_gather_nx2vi64_invalid_immediate_offset_inbound_not_multiple_of_2: ; CHECK-NEXT: mov w[[N:[0-9]+]], #33 ; CHECK-NEXT: prfh pldl1strm, p0, [x[[N:[0-9]+]], z0.d, uxtw #1] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfh.nx2vi64( %Pg, %bases, i64 33, i32 1) + call void @llvm.aarch64.sve.prfh.gather.nx2vi64( %Pg, %bases, i64 33, i32 1) ret void } ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; PRFW , , [.S{, #}] -> 32-bit element, imm = 0, 4, ..., 124 -define void @llvm_aarch64_sve_gather_prfw_nx4vi32_runtime_offset( %bases, i64 %imm, %Pg) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfw_nx4vi32_runtime_offset: +define void @llvm_aarch64_sve_prfw_gather_nx4vi32_runtime_offset( %bases, i64 %imm, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfw_gather_nx4vi32_runtime_offset: ; CHECK-NEXT: prfw pldl1strm, p0, [x0, z0.s, uxtw #2] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfw.nx4vi32( %Pg, %bases, i64 %imm, i32 1) + call void @llvm.aarch64.sve.prfw.gather.nx4vi32( %Pg, %bases, i64 %imm, i32 1) ret void } -define void @llvm_aarch64_sve_gather_prfw_nx4vi32_invalid_immediate_offset_upper_bound( %bases, %Pg) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfw_nx4vi32_invalid_immediate_offset_upper_bound: +define void @llvm_aarch64_sve_prfw_gather_nx4vi32_invalid_immediate_offset_upper_bound( %bases, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfw_gather_nx4vi32_invalid_immediate_offset_upper_bound: ; CHECK-NEXT: mov w[[N:[0-9]+]], #125 ; CHECK-NEXT: prfw pldl1strm, p0, [x[[N]], z0.s, uxtw #2] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfw.nx4vi32( %Pg, %bases, i64 125, i32 1) + call void @llvm.aarch64.sve.prfw.gather.nx4vi32( %Pg, %bases, i64 125, i32 1) ret void } -define void @llvm_aarch64_sve_gather_prfw_nx4vi32_invalid_immediate_offset_lower_bound( %bases, %Pg) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfw_nx4vi32_invalid_immediate_offset_lower_bound: +define void @llvm_aarch64_sve_prfw_gather_nx4vi32_invalid_immediate_offset_lower_bound( %bases, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfw_gather_nx4vi32_invalid_immediate_offset_lower_bound: ; CHECK-NEXT: mov x[[N:[0-9]+]], #-1 ; CHECK-NEXT: prfw pldl1strm, p0, [x[[N:[0-9]+]], z0.s, uxtw #2] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfw.nx4vi32( %Pg, %bases, i64 -1, i32 1) + call void @llvm.aarch64.sve.prfw.gather.nx4vi32( %Pg, %bases, i64 -1, i32 1) ret void } -define void @llvm_aarch64_sve_gather_prfw_nx4vi32_invalid_immediate_offset_inbound_not_multiple_of_4( %bases, %Pg) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfw_nx4vi32_invalid_immediate_offset_inbound_not_multiple_of_4: +define void @llvm_aarch64_sve_prfw_gather_nx4vi32_invalid_immediate_offset_inbound_not_multiple_of_4( %bases, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfw_gather_nx4vi32_invalid_immediate_offset_inbound_not_multiple_of_4: ; CHECK-NEXT: mov w[[N:[0-9]+]], #33 ; CHECK-NEXT: prfw pldl1strm, p0, [x[[N:[0-9]+]], z0.s, uxtw #2] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfw.nx4vi32( %Pg, %bases, i64 33, i32 1) + call void @llvm.aarch64.sve.prfw.gather.nx4vi32( %Pg, %bases, i64 33, i32 1) ret void } ; PRFW , , [.D{, #}] -> 64-bit element, imm = 0, 4, ..., 124 -define void @llvm_aarch64_sve_gather_prfw_nx2vi64_runtime_offset( %bases, i64 %imm, %Pg) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfw_nx2vi64_runtime_offset: +define void @llvm_aarch64_sve_prfw_gather_nx2vi64_runtime_offset( %bases, i64 %imm, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfw_gather_nx2vi64_runtime_offset: ; CHECK-NEXT: prfw pldl1strm, p0, [x0, z0.d, uxtw #2] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfw.nx2vi64( %Pg, %bases, i64 %imm, i32 1) + call void @llvm.aarch64.sve.prfw.gather.nx2vi64( %Pg, %bases, i64 %imm, i32 1) ret void } -define void @llvm_aarch64_sve_gather_prfw_nx2vi64_invalid_immediate_offset_upper_bound( %bases, %Pg) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfw_nx2vi64_invalid_immediate_offset_upper_bound: +define void @llvm_aarch64_sve_prfw_gather_nx2vi64_invalid_immediate_offset_upper_bound( %bases, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfw_gather_nx2vi64_invalid_immediate_offset_upper_bound: ; CHECK-NEXT: mov w[[N:[0-9]+]], #125 ; CHECK-NEXT: prfw pldl1strm, p0, [x[[N]], z0.d, uxtw #2] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfw.nx2vi64( %Pg, %bases, i64 125, i32 1) + call void @llvm.aarch64.sve.prfw.gather.nx2vi64( %Pg, %bases, i64 125, i32 1) ret void } -define void @llvm_aarch64_sve_gather_prfw_nx2vi64_invalid_immediate_offset_lower_bound( %bases, %Pg) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfw_nx2vi64_invalid_immediate_offset_lower_bound: +define void @llvm_aarch64_sve_prfw_gather_nx2vi64_invalid_immediate_offset_lower_bound( %bases, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfw_gather_nx2vi64_invalid_immediate_offset_lower_bound: ; CHECK-NEXT: mov x[[N:[0-9]+]], #-1 ; CHECK-NEXT: prfw pldl1strm, p0, [x[[N:[0-9]+]], z0.d, uxtw #2] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfw.nx2vi64( %Pg, %bases, i64 -1, i32 1) + call void @llvm.aarch64.sve.prfw.gather.nx2vi64( %Pg, %bases, i64 -1, i32 1) ret void } -define void @llvm_aarch64_sve_gather_prfw_nx2vi64_invalid_immediate_offset_inbound_not_multiple_of_4( %bases, %Pg) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfw_nx2vi64_invalid_immediate_offset_inbound_not_multiple_of_4: +define void @llvm_aarch64_sve_prfw_gather_nx2vi64_invalid_immediate_offset_inbound_not_multiple_of_4( %bases, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfw_gather_nx2vi64_invalid_immediate_offset_inbound_not_multiple_of_4: ; CHECK-NEXT: mov w[[N:[0-9]+]], #33 ; CHECK-NEXT: prfw pldl1strm, p0, [x[[N:[0-9]+]], z0.d, uxtw #2] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfw.nx2vi64( %Pg, %bases, i64 33, i32 1) + call void @llvm.aarch64.sve.prfw.gather.nx2vi64( %Pg, %bases, i64 33, i32 1) ret void } ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; PRFD , , [.S{, #}] -> 32-bit element, imm = 0, 8, ..., 248 -define void @llvm_aarch64_sve_gather_prfd_nx4vi32_runtime_offset( %bases, i64 %imm, %Pg) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfd_nx4vi32_runtime_offset: +define void @llvm_aarch64_sve_prfd_gather_nx4vi32_runtime_offset( %bases, i64 %imm, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfd_gather_nx4vi32_runtime_offset: ; CHECK-NEXT: prfd pldl1strm, p0, [x0, z0.s, uxtw #3] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfd.nx4vi32( %Pg, %bases, i64 %imm, i32 1) + call void @llvm.aarch64.sve.prfd.gather.nx4vi32( %Pg, %bases, i64 %imm, i32 1) ret void } -define void @llvm_aarch64_sve_gather_prfd_nx4vi32_invalid_immediate_offset_upper_bound( %bases, %Pg) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfd_nx4vi32_invalid_immediate_offset_upper_bound: +define void @llvm_aarch64_sve_prfd_gather_nx4vi32_invalid_immediate_offset_upper_bound( %bases, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfd_gather_nx4vi32_invalid_immediate_offset_upper_bound: ; CHECK-NEXT: mov w[[N:[0-9]+]], #125 ; CHECK-NEXT: prfd pldl1strm, p0, [x[[N]], z0.s, uxtw #3] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfd.nx4vi32( %Pg, %bases, i64 125, i32 1) + call void @llvm.aarch64.sve.prfd.gather.nx4vi32( %Pg, %bases, i64 125, i32 1) ret void } -define void @llvm_aarch64_sve_gather_prfd_nx4vi32_invalid_immediate_offset_lower_bound( %bases, %Pg) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfd_nx4vi32_invalid_immediate_offset_lower_bound: +define void @llvm_aarch64_sve_prfd_gather_nx4vi32_invalid_immediate_offset_lower_bound( %bases, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfd_gather_nx4vi32_invalid_immediate_offset_lower_bound: ; CHECK-NEXT: mov x[[N:[0-9]+]], #-1 ; CHECK-NEXT: prfd pldl1strm, p0, [x[[N:[0-9]+]], z0.s, uxtw #3] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfd.nx4vi32( %Pg, %bases, i64 -1, i32 1) + call void @llvm.aarch64.sve.prfd.gather.nx4vi32( %Pg, %bases, i64 -1, i32 1) ret void } -define void @llvm_aarch64_sve_gather_prfd_nx4vi32_invalid_immediate_offset_inbound_not_multiple_of_8( %bases, %Pg) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfd_nx4vi32_invalid_immediate_offset_inbound_not_multiple_of_8: +define void @llvm_aarch64_sve_prfd_gather_nx4vi32_invalid_immediate_offset_inbound_not_multiple_of_8( %bases, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfd_gather_nx4vi32_invalid_immediate_offset_inbound_not_multiple_of_8: ; CHECK-NEXT: mov w[[N:[0-9]+]], #33 ; CHECK-NEXT: prfd pldl1strm, p0, [x[[N:[0-9]+]], z0.s, uxtw #3] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfd.nx4vi32( %Pg, %bases, i64 33, i32 1) + call void @llvm.aarch64.sve.prfd.gather.nx4vi32( %Pg, %bases, i64 33, i32 1) ret void } ; PRFD , , [.D{, #}] -> 64-bit element, imm = 0, 4, ..., 248 -define void @llvm_aarch64_sve_gather_prfd_nx2vi64_runtime_offset( %bases, i64 %imm, %Pg) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfd_nx2vi64_runtime_offset: +define void @llvm_aarch64_sve_prfd_gather_nx2vi64_runtime_offset( %bases, i64 %imm, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfd_gather_nx2vi64_runtime_offset: ; CHECK-NEXT: prfd pldl1strm, p0, [x0, z0.d, uxtw #3] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfd.nx2vi64( %Pg, %bases, i64 %imm, i32 1) + call void @llvm.aarch64.sve.prfd.gather.nx2vi64( %Pg, %bases, i64 %imm, i32 1) ret void } -define void @llvm_aarch64_sve_gather_prfd_nx2vi64_invalid_immediate_offset_upper_bound( %bases, %Pg) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfd_nx2vi64_invalid_immediate_offset_upper_bound: +define void @llvm_aarch64_sve_prfd_gather_nx2vi64_invalid_immediate_offset_upper_bound( %bases, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfd_gather_nx2vi64_invalid_immediate_offset_upper_bound: ; CHECK-NEXT: mov w[[N:[0-9]+]], #125 ; CHECK-NEXT: prfd pldl1strm, p0, [x[[N]], z0.d, uxtw #3] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfd.nx2vi64( %Pg, %bases, i64 125, i32 1) + call void @llvm.aarch64.sve.prfd.gather.nx2vi64( %Pg, %bases, i64 125, i32 1) ret void } -define void @llvm_aarch64_sve_gather_prfd_nx2vi64_invalid_immediate_offset_lower_bound( %bases, %Pg) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfd_nx2vi64_invalid_immediate_offset_lower_bound: +define void @llvm_aarch64_sve_prfd_gather_nx2vi64_invalid_immediate_offset_lower_bound( %bases, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfd_gather_nx2vi64_invalid_immediate_offset_lower_bound: ; CHECK-NEXT: mov x[[N:[0-9]+]], #-1 ; CHECK-NEXT: prfd pldl1strm, p0, [x[[N:[0-9]+]], z0.d, uxtw #3] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfd.nx2vi64( %Pg, %bases, i64 -1, i32 1) + call void @llvm.aarch64.sve.prfd.gather.nx2vi64( %Pg, %bases, i64 -1, i32 1) ret void } -define void @llvm_aarch64_sve_gather_prfd_nx2vi64_invalid_immediate_offset_inbound_not_multiple_of_8( %bases, %Pg) nounwind { -; CHECK-LABEL: llvm_aarch64_sve_gather_prfd_nx2vi64_invalid_immediate_offset_inbound_not_multiple_of_8: +define void @llvm_aarch64_sve_prfd_gather_nx2vi64_invalid_immediate_offset_inbound_not_multiple_of_8( %bases, %Pg) nounwind { +; CHECK-LABEL: llvm_aarch64_sve_prfd_gather_nx2vi64_invalid_immediate_offset_inbound_not_multiple_of_8: ; CHECK-NEXT: mov w[[N:[0-9]+]], #33 ; CHECK-NEXT: prfd pldl1strm, p0, [x[[N:[0-9]+]], z0.d, uxtw #3] ; CHECK-NEXT: ret - call void @llvm.aarch64.sve.gather.prfd.nx2vi64( %Pg, %bases, i64 33, i32 1) + call void @llvm.aarch64.sve.prfd.gather.nx2vi64( %Pg, %bases, i64 33, i32 1) ret void } -declare void @llvm.aarch64.sve.gather.prfb.nx4vi32( %Pg, %bases, i64 %imm, i32 %prfop) -declare void @llvm.aarch64.sve.gather.prfb.nx2vi64( %Pg, %bases, i64 %imm, i32 %prfop) -declare void @llvm.aarch64.sve.gather.prfh.nx4vi32( %Pg, %bases, i64 %imm, i32 %prfop) -declare void @llvm.aarch64.sve.gather.prfh.nx2vi64( %Pg, %bases, i64 %imm, i32 %prfop) -declare void @llvm.aarch64.sve.gather.prfw.nx4vi32( %Pg, %bases, i64 %imm, i32 %prfop) -declare void @llvm.aarch64.sve.gather.prfw.nx2vi64( %Pg, %bases, i64 %imm, i32 %prfop) -declare void @llvm.aarch64.sve.gather.prfd.nx4vi32( %Pg, %bases, i64 %imm, i32 %prfop) -declare void @llvm.aarch64.sve.gather.prfd.nx2vi64( %Pg, %bases, i64 %imm, i32 %prfop) +declare void @llvm.aarch64.sve.prfb.gather.nx4vi32( %Pg, %bases, i64 %imm, i32 %prfop) +declare void @llvm.aarch64.sve.prfb.gather.nx2vi64( %Pg, %bases, i64 %imm, i32 %prfop) +declare void @llvm.aarch64.sve.prfh.gather.nx4vi32( %Pg, %bases, i64 %imm, i32 %prfop) +declare void @llvm.aarch64.sve.prfh.gather.nx2vi64( %Pg, %bases, i64 %imm, i32 %prfop) +declare void @llvm.aarch64.sve.prfw.gather.nx4vi32( %Pg, %bases, i64 %imm, i32 %prfop) +declare void @llvm.aarch64.sve.prfw.gather.nx2vi64( %Pg, %bases, i64 %imm, i32 %prfop) +declare void @llvm.aarch64.sve.prfd.gather.nx4vi32( %Pg, %bases, i64 %imm, i32 %prfop) +declare void @llvm.aarch64.sve.prfd.gather.nx2vi64( %Pg, %bases, i64 %imm, i32 %prfop) diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-ldst1.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-ldst1.ll new file mode 100644 index 0000000000000..1501647b0883c --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-ldst1.ll @@ -0,0 +1,182 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s + +; +; LD1B +; + +define @ld1b_i8( %pred, i8* %addr) { +; CHECK-LABEL: ld1b_i8: +; CHECK: ld1b { z0.b }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld1.nxv16i8( %pred, + i8* %addr) + ret %res +} + +; +; LD1H +; + +define @ld1h_i16( %pred, i16* %addr) { +; CHECK-LABEL: ld1h_i16: +; CHECK: ld1h { z0.h }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld1.nxv8i16( %pred, + i16* %addr) + ret %res +} + +define @ld1h_f16( %pred, half* %addr) { +; CHECK-LABEL: ld1h_f16: +; CHECK: ld1h { z0.h }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld1.nxv8f16( %pred, + half* %addr) + ret %res +} + +; +; LD1W +; + +define @ld1w_i32( %pred, i32* %addr) { +; CHECK-LABEL: ld1w_i32: +; CHECK: ld1w { z0.s }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld1.nxv4i32( %pred, + i32* %addr) + ret %res +} + +define @ld1w_f32( %pred, float* %addr) { +; CHECK-LABEL: ld1w_f32: +; CHECK: ld1w { z0.s }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld1.nxv4f32( %pred, + float* %addr) + ret %res +} + +; +; LD1D +; + +define @ld1d_i64( %pred, i64* %addr) { +; CHECK-LABEL: ld1d_i64: +; CHECK: ld1d { z0.d }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld1.nxv2i64( %pred, + i64* %addr) + ret %res +} + +define @ld1d_f64( %pred, double* %addr) { +; CHECK-LABEL: ld1d_f64: +; CHECK: ld1d { z0.d }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld1.nxv2f64( %pred, + double* %addr) + ret %res +} + +; +; ST1B +; + +define void @st1b_i8( %data, %pred, i8* %addr) { +; CHECK-LABEL: st1b_i8: +; CHECK: st1b { z0.b }, p0, [x0] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1.nxv16i8( %data, + %pred, + i8* %addr) + ret void +} + +; +; ST1H +; + +define void @st1h_i16( %data, %pred, i16* %addr) { +; CHECK-LABEL: st1h_i16: +; CHECK: st1h { z0.h }, p0, [x0] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1.nxv8i16( %data, + %pred, + i16* %addr) + ret void +} + +define void @st1h_f16( %data, %pred, half* %addr) { +; CHECK-LABEL: st1h_f16: +; CHECK: st1h { z0.h }, p0, [x0] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1.nxv8f16( %data, + %pred, + half* %addr) + ret void +} + +; +; ST1W +; + +define void @st1w_i32( %data, %pred, i32* %addr) { +; CHECK-LABEL: st1w_i32: +; CHECK: st1w { z0.s }, p0, [x0] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1.nxv4i32( %data, + %pred, + i32* %addr) + ret void +} + +define void @st1w_f32( %data, %pred, float* %addr) { +; CHECK-LABEL: st1w_f32: +; CHECK: st1w { z0.s }, p0, [x0] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1.nxv4f32( %data, + %pred, + float* %addr) + ret void +} + +; +; ST1D +; + +define void @st1d_i64( %data, %pred, i64* %addr) { +; CHECK-LABEL: st1d_i64: +; CHECK: st1d { z0.d }, p0, [x0] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1.nxv2i64( %data, + %pred, + i64* %addr) + ret void +} + +define void @st1d_f64( %data, %pred, double* %addr) { +; CHECK-LABEL: st1d_f64: +; CHECK: st1d { z0.d }, p0, [x0] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1.nxv2f64( %data, + %pred, + double* %addr) + ret void +} + +declare @llvm.aarch64.sve.ld1.nxv16i8(, i8*) +declare @llvm.aarch64.sve.ld1.nxv8i16(, i16*) +declare @llvm.aarch64.sve.ld1.nxv4i32(, i32*) +declare @llvm.aarch64.sve.ld1.nxv2i64(, i64*) +declare @llvm.aarch64.sve.ld1.nxv8f16(, half*) +declare @llvm.aarch64.sve.ld1.nxv4f32(, float*) +declare @llvm.aarch64.sve.ld1.nxv2f64(, double*) + +declare void @llvm.aarch64.sve.st1.nxv16i8(, , i8*) +declare void @llvm.aarch64.sve.st1.nxv8i16(, , i16*) +declare void @llvm.aarch64.sve.st1.nxv4i32(, , i32*) +declare void @llvm.aarch64.sve.st1.nxv2i64(, , i64*) +declare void @llvm.aarch64.sve.st1.nxv8f16(, , half*) +declare void @llvm.aarch64.sve.st1.nxv4f32(, , float*) +declare void @llvm.aarch64.sve.st1.nxv2f64(, , double*) diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-shifts-merging.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-shifts-merging.ll new file mode 100644 index 0000000000000..7f5105da675e1 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-shifts-merging.ll @@ -0,0 +1,340 @@ +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s + +; +; ASR +; + +define @asr_i8( %pg, %a, %b) { +; CHECK-LABEL: asr_i8: +; CHECK: movprfx z0.b, p0/z, z0.b +; CHECK-NEXT: asr z0.b, p0/m, z0.b, z1.b +; CHECK-NEXT: ret + %a_z = select %pg, %a, zeroinitializer + %out = call @llvm.aarch64.sve.asr.nxv16i8( %pg, + %a_z, + %b) + ret %out +} + +define @asr_i16( %pg, %a, %b) { +; CHECK-LABEL: asr_i16: +; CHECK: movprfx z0.h, p0/z, z0.h +; CHECK-NEXT: asr z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: ret + %a_z = select %pg, %a, zeroinitializer + %out = call @llvm.aarch64.sve.asr.nxv8i16( %pg, + %a_z, + %b) + ret %out +} + +define @asr_i32( %pg, %a, %b) { +; CHECK-LABEL: asr_i32: +; CHECK: movprfx z0.s, p0/z, z0.s +; CHECK-NEXT: asr z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: ret + %a_z = select %pg, %a, zeroinitializer + %out = call @llvm.aarch64.sve.asr.nxv4i32( %pg, + %a_z, + %b) + ret %out +} + +define @asr_i64( %pg, %a, %b) { +; CHECK-LABEL: asr_i64: +; CHECK: movprfx z0.d, p0/z, z0.d +; CHECK-NEXT: asr z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: ret + %a_z = select %pg, %a, zeroinitializer + %out = call @llvm.aarch64.sve.asr.nxv2i64( %pg, + %a_z, + %b) + ret %out +} + +define @asr_wide_i8( %pg, %a, %b) { +; CHECK-LABEL: asr_wide_i8: +; CHECK-NOT: movprfx +; CHECK: asr z0.b, p0/m, z0.b, z1.d + %a_z = select %pg, %a, zeroinitializer + %out = call @llvm.aarch64.sve.asr.wide.nxv16i8( %pg, + %a_z, + %b) + ret %out +} + +define @asr_wide_i16( %pg, %a, %b) { +; CHECK-LABEL: asr_wide_i16: +; CHECK-NOT: movprfx +; CHECK: asr z0.h, p0/m, z0.h, z1.d + %a_z = select %pg, %a, zeroinitializer + %out = call @llvm.aarch64.sve.asr.wide.nxv8i16( %pg, + %a_z, + %b) + ret %out +} + +define @asr_wide_i32( %pg, %a, %b) { +; CHECK-LABEL: asr_wide_i32: +; CHECK-NOT: movprfx +; CHECK: asr z0.s, p0/m, z0.s, z1.d + %a_z = select %pg, %a, zeroinitializer + %out = call @llvm.aarch64.sve.asr.wide.nxv4i32( %pg, + %a_z, + %b) + ret %out +} + +; +; ASRD +; + +define @asrd_i8( %pg, %a) { +; CHECK-LABEL: asrd_i8: +; CHECK: movprfx z0.b, p0/z, z0.b +; CHECK-NEXT: asrd z0.b, p0/m, z0.b, #1 +; CHECK-NEXT: ret + %a_z = select %pg, %a, zeroinitializer + %out = call @llvm.aarch64.sve.asrd.nxv16i8( %pg, + %a_z, + i32 1) + ret %out +} + +define @asrd_i16( %pg, %a) { +; CHECK-LABEL: asrd_i16: +; CHECK: movprfx z0.h, p0/z, z0.h +; CHECK-NEXT: asrd z0.h, p0/m, z0.h, #2 +; CHECK-NEXT: ret + %a_z = select %pg, %a, zeroinitializer + %out = call @llvm.aarch64.sve.asrd.nxv8i16( %pg, + %a_z, + i32 2) + ret %out +} + +define @asrd_i32( %pg, %a) { +; CHECK-LABEL: asrd_i32: +; CHECK: movprfx z0.s, p0/z, z0.s +; CHECK-NEXT: asrd z0.s, p0/m, z0.s, #31 +; CHECK-NEXT: ret + %a_z = select %pg, %a, zeroinitializer + %out = call @llvm.aarch64.sve.asrd.nxv4i32( %pg, + %a_z, + i32 31) + ret %out +} + +define @asrd_i64( %pg, %a) { +; CHECK-LABEL: asrd_i64: +; CHECK: movprfx z0.d, p0/z, z0.d +; CHECK-NEXT: asrd z0.d, p0/m, z0.d, #64 +; CHECK-NEXT: ret + %a_z = select %pg, %a, zeroinitializer + %out = call @llvm.aarch64.sve.asrd.nxv2i64( %pg, + %a_z, + i32 64) + ret %out +} + +; +; LSL +; + +define @lsl_i8( %pg, %a, %b) { +; CHECK-LABEL: lsl_i8: +; CHECK: movprfx z0.b, p0/z, z0.b +; CHECK-NEXT: lsl z0.b, p0/m, z0.b, z1.b +; CHECK-NEXT: ret + %a_z = select %pg, %a, zeroinitializer + %out = call @llvm.aarch64.sve.lsl.nxv16i8( %pg, + %a_z, + %b) + ret %out +} + +define @lsl_i16( %pg, %a, %b) { +; CHECK-LABEL: lsl_i16: +; CHECK: movprfx z0.h, p0/z, z0.h +; CHECK-NEXT: lsl z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: ret + %a_z = select %pg, %a, zeroinitializer + %out = call @llvm.aarch64.sve.lsl.nxv8i16( %pg, + %a_z, + %b) + ret %out +} + +define @lsl_i32( %pg, %a, %b) { +; CHECK-LABEL: lsl_i32: +; CHECK: movprfx z0.s, p0/z, z0.s +; CHECK-NEXT: lsl z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: ret + %a_z = select %pg, %a, zeroinitializer + %out = call @llvm.aarch64.sve.lsl.nxv4i32( %pg, + %a_z, + %b) + ret %out +} + +define @lsl_i64( %pg, %a, %b) { +; CHECK-LABEL: lsl_i64: +; CHECK: movprfx z0.d, p0/z, z0.d +; CHECK-NEXT: lsl z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: ret + %a_z = select %pg, %a, zeroinitializer + %out = call @llvm.aarch64.sve.lsl.nxv2i64( %pg, + %a_z, + %b) + ret %out +} + +define @lsl_wide_i8( %pg, %a, %b) { +; CHECK-LABEL: lsl_wide_i8: +; CHECK-NOT: movprfx +; CHECK: lsl z0.b, p0/m, z0.b, z1.d + %a_z = select %pg, %a, zeroinitializer + %out = call @llvm.aarch64.sve.lsl.wide.nxv16i8( %pg, + %a_z, + %b) + ret %out +} + +define @lsl_wide_i16( %pg, %a, %b) { +; CHECK-LABEL: lsl_wide_i16: +; CHECK-NOT: movprfx +; CHECK: lsl z0.h, p0/m, z0.h, z1.d + %a_z = select %pg, %a, zeroinitializer + %out = call @llvm.aarch64.sve.lsl.wide.nxv8i16( %pg, + %a_z, + %b) + ret %out +} + +define @lsl_wide_i32( %pg, %a, %b) { +; CHECK-LABEL: lsl_wide_i32: +; CHECK-NOT: movprfx +; CHECK: lsl z0.s, p0/m, z0.s, z1.d + %a_z = select %pg, %a, zeroinitializer + %out = call @llvm.aarch64.sve.lsl.wide.nxv4i32( %pg, + %a_z, + %b) + ret %out +} + +; +; LSR +; + +define @lsr_i8( %pg, %a, %b) { +; CHECK-LABEL: lsr_i8: +; CHECK: movprfx z0.b, p0/z, z0.b +; CHECK-NEXT: lsr z0.b, p0/m, z0.b, z1.b +; CHECK-NEXT: ret + %a_z = select %pg, %a, zeroinitializer + %out = call @llvm.aarch64.sve.lsr.nxv16i8( %pg, + %a_z, + %b) + ret %out +} + +define @lsr_i16( %pg, %a, %b) { +; CHECK-LABEL: lsr_i16: +; CHECK: movprfx z0.h, p0/z, z0.h +; CHECK-NEXT: lsr z0.h, p0/m, z0.h, z1.h +; CHECK-NEXT: ret + %a_z = select %pg, %a, zeroinitializer + %out = call @llvm.aarch64.sve.lsr.nxv8i16( %pg, + %a_z, + %b) + ret %out +} + +define @lsr_i32( %pg, %a, %b) { +; CHECK-LABEL: lsr_i32: +; CHECK: movprfx z0.s, p0/z, z0.s +; CHECK-NEXT: lsr z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: ret + %a_z = select %pg, %a, zeroinitializer + %out = call @llvm.aarch64.sve.lsr.nxv4i32( %pg, + %a_z, + %b) + ret %out +} + +define @lsr_i64( %pg, %a, %b) { +; CHECK-LABEL: lsr_i64: +; CHECK: movprfx z0.d, p0/z, z0.d +; CHECK-NEXT: lsr z0.d, p0/m, z0.d, z1.d +; CHECK-NEXT: ret + %a_z = select %pg, %a, zeroinitializer + %out = call @llvm.aarch64.sve.lsr.nxv2i64( %pg, + %a_z, + %b) + ret %out +} + +define @lsr_wide_i8( %pg, %a, %b) { +; CHECK-LABEL: lsr_wide_i8: +; CHECK-NOT: movprfx +; CHECK: lsr z0.b, p0/m, z0.b, z1.d + %a_z = select %pg, %a, zeroinitializer + %out = call @llvm.aarch64.sve.lsr.wide.nxv16i8( %pg, + %a_z, + %b) + ret %out +} + +define @lsr_wide_i16( %pg, %a, %b) { +; CHECK-LABEL: lsr_wide_i16: +; CHECK-NOT: movprfx +; CHECK: lsr z0.h, p0/m, z0.h, z1.d + %a_z = select %pg, %a, zeroinitializer + %out = call @llvm.aarch64.sve.lsr.wide.nxv8i16( %pg, + %a_z, + %b) + ret %out +} + +define @lsr_wide_i32( %pg, %a, %b) { +; CHECK-LABEL: lsr_wide_i32: +; CHECK-NOT: movprfx +; CHECK: lsr z0.s, p0/m, z0.s, z1.d + %a_z = select %pg, %a, zeroinitializer + %out = call @llvm.aarch64.sve.lsr.wide.nxv4i32( %pg, + %a_z, + %b) + ret %out +} + +declare @llvm.aarch64.sve.asr.nxv16i8(, , ) +declare @llvm.aarch64.sve.asr.nxv8i16(, , ) +declare @llvm.aarch64.sve.asr.nxv4i32(, , ) +declare @llvm.aarch64.sve.asr.nxv2i64(, , ) + +declare @llvm.aarch64.sve.asr.wide.nxv16i8(, , ) +declare @llvm.aarch64.sve.asr.wide.nxv8i16(, , ) +declare @llvm.aarch64.sve.asr.wide.nxv4i32(, , ) + +declare @llvm.aarch64.sve.asrd.nxv16i8(, , i32) +declare @llvm.aarch64.sve.asrd.nxv8i16(, , i32) +declare @llvm.aarch64.sve.asrd.nxv4i32(, , i32) +declare @llvm.aarch64.sve.asrd.nxv2i64(, , i32) + +declare @llvm.aarch64.sve.lsl.nxv16i8(, , ) +declare @llvm.aarch64.sve.lsl.nxv8i16(, , ) +declare @llvm.aarch64.sve.lsl.nxv4i32(, , ) +declare @llvm.aarch64.sve.lsl.nxv2i64(, , ) + +declare @llvm.aarch64.sve.lsl.wide.nxv16i8(, , ) +declare @llvm.aarch64.sve.lsl.wide.nxv8i16(, , ) +declare @llvm.aarch64.sve.lsl.wide.nxv4i32(, , ) + +declare @llvm.aarch64.sve.lsr.nxv16i8(, , ) +declare @llvm.aarch64.sve.lsr.nxv8i16(, , ) +declare @llvm.aarch64.sve.lsr.nxv4i32(, , ) +declare @llvm.aarch64.sve.lsr.nxv2i64(, , ) + +declare @llvm.aarch64.sve.lsr.wide.nxv16i8(, , ) +declare @llvm.aarch64.sve.lsr.wide.nxv8i16(, , ) +declare @llvm.aarch64.sve.lsr.wide.nxv4i32(, , ) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll index 34aca1e80f591..3f18877acd945 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll @@ -165,7 +165,7 @@ define void @constrained_if_register_class() { ; CHECK-NEXT: s_cbranch_execz BB4_5 ; CHECK-NEXT: ; %bb.4: ; %bb11 ; CHECK-NEXT: v_mov_b32_e32 v0, 4.0 -; CHECK-NEXT: buffer_store_dword v0, v0, s[0:3], s33 offen +; CHECK-NEXT: buffer_store_dword v0, v0, s[0:3], 0 offen ; CHECK-NEXT: BB4_5: ; %Flow ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] ; CHECK-NEXT: BB4_6: ; %bb12 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll new file mode 100644 index 0000000000000..d8fbcfaac2009 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll @@ -0,0 +1,612 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=GFX6 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=fiji < %s | FileCheck -check-prefix=GFX8 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s + +define float @v_fma_f32(float %x, float %y, float %z) { +; GFX6-LABEL: v_fma_f32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_fma_f32 v0, v0, v1, v2 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fma_f32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_fma_f32 v0, v0, v1, v2 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fma_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_fma_f32 v0, v0, v1, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %fma = call float @llvm.fma.f32(float %x, float %y, float %z) + ret float %fma +} + +define <2 x float> @v_fma_v2f32(<2 x float> %x, <2 x float> %y, <2 x float> %z) { +; GFX6-LABEL: v_fma_v2f32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_fma_f32 v0, v0, v2, v4 +; GFX6-NEXT: v_fma_f32 v1, v1, v3, v5 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fma_v2f32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_fma_f32 v0, v0, v2, v4 +; GFX8-NEXT: v_fma_f32 v1, v1, v3, v5 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fma_v2f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_fma_f32 v0, v0, v2, v4 +; GFX9-NEXT: v_fma_f32 v1, v1, v3, v5 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %fma = call <2 x float> @llvm.fma.v2f32(<2 x float> %x, <2 x float> %y, <2 x float> %z) + ret <2 x float> %fma +} + +define half @v_fma_f16(half %x, half %y, half %z) { +; GFX6-LABEL: v_fma_f16: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_fma_f32 v0, v0, v1, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fma_f16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_fma_f16 v0, v0, v1, v2 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fma_f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_fma_f16 v0, v0, v1, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %fma = call half @llvm.fma.f16(half %x, half %y, half %z) + ret half %fma +} + +define <2 x half> @v_fma_v2f16(<2 x half> %x, <2 x half> %y, <2 x half> %z) { +; GFX6-LABEL: v_fma_v2f16: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_fma_f32 v0, v0, v2, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_fma_f32 v1, v1, v3, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fma_v2f16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX8-NEXT: v_fma_f16 v0, v0, v1, v2 +; GFX8-NEXT: v_fma_f16 v1, v3, v4, v5 +; GFX8-NEXT: v_mov_b32_e32 v2, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fma_v2f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_fma_f16 v0, v0, v1, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %fma = call <2 x half> @llvm.fma.v2f16(<2 x half> %x, <2 x half> %y, <2 x half> %z) + ret <2 x half> %fma +} + +define <2 x half> @v_fma_v2f16_fneg_lhs(<2 x half> %x, <2 x half> %y, <2 x half> %z) { +; GFX6-LABEL: v_fma_v2f16_fneg_lhs: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX6-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_fma_f32 v0, v0, v2, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_fma_f32 v1, v1, v3, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fma_v2f16_fneg_lhs: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX8-NEXT: v_fma_f16 v0, v0, v1, v2 +; GFX8-NEXT: v_fma_f16 v1, v3, v4, v5 +; GFX8-NEXT: v_mov_b32_e32 v2, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fma_v2f16_fneg_lhs: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_fma_f16 v0, v0, v1, v2 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %x.fneg = fneg <2 x half> %x + %fma = call <2 x half> @llvm.fma.v2f16(<2 x half> %x.fneg, <2 x half> %y, <2 x half> %z) + ret <2 x half> %fma +} + +define <2 x half> @v_fma_v2f16_fneg_rhs(<2 x half> %x, <2 x half> %y, <2 x half> %z) { +; GFX6-LABEL: v_fma_v2f16_fneg_rhs: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX6-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX6-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_fma_f32 v0, v0, v2, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_fma_f32 v1, v1, v3, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fma_v2f16_fneg_rhs: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX8-NEXT: v_fma_f16 v0, v0, v1, v2 +; GFX8-NEXT: v_fma_f16 v1, v3, v4, v5 +; GFX8-NEXT: v_mov_b32_e32 v2, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fma_v2f16_fneg_rhs: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_fma_f16 v0, v0, v1, v2 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %y.fneg = fneg <2 x half> %y + %fma = call <2 x half> @llvm.fma.v2f16(<2 x half> %x, <2 x half> %y.fneg, <2 x half> %z) + ret <2 x half> %fma +} + +define <2 x half> @v_fma_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y, <2 x half> %z) { +; GFX6-LABEL: v_fma_v2f16_fneg_lhs_rhs: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_mov_b32_e32 v6, 0xffff +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_and_b32_e32 v0, v0, v6 +; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v2, v2, v6 +; GFX6-NEXT: s_mov_b32 s4, 0x80008000 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX6-NEXT: v_xor_b32_e32 v0, s4, v0 +; GFX6-NEXT: v_xor_b32_e32 v1, s4, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_fma_f32 v0, v0, v1, v4 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_fma_f32 v1, v2, v3, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fma_v2f16_fneg_lhs_rhs: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 s4, 0x80008000 +; GFX8-NEXT: v_xor_b32_e32 v0, s4, v0 +; GFX8-NEXT: v_xor_b32_e32 v1, s4, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX8-NEXT: v_fma_f16 v0, v0, v1, v2 +; GFX8-NEXT: v_fma_f16 v1, v3, v4, v5 +; GFX8-NEXT: v_mov_b32_e32 v2, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fma_v2f16_fneg_lhs_rhs: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_fma_f16 v0, v0, v1, v2 neg_lo:[1,1,0] neg_hi:[1,1,0] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %x.fneg = fneg <2 x half> %x + %y.fneg = fneg <2 x half> %y + %fma = call <2 x half> @llvm.fma.v2f16(<2 x half> %x.fneg, <2 x half> %y.fneg, <2 x half> %z) + ret <2 x half> %fma +} + +; FIXME: +; define <3 x half> @v_fma_v3f16(<3 x half> %x, <3 x half> %y, <3 x half> %z) { +; %fma = call <3 x half> @llvm.fma.v3f16(<3 x half> %x, <3 x half> %y, <3 x half> %z) +; ret <3 x half> %fma +; } + +define <4 x half> @v_fma_v4f16(<4 x half> %x, <4 x half> %y, <4 x half> %z) { +; GFX6-LABEL: v_fma_v4f16: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4 +; GFX6-NEXT: v_cvt_f32_f16_e32 v8, v8 +; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX6-NEXT: v_cvt_f32_f16_e32 v9, v9 +; GFX6-NEXT: v_fma_f32 v0, v0, v4, v8 +; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v6 +; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v7 +; GFX6-NEXT: v_fma_f32 v1, v1, v5, v9 +; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v10 +; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v11 +; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX6-NEXT: v_fma_f32 v2, v2, v4, v5 +; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX6-NEXT: v_fma_f32 v3, v3, v6, v7 +; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fma_v4f16: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 16, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 16, v4 +; GFX8-NEXT: v_fma_f16 v0, v0, v2, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 16, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 16, v5 +; GFX8-NEXT: v_fma_f16 v2, v6, v8, v10 +; GFX8-NEXT: v_mov_b32_e32 v4, 16 +; GFX8-NEXT: v_fma_f16 v1, v1, v3, v5 +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_fma_f16 v3, v7, v9, v11 +; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fma_v4f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_pk_fma_f16 v0, v0, v2, v4 +; GFX9-NEXT: v_pk_fma_f16 v1, v1, v3, v5 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %fma = call <4 x half> @llvm.fma.v4f16(<4 x half> %x, <4 x half> %y, <4 x half> %z) + ret <4 x half> %fma +} + +define double @v_fma_f64(double %x, double %y, double %z) { +; GFX6-LABEL: v_fma_f64: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fma_f64: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fma_f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %fma = call double @llvm.fma.f64(double %x, double %y, double %z) + ret double %fma +} + +define double @v_fma_f64_fneg_all(double %x, double %y, double %z) { +; GFX6-LABEL: v_fma_f64_fneg_all: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_fma_f64 v[0:1], -v[0:1], -v[2:3], -v[4:5] +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fma_f64_fneg_all: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_fma_f64 v[0:1], -v[0:1], -v[2:3], -v[4:5] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fma_f64_fneg_all: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_fma_f64 v[0:1], -v[0:1], -v[2:3], -v[4:5] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %neg.x = fneg double %x + %neg.y = fneg double %y + %neg.z = fneg double %z + %fma = call double @llvm.fma.f64(double %neg.x, double %neg.y, double %neg.z) + ret double %fma +} + +define <2 x double> @v_fma_v2f64(<2 x double> %x, <2 x double> %y, <2 x double> %z) { +; GFX6-LABEL: v_fma_v2f64: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_fma_f64 v[0:1], v[0:1], v[4:5], v[8:9] +; GFX6-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[10:11] +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fma_v2f64: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_fma_f64 v[0:1], v[0:1], v[4:5], v[8:9] +; GFX8-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[10:11] +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fma_v2f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_fma_f64 v[0:1], v[0:1], v[4:5], v[8:9] +; GFX9-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[10:11] +; GFX9-NEXT: s_setpc_b64 s[30:31] + %fma = call <2 x double> @llvm.fma.v2f64(<2 x double> %x, <2 x double> %y, <2 x double> %z) + ret <2 x double> %fma +} + +define float @v_fma_f32_fabs_lhs(float %x, float %y, float %z) { +; GFX6-LABEL: v_fma_f32_fabs_lhs: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_fma_f32 v0, |v0|, v1, v2 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fma_f32_fabs_lhs: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_fma_f32 v0, |v0|, v1, v2 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fma_f32_fabs_lhs: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_fma_f32 v0, |v0|, v1, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %fabs.x = call float @llvm.fabs.f32(float %x) + %fma = call float @llvm.fma.f32(float %fabs.x, float %y, float %z) + ret float %fma +} + +define float @v_fma_f32_fabs_rhs(float %x, float %y, float %z) { +; GFX6-LABEL: v_fma_f32_fabs_rhs: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_fma_f32 v0, v0, |v1|, v2 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fma_f32_fabs_rhs: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_fma_f32 v0, v0, |v1|, v2 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fma_f32_fabs_rhs: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_fma_f32 v0, v0, |v1|, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %fabs.y = call float @llvm.fabs.f32(float %y) + %fma = call float @llvm.fma.f32(float %x, float %fabs.y, float %z) + ret float %fma +} + +define float @v_fma_f32_fabs_lhs_rhs(float %x, float %y, float %z) { +; GFX6-LABEL: v_fma_f32_fabs_lhs_rhs: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_fma_f32 v0, |v0|, |v1|, v2 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fma_f32_fabs_lhs_rhs: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_fma_f32 v0, |v0|, |v1|, v2 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fma_f32_fabs_lhs_rhs: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_fma_f32 v0, |v0|, |v1|, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %fabs.x = call float @llvm.fabs.f32(float %x) + %fabs.y = call float @llvm.fabs.f32(float %y) + %fma = call float @llvm.fma.f32(float %fabs.x, float %fabs.y, float %z) + ret float %fma +} + +define amdgpu_ps float @v_fma_f32_sgpr_vgpr_vgpr(float inreg %x, float %y, float %z) { +; GFX6-LABEL: v_fma_f32_sgpr_vgpr_vgpr: +; GFX6: ; %bb.0: +; GFX6-NEXT: v_fma_f32 v0, s0, v0, v1 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: v_fma_f32_sgpr_vgpr_vgpr: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_fma_f32 v0, s0, v0, v1 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: v_fma_f32_sgpr_vgpr_vgpr: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_fma_f32 v0, s0, v0, v1 +; GFX9-NEXT: ; return to shader part epilog + %fma = call float @llvm.fma.f32(float %x, float %y, float %z) + ret float %fma +} + +define amdgpu_ps float @v_fma_f32_vgpr_sgpr_vgpr(float %x, float inreg %y, float %z) { +; GFX6-LABEL: v_fma_f32_vgpr_sgpr_vgpr: +; GFX6: ; %bb.0: +; GFX6-NEXT: v_fma_f32 v0, v0, s0, v1 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: v_fma_f32_vgpr_sgpr_vgpr: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_fma_f32 v0, v0, s0, v1 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: v_fma_f32_vgpr_sgpr_vgpr: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_fma_f32 v0, v0, s0, v1 +; GFX9-NEXT: ; return to shader part epilog + %fma = call float @llvm.fma.f32(float %x, float %y, float %z) + ret float %fma +} + +define amdgpu_ps float @v_fma_f32_sgpr_sgpr_sgpr(float inreg %x, float inreg %y, float inreg %z) { +; GFX6-LABEL: v_fma_f32_sgpr_sgpr_sgpr: +; GFX6: ; %bb.0: +; GFX6-NEXT: v_mov_b32_e32 v0, s1 +; GFX6-NEXT: v_mov_b32_e32 v1, s2 +; GFX6-NEXT: v_fma_f32 v0, s0, v0, v1 +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: v_fma_f32_sgpr_sgpr_sgpr: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_mov_b32_e32 v0, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NEXT: v_fma_f32 v0, s0, v0, v1 +; GFX8-NEXT: ; return to shader part epilog +; +; GFX9-LABEL: v_fma_f32_sgpr_sgpr_sgpr: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_fma_f32 v0, s0, v0, v1 +; GFX9-NEXT: ; return to shader part epilog + %fma = call float @llvm.fma.f32(float %x, float %y, float %z) + ret float %fma +} + +define float @v_fma_f32_fneg_lhs(float %x, float %y, float %z) { +; GFX6-LABEL: v_fma_f32_fneg_lhs: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_fma_f32 v0, -v0, v1, v2 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fma_f32_fneg_lhs: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_fma_f32 v0, -v0, v1, v2 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fma_f32_fneg_lhs: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_fma_f32 v0, -v0, v1, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %neg.x = fneg float %x + %fma = call float @llvm.fma.f32(float %neg.x, float %y, float %z) + ret float %fma +} + +define float @v_fma_f32_fneg_rhs(float %x, float %y, float %z) { +; GFX6-LABEL: v_fma_f32_fneg_rhs: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_fma_f32 v0, v0, -v1, v2 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fma_f32_fneg_rhs: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_fma_f32 v0, v0, -v1, v2 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fma_f32_fneg_rhs: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_fma_f32 v0, v0, -v1, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %neg.y = fneg float %y + %fma = call float @llvm.fma.f32(float %x, float %neg.y, float %z) + ret float %fma +} + +define float @v_fma_f32_fneg_z(float %x, float %y, float %z) { +; GFX6-LABEL: v_fma_f32_fneg_z: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_fma_f32 v0, v0, v1, -v2 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_fma_f32_fneg_z: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_fma_f32 v0, v0, v1, -v2 +; GFX8-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fma_f32_fneg_z: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_fma_f32 v0, v0, v1, -v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] + %neg.z = fneg float %z + %fma = call float @llvm.fma.f32(float %x, float %y, float %neg.z) + ret float %fma +} + +declare half @llvm.fma.f16(half, half, half) #0 +declare float @llvm.fma.f32(float, float, float) #0 +declare double @llvm.fma.f64(double, double, double) #0 + +declare half @llvm.fabs.f16(half) #0 +declare float @llvm.fabs.f32(float) #0 + +declare <2 x half> @llvm.fma.v2f16(<2 x half>, <2 x half>, <2 x half>) #0 +declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>) #0 +declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) #0 + +declare <3 x half> @llvm.fma.v3f16(<3 x half>, <3 x half>, <3 x half>) #0 +declare <4 x half> @llvm.fma.v4f16(<4 x half>, <4 x half>, <4 x half>) #0 + +attributes #0 = { nounwind readnone speculatable willreturn } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll index 7b6863fb17a5f..3f573e7f9a861 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll @@ -1684,8 +1684,8 @@ define amdgpu_ps <32 x i32> @dyn_insertelement_v32i32_s_s_s(<32 x i32> inreg %ve ; GPRIDX-NEXT: s_mov_b32 s27, s29 ; GPRIDX-NEXT: s_mov_b32 s28, s30 ; GPRIDX-NEXT: s_mov_b32 s29, s31 -; GPRIDX-NEXT: s_mov_b32 s30, s32 ; GPRIDX-NEXT: s_mov_b32 s31, s33 +; GPRIDX-NEXT: s_mov_b32 s30, s32 ; GPRIDX-NEXT: s_mov_b32 m0, s35 ; GPRIDX-NEXT: s_nop 0 ; GPRIDX-NEXT: s_movreld_b32 s0, s34 @@ -1724,8 +1724,8 @@ define amdgpu_ps <32 x i32> @dyn_insertelement_v32i32_s_s_s(<32 x i32> inreg %ve ; MOVREL-NEXT: s_mov_b32 s27, s29 ; MOVREL-NEXT: s_mov_b32 s28, s30 ; MOVREL-NEXT: s_mov_b32 s29, s31 -; MOVREL-NEXT: s_mov_b32 s30, s32 ; MOVREL-NEXT: s_mov_b32 s31, s33 +; MOVREL-NEXT: s_mov_b32 s30, s32 ; MOVREL-NEXT: s_movreld_b32 s0, s34 ; MOVREL-NEXT: ; implicit-def: $vcc_hi ; MOVREL-NEXT: ; return to shader part epilog diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-frint.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-frint.mir new file mode 100644 index 0000000000000..45a8551ee47e3 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-frint.mir @@ -0,0 +1,105 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=bonaire -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s +# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s + +--- +name: frint_s32_vv +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $vgpr0 + + ; GCN-LABEL: name: frint_s32_vv + ; GCN: liveins: $vgpr0 + ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN: [[V_RNDNE_F32_e64_:%[0-9]+]]:vgpr_32 = V_RNDNE_F32_e64 0, [[COPY]], 0, 0, implicit $exec + ; GCN: $vgpr0 = COPY [[V_RNDNE_F32_e64_]] + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(s32) = G_FRINT %0 + $vgpr0 = COPY %1 +... + +--- +name: frint_s32_vs +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $sgpr0 + + ; GCN-LABEL: name: frint_s32_vs + ; GCN: liveins: $sgpr0 + ; GCN: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GCN: [[V_RNDNE_F32_e64_:%[0-9]+]]:vgpr_32 = V_RNDNE_F32_e64 0, [[COPY]], 0, 0, implicit $exec + ; GCN: $vgpr0 = COPY [[V_RNDNE_F32_e64_]] + %0:sgpr(s32) = COPY $sgpr0 + %1:vgpr(s32) = G_FRINT %0 + $vgpr0 = COPY %1 +... + +--- +name: frint_fneg_s32_vv +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $vgpr0 + + ; GCN-LABEL: name: frint_fneg_s32_vv + ; GCN: liveins: $vgpr0 + ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GCN: [[V_RNDNE_F32_e64_:%[0-9]+]]:vgpr_32 = V_RNDNE_F32_e64 1, [[COPY]], 0, 0, implicit $exec + ; GCN: $vgpr0 = COPY [[V_RNDNE_F32_e64_]] + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(s32) = G_FNEG %0 + %2:vgpr(s32) = G_FRINT %1 + $vgpr0 = COPY %2 +... + +--- +name: frint_s64_vv +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1 + + ; GCN-LABEL: name: frint_s64_vv + ; GCN: liveins: $vgpr0_vgpr1 + ; GCN: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GCN: [[V_RNDNE_F64_e64_:%[0-9]+]]:vreg_64 = V_RNDNE_F64_e64 0, [[COPY]], 0, 0, implicit $exec + ; GCN: $vgpr0_vgpr1 = COPY [[V_RNDNE_F64_e64_]] + %0:vgpr(s64) = COPY $vgpr0_vgpr1 + %1:vgpr(s64) = G_FRINT %0 + $vgpr0_vgpr1 = COPY %1 +... + +--- +name: frint_s64_fneg_vv +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1 + + ; GCN-LABEL: name: frint_s64_fneg_vv + ; GCN: liveins: $vgpr0_vgpr1 + ; GCN: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GCN: [[V_RNDNE_F64_e64_:%[0-9]+]]:vreg_64 = V_RNDNE_F64_e64 1, [[COPY]], 0, 0, implicit $exec + ; GCN: $vgpr0_vgpr1 = COPY [[V_RNDNE_F64_e64_]] + %0:vgpr(s64) = COPY $vgpr0_vgpr1 + %1:vgpr(s64) = G_FNEG %0 + %2:vgpr(s64) = G_FRINT %1 + $vgpr0_vgpr1 = COPY %2 +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-local.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-local.mir index 08fdd0f30a16f..1382434fe0a74 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-local.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-local.mir @@ -16,12 +16,6 @@ body: | bb.0: liveins: $vgpr0 - ; GFX6-LABEL: name: load_local_s32_from_4 - ; GFX6: liveins: $vgpr0 - ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: [[DS_READ_B32_:%[0-9]+]]:vgpr_32 = DS_READ_B32 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 4, addrspace 3) - ; GFX6: $vgpr0 = COPY [[DS_READ_B32_]] ; GFX7-LABEL: name: load_local_s32_from_4 ; GFX7: liveins: $vgpr0 ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -33,6 +27,12 @@ body: | ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[COPY]], 0, 0, implicit $exec :: (load 4, addrspace 3) ; GFX9: $vgpr0 = COPY [[DS_READ_B32_gfx9_]] + ; GFX6-LABEL: name: load_local_s32_from_4 + ; GFX6: liveins: $vgpr0 + ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: [[DS_READ_B32_:%[0-9]+]]:vgpr_32 = DS_READ_B32 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 4, addrspace 3) + ; GFX6: $vgpr0 = COPY [[DS_READ_B32_]] %0:vgpr(p3) = COPY $vgpr0 %1:vgpr(s32) = G_LOAD %0 :: (load 4, align 4, addrspace 3) $vgpr0 = COPY %1 @@ -50,12 +50,6 @@ body: | bb.0: liveins: $vgpr0 - ; GFX6-LABEL: name: load_local_s32_from_2 - ; GFX6: liveins: $vgpr0 - ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: [[DS_READ_U16_:%[0-9]+]]:vgpr_32 = DS_READ_U16 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 2, addrspace 3) - ; GFX6: $vgpr0 = COPY [[DS_READ_U16_]] ; GFX7-LABEL: name: load_local_s32_from_2 ; GFX7: liveins: $vgpr0 ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -67,6 +61,12 @@ body: | ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[DS_READ_U16_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_U16_gfx9 [[COPY]], 0, 0, implicit $exec :: (load 2, addrspace 3) ; GFX9: $vgpr0 = COPY [[DS_READ_U16_gfx9_]] + ; GFX6-LABEL: name: load_local_s32_from_2 + ; GFX6: liveins: $vgpr0 + ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: [[DS_READ_U16_:%[0-9]+]]:vgpr_32 = DS_READ_U16 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 2, addrspace 3) + ; GFX6: $vgpr0 = COPY [[DS_READ_U16_]] %0:vgpr(p3) = COPY $vgpr0 %1:vgpr(s32) = G_LOAD %0 :: (load 2, align 2, addrspace 3) $vgpr0 = COPY %1 @@ -81,19 +81,12 @@ regBankSelected: true tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | bb.0: liveins: $vgpr0 - ; GFX6-LABEL: name: load_local_s32_from_1 - ; GFX6: liveins: $vgpr0 - ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: [[DS_READ_U8_:%[0-9]+]]:vgpr_32 = DS_READ_U8 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 1, addrspace 3) - ; GFX6: $vgpr0 = COPY [[DS_READ_U8_]] ; GFX7-LABEL: name: load_local_s32_from_1 ; GFX7: liveins: $vgpr0 ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -105,6 +98,12 @@ body: | ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[DS_READ_U8_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_U8_gfx9 [[COPY]], 0, 0, implicit $exec :: (load 1, addrspace 3) ; GFX9: $vgpr0 = COPY [[DS_READ_U8_gfx9_]] + ; GFX6-LABEL: name: load_local_s32_from_1 + ; GFX6: liveins: $vgpr0 + ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: [[DS_READ_U8_:%[0-9]+]]:vgpr_32 = DS_READ_U8 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 1, addrspace 3) + ; GFX6: $vgpr0 = COPY [[DS_READ_U8_]] %0:vgpr(p3) = COPY $vgpr0 %1:vgpr(s32) = G_LOAD %0 :: (load 1, align 1, addrspace 3) $vgpr0 = COPY %1 @@ -122,12 +121,6 @@ body: | bb.0: liveins: $vgpr0 - ; GFX6-LABEL: name: load_local_v2s32 - ; GFX6: liveins: $vgpr0 - ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: [[DS_READ_B64_:%[0-9]+]]:vreg_64 = DS_READ_B64 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 8, addrspace 3) - ; GFX6: $vgpr0_vgpr1 = COPY [[DS_READ_B64_]] ; GFX7-LABEL: name: load_local_v2s32 ; GFX7: liveins: $vgpr0 ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -139,6 +132,12 @@ body: | ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[DS_READ_B64_gfx9_:%[0-9]+]]:vreg_64 = DS_READ_B64_gfx9 [[COPY]], 0, 0, implicit $exec :: (load 8, addrspace 3) ; GFX9: $vgpr0_vgpr1 = COPY [[DS_READ_B64_gfx9_]] + ; GFX6-LABEL: name: load_local_v2s32 + ; GFX6: liveins: $vgpr0 + ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: [[DS_READ_B64_:%[0-9]+]]:vreg_64 = DS_READ_B64 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 8, addrspace 3) + ; GFX6: $vgpr0_vgpr1 = COPY [[DS_READ_B64_]] %0:vgpr(p3) = COPY $vgpr0 %1:vgpr(<2 x s32>) = G_LOAD %0 :: (load 8, align 8, addrspace 3) $vgpr0_vgpr1 = COPY %1 @@ -156,12 +155,6 @@ body: | bb.0: liveins: $vgpr0 - ; GFX6-LABEL: name: load_local_v2s32_align4 - ; GFX6: liveins: $vgpr0 - ; GFX6: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0 - ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: [[LOAD:%[0-9]+]]:vreg_64(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3) - ; GFX6: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>) ; GFX7-LABEL: name: load_local_v2s32_align4 ; GFX7: liveins: $vgpr0 ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -173,6 +166,12 @@ body: | ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[DS_READ2_B32_gfx9_:%[0-9]+]]:vreg_64 = DS_READ2_B32_gfx9 [[COPY]], 0, 1, 0, implicit $exec :: (load 8, align 4, addrspace 3) ; GFX9: $vgpr0_vgpr1 = COPY [[DS_READ2_B32_gfx9_]] + ; GFX6-LABEL: name: load_local_v2s32_align4 + ; GFX6: liveins: $vgpr0 + ; GFX6: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0 + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: [[LOAD:%[0-9]+]]:vreg_64(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3) + ; GFX6: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>) %0:vgpr(p3) = COPY $vgpr0 %1:vgpr(<2 x s32>) = G_LOAD %0 :: (load 8, align 4, addrspace 3) $vgpr0_vgpr1 = COPY %1 @@ -190,12 +189,6 @@ body: | bb.0: liveins: $vgpr0 - ; GFX6-LABEL: name: load_local_s64 - ; GFX6: liveins: $vgpr0 - ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: [[DS_READ_B64_:%[0-9]+]]:vreg_64 = DS_READ_B64 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 8, addrspace 3) - ; GFX6: $vgpr0_vgpr1 = COPY [[DS_READ_B64_]] ; GFX7-LABEL: name: load_local_s64 ; GFX7: liveins: $vgpr0 ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -207,6 +200,12 @@ body: | ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[DS_READ_B64_gfx9_:%[0-9]+]]:vreg_64 = DS_READ_B64_gfx9 [[COPY]], 0, 0, implicit $exec :: (load 8, addrspace 3) ; GFX9: $vgpr0_vgpr1 = COPY [[DS_READ_B64_gfx9_]] + ; GFX6-LABEL: name: load_local_s64 + ; GFX6: liveins: $vgpr0 + ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: [[DS_READ_B64_:%[0-9]+]]:vreg_64 = DS_READ_B64 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 8, addrspace 3) + ; GFX6: $vgpr0_vgpr1 = COPY [[DS_READ_B64_]] %0:vgpr(p3) = COPY $vgpr0 %1:vgpr(s64) = G_LOAD %0 :: (load 8, align 8, addrspace 3) $vgpr0_vgpr1 = COPY %1 @@ -224,12 +223,6 @@ body: | bb.0: liveins: $vgpr0 - ; GFX6-LABEL: name: load_local_s64_align4 - ; GFX6: liveins: $vgpr0 - ; GFX6: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0 - ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: [[LOAD:%[0-9]+]]:vreg_64(s64) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3) - ; GFX6: $vgpr0_vgpr1 = COPY [[LOAD]](s64) ; GFX7-LABEL: name: load_local_s64_align4 ; GFX7: liveins: $vgpr0 ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -241,6 +234,12 @@ body: | ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[DS_READ2_B32_gfx9_:%[0-9]+]]:vreg_64 = DS_READ2_B32_gfx9 [[COPY]], 0, 1, 0, implicit $exec :: (load 8, align 4, addrspace 3) ; GFX9: $vgpr0_vgpr1 = COPY [[DS_READ2_B32_gfx9_]] + ; GFX6-LABEL: name: load_local_s64_align4 + ; GFX6: liveins: $vgpr0 + ; GFX6: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0 + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: [[LOAD:%[0-9]+]]:vreg_64(s64) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3) + ; GFX6: $vgpr0_vgpr1 = COPY [[LOAD]](s64) %0:vgpr(p3) = COPY $vgpr0 %1:vgpr(s64) = G_LOAD %0 :: (load 8, align 4, addrspace 3) $vgpr0_vgpr1 = COPY %1 @@ -258,12 +257,6 @@ body: | bb.0: liveins: $vgpr0 - ; GFX6-LABEL: name: load_local_p3_from_4 - ; GFX6: liveins: $vgpr0 - ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: [[DS_READ_B32_:%[0-9]+]]:vgpr_32 = DS_READ_B32 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 4, addrspace 3) - ; GFX6: $vgpr0 = COPY [[DS_READ_B32_]] ; GFX7-LABEL: name: load_local_p3_from_4 ; GFX7: liveins: $vgpr0 ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -275,6 +268,12 @@ body: | ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[COPY]], 0, 0, implicit $exec :: (load 4, addrspace 3) ; GFX9: $vgpr0 = COPY [[DS_READ_B32_gfx9_]] + ; GFX6-LABEL: name: load_local_p3_from_4 + ; GFX6: liveins: $vgpr0 + ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: [[DS_READ_B32_:%[0-9]+]]:vgpr_32 = DS_READ_B32 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 4, addrspace 3) + ; GFX6: $vgpr0 = COPY [[DS_READ_B32_]] %0:vgpr(p3) = COPY $vgpr0 %1:vgpr(p3) = G_LOAD %0 :: (load 4, align 4, addrspace 3) $vgpr0 = COPY %1 @@ -292,12 +291,6 @@ body: | bb.0: liveins: $vgpr0 - ; GFX6-LABEL: name: load_local_p5_from_4 - ; GFX6: liveins: $vgpr0 - ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: [[DS_READ_B32_:%[0-9]+]]:vgpr_32 = DS_READ_B32 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 4, addrspace 3) - ; GFX6: $vgpr0 = COPY [[DS_READ_B32_]] ; GFX7-LABEL: name: load_local_p5_from_4 ; GFX7: liveins: $vgpr0 ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -309,6 +302,12 @@ body: | ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[COPY]], 0, 0, implicit $exec :: (load 4, addrspace 3) ; GFX9: $vgpr0 = COPY [[DS_READ_B32_gfx9_]] + ; GFX6-LABEL: name: load_local_p5_from_4 + ; GFX6: liveins: $vgpr0 + ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: [[DS_READ_B32_:%[0-9]+]]:vgpr_32 = DS_READ_B32 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 4, addrspace 3) + ; GFX6: $vgpr0 = COPY [[DS_READ_B32_]] %0:vgpr(p3) = COPY $vgpr0 %1:vgpr(p3) = G_LOAD %0 :: (load 4, align 4, addrspace 3) $vgpr0 = COPY %1 @@ -326,12 +325,6 @@ body: | bb.0: liveins: $vgpr0 - ; GFX6-LABEL: name: load_local_p1_align8 - ; GFX6: liveins: $vgpr0 - ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: [[DS_READ_B64_:%[0-9]+]]:vreg_64 = DS_READ_B64 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 8, addrspace 3) - ; GFX6: $vgpr0_vgpr1 = COPY [[DS_READ_B64_]] ; GFX7-LABEL: name: load_local_p1_align8 ; GFX7: liveins: $vgpr0 ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -343,6 +336,12 @@ body: | ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[DS_READ_B64_gfx9_:%[0-9]+]]:vreg_64 = DS_READ_B64_gfx9 [[COPY]], 0, 0, implicit $exec :: (load 8, addrspace 3) ; GFX9: $vgpr0_vgpr1 = COPY [[DS_READ_B64_gfx9_]] + ; GFX6-LABEL: name: load_local_p1_align8 + ; GFX6: liveins: $vgpr0 + ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: [[DS_READ_B64_:%[0-9]+]]:vreg_64 = DS_READ_B64 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 8, addrspace 3) + ; GFX6: $vgpr0_vgpr1 = COPY [[DS_READ_B64_]] %0:vgpr(p3) = COPY $vgpr0 %1:vgpr(p1) = G_LOAD %0 :: (load 8, align 8, addrspace 3) $vgpr0_vgpr1 = COPY %1 @@ -360,12 +359,6 @@ body: | bb.0: liveins: $vgpr0 - ; GFX6-LABEL: name: load_local_p1_align4 - ; GFX6: liveins: $vgpr0 - ; GFX6: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0 - ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: [[LOAD:%[0-9]+]]:vreg_64(p1) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3) - ; GFX6: $vgpr0_vgpr1 = COPY [[LOAD]](p1) ; GFX7-LABEL: name: load_local_p1_align4 ; GFX7: liveins: $vgpr0 ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -377,6 +370,12 @@ body: | ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[DS_READ2_B32_gfx9_:%[0-9]+]]:vreg_64 = DS_READ2_B32_gfx9 [[COPY]], 0, 1, 0, implicit $exec :: (load 8, align 4, addrspace 3) ; GFX9: $vgpr0_vgpr1 = COPY [[DS_READ2_B32_gfx9_]] + ; GFX6-LABEL: name: load_local_p1_align4 + ; GFX6: liveins: $vgpr0 + ; GFX6: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0 + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: [[LOAD:%[0-9]+]]:vreg_64(p1) = G_LOAD [[COPY]](p3) :: (load 8, align 4, addrspace 3) + ; GFX6: $vgpr0_vgpr1 = COPY [[LOAD]](p1) %0:vgpr(p3) = COPY $vgpr0 %1:vgpr(p1) = G_LOAD %0 :: (load 8, align 4, addrspace 3) $vgpr0_vgpr1 = COPY %1 @@ -394,12 +393,6 @@ body: | bb.0: liveins: $vgpr0 - ; GFX6-LABEL: name: load_local_p999_from_8 - ; GFX6: liveins: $vgpr0 - ; GFX6: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0 - ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: [[LOAD:%[0-9]+]]:vreg_64(p999) = G_LOAD [[COPY]](p3) :: (load 8, addrspace 3) - ; GFX6: $vgpr0_vgpr1 = COPY [[LOAD]](p999) ; GFX7-LABEL: name: load_local_p999_from_8 ; GFX7: liveins: $vgpr0 ; GFX7: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0 @@ -411,6 +404,12 @@ body: | ; GFX9: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0 ; GFX9: [[LOAD:%[0-9]+]]:vreg_64(p999) = G_LOAD [[COPY]](p3) :: (load 8, addrspace 3) ; GFX9: $vgpr0_vgpr1 = COPY [[LOAD]](p999) + ; GFX6-LABEL: name: load_local_p999_from_8 + ; GFX6: liveins: $vgpr0 + ; GFX6: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0 + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: [[LOAD:%[0-9]+]]:vreg_64(p999) = G_LOAD [[COPY]](p3) :: (load 8, addrspace 3) + ; GFX6: $vgpr0_vgpr1 = COPY [[LOAD]](p999) %0:vgpr(p3) = COPY $vgpr0 %1:vgpr(p999) = G_LOAD %0 :: (load 8, align 8, addrspace 3) $vgpr0_vgpr1 = COPY %1 @@ -428,12 +427,6 @@ body: | bb.0: liveins: $vgpr0 - ; GFX6-LABEL: name: load_local_v2p3 - ; GFX6: liveins: $vgpr0 - ; GFX6: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0 - ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: [[LOAD:%[0-9]+]]:vreg_64(<2 x p3>) = G_LOAD [[COPY]](p3) :: (load 8, addrspace 3) - ; GFX6: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>) ; GFX7-LABEL: name: load_local_v2p3 ; GFX7: liveins: $vgpr0 ; GFX7: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0 @@ -445,6 +438,12 @@ body: | ; GFX9: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0 ; GFX9: [[LOAD:%[0-9]+]]:vreg_64(<2 x p3>) = G_LOAD [[COPY]](p3) :: (load 8, addrspace 3) ; GFX9: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>) + ; GFX6-LABEL: name: load_local_v2p3 + ; GFX6: liveins: $vgpr0 + ; GFX6: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0 + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: [[LOAD:%[0-9]+]]:vreg_64(<2 x p3>) = G_LOAD [[COPY]](p3) :: (load 8, addrspace 3) + ; GFX6: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>) %0:vgpr(p3) = COPY $vgpr0 %1:vgpr(<2 x p3>) = G_LOAD %0 :: (load 8, align 8, addrspace 3) $vgpr0_vgpr1 = COPY %1 @@ -462,12 +461,6 @@ body: | bb.0: liveins: $vgpr0 - ; GFX6-LABEL: name: load_local_v2s16 - ; GFX6: liveins: $vgpr0 - ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: [[DS_READ_B32_:%[0-9]+]]:vgpr_32 = DS_READ_B32 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 4, addrspace 3) - ; GFX6: $vgpr0 = COPY [[DS_READ_B32_]] ; GFX7-LABEL: name: load_local_v2s16 ; GFX7: liveins: $vgpr0 ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -479,6 +472,12 @@ body: | ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[DS_READ_B32_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_B32_gfx9 [[COPY]], 0, 0, implicit $exec :: (load 4, addrspace 3) ; GFX9: $vgpr0 = COPY [[DS_READ_B32_gfx9_]] + ; GFX6-LABEL: name: load_local_v2s16 + ; GFX6: liveins: $vgpr0 + ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: [[DS_READ_B32_:%[0-9]+]]:vgpr_32 = DS_READ_B32 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 4, addrspace 3) + ; GFX6: $vgpr0 = COPY [[DS_READ_B32_]] %0:vgpr(p3) = COPY $vgpr0 %1:vgpr(<2 x s16>) = G_LOAD %0 :: (load 4, align 4, addrspace 3) $vgpr0 = COPY %1 @@ -496,12 +495,6 @@ body: | bb.0: liveins: $vgpr0 - ; GFX6-LABEL: name: load_local_v4s16 - ; GFX6: liveins: $vgpr0 - ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: [[DS_READ_B64_:%[0-9]+]]:vreg_64 = DS_READ_B64 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 8, addrspace 3) - ; GFX6: $vgpr0_vgpr1 = COPY [[DS_READ_B64_]] ; GFX7-LABEL: name: load_local_v4s16 ; GFX7: liveins: $vgpr0 ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -513,6 +506,12 @@ body: | ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[DS_READ_B64_gfx9_:%[0-9]+]]:vreg_64 = DS_READ_B64_gfx9 [[COPY]], 0, 0, implicit $exec :: (load 8, addrspace 3) ; GFX9: $vgpr0_vgpr1 = COPY [[DS_READ_B64_gfx9_]] + ; GFX6-LABEL: name: load_local_v4s16 + ; GFX6: liveins: $vgpr0 + ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: [[DS_READ_B64_:%[0-9]+]]:vreg_64 = DS_READ_B64 [[COPY]], 0, 0, implicit $m0, implicit $exec :: (load 8, addrspace 3) + ; GFX6: $vgpr0_vgpr1 = COPY [[DS_READ_B64_]] %0:vgpr(p3) = COPY $vgpr0 %1:vgpr(<4 x s16>) = G_LOAD %0 :: (load 8, align 8, addrspace 3) $vgpr0_vgpr1 = COPY %1 @@ -527,7 +526,6 @@ body: | # tracksRegLiveness: true # machineFunctionInfo: # scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 -# scratchWaveOffsetReg: $sgpr4 # stackPtrOffsetReg: $sgpr32 # body: | @@ -555,14 +553,6 @@ body: | bb.0: liveins: $vgpr0 - ; GFX6-LABEL: name: load_local_s32_from_1_gep_65535 - ; GFX6: liveins: $vgpr0 - ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec - ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: [[DS_READ_U8_:%[0-9]+]]:vgpr_32 = DS_READ_U8 %2, 0, 0, implicit $m0, implicit $exec :: (load 1, addrspace 3) - ; GFX6: $vgpr0 = COPY [[DS_READ_U8_]] ; GFX7-LABEL: name: load_local_s32_from_1_gep_65535 ; GFX7: liveins: $vgpr0 ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -574,6 +564,14 @@ body: | ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[DS_READ_U8_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_U8_gfx9 [[COPY]], 65535, 0, implicit $exec :: (load 1, addrspace 3) ; GFX9: $vgpr0 = COPY [[DS_READ_U8_gfx9_]] + ; GFX6-LABEL: name: load_local_s32_from_1_gep_65535 + ; GFX6: liveins: $vgpr0 + ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65535, implicit $exec + ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: [[DS_READ_U8_:%[0-9]+]]:vgpr_32 = DS_READ_U8 %2, 0, 0, implicit $m0, implicit $exec :: (load 1, addrspace 3) + ; GFX6: $vgpr0 = COPY [[DS_READ_U8_]] %0:vgpr(p3) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 65535 %2:vgpr(p3) = G_PTR_ADD %0, %1 @@ -593,14 +591,6 @@ body: | bb.0: liveins: $vgpr0 - ; GFX6-LABEL: name: load_local_s32_from_1_gep_65535_known_bits_base_address - ; GFX6: liveins: $vgpr0 - ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2147483647, implicit $exec - ; GFX6: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY]], [[V_MOV_B32_e32_]], implicit $exec - ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: [[DS_READ_U8_:%[0-9]+]]:vgpr_32 = DS_READ_U8 [[V_AND_B32_e64_]], 65535, 0, implicit $m0, implicit $exec :: (load 1, addrspace 3) - ; GFX6: $vgpr0 = COPY [[DS_READ_U8_]] ; GFX7-LABEL: name: load_local_s32_from_1_gep_65535_known_bits_base_address ; GFX7: liveins: $vgpr0 ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -616,6 +606,14 @@ body: | ; GFX9: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY]], [[V_MOV_B32_e32_]], implicit $exec ; GFX9: [[DS_READ_U8_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_U8_gfx9 [[V_AND_B32_e64_]], 65535, 0, implicit $exec :: (load 1, addrspace 3) ; GFX9: $vgpr0 = COPY [[DS_READ_U8_gfx9_]] + ; GFX6-LABEL: name: load_local_s32_from_1_gep_65535_known_bits_base_address + ; GFX6: liveins: $vgpr0 + ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2147483647, implicit $exec + ; GFX6: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY]], [[V_MOV_B32_e32_]], implicit $exec + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: [[DS_READ_U8_:%[0-9]+]]:vgpr_32 = DS_READ_U8 [[V_AND_B32_e64_]], 65535, 0, implicit $m0, implicit $exec :: (load 1, addrspace 3) + ; GFX6: $vgpr0 = COPY [[DS_READ_U8_]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 2147483647 %2:vgpr(s32) = G_AND %0, %1 @@ -638,14 +636,6 @@ body: | bb.0: liveins: $vgpr0 - ; GFX6-LABEL: name: load_local_s32_from_1_gep_65536 - ; GFX6: liveins: $vgpr0 - ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65536, implicit $exec - ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: [[DS_READ_U8_:%[0-9]+]]:vgpr_32 = DS_READ_U8 %2, 0, 0, implicit $m0, implicit $exec :: (load 1, addrspace 3) - ; GFX6: $vgpr0 = COPY [[DS_READ_U8_]] ; GFX7-LABEL: name: load_local_s32_from_1_gep_65536 ; GFX7: liveins: $vgpr0 ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -661,6 +651,14 @@ body: | ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX9: [[DS_READ_U8_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_U8_gfx9 [[V_ADD_U32_e64_]], 0, 0, implicit $exec :: (load 1, addrspace 3) ; GFX9: $vgpr0 = COPY [[DS_READ_U8_gfx9_]] + ; GFX6-LABEL: name: load_local_s32_from_1_gep_65536 + ; GFX6: liveins: $vgpr0 + ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 65536, implicit $exec + ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: [[DS_READ_U8_:%[0-9]+]]:vgpr_32 = DS_READ_U8 %2, 0, 0, implicit $m0, implicit $exec :: (load 1, addrspace 3) + ; GFX6: $vgpr0 = COPY [[DS_READ_U8_]] %0:vgpr(p3) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 65536 %2:vgpr(p3) = G_PTR_ADD %0, %1 @@ -680,14 +678,6 @@ body: | bb.0: liveins: $vgpr0 - ; GFX6-LABEL: name: load_local_s32_from_1_gep_m1 - ; GFX6: liveins: $vgpr0 - ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967295, implicit $exec - ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: [[DS_READ_U8_:%[0-9]+]]:vgpr_32 = DS_READ_U8 %2, 0, 0, implicit $m0, implicit $exec :: (load 1, addrspace 3) - ; GFX6: $vgpr0 = COPY [[DS_READ_U8_]] ; GFX7-LABEL: name: load_local_s32_from_1_gep_m1 ; GFX7: liveins: $vgpr0 ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -703,6 +693,14 @@ body: | ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX9: [[DS_READ_U8_gfx9_:%[0-9]+]]:vgpr_32 = DS_READ_U8_gfx9 [[V_ADD_U32_e64_]], 0, 0, implicit $exec :: (load 1, addrspace 3) ; GFX9: $vgpr0 = COPY [[DS_READ_U8_gfx9_]] + ; GFX6-LABEL: name: load_local_s32_from_1_gep_m1 + ; GFX6: liveins: $vgpr0 + ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967295, implicit $exec + ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: [[DS_READ_U8_:%[0-9]+]]:vgpr_32 = DS_READ_U8 %2, 0, 0, implicit $m0, implicit $exec :: (load 1, addrspace 3) + ; GFX6: $vgpr0 = COPY [[DS_READ_U8_]] %0:vgpr(p3) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 -1 %2:vgpr(p3) = G_PTR_ADD %0, %1 @@ -722,14 +720,6 @@ body: | bb.0: liveins: $vgpr0_vgpr1 - ; GFX6-LABEL: name: load_local_s64_align4_from_1_gep_1016 - ; GFX6: liveins: $vgpr0_vgpr1 - ; GFX6: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0 - ; GFX6: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1016 - ; GFX6: [[PTR_ADD:%[0-9]+]]:vgpr(p3) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: [[LOAD:%[0-9]+]]:vreg_64(s64) = G_LOAD [[PTR_ADD]](p3) :: (load 8, align 4, addrspace 3) - ; GFX6: $vgpr0_vgpr1 = COPY [[LOAD]](s64) ; GFX7-LABEL: name: load_local_s64_align4_from_1_gep_1016 ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -741,6 +731,14 @@ body: | ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[DS_READ2_B32_gfx9_:%[0-9]+]]:vreg_64 = DS_READ2_B32_gfx9 [[COPY]], 254, 255, 0, implicit $exec :: (load 8, align 4, addrspace 3) ; GFX9: $vgpr0_vgpr1 = COPY [[DS_READ2_B32_gfx9_]] + ; GFX6-LABEL: name: load_local_s64_align4_from_1_gep_1016 + ; GFX6: liveins: $vgpr0_vgpr1 + ; GFX6: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0 + ; GFX6: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1016 + ; GFX6: [[PTR_ADD:%[0-9]+]]:vgpr(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: [[LOAD:%[0-9]+]]:vreg_64(s64) = G_LOAD [[PTR_ADD]](p3) :: (load 8, align 4, addrspace 3) + ; GFX6: $vgpr0_vgpr1 = COPY [[LOAD]](s64) %0:vgpr(p3) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 1016 %2:vgpr(p3) = G_PTR_ADD %0, %1 @@ -760,14 +758,6 @@ body: | bb.0: liveins: $vgpr0_vgpr1 - ; GFX6-LABEL: name: load_local_s64_align4_from_1_gep_1020 - ; GFX6: liveins: $vgpr0_vgpr1 - ; GFX6: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0 - ; GFX6: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1020 - ; GFX6: [[PTR_ADD:%[0-9]+]]:vgpr(p3) = G_PTR_ADD [[COPY]], [[C]](s32) - ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: [[LOAD:%[0-9]+]]:vreg_64(s64) = G_LOAD [[PTR_ADD]](p3) :: (load 8, align 4, addrspace 3) - ; GFX6: $vgpr0_vgpr1 = COPY [[LOAD]](s64) ; GFX7-LABEL: name: load_local_s64_align4_from_1_gep_1020 ; GFX7: liveins: $vgpr0_vgpr1 ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -783,6 +773,14 @@ body: | ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec ; GFX9: [[DS_READ2_B32_gfx9_:%[0-9]+]]:vreg_64 = DS_READ2_B32_gfx9 [[V_ADD_U32_e64_]], 0, 1, 0, implicit $exec :: (load 8, align 4, addrspace 3) ; GFX9: $vgpr0_vgpr1 = COPY [[DS_READ2_B32_gfx9_]] + ; GFX6-LABEL: name: load_local_s64_align4_from_1_gep_1020 + ; GFX6: liveins: $vgpr0_vgpr1 + ; GFX6: [[COPY:%[0-9]+]]:vgpr(p3) = COPY $vgpr0 + ; GFX6: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1020 + ; GFX6: [[PTR_ADD:%[0-9]+]]:vgpr(p3) = G_PTR_ADD [[COPY]], [[C]](s32) + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: [[LOAD:%[0-9]+]]:vreg_64(s64) = G_LOAD [[PTR_ADD]](p3) :: (load 8, align 4, addrspace 3) + ; GFX6: $vgpr0_vgpr1 = COPY [[LOAD]](s64) %0:vgpr(p3) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 1020 %2:vgpr(p3) = G_PTR_ADD %0, %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-private.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-private.mir index 12e6e747a0706..13e4035a48828 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-private.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-private.mir @@ -10,7 +10,6 @@ regBankSelected: true tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | @@ -20,12 +19,12 @@ body: | ; GFX6-LABEL: name: load_private_s32_from_4 ; GFX6: liveins: $vgpr0 ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) + ; GFX6: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_4 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) + ; GFX9: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_LOAD %0 :: (load 4, align 4, addrspace 5) @@ -41,7 +40,6 @@ regBankSelected: true tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | @@ -51,12 +49,12 @@ body: | ; GFX6-LABEL: name: load_private_s32_from_2 ; GFX6: liveins: $vgpr0 ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: [[BUFFER_LOAD_USHORT_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 2, addrspace 5) + ; GFX6: [[BUFFER_LOAD_USHORT_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 2, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_USHORT_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_2 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9: [[BUFFER_LOAD_USHORT_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 2, addrspace 5) + ; GFX9: [[BUFFER_LOAD_USHORT_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 2, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_USHORT_OFFEN]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_LOAD %0 :: (load 2, align 2, addrspace 5) @@ -72,7 +70,6 @@ regBankSelected: true tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | @@ -82,12 +79,12 @@ body: | ; GFX6-LABEL: name: load_private_s32_from_1 ; GFX6: liveins: $vgpr0 ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_LOAD %0 :: (load 1, align 1, addrspace 5) @@ -130,7 +127,6 @@ regBankSelected: true tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | @@ -161,7 +157,6 @@ regBankSelected: true tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | @@ -196,7 +191,6 @@ regBankSelected: true tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | @@ -208,12 +202,12 @@ body: | ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2047, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_gep_2047 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 2047, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 2047, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 2047 @@ -231,7 +225,6 @@ regBankSelected: true tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | @@ -243,14 +236,14 @@ body: | ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2147483647, implicit $exec ; GFX6: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY]], [[V_MOV_B32_e32_]], implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_AND_B32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 2047, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_AND_B32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 2047, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_gep_2047_known_bits ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2147483647, implicit $exec ; GFX9: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY]], [[V_MOV_B32_e32_]], implicit $exec - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_AND_B32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 2047, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_AND_B32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 2047, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 2147483647 @@ -271,7 +264,6 @@ regBankSelected: true tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | @@ -283,12 +275,12 @@ body: | ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2048, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_gep_2048 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 2048, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 2048, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 2048 @@ -306,7 +298,6 @@ regBankSelected: true tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | @@ -318,14 +309,14 @@ body: | ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965249, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_gep_m2047 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965249, implicit $exec ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 -2047 @@ -343,7 +334,6 @@ regBankSelected: true tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | @@ -355,14 +345,14 @@ body: | ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965248, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_gep_m2048 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965248, implicit $exec ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 -2048 @@ -380,7 +370,6 @@ regBankSelected: true tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | @@ -392,12 +381,12 @@ body: | ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_gep_4095 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4095, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 4095 @@ -415,7 +404,6 @@ regBankSelected: true tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | @@ -427,14 +415,14 @@ body: | ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_gep_4096 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 4096 @@ -452,7 +440,6 @@ regBankSelected: true tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | @@ -464,14 +451,14 @@ body: | ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963201, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_gep_m4095 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963201, implicit $exec ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 -4095 @@ -489,7 +476,6 @@ regBankSelected: true tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | @@ -501,14 +487,14 @@ body: | ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963200, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_gep_m4096 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963200, implicit $exec ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 -4096 @@ -526,7 +512,6 @@ regBankSelected: true tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | @@ -538,14 +523,14 @@ body: | ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8191, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_gep_8191 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8191, implicit $exec ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 8191 @@ -563,7 +548,6 @@ regBankSelected: true tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | @@ -575,14 +559,14 @@ body: | ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8192, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_gep_8192 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8192, implicit $exec ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 8192 @@ -600,7 +584,6 @@ regBankSelected: true tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | @@ -612,14 +595,14 @@ body: | ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959105, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_gep_m8191 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959105, implicit $exec ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 -8191 @@ -637,7 +620,6 @@ regBankSelected: true tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | @@ -649,14 +631,14 @@ body: | ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959104, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_gep_m8192 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959104, implicit $exec ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 -8192 @@ -674,17 +656,16 @@ regBankSelected: true tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | bb.0: ; GFX6-LABEL: name: load_private_s32_from_4_constant_0 - ; GFX6: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) + ; GFX6: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]] ; GFX9-LABEL: name: load_private_s32_from_4_constant_0 - ; GFX9: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) + ; GFX9: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]] %0:vgpr(p5) = G_CONSTANT i32 0 %1:vgpr(s32) = G_LOAD %0 :: (load 4, align 4, addrspace 5) @@ -700,17 +681,16 @@ regBankSelected: true tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | bb.0: ; GFX6-LABEL: name: load_private_s32_from_4_constant_sgpr_16 - ; GFX6: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 16, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) + ; GFX6: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 16, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]] ; GFX9-LABEL: name: load_private_s32_from_4_constant_sgpr_16 - ; GFX9: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 16, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) + ; GFX9: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 16, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]] %0:sgpr(p5) = G_CONSTANT i32 16 %1:vgpr(s32) = G_LOAD %0 :: (load 4, align 4, addrspace 5) @@ -726,17 +706,16 @@ regBankSelected: true tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | bb.0: ; GFX6-LABEL: name: load_private_s32_from_1_constant_4095 - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4095, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFSET]] ; GFX9-LABEL: name: load_private_s32_from_1_constant_4095 - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4095, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFSET]] %0:vgpr(p5) = G_CONSTANT i32 4095 %1:vgpr(s32) = G_LOAD %0 :: (load 1, align 1, addrspace 5) @@ -752,7 +731,6 @@ regBankSelected: true tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | @@ -760,11 +738,11 @@ body: | ; GFX6-LABEL: name: load_private_s32_from_1_constant_4096 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_constant_4096 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = G_CONSTANT i32 4096 %1:vgpr(s32) = G_LOAD %0 :: (load 1, align 1, addrspace 5) @@ -780,7 +758,6 @@ regBankSelected: true tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 stack: - { id: 0, size: 4, alignment: 4 } @@ -808,7 +785,6 @@ regBankSelected: true tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 stack: - { id: 0, size: 4096, alignment: 4 } @@ -820,7 +796,7 @@ body: | ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX6: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], 0, implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_fi_offset_4095 ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4095, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) @@ -841,7 +817,6 @@ regBankSelected: true tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 stack: - { id: 0, size: 8192, alignment: 4 } @@ -853,13 +828,13 @@ body: | ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX6: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], 0, implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_fi_offset_4096 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX9: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], 0, implicit $exec - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = G_FRAME_INDEX %stack.0 %1:vgpr(s32) = G_CONSTANT i32 4096 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-add3.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-add3.mir index dbe90d60654a7..a33e4c3b313f4 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-add3.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-add3.mir @@ -130,3 +130,178 @@ body: | S_ENDPGM 0, implicit %4, implicit %3 ... +--- + +name: add_p3_vgpr_vgpr_vgpr +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + + ; GFX8-LABEL: name: add_p3_vgpr_vgpr_vgpr + ; GFX8: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX8: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX8: %3:vgpr_32, dead %6:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[COPY1]], 0, implicit $exec + ; GFX8: %4:vgpr_32, dead %5:sreg_64_xexec = V_ADD_I32_e64 %3, [[COPY2]], 0, implicit $exec + ; GFX8: S_ENDPGM 0, implicit %4 + ; GFX9-LABEL: name: add_p3_vgpr_vgpr_vgpr + ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[COPY1]], 0, implicit $exec + ; GFX9: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_]], [[COPY2]], 0, implicit $exec + ; GFX9: S_ENDPGM 0, implicit [[V_ADD_U32_e64_1]] + ; GFX10-LABEL: name: add_p3_vgpr_vgpr_vgpr + ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX10: $vcc_hi = IMPLICIT_DEF + ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX10: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX10: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[COPY1]], 0, implicit $exec + ; GFX10: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_]], [[COPY2]], 0, implicit $exec + ; GFX10: S_ENDPGM 0, implicit [[V_ADD_U32_e64_1]] + %0:vgpr(p3) = COPY $vgpr0 + %1:vgpr(s32) = COPY $vgpr1 + %2:vgpr(s32) = COPY $vgpr2 + %3:vgpr(p3) = G_PTR_ADD %0, %1 + %4:vgpr(p3) = G_PTR_ADD %3, %2 + S_ENDPGM 0, implicit %4 +... + +--- + +name: add_p5_vgpr_vgpr_vgpr +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + + ; GFX8-LABEL: name: add_p5_vgpr_vgpr_vgpr + ; GFX8: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX8: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX8: %3:vgpr_32, dead %6:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[COPY1]], 0, implicit $exec + ; GFX8: %4:vgpr_32, dead %5:sreg_64_xexec = V_ADD_I32_e64 %3, [[COPY2]], 0, implicit $exec + ; GFX8: S_ENDPGM 0, implicit %4 + ; GFX9-LABEL: name: add_p5_vgpr_vgpr_vgpr + ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[COPY1]], 0, implicit $exec + ; GFX9: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_]], [[COPY2]], 0, implicit $exec + ; GFX9: S_ENDPGM 0, implicit [[V_ADD_U32_e64_1]] + ; GFX10-LABEL: name: add_p5_vgpr_vgpr_vgpr + ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX10: $vcc_hi = IMPLICIT_DEF + ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX10: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX10: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[COPY1]], 0, implicit $exec + ; GFX10: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_ADD_U32_e64_]], [[COPY2]], 0, implicit $exec + ; GFX10: S_ENDPGM 0, implicit [[V_ADD_U32_e64_1]] + %0:vgpr(p5) = COPY $vgpr0 + %1:vgpr(s32) = COPY $vgpr1 + %2:vgpr(s32) = COPY $vgpr2 + %3:vgpr(p5) = G_PTR_ADD %0, %1 + %4:vgpr(p5) = G_PTR_ADD %3, %2 + S_ENDPGM 0, implicit %4 +... + +--- + +name: add_p3_s32_vgpr_vgpr_vgpr +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + + ; GFX8-LABEL: name: add_p3_s32_vgpr_vgpr_vgpr + ; GFX8: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX8: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX8: %3:vgpr_32, dead %6:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[COPY1]], 0, implicit $exec + ; GFX8: %4:vgpr_32, dead %5:sreg_64_xexec = V_ADD_I32_e64 [[COPY2]], %3, 0, implicit $exec + ; GFX8: S_ENDPGM 0, implicit %4 + ; GFX9-LABEL: name: add_p3_s32_vgpr_vgpr_vgpr + ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[COPY1]], 0, implicit $exec + ; GFX9: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY2]], [[V_ADD_U32_e64_]], 0, implicit $exec + ; GFX9: S_ENDPGM 0, implicit [[V_ADD_U32_e64_1]] + ; GFX10-LABEL: name: add_p3_s32_vgpr_vgpr_vgpr + ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX10: $vcc_hi = IMPLICIT_DEF + ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX10: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX10: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[COPY1]], 0, implicit $exec + ; GFX10: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY2]], [[V_ADD_U32_e64_]], 0, implicit $exec + ; GFX10: S_ENDPGM 0, implicit [[V_ADD_U32_e64_1]] + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(s32) = COPY $vgpr1 + %2:vgpr(p3) = COPY $vgpr2 + %3:vgpr(s32) = G_ADD %0, %1 + %4:vgpr(p3) = G_PTR_ADD %2, %3 + S_ENDPGM 0, implicit %4 +... + +--- + +name: add_p5_s32_vgpr_vgpr_vgpr +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + + ; GFX8-LABEL: name: add_p5_s32_vgpr_vgpr_vgpr + ; GFX8: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX8: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX8: %3:vgpr_32, dead %6:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[COPY1]], 0, implicit $exec + ; GFX8: %4:vgpr_32, dead %5:sreg_64_xexec = V_ADD_I32_e64 [[COPY2]], %3, 0, implicit $exec + ; GFX8: S_ENDPGM 0, implicit %4 + ; GFX9-LABEL: name: add_p5_s32_vgpr_vgpr_vgpr + ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[COPY1]], 0, implicit $exec + ; GFX9: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY2]], [[V_ADD_U32_e64_]], 0, implicit $exec + ; GFX9: S_ENDPGM 0, implicit [[V_ADD_U32_e64_1]] + ; GFX10-LABEL: name: add_p5_s32_vgpr_vgpr_vgpr + ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX10: $vcc_hi = IMPLICIT_DEF + ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX10: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX10: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[COPY1]], 0, implicit $exec + ; GFX10: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY2]], [[V_ADD_U32_e64_]], 0, implicit $exec + ; GFX10: S_ENDPGM 0, implicit [[V_ADD_U32_e64_1]] + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(s32) = COPY $vgpr1 + %2:vgpr(p5) = COPY $vgpr2 + %3:vgpr(s32) = G_ADD %0, %1 + %4:vgpr(p5) = G_PTR_ADD %2, %3 + S_ENDPGM 0, implicit %4 +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-and-or.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-and-or.mir new file mode 100644 index 0000000000000..0230dd835215f --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-and-or.mir @@ -0,0 +1,176 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX8 %s +# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX9 %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX10 %s + +--- + +name: and_or_s32_sgpr_sgpr_sgpr +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $sgpr0, $sgpr1, $sgpr2 + ; GFX8-LABEL: name: and_or_s32_sgpr_sgpr_sgpr + ; GFX8: liveins: $sgpr0, $sgpr1, $sgpr2 + ; GFX8: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX8: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX8: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX8: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY]], [[COPY1]], implicit-def $scc + ; GFX8: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_AND_B32_]], [[COPY2]], implicit-def $scc + ; GFX8: S_ENDPGM 0, implicit [[S_OR_B32_]] + ; GFX9-LABEL: name: and_or_s32_sgpr_sgpr_sgpr + ; GFX9: liveins: $sgpr0, $sgpr1, $sgpr2 + ; GFX9: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX9: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX9: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY]], [[COPY1]], implicit-def $scc + ; GFX9: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_AND_B32_]], [[COPY2]], implicit-def $scc + ; GFX9: S_ENDPGM 0, implicit [[S_OR_B32_]] + ; GFX10-LABEL: name: and_or_s32_sgpr_sgpr_sgpr + ; GFX10: liveins: $sgpr0, $sgpr1, $sgpr2 + ; GFX10: $vcc_hi = IMPLICIT_DEF + ; GFX10: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX10: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX10: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2 + ; GFX10: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY]], [[COPY1]], implicit-def $scc + ; GFX10: [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_AND_B32_]], [[COPY2]], implicit-def $scc + ; GFX10: S_ENDPGM 0, implicit [[S_OR_B32_]] + %0:sgpr(s32) = COPY $sgpr0 + %1:sgpr(s32) = COPY $sgpr1 + %2:sgpr(s32) = COPY $sgpr2 + %3:sgpr(s32) = G_AND %0, %1 + %4:sgpr(s32) = G_OR %3, %2 + S_ENDPGM 0, implicit %4 +... + +--- + +name: and_or_s32_vgpr_vgpr_vgpr +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX8-LABEL: name: and_or_s32_vgpr_vgpr_vgpr + ; GFX8: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX8: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX8: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY]], [[COPY1]], implicit $exec + ; GFX8: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_]], [[COPY2]], implicit $exec + ; GFX8: S_ENDPGM 0, implicit [[V_OR_B32_e64_]] + ; GFX9-LABEL: name: and_or_s32_vgpr_vgpr_vgpr + ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX9: [[V_AND_OR_B32_:%[0-9]+]]:vgpr_32 = V_AND_OR_B32 [[COPY]], [[COPY1]], [[COPY2]], implicit $exec + ; GFX9: S_ENDPGM 0, implicit [[V_AND_OR_B32_]] + ; GFX10-LABEL: name: and_or_s32_vgpr_vgpr_vgpr + ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX10: $vcc_hi = IMPLICIT_DEF + ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX10: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX10: [[V_AND_OR_B32_:%[0-9]+]]:vgpr_32 = V_AND_OR_B32 [[COPY]], [[COPY1]], [[COPY2]], implicit $exec + ; GFX10: S_ENDPGM 0, implicit [[V_AND_OR_B32_]] + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(s32) = COPY $vgpr1 + %2:vgpr(s32) = COPY $vgpr2 + %3:vgpr(s32) = G_AND %0, %1 + %4:vgpr(s32) = G_OR %3, %2 + S_ENDPGM 0, implicit %4 +... + +--- + +name: and_or_s32_vgpr_vgpr_vgpr_commute +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX8-LABEL: name: and_or_s32_vgpr_vgpr_vgpr_commute + ; GFX8: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX8: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX8: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY]], [[COPY1]], implicit $exec + ; GFX8: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[COPY2]], [[V_AND_B32_e64_]], implicit $exec + ; GFX8: S_ENDPGM 0, implicit [[V_OR_B32_e64_]] + ; GFX9-LABEL: name: and_or_s32_vgpr_vgpr_vgpr_commute + ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX9: [[V_AND_OR_B32_:%[0-9]+]]:vgpr_32 = V_AND_OR_B32 [[COPY]], [[COPY1]], [[COPY2]], implicit $exec + ; GFX9: S_ENDPGM 0, implicit [[V_AND_OR_B32_]] + ; GFX10-LABEL: name: and_or_s32_vgpr_vgpr_vgpr_commute + ; GFX10: liveins: $vgpr0, $vgpr1, $vgpr2 + ; GFX10: $vcc_hi = IMPLICIT_DEF + ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX10: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX10: [[V_AND_OR_B32_:%[0-9]+]]:vgpr_32 = V_AND_OR_B32 [[COPY]], [[COPY1]], [[COPY2]], implicit $exec + ; GFX10: S_ENDPGM 0, implicit [[V_AND_OR_B32_]] + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(s32) = COPY $vgpr1 + %2:vgpr(s32) = COPY $vgpr2 + %3:vgpr(s32) = G_AND %0, %1 + %4:vgpr(s32) = G_OR %2, %3 + S_ENDPGM 0, implicit %4 +... + +--- + +name: and_or_s32_sgpr_sgpr_vgpr +legalized: true +regBankSelected: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $sgpr0, $sgpr1, $vgpr0 + ; GFX8-LABEL: name: and_or_s32_sgpr_sgpr_vgpr + ; GFX8: liveins: $sgpr0, $sgpr1, $vgpr0 + ; GFX8: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX8: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX8: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX8: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY]], [[COPY1]], implicit-def $scc + ; GFX8: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[S_AND_B32_]] + ; GFX8: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[COPY3]], [[COPY2]], implicit $exec + ; GFX8: S_ENDPGM 0, implicit [[V_OR_B32_e64_]] + ; GFX9-LABEL: name: and_or_s32_sgpr_sgpr_vgpr + ; GFX9: liveins: $sgpr0, $sgpr1, $vgpr0 + ; GFX9: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY]], [[COPY1]], implicit-def $scc + ; GFX9: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[S_AND_B32_]] + ; GFX9: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[COPY3]], [[COPY2]], implicit $exec + ; GFX9: S_ENDPGM 0, implicit [[V_OR_B32_e64_]] + ; GFX10-LABEL: name: and_or_s32_sgpr_sgpr_vgpr + ; GFX10: liveins: $sgpr0, $sgpr1, $vgpr0 + ; GFX10: $vcc_hi = IMPLICIT_DEF + ; GFX10: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 + ; GFX10: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1 + ; GFX10: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX10: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY]], [[COPY1]], implicit-def $scc + ; GFX10: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[S_AND_B32_]] + ; GFX10: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[COPY3]], [[COPY2]], implicit $exec + ; GFX10: S_ENDPGM 0, implicit [[V_OR_B32_e64_]] + %0:sgpr(s32) = COPY $sgpr0 + %1:sgpr(s32) = COPY $sgpr1 + %2:vgpr(s32) = COPY $vgpr0 + %3:sgpr(s32) = G_AND %0, %1 + %4:vgpr(s32) = COPY %3 + %5:vgpr(s32) = G_OR %4, %2 + S_ENDPGM 0, implicit %5 +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-local.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-local.mir index 60cc05c7da5c8..440c34f101633 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-local.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-local.mir @@ -13,19 +13,12 @@ regBankSelected: true tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | bb.0: liveins: $vgpr0, $vgpr1 - ; GFX6-LABEL: name: store_local_s32_to_4 - ; GFX6: liveins: $vgpr0, $vgpr1 - ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: DS_WRITE_B32 [[COPY1]], [[COPY]], 0, 0, implicit $m0, implicit $exec :: (store 4, addrspace 3) ; GFX7-LABEL: name: store_local_s32_to_4 ; GFX7: liveins: $vgpr0, $vgpr1 ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -37,6 +30,12 @@ body: | ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX9: DS_WRITE_B32_gfx9 [[COPY1]], [[COPY]], 0, 0, implicit $exec :: (store 4, addrspace 3) + ; GFX6-LABEL: name: store_local_s32_to_4 + ; GFX6: liveins: $vgpr0, $vgpr1 + ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: DS_WRITE_B32 [[COPY1]], [[COPY]], 0, 0, implicit $m0, implicit $exec :: (store 4, addrspace 3) %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(p3) = COPY $vgpr1 G_STORE %0, %1 :: (store 4, align 4, addrspace 3) @@ -51,19 +50,12 @@ regBankSelected: true tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | bb.0: liveins: $vgpr0, $vgpr1 - ; GFX6-LABEL: name: store_local_s32_to_2 - ; GFX6: liveins: $vgpr0, $vgpr1 - ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: DS_WRITE_B16 [[COPY1]], [[COPY]], 0, 0, implicit $m0, implicit $exec :: (store 2, addrspace 3) ; GFX7-LABEL: name: store_local_s32_to_2 ; GFX7: liveins: $vgpr0, $vgpr1 ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -75,6 +67,12 @@ body: | ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX9: DS_WRITE_B16_gfx9 [[COPY1]], [[COPY]], 0, 0, implicit $exec :: (store 2, addrspace 3) + ; GFX6-LABEL: name: store_local_s32_to_2 + ; GFX6: liveins: $vgpr0, $vgpr1 + ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: DS_WRITE_B16 [[COPY1]], [[COPY]], 0, 0, implicit $m0, implicit $exec :: (store 2, addrspace 3) %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(p3) = COPY $vgpr1 G_STORE %0, %1 :: (store 2, align 2, addrspace 3) @@ -89,19 +87,12 @@ regBankSelected: true tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | bb.0: liveins: $vgpr0, $vgpr1 - ; GFX6-LABEL: name: store_local_s32_to_1 - ; GFX6: liveins: $vgpr0, $vgpr1 - ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: DS_WRITE_B8 [[COPY1]], [[COPY]], 0, 0, implicit $m0, implicit $exec :: (store 1, addrspace 3) ; GFX7-LABEL: name: store_local_s32_to_1 ; GFX7: liveins: $vgpr0, $vgpr1 ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -113,6 +104,12 @@ body: | ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX9: DS_WRITE_B8_gfx9 [[COPY1]], [[COPY]], 0, 0, implicit $exec :: (store 1, addrspace 3) + ; GFX6-LABEL: name: store_local_s32_to_1 + ; GFX6: liveins: $vgpr0, $vgpr1 + ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: DS_WRITE_B8 [[COPY1]], [[COPY]], 0, 0, implicit $m0, implicit $exec :: (store 1, addrspace 3) %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(p3) = COPY $vgpr1 G_STORE %0, %1 :: (store 1, align 1, addrspace 3) @@ -127,19 +124,12 @@ regBankSelected: true tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | bb.0: liveins: $vgpr0, $vgpr1 - ; GFX6-LABEL: name: store_local_v2s16 - ; GFX6: liveins: $vgpr0, $vgpr1 - ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: DS_WRITE_B32 [[COPY1]], [[COPY]], 0, 0, implicit $m0, implicit $exec :: (store 4, addrspace 3) ; GFX7-LABEL: name: store_local_v2s16 ; GFX7: liveins: $vgpr0, $vgpr1 ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -151,6 +141,12 @@ body: | ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX9: DS_WRITE_B32_gfx9 [[COPY1]], [[COPY]], 0, 0, implicit $exec :: (store 4, addrspace 3) + ; GFX6-LABEL: name: store_local_v2s16 + ; GFX6: liveins: $vgpr0, $vgpr1 + ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: DS_WRITE_B32 [[COPY1]], [[COPY]], 0, 0, implicit $m0, implicit $exec :: (store 4, addrspace 3) %0:vgpr(<2 x s16>) = COPY $vgpr0 %1:vgpr(p3) = COPY $vgpr1 G_STORE %0, %1 :: (store 4, align 4, addrspace 3) @@ -165,19 +161,12 @@ regBankSelected: true tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | bb.0: liveins: $vgpr0, $vgpr1 - ; GFX6-LABEL: name: store_local_p3 - ; GFX6: liveins: $vgpr0, $vgpr1 - ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: DS_WRITE_B32 [[COPY1]], [[COPY]], 0, 0, implicit $m0, implicit $exec :: (store 4, addrspace 3) ; GFX7-LABEL: name: store_local_p3 ; GFX7: liveins: $vgpr0, $vgpr1 ; GFX7: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 @@ -189,6 +178,12 @@ body: | ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; GFX9: DS_WRITE_B32_gfx9 [[COPY1]], [[COPY]], 0, 0, implicit $exec :: (store 4, addrspace 3) + ; GFX6-LABEL: name: store_local_p3 + ; GFX6: liveins: $vgpr0, $vgpr1 + ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: DS_WRITE_B32 [[COPY1]], [[COPY]], 0, 0, implicit $m0, implicit $exec :: (store 4, addrspace 3) %0:vgpr(p3) = COPY $vgpr0 %1:vgpr(p3) = COPY $vgpr1 G_STORE %0, %1 :: (store 4, align 4, addrspace 3) @@ -205,11 +200,6 @@ tracksRegLiveness: true body: | bb.0: - ; GFX6-LABEL: name: store_local_s32_to_1_constant_4095 - ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec - ; GFX6: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: DS_WRITE_B8 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], 0, 0, implicit $m0, implicit $exec :: (store 1, addrspace 3) ; GFX7-LABEL: name: store_local_s32_to_1_constant_4095 ; GFX7: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec ; GFX7: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec @@ -219,6 +209,11 @@ body: | ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec ; GFX9: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GFX9: DS_WRITE_B8_gfx9 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], 0, 0, implicit $exec :: (store 1, addrspace 3) + ; GFX6-LABEL: name: store_local_s32_to_1_constant_4095 + ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec + ; GFX6: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: DS_WRITE_B8 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], 0, 0, implicit $m0, implicit $exec :: (store 1, addrspace 3) %0:vgpr(p3) = G_CONSTANT i32 4095 %1:vgpr(s32) = G_CONSTANT i32 0 G_STORE %1, %0 :: (store 1, align 1, addrspace 3) @@ -233,7 +228,6 @@ regBankSelected: true tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 stack: - { id: 0, size: 4096, alignment: 4 } @@ -241,11 +235,6 @@ stack: body: | bb.0: - ; GFX6-LABEL: name: store_local_s32_to_1_constant_4096 - ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec - ; GFX6: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: DS_WRITE_B8 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], 0, 0, implicit $m0, implicit $exec :: (store 1, addrspace 3) ; GFX7-LABEL: name: store_local_s32_to_1_constant_4096 ; GFX7: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec ; GFX7: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec @@ -255,6 +244,11 @@ body: | ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec ; GFX9: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GFX9: DS_WRITE_B8_gfx9 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], 0, 0, implicit $exec :: (store 1, addrspace 3) + ; GFX6-LABEL: name: store_local_s32_to_1_constant_4096 + ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec + ; GFX6: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: DS_WRITE_B8 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], 0, 0, implicit $m0, implicit $exec :: (store 1, addrspace 3) %0:vgpr(p3) = G_CONSTANT i32 4096 %1:vgpr(s32) = G_CONSTANT i32 0 G_STORE %1, %0 :: (store 1, align 1, addrspace 3) @@ -269,19 +263,12 @@ regBankSelected: true tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | bb.0: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX6-LABEL: name: store_local_s64_align4 - ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX6: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 - ; GFX6: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY $vgpr2 - ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: G_STORE [[COPY]](s64), [[COPY1]](p3) :: (store 8, align 4, addrspace 3) ; GFX7-LABEL: name: store_local_s64_align4 ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 @@ -297,6 +284,12 @@ body: | ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 ; GFX9: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 ; GFX9: DS_WRITE2_B32_gfx9 [[COPY1]], [[COPY3]], [[COPY2]], 0, 1, 0, implicit $exec :: (store 8, align 4, addrspace 3) + ; GFX6-LABEL: name: store_local_s64_align4 + ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX6: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY $vgpr2 + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: G_STORE [[COPY]](s64), [[COPY1]](p3) :: (store 8, align 4, addrspace 3) %0:vgpr(s64) = COPY $vgpr0_vgpr1 %1:vgpr(p3) = COPY $vgpr2 G_STORE %0, %1 :: (store 8, align 4, addrspace 3) @@ -311,19 +304,12 @@ regBankSelected: true tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | bb.0: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX6-LABEL: name: store_local_p1_align4 - ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX6: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 - ; GFX6: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY $vgpr2 - ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: G_STORE [[COPY]](p1), [[COPY1]](p3) :: (store 8, align 4, addrspace 3) ; GFX7-LABEL: name: store_local_p1_align4 ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 @@ -339,6 +325,12 @@ body: | ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 ; GFX9: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 ; GFX9: DS_WRITE2_B32_gfx9 [[COPY1]], [[COPY3]], [[COPY2]], 0, 1, 0, implicit $exec :: (store 8, align 4, addrspace 3) + ; GFX6-LABEL: name: store_local_p1_align4 + ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX6: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY $vgpr2 + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: G_STORE [[COPY]](p1), [[COPY1]](p3) :: (store 8, align 4, addrspace 3) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(p3) = COPY $vgpr2 G_STORE %0, %1 :: (store 8, align 4, addrspace 3) @@ -353,19 +345,12 @@ regBankSelected: true tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | bb.0: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX6-LABEL: name: store_local_v2s32_align4 - ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX6: [[COPY:%[0-9]+]]:vgpr(<2 x s32>) = COPY $vgpr0_vgpr1 - ; GFX6: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY $vgpr2 - ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: G_STORE [[COPY]](<2 x s32>), [[COPY1]](p3) :: (store 8, align 4, addrspace 3) ; GFX7-LABEL: name: store_local_v2s32_align4 ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 @@ -381,6 +366,12 @@ body: | ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 ; GFX9: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 ; GFX9: DS_WRITE2_B32_gfx9 [[COPY1]], [[COPY3]], [[COPY2]], 0, 1, 0, implicit $exec :: (store 8, align 4, addrspace 3) + ; GFX6-LABEL: name: store_local_v2s32_align4 + ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX6: [[COPY:%[0-9]+]]:vgpr(<2 x s32>) = COPY $vgpr0_vgpr1 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY $vgpr2 + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: G_STORE [[COPY]](<2 x s32>), [[COPY1]](p3) :: (store 8, align 4, addrspace 3) %0:vgpr(<2 x s32>) = COPY $vgpr0_vgpr1 %1:vgpr(p3) = COPY $vgpr2 G_STORE %0, %1 :: (store 8, align 4, addrspace 3) @@ -395,19 +386,12 @@ regBankSelected: true tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | bb.0: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX6-LABEL: name: store_local_v4s16_align4 - ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX6: [[COPY:%[0-9]+]]:vgpr(<4 x s16>) = COPY $vgpr0_vgpr1 - ; GFX6: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY $vgpr2 - ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: G_STORE [[COPY]](<4 x s16>), [[COPY1]](p3) :: (store 8, align 4, addrspace 3) ; GFX7-LABEL: name: store_local_v4s16_align4 ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 @@ -423,6 +407,12 @@ body: | ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 ; GFX9: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 ; GFX9: DS_WRITE2_B32_gfx9 [[COPY1]], [[COPY3]], [[COPY2]], 0, 1, 0, implicit $exec :: (store 8, align 4, addrspace 3) + ; GFX6-LABEL: name: store_local_v4s16_align4 + ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX6: [[COPY:%[0-9]+]]:vgpr(<4 x s16>) = COPY $vgpr0_vgpr1 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY $vgpr2 + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: G_STORE [[COPY]](<4 x s16>), [[COPY1]](p3) :: (store 8, align 4, addrspace 3) %0:vgpr(<4 x s16>) = COPY $vgpr0_vgpr1 %1:vgpr(p3) = COPY $vgpr2 G_STORE %0, %1 :: (store 8, align 4, addrspace 3) @@ -437,19 +427,12 @@ regBankSelected: true tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | bb.0: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX6-LABEL: name: store_local_s64_align8 - ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: DS_WRITE_B64 [[COPY1]], [[COPY]], 0, 0, implicit $m0, implicit $exec :: (store 8, addrspace 3) ; GFX7-LABEL: name: store_local_s64_align8 ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 @@ -461,6 +444,12 @@ body: | ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX9: DS_WRITE_B64_gfx9 [[COPY1]], [[COPY]], 0, 0, implicit $exec :: (store 8, addrspace 3) + ; GFX6-LABEL: name: store_local_s64_align8 + ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: DS_WRITE_B64 [[COPY1]], [[COPY]], 0, 0, implicit $m0, implicit $exec :: (store 8, addrspace 3) %0:vgpr(s64) = COPY $vgpr0_vgpr1 %1:vgpr(p3) = COPY $vgpr2 G_STORE %0, %1 :: (store 8, align 8, addrspace 3) @@ -475,19 +464,12 @@ regBankSelected: true tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | bb.0: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX6-LABEL: name: store_local_p1_align8 - ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: DS_WRITE_B64 [[COPY1]], [[COPY]], 0, 0, implicit $m0, implicit $exec :: (store 8, addrspace 3) ; GFX7-LABEL: name: store_local_p1_align8 ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 @@ -499,6 +481,12 @@ body: | ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX9: DS_WRITE_B64_gfx9 [[COPY1]], [[COPY]], 0, 0, implicit $exec :: (store 8, addrspace 3) + ; GFX6-LABEL: name: store_local_p1_align8 + ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: DS_WRITE_B64 [[COPY1]], [[COPY]], 0, 0, implicit $m0, implicit $exec :: (store 8, addrspace 3) %0:vgpr(p1) = COPY $vgpr0_vgpr1 %1:vgpr(p3) = COPY $vgpr2 G_STORE %0, %1 :: (store 8, align 8, addrspace 3) @@ -513,19 +501,12 @@ regBankSelected: true tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | bb.0: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX6-LABEL: name: store_local_v2s32_align8 - ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: DS_WRITE_B64 [[COPY1]], [[COPY]], 0, 0, implicit $m0, implicit $exec :: (store 8, addrspace 3) ; GFX7-LABEL: name: store_local_v2s32_align8 ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 @@ -537,6 +518,12 @@ body: | ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX9: DS_WRITE_B64_gfx9 [[COPY1]], [[COPY]], 0, 0, implicit $exec :: (store 8, addrspace 3) + ; GFX6-LABEL: name: store_local_v2s32_align8 + ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: DS_WRITE_B64 [[COPY1]], [[COPY]], 0, 0, implicit $m0, implicit $exec :: (store 8, addrspace 3) %0:vgpr(<2 x s32>) = COPY $vgpr0_vgpr1 %1:vgpr(p3) = COPY $vgpr2 G_STORE %0, %1 :: (store 8, align 8, addrspace 3) @@ -551,19 +538,12 @@ regBankSelected: true tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | bb.0: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX6-LABEL: name: store_local_v4s16_align8 - ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 - ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: DS_WRITE_B64 [[COPY1]], [[COPY]], 0, 0, implicit $m0, implicit $exec :: (store 8, addrspace 3) ; GFX7-LABEL: name: store_local_v4s16_align8 ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 @@ -575,6 +555,12 @@ body: | ; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GFX9: DS_WRITE_B64_gfx9 [[COPY1]], [[COPY]], 0, 0, implicit $exec :: (store 8, addrspace 3) + ; GFX6-LABEL: name: store_local_v4s16_align8 + ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX6: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: DS_WRITE_B64 [[COPY1]], [[COPY]], 0, 0, implicit $m0, implicit $exec :: (store 8, addrspace 3) %0:vgpr(<4 x s16>) = COPY $vgpr0_vgpr1 %1:vgpr(p3) = COPY $vgpr2 G_STORE %0, %1 :: (store 8, align 8, addrspace 3) @@ -589,21 +575,12 @@ regBankSelected: true tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | bb.0: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX6-LABEL: name: store_local_s64_align4_from_1_gep_1016 - ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX6: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 - ; GFX6: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY $vgpr2 - ; GFX6: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1016 - ; GFX6: [[PTR_ADD:%[0-9]+]]:vgpr(p3) = G_PTR_ADD [[COPY1]], [[C]](s32) - ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: G_STORE [[COPY]](s64), [[PTR_ADD]](p3) :: (store 8, align 4, addrspace 3) ; GFX7-LABEL: name: store_local_s64_align4_from_1_gep_1016 ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 @@ -619,6 +596,14 @@ body: | ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 ; GFX9: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 ; GFX9: DS_WRITE2_B32_gfx9 [[COPY1]], [[COPY3]], [[COPY2]], 254, 255, 0, implicit $exec :: (store 8, align 4, addrspace 3) + ; GFX6-LABEL: name: store_local_s64_align4_from_1_gep_1016 + ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX6: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY $vgpr2 + ; GFX6: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1016 + ; GFX6: [[PTR_ADD:%[0-9]+]]:vgpr(p3) = G_PTR_ADD [[COPY1]], [[C]](s32) + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: G_STORE [[COPY]](s64), [[PTR_ADD]](p3) :: (store 8, align 4, addrspace 3) %0:vgpr(s64) = COPY $vgpr0_vgpr1 %1:vgpr(p3) = COPY $vgpr2 %2:vgpr(s32) = G_CONSTANT i32 1016 @@ -635,21 +620,12 @@ regBankSelected: true tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | bb.0: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX6-LABEL: name: store_local_s64_align4_from_1_gep_1020 - ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2 - ; GFX6: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 - ; GFX6: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY $vgpr2 - ; GFX6: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1020 - ; GFX6: [[PTR_ADD:%[0-9]+]]:vgpr(p3) = G_PTR_ADD [[COPY1]], [[C]](s32) - ; GFX6: $m0 = S_MOV_B32 -1 - ; GFX6: G_STORE [[COPY]](s64), [[PTR_ADD]](p3) :: (store 8, align 4, addrspace 3) ; GFX7-LABEL: name: store_local_s64_align4_from_1_gep_1020 ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2 ; GFX7: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 @@ -669,6 +645,14 @@ body: | ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 ; GFX9: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0 ; GFX9: DS_WRITE2_B32_gfx9 [[V_ADD_U32_e64_]], [[COPY3]], [[COPY2]], 0, 1, 0, implicit $exec :: (store 8, align 4, addrspace 3) + ; GFX6-LABEL: name: store_local_s64_align4_from_1_gep_1020 + ; GFX6: liveins: $vgpr0_vgpr1, $vgpr2 + ; GFX6: [[COPY:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY $vgpr2 + ; GFX6: [[C:%[0-9]+]]:vgpr(s32) = G_CONSTANT i32 1020 + ; GFX6: [[PTR_ADD:%[0-9]+]]:vgpr(p3) = G_PTR_ADD [[COPY1]], [[C]](s32) + ; GFX6: $m0 = S_MOV_B32 -1 + ; GFX6: G_STORE [[COPY]](s64), [[PTR_ADD]](p3) :: (store 8, align 4, addrspace 3) %0:vgpr(s64) = COPY $vgpr0_vgpr1 %1:vgpr(p3) = COPY $vgpr2 %2:vgpr(s32) = G_CONSTANT i32 1020 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-private.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-private.mir index e5bc410511199..9ac43878862c0 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-private.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-private.mir @@ -4,29 +4,29 @@ --- -name: store_private_s32_to_4 +name: function_store_private_s32_to_4 legalized: true regBankSelected: true tracksRegLiveness: true machineFunctionInfo: + isEntryFunction: false scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | bb.0: liveins: $vgpr0, $vgpr1 - ; GFX6-LABEL: name: store_private_s32_to_4 + ; GFX6-LABEL: name: function_store_private_s32_to_4 ; GFX6: liveins: $vgpr0, $vgpr1 ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX6: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) - ; GFX9-LABEL: name: store_private_s32_to_4 + ; GFX6: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) + ; GFX9-LABEL: name: function_store_private_s32_to_4 ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX9: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) + ; GFX9: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(p5) = COPY $vgpr1 G_STORE %0, %1 :: (store 4, align 4, addrspace 5) @@ -35,29 +35,29 @@ body: | --- -name: store_private_s32_to_2 +name: function_store_private_s32_to_2 legalized: true regBankSelected: true tracksRegLiveness: true machineFunctionInfo: + isEntryFunction: false scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | bb.0: liveins: $vgpr0, $vgpr1 - ; GFX6-LABEL: name: store_private_s32_to_2 + ; GFX6-LABEL: name: function_store_private_s32_to_2 ; GFX6: liveins: $vgpr0, $vgpr1 ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX6: BUFFER_STORE_SHORT_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 2, addrspace 5) - ; GFX9-LABEL: name: store_private_s32_to_2 + ; GFX6: BUFFER_STORE_SHORT_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 2, addrspace 5) + ; GFX9-LABEL: name: function_store_private_s32_to_2 ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX9: BUFFER_STORE_SHORT_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 2, addrspace 5) + ; GFX9: BUFFER_STORE_SHORT_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 2, addrspace 5) %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(p5) = COPY $vgpr1 G_STORE %0, %1 :: (store 2, align 2, addrspace 5) @@ -66,29 +66,29 @@ body: | --- -name: store_private_s32_to_1 +name: function_store_private_s32_to_1 legalized: true regBankSelected: true tracksRegLiveness: true machineFunctionInfo: + isEntryFunction: false scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | bb.0: liveins: $vgpr0, $vgpr1 - ; GFX6-LABEL: name: store_private_s32_to_1 + ; GFX6-LABEL: name: function_store_private_s32_to_1 ; GFX6: liveins: $vgpr0, $vgpr1 ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX6: BUFFER_STORE_BYTE_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) - ; GFX9-LABEL: name: store_private_s32_to_1 + ; GFX6: BUFFER_STORE_BYTE_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) + ; GFX9-LABEL: name: function_store_private_s32_to_1 ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX9: BUFFER_STORE_BYTE_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) + ; GFX9: BUFFER_STORE_BYTE_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(p5) = COPY $vgpr1 G_STORE %0, %1 :: (store 1, align 1, addrspace 5) @@ -97,29 +97,29 @@ body: | --- -name: store_private_v2s16 +name: function_store_private_v2s16 legalized: true regBankSelected: true tracksRegLiveness: true machineFunctionInfo: + isEntryFunction: false scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | bb.0: liveins: $vgpr0, $vgpr1 - ; GFX6-LABEL: name: store_private_v2s16 + ; GFX6-LABEL: name: function_store_private_v2s16 ; GFX6: liveins: $vgpr0, $vgpr1 ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX6: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) - ; GFX9-LABEL: name: store_private_v2s16 + ; GFX6: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) + ; GFX9-LABEL: name: function_store_private_v2s16 ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX9: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) + ; GFX9: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) %0:vgpr(<2 x s16>) = COPY $vgpr0 %1:vgpr(p5) = COPY $vgpr1 G_STORE %0, %1 :: (store 4, align 4, addrspace 5) @@ -128,29 +128,29 @@ body: | --- -name: store_private_p3 +name: function_store_private_p3 legalized: true regBankSelected: true tracksRegLiveness: true machineFunctionInfo: + isEntryFunction: false scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | bb.0: liveins: $vgpr0, $vgpr1 - ; GFX6-LABEL: name: store_private_p3 + ; GFX6-LABEL: name: function_store_private_p3 ; GFX6: liveins: $vgpr0, $vgpr1 ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX6: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) - ; GFX9-LABEL: name: store_private_p3 + ; GFX6: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) + ; GFX9-LABEL: name: function_store_private_p3 ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX9: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) + ; GFX9: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) %0:vgpr(p3) = COPY $vgpr0 %1:vgpr(p5) = COPY $vgpr1 G_STORE %0, %1 :: (store 4, align 4, addrspace 5) @@ -159,29 +159,29 @@ body: | --- -name: store_private_p5 +name: function_store_private_p5 legalized: true regBankSelected: true tracksRegLiveness: true machineFunctionInfo: + isEntryFunction: false scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | bb.0: liveins: $vgpr0, $vgpr1 - ; GFX6-LABEL: name: store_private_p5 + ; GFX6-LABEL: name: function_store_private_p5 ; GFX6: liveins: $vgpr0, $vgpr1 ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX6: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) - ; GFX9-LABEL: name: store_private_p5 + ; GFX6: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) + ; GFX9-LABEL: name: function_store_private_p5 ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX9: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) + ; GFX9: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(p5) = COPY $vgpr1 G_STORE %0, %1 :: (store 4, align 4, addrspace 5) @@ -190,13 +190,13 @@ body: | --- -name: store_private_s32_to_1_fi_offset_4095 +name: function_store_private_s32_to_1_fi_offset_4095 legalized: true regBankSelected: true tracksRegLiveness: true machineFunctionInfo: + isEntryFunction: false scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 stack: - { id: 0, size: 4096, alignment: 4 } @@ -204,13 +204,13 @@ stack: body: | bb.0: - ; GFX6-LABEL: name: store_private_s32_to_1_fi_offset_4095 + ; GFX6-LABEL: name: function_store_private_s32_to_1_fi_offset_4095 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX6: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], 0, implicit $exec ; GFX6: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX6: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_2]], %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) - ; GFX9-LABEL: name: store_private_s32_to_1_fi_offset_4095 + ; GFX6: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_2]], %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) + ; GFX9-LABEL: name: function_store_private_s32_to_1_fi_offset_4095 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GFX9: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4095, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) %0:vgpr(p5) = G_FRAME_INDEX %stack.0 @@ -223,13 +223,13 @@ body: | --- -name: store_private_s32_to_1_constant_4095 +name: function_store_private_s32_to_1_constant_4095 legalized: true regBankSelected: true tracksRegLiveness: true machineFunctionInfo: + isEntryFunction: false scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 stack: - { id: 0, size: 4096, alignment: 4 } @@ -237,12 +237,12 @@ stack: body: | bb.0: - ; GFX6-LABEL: name: store_private_s32_to_1_constant_4095 + ; GFX6-LABEL: name: function_store_private_s32_to_1_constant_4095 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX6: BUFFER_STORE_BYTE_OFFSET [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4095, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) - ; GFX9-LABEL: name: store_private_s32_to_1_constant_4095 + ; GFX6: BUFFER_STORE_BYTE_OFFSET [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) + ; GFX9-LABEL: name: function_store_private_s32_to_1_constant_4095 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX9: BUFFER_STORE_BYTE_OFFSET [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4095, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) + ; GFX9: BUFFER_STORE_BYTE_OFFSET [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) %0:vgpr(p5) = G_CONSTANT i32 4095 %1:vgpr(s32) = G_CONSTANT i32 0 G_STORE %1, %0 :: (store 1, align 1, addrspace 5) @@ -251,13 +251,13 @@ body: | --- -name: store_private_s32_to_1_constant_4096 +name: function_store_private_s32_to_1_constant_4096 legalized: true regBankSelected: true tracksRegLiveness: true machineFunctionInfo: + isEntryFunction: false scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 stack: - { id: 0, size: 4096, alignment: 4 } @@ -265,14 +265,291 @@ stack: body: | bb.0: - ; GFX6-LABEL: name: store_private_s32_to_1_constant_4096 + ; GFX6-LABEL: name: function_store_private_s32_to_1_constant_4096 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GFX6: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec - ; GFX6: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) - ; GFX9-LABEL: name: store_private_s32_to_1_constant_4096 + ; GFX6: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) + ; GFX9-LABEL: name: function_store_private_s32_to_1_constant_4096 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GFX9: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec - ; GFX9: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) + ; GFX9: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) + %0:vgpr(p5) = G_CONSTANT i32 4096 + %1:vgpr(s32) = G_CONSTANT i32 0 + G_STORE %1, %0 :: (store 1, align 1, addrspace 5) + +... + +--- + +name: kernel_store_private_s32_to_4 +legalized: true +regBankSelected: true +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true + scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 + +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3 + + ; GFX6-LABEL: name: kernel_store_private_s32_to_4 + ; GFX6: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX6: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) + ; GFX9-LABEL: name: kernel_store_private_s32_to_4 + ; GFX9: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX9: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(p5) = COPY $vgpr1 + G_STORE %0, %1 :: (store 4, align 4, addrspace 5) + +... + +--- + +name: kernel_store_private_s32_to_2 +legalized: true +regBankSelected: true +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true + scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 + +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3 + + ; GFX6-LABEL: name: kernel_store_private_s32_to_2 + ; GFX6: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX6: BUFFER_STORE_SHORT_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 2, addrspace 5) + ; GFX9-LABEL: name: kernel_store_private_s32_to_2 + ; GFX9: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX9: BUFFER_STORE_SHORT_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 2, addrspace 5) + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(p5) = COPY $vgpr1 + G_STORE %0, %1 :: (store 2, align 2, addrspace 5) + +... + +--- + +name: kernel_store_private_s32_to_1 +legalized: true +regBankSelected: true +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true + scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 + +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3 + + ; GFX6-LABEL: name: kernel_store_private_s32_to_1 + ; GFX6: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX6: BUFFER_STORE_BYTE_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) + ; GFX9-LABEL: name: kernel_store_private_s32_to_1 + ; GFX9: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX9: BUFFER_STORE_BYTE_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) + %0:vgpr(s32) = COPY $vgpr0 + %1:vgpr(p5) = COPY $vgpr1 + G_STORE %0, %1 :: (store 1, align 1, addrspace 5) + +... + +--- + +name: kernel_store_private_v2s16 +legalized: true +regBankSelected: true +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true + scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 + +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3 + + ; GFX6-LABEL: name: kernel_store_private_v2s16 + ; GFX6: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX6: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) + ; GFX9-LABEL: name: kernel_store_private_v2s16 + ; GFX9: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX9: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) + %0:vgpr(<2 x s16>) = COPY $vgpr0 + %1:vgpr(p5) = COPY $vgpr1 + G_STORE %0, %1 :: (store 4, align 4, addrspace 5) + +... + +--- + +name: kernel_store_private_p3 +legalized: true +regBankSelected: true +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true + scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 + +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3 + + ; GFX6-LABEL: name: kernel_store_private_p3 + ; GFX6: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX6: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) + ; GFX9-LABEL: name: kernel_store_private_p3 + ; GFX9: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX9: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) + %0:vgpr(p3) = COPY $vgpr0 + %1:vgpr(p5) = COPY $vgpr1 + G_STORE %0, %1 :: (store 4, align 4, addrspace 5) + +... + +--- + +name: kernel_store_private_p5 +legalized: true +regBankSelected: true +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true + scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 + +body: | + bb.0: + liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3 + + ; GFX6-LABEL: name: kernel_store_private_p5 + ; GFX6: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX6: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) + ; GFX9-LABEL: name: kernel_store_private_p5 + ; GFX9: liveins: $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 + ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 + ; GFX9: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) + %0:vgpr(p5) = COPY $vgpr0 + %1:vgpr(p5) = COPY $vgpr1 + G_STORE %0, %1 :: (store 4, align 4, addrspace 5) + +... + +--- + +name: kernel_store_private_s32_to_1_fi_offset_4095 +legalized: true +regBankSelected: true +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true + scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 +stack: + - { id: 0, size: 4096, alignment: 4 } + +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + + ; GFX6-LABEL: name: kernel_store_private_s32_to_1_fi_offset_4095 + ; GFX6: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec + ; GFX6: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec + ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], 0, implicit $exec + ; GFX6: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX6: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_2]], %2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) + ; GFX9-LABEL: name: kernel_store_private_s32_to_1_fi_offset_4095 + ; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX9: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) + %0:vgpr(p5) = G_FRAME_INDEX %stack.0 + %1:vgpr(s32) = G_CONSTANT i32 4095 + %2:vgpr(p5) = G_PTR_ADD %0, %1 + %3:vgpr(s32) = G_CONSTANT i32 0 + G_STORE %3, %2 :: (store 1, align 1, addrspace 5) + +... + +--- + +name: kernel_store_private_s32_to_1_constant_4095 +legalized: true +regBankSelected: true +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true + scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 +stack: + - { id: 0, size: 4096, alignment: 4 } + +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + + ; GFX6-LABEL: name: kernel_store_private_s32_to_1_constant_4095 + ; GFX6: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX6: BUFFER_STORE_BYTE_OFFSET [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) + ; GFX9-LABEL: name: kernel_store_private_s32_to_1_constant_4095 + ; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX9: BUFFER_STORE_BYTE_OFFSET [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) + %0:vgpr(p5) = G_CONSTANT i32 4095 + %1:vgpr(s32) = G_CONSTANT i32 0 + G_STORE %1, %0 :: (store 1, align 1, addrspace 5) + +... + +--- + +name: kernel_store_private_s32_to_1_constant_4096 +legalized: true +regBankSelected: true +tracksRegLiveness: true +machineFunctionInfo: + isEntryFunction: true + scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 +stack: + - { id: 0, size: 4096, alignment: 4 } + +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + + ; GFX6-LABEL: name: kernel_store_private_s32_to_1_constant_4096 + ; GFX6: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX6: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec + ; GFX6: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) + ; GFX9-LABEL: name: kernel_store_private_s32_to_1_constant_4096 + ; GFX9: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + ; GFX9: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec + ; GFX9: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) %0:vgpr(p5) = G_CONSTANT i32 4096 %1:vgpr(s32) = G_CONSTANT i32 0 G_STORE %1, %0 :: (store 1, align 1, addrspace 5) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-sat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-sat.ll new file mode 100644 index 0000000000000..efdecf7fb49fe --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-sat.ll @@ -0,0 +1,318 @@ +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +; RUN: llc -march=amdgcn -global-isel -stop-after=irtranslator %s -o - | FileCheck %s + +define i16 @uaddsat_i16(i16 %lhs, i16 %rhs) { + ; CHECK-LABEL: name: uaddsat_i16 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[UADDSAT:%[0-9]+]]:_(s16) = G_UADDSAT [[TRUNC]], [[TRUNC1]] + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UADDSAT]](s16) + ; CHECK: $vgpr0 = COPY [[ANYEXT]](s32) + ; CHECK: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]] + ; CHECK: S_SETPC_B64_return [[COPY3]], implicit $vgpr0 + %res = call i16 @llvm.uadd.sat.i16(i16 %lhs, i16 %rhs) + ret i16 %res +} +declare i16 @llvm.uadd.sat.i16(i16, i16) + +define i32 @uaddsat_i32(i32 %lhs, i32 %rhs) { + ; CHECK-LABEL: name: uaddsat_i32 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[UADDSAT:%[0-9]+]]:_(s32) = G_UADDSAT [[COPY]], [[COPY1]] + ; CHECK: $vgpr0 = COPY [[UADDSAT]](s32) + ; CHECK: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]] + ; CHECK: S_SETPC_B64_return [[COPY3]], implicit $vgpr0 + %res = call i32 @llvm.uadd.sat.i32(i32 %lhs, i32 %rhs) + ret i32 %res +} +declare i32 @llvm.uadd.sat.i32(i32, i32) + +define i64 @uaddsat_i64(i64 %lhs, i64 %rhs) { + ; CHECK-LABEL: name: uaddsat_i64 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; CHECK: [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; CHECK: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) + ; CHECK: [[UADDSAT:%[0-9]+]]:_(s64) = G_UADDSAT [[MV]], [[MV1]] + ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UADDSAT]](s64) + ; CHECK: $vgpr0 = COPY [[UV]](s32) + ; CHECK: $vgpr1 = COPY [[UV1]](s32) + ; CHECK: [[COPY5:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY4]] + ; CHECK: S_SETPC_B64_return [[COPY5]], implicit $vgpr0, implicit $vgpr1 + %res = call i64 @llvm.uadd.sat.i64(i64 %lhs, i64 %rhs) + ret i64 %res +} +declare i64 @llvm.uadd.sat.i64(i64, i64) + +define <2 x i32> @uaddsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { + ; CHECK-LABEL: name: uaddsat_v2i32 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; CHECK: [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32) + ; CHECK: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY2]](s32), [[COPY3]](s32) + ; CHECK: [[UADDSAT:%[0-9]+]]:_(<2 x s32>) = G_UADDSAT [[BUILD_VECTOR]], [[BUILD_VECTOR1]] + ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UADDSAT]](<2 x s32>) + ; CHECK: $vgpr0 = COPY [[UV]](s32) + ; CHECK: $vgpr1 = COPY [[UV1]](s32) + ; CHECK: [[COPY5:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY4]] + ; CHECK: S_SETPC_B64_return [[COPY5]], implicit $vgpr0, implicit $vgpr1 + %res = call <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) + ret <2 x i32> %res +} +declare <2 x i32> @llvm.uadd.sat.v2i32(<2 x i32>, <2 x i32>) + +define i16 @saddsat_i16(i16 %lhs, i16 %rhs) { + ; CHECK-LABEL: name: saddsat_i16 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[SADDSAT:%[0-9]+]]:_(s16) = G_SADDSAT [[TRUNC]], [[TRUNC1]] + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SADDSAT]](s16) + ; CHECK: $vgpr0 = COPY [[ANYEXT]](s32) + ; CHECK: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]] + ; CHECK: S_SETPC_B64_return [[COPY3]], implicit $vgpr0 + %res = call i16 @llvm.sadd.sat.i16(i16 %lhs, i16 %rhs) + ret i16 %res +} +declare i16 @llvm.sadd.sat.i16(i16, i16) + +define i32 @saddsat_i32(i32 %lhs, i32 %rhs) { + ; CHECK-LABEL: name: saddsat_i32 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[SADDSAT:%[0-9]+]]:_(s32) = G_SADDSAT [[COPY]], [[COPY1]] + ; CHECK: $vgpr0 = COPY [[SADDSAT]](s32) + ; CHECK: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]] + ; CHECK: S_SETPC_B64_return [[COPY3]], implicit $vgpr0 + %res = call i32 @llvm.sadd.sat.i32(i32 %lhs, i32 %rhs) + ret i32 %res +} +declare i32 @llvm.sadd.sat.i32(i32, i32) + +define i64 @saddsat_i64(i64 %lhs, i64 %rhs) { + ; CHECK-LABEL: name: saddsat_i64 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; CHECK: [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; CHECK: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) + ; CHECK: [[SADDSAT:%[0-9]+]]:_(s64) = G_SADDSAT [[MV]], [[MV1]] + ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[SADDSAT]](s64) + ; CHECK: $vgpr0 = COPY [[UV]](s32) + ; CHECK: $vgpr1 = COPY [[UV1]](s32) + ; CHECK: [[COPY5:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY4]] + ; CHECK: S_SETPC_B64_return [[COPY5]], implicit $vgpr0, implicit $vgpr1 + %res = call i64 @llvm.sadd.sat.i64(i64 %lhs, i64 %rhs) + ret i64 %res +} +declare i64 @llvm.sadd.sat.i64(i64, i64) + +define <2 x i32> @saddsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { + ; CHECK-LABEL: name: saddsat_v2i32 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; CHECK: [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32) + ; CHECK: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY2]](s32), [[COPY3]](s32) + ; CHECK: [[SADDSAT:%[0-9]+]]:_(<2 x s32>) = G_SADDSAT [[BUILD_VECTOR]], [[BUILD_VECTOR1]] + ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[SADDSAT]](<2 x s32>) + ; CHECK: $vgpr0 = COPY [[UV]](s32) + ; CHECK: $vgpr1 = COPY [[UV1]](s32) + ; CHECK: [[COPY5:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY4]] + ; CHECK: S_SETPC_B64_return [[COPY5]], implicit $vgpr0, implicit $vgpr1 + %res = call <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) + ret <2 x i32> %res +} +declare <2 x i32> @llvm.sadd.sat.v2i32(<2 x i32>, <2 x i32>) + +define i16 @usubsat_i16(i16 %lhs, i16 %rhs) { + ; CHECK-LABEL: name: usubsat_i16 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[USUBSAT:%[0-9]+]]:_(s16) = G_USUBSAT [[TRUNC]], [[TRUNC1]] + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[USUBSAT]](s16) + ; CHECK: $vgpr0 = COPY [[ANYEXT]](s32) + ; CHECK: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]] + ; CHECK: S_SETPC_B64_return [[COPY3]], implicit $vgpr0 + %res = call i16 @llvm.usub.sat.i16(i16 %lhs, i16 %rhs) + ret i16 %res +} +declare i16 @llvm.usub.sat.i16(i16, i16) + +define i32 @usubsat_i32(i32 %lhs, i32 %rhs) { + ; CHECK-LABEL: name: usubsat_i32 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[USUBSAT:%[0-9]+]]:_(s32) = G_USUBSAT [[COPY]], [[COPY1]] + ; CHECK: $vgpr0 = COPY [[USUBSAT]](s32) + ; CHECK: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]] + ; CHECK: S_SETPC_B64_return [[COPY3]], implicit $vgpr0 + %res = call i32 @llvm.usub.sat.i32(i32 %lhs, i32 %rhs) + ret i32 %res +} +declare i32 @llvm.usub.sat.i32(i32, i32) + +define i64 @usubsat_i64(i64 %lhs, i64 %rhs) { + ; CHECK-LABEL: name: usubsat_i64 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; CHECK: [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; CHECK: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) + ; CHECK: [[USUBSAT:%[0-9]+]]:_(s64) = G_USUBSAT [[MV]], [[MV1]] + ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[USUBSAT]](s64) + ; CHECK: $vgpr0 = COPY [[UV]](s32) + ; CHECK: $vgpr1 = COPY [[UV1]](s32) + ; CHECK: [[COPY5:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY4]] + ; CHECK: S_SETPC_B64_return [[COPY5]], implicit $vgpr0, implicit $vgpr1 + %res = call i64 @llvm.usub.sat.i64(i64 %lhs, i64 %rhs) + ret i64 %res +} +declare i64 @llvm.usub.sat.i64(i64, i64) + +define <2 x i32> @usubsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { + ; CHECK-LABEL: name: usubsat_v2i32 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; CHECK: [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32) + ; CHECK: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY2]](s32), [[COPY3]](s32) + ; CHECK: [[USUBSAT:%[0-9]+]]:_(<2 x s32>) = G_USUBSAT [[BUILD_VECTOR]], [[BUILD_VECTOR1]] + ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[USUBSAT]](<2 x s32>) + ; CHECK: $vgpr0 = COPY [[UV]](s32) + ; CHECK: $vgpr1 = COPY [[UV1]](s32) + ; CHECK: [[COPY5:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY4]] + ; CHECK: S_SETPC_B64_return [[COPY5]], implicit $vgpr0, implicit $vgpr1 + %res = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) + ret <2 x i32> %res +} +declare <2 x i32> @llvm.usub.sat.v2i32(<2 x i32>, <2 x i32>) + +define i16 @ssubsat_i16(i16 %lhs, i16 %rhs) { + ; CHECK-LABEL: name: ssubsat_i16 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[SSUBSAT:%[0-9]+]]:_(s16) = G_SSUBSAT [[TRUNC]], [[TRUNC1]] + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SSUBSAT]](s16) + ; CHECK: $vgpr0 = COPY [[ANYEXT]](s32) + ; CHECK: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]] + ; CHECK: S_SETPC_B64_return [[COPY3]], implicit $vgpr0 + %res = call i16 @llvm.ssub.sat.i16(i16 %lhs, i16 %rhs) + ret i16 %res +} +declare i16 @llvm.ssub.sat.i16(i16, i16) + +define i32 @ssubsat_i32(i32 %lhs, i32 %rhs) { + ; CHECK-LABEL: name: ssubsat_i32 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $vgpr0, $vgpr1, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[SSUBSAT:%[0-9]+]]:_(s32) = G_SSUBSAT [[COPY]], [[COPY1]] + ; CHECK: $vgpr0 = COPY [[SSUBSAT]](s32) + ; CHECK: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY2]] + ; CHECK: S_SETPC_B64_return [[COPY3]], implicit $vgpr0 + %res = call i32 @llvm.ssub.sat.i32(i32 %lhs, i32 %rhs) + ret i32 %res +} +declare i32 @llvm.ssub.sat.i32(i32, i32) + +define i64 @ssubsat_i64(i64 %lhs, i64 %rhs) { + ; CHECK-LABEL: name: ssubsat_i64 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; CHECK: [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32) + ; CHECK: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32) + ; CHECK: [[SSUBSAT:%[0-9]+]]:_(s64) = G_SSUBSAT [[MV]], [[MV1]] + ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[SSUBSAT]](s64) + ; CHECK: $vgpr0 = COPY [[UV]](s32) + ; CHECK: $vgpr1 = COPY [[UV1]](s32) + ; CHECK: [[COPY5:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY4]] + ; CHECK: S_SETPC_B64_return [[COPY5]], implicit $vgpr0, implicit $vgpr1 + %res = call i64 @llvm.ssub.sat.i64(i64 %lhs, i64 %rhs) + ret i64 %res +} +declare i64 @llvm.ssub.sat.i64(i64, i64) + +define <2 x i32> @ssubsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { + ; CHECK-LABEL: name: ssubsat_v2i32 + ; CHECK: bb.1 (%ir-block.0): + ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr30_sgpr31 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3 + ; CHECK: [[COPY4:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32) + ; CHECK: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY2]](s32), [[COPY3]](s32) + ; CHECK: [[SSUBSAT:%[0-9]+]]:_(<2 x s32>) = G_SSUBSAT [[BUILD_VECTOR]], [[BUILD_VECTOR1]] + ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[SSUBSAT]](<2 x s32>) + ; CHECK: $vgpr0 = COPY [[UV]](s32) + ; CHECK: $vgpr1 = COPY [[UV1]](s32) + ; CHECK: [[COPY5:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY4]] + ; CHECK: S_SETPC_B64_return [[COPY5]], implicit $vgpr0, implicit $vgpr1 + %res = call <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) + ret <2 x i32> %res +} +declare <2 x i32> @llvm.ssub.sat.v2i32(<2 x i32>, <2 x i32>) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-implicit-def.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-implicit-def.mir index bea92bcd210f9..0ff5bbcb5a6d0 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-implicit-def.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-implicit-def.mir @@ -174,6 +174,48 @@ body: | $vgpr0 = COPY %1 ... +--- +name: test_implicit_def_s1025 +body: | + bb.0: + + ; CHECK-LABEL: name: test_implicit_def_s1025 + ; CHECK: [[DEF:%[0-9]+]]:_(s1025) = G_IMPLICIT_DEF + ; CHECK: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[DEF]](s1025) + ; CHECK: $vgpr0 = COPY [[TRUNC]](s32) + %0:_(s1025) = G_IMPLICIT_DEF + %1:_(s32) = G_TRUNC %0 + $vgpr0 = COPY %1 +... + +--- +name: test_implicit_def_s1056 +body: | + bb.0: + + ; CHECK-LABEL: name: test_implicit_def_s1056 + ; CHECK: [[DEF:%[0-9]+]]:_(s1056) = G_IMPLICIT_DEF + ; CHECK: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[DEF]](s1056) + ; CHECK: $vgpr0 = COPY [[TRUNC]](s32) + %0:_(s1056) = G_IMPLICIT_DEF + %1:_(s32) = G_TRUNC %0 + $vgpr0 = COPY %1 +... + +--- +name: test_implicit_def_s2048 +body: | + bb.0: + + ; CHECK-LABEL: name: test_implicit_def_s2048 + ; CHECK: [[DEF:%[0-9]+]]:_(s1024) = G_IMPLICIT_DEF + ; CHECK: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[DEF]](s1024) + ; CHECK: $vgpr0 = COPY [[TRUNC]](s32) + %0:_(s2048) = G_IMPLICIT_DEF + %1:_(s32) = G_TRUNC %0 + $vgpr0 = COPY %1 +... + --- name: test_implicit_def_v2s32 body: | @@ -294,6 +336,38 @@ body: | S_NOP 0, implicit %0 ... +--- +name: test_implicit_def_v33s32 +body: | + bb.0: + + ; CHECK-LABEL: name: test_implicit_def_v33s32 + ; CHECK: [[DEF:%[0-9]+]]:_(<33 x s32>) = G_IMPLICIT_DEF + ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32), [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32), [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32), [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32), [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32), [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32), [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32), [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32), [[UV22:%[0-9]+]]:_(s32), [[UV23:%[0-9]+]]:_(s32), [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32), [[UV26:%[0-9]+]]:_(s32), [[UV27:%[0-9]+]]:_(s32), [[UV28:%[0-9]+]]:_(s32), [[UV29:%[0-9]+]]:_(s32), [[UV30:%[0-9]+]]:_(s32), [[UV31:%[0-9]+]]:_(s32), [[UV32:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF]](<33 x s32>) + ; CHECK: S_NOP 0, implicit [[UV]](s32), implicit [[UV32]](s32) + %0:_(<33 x s32>) = G_IMPLICIT_DEF + %1:_(s32), %2:_(s32), %3:_(s32), %4:_(s32), %5:_(s32), %6:_(s32), %7:_(s32), %8:_(s32), %9:_(s32), %10:_(s32), %11:_(s32), %12:_(s32), %13:_(s32), %14:_(s32), %15:_(s32), %16:_(s32), %17:_(s32), %18:_(s32), %19:_(s32), %20:_(s32), %21:_(s32), %22:_(s32), %23:_(s32), %24:_(s32), %25:_(s32), %26:_(s32), %27:_(s32), %28:_(s32), %29:_(s32), %30:_(s32), %31:_(s32), %32:_(s32), %33:_(s32) = G_UNMERGE_VALUES %0 + S_NOP 0, implicit %1, implicit %33 +... + +--- +name: test_implicit_def_v64s32 +body: | + bb.0: + + ; CHECK-LABEL: name: test_implicit_def_v64s32 + ; CHECK: [[DEF:%[0-9]+]]:_(<16 x s32>) = G_IMPLICIT_DEF + ; CHECK: [[COPY:%[0-9]+]]:_(<16 x s32>) = COPY [[DEF]](<16 x s32>) + ; CHECK: [[COPY1:%[0-9]+]]:_(<16 x s32>) = COPY [[DEF]](<16 x s32>) + ; CHECK: [[COPY2:%[0-9]+]]:_(<16 x s32>) = COPY [[DEF]](<16 x s32>) + ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:_(<64 x s32>) = G_CONCAT_VECTORS [[DEF]](<16 x s32>), [[COPY]](<16 x s32>), [[COPY1]](<16 x s32>), [[COPY2]](<16 x s32>) + ; CHECK: [[CONCAT_VECTORS1:%[0-9]+]]:_(<32 x s32>) = G_CONCAT_VECTORS [[DEF]](<16 x s32>), [[COPY]](<16 x s32>) + ; CHECK: S_NOP 0, implicit [[CONCAT_VECTORS]](<64 x s32>), implicit [[CONCAT_VECTORS1]](<32 x s32>) + %0:_(<64 x s32>) = G_IMPLICIT_DEF + %1:_(<32 x s32>), %2:_(<32 x s32>) = G_UNMERGE_VALUES %0 + S_NOP 0, implicit %0, implicit %1 +... + --- name: test_implicit_def_v2s1 body: | @@ -583,4 +657,34 @@ body: | ; CHECK: $vgpr0_vgpr1 = COPY [[DEF]](p999) %0:_(p999) = G_IMPLICIT_DEF $vgpr0_vgpr1 = COPY %0 + +... + +--- +name: test_implicit_def_v2s1024 +body: | + bb.0: + + ; CHECK-LABEL: name: test_implicit_def_v2s1024 + ; CHECK: [[DEF:%[0-9]+]]:_(<2 x s1024>) = G_IMPLICIT_DEF + ; CHECK: [[UV:%[0-9]+]]:_(s1024), [[UV1:%[0-9]+]]:_(s1024) = G_UNMERGE_VALUES [[DEF]](<2 x s1024>) + ; CHECK: S_ENDPGM 0, implicit [[UV]](s1024), implicit [[UV1]](s1024) + %0:_(<2 x s1024>) = G_IMPLICIT_DEF + %1:_(s1024), %2:_(s1024) = G_UNMERGE_VALUES %0 + S_ENDPGM 0, implicit %1, implicit %2 +... + +--- + +name: test_implicit_def_v3s1024 +body: | + bb.0: + + ; CHECK-LABEL: name: test_implicit_def_v3s1024 + ; CHECK: [[DEF:%[0-9]+]]:_(<3 x s1024>) = G_IMPLICIT_DEF + ; CHECK: [[UV:%[0-9]+]]:_(s1024), [[UV1:%[0-9]+]]:_(s1024), [[UV2:%[0-9]+]]:_(s1024) = G_UNMERGE_VALUES [[DEF]](<3 x s1024>) + ; CHECK: S_ENDPGM 0, implicit [[UV]](s1024), implicit [[UV1]](s1024), implicit [[UV2]](s1024) + %0:_(<3 x s1024>) = G_IMPLICIT_DEF + %1:_(s1024), %2:_(s1024), %3:_(s1024) = G_UNMERGE_VALUES %0 + S_ENDPGM 0, implicit %1, implicit %2, implicit %3 ... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sextload-constant-32bit.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sextload-constant-32bit.mir new file mode 100644 index 0000000000000..b7896fc8184d6 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sextload-constant-32bit.mir @@ -0,0 +1,168 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=bonaire -O0 -run-pass=legalizer %s -o - | FileCheck -check-prefix=CI %s + +--- +name: test_sextload_constant32bit_s64_s32_align4 +body: | + bb.0: + liveins: $sgpr0 + + ; CI-LABEL: name: test_sextload_constant32bit_s64_s32_align4 + ; CI: [[COPY:%[0-9]+]]:_(p6) = COPY $sgpr0 + ; CI: [[C:%[0-9]+]]:_(p6) = G_CONSTANT i32 0 + ; CI: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[COPY]](p6), [[C]](p6) + ; CI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[MV]](p4) :: (load 4, addrspace 6) + ; CI: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[LOAD]](s32) + ; CI: $vgpr0_vgpr1 = COPY [[SEXT]](s64) + %0:_(p6) = COPY $sgpr0 + %1:_(s64) = G_SEXTLOAD %0 :: (load 4, align 4, addrspace 6) + $vgpr0_vgpr1 = COPY %1 +... + +--- +name: test_sextload_constant32bit_s64_s32_align2 +body: | + bb.0: + liveins: $sgpr0 + + ; CI-LABEL: name: test_sextload_constant32bit_s64_s32_align2 + ; CI: [[COPY:%[0-9]+]]:_(p6) = COPY $sgpr0 + ; CI: [[C:%[0-9]+]]:_(p6) = G_CONSTANT i32 0 + ; CI: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[COPY]](p6), [[C]](p6) + ; CI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[MV]](p4) :: (load 2, addrspace 6) + ; CI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 + ; CI: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[MV]], [[C1]](s64) + ; CI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (load 2, addrspace 6) + ; CI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; CI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C2]] + ; CI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; CI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C2]] + ; CI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C3]](s32) + ; CI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CI: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[OR]](s32) + ; CI: $vgpr0_vgpr1 = COPY [[SEXT]](s64) + %0:_(p6) = COPY $sgpr0 + %1:_(s64) = G_SEXTLOAD %0 :: (load 4, align 2, addrspace 6) + $vgpr0_vgpr1 = COPY %1 +... + +--- +name: test_sextload_constant32bit_s64_s32_align1 +body: | + bb.0: + liveins: $sgpr0 + + ; CI-LABEL: name: test_sextload_constant32bit_s64_s32_align1 + ; CI: [[COPY:%[0-9]+]]:_(p6) = COPY $sgpr0 + ; CI: [[C:%[0-9]+]]:_(p6) = G_CONSTANT i32 0 + ; CI: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[COPY]](p6), [[C]](p6) + ; CI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[MV]](p4) :: (load 1, addrspace 6) + ; CI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; CI: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[MV]], [[C1]](s64) + ; CI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (load 1, addrspace 6) + ; CI: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 + ; CI: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[MV]], [[C2]](s64) + ; CI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p4) :: (load 1, addrspace 6) + ; CI: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 3 + ; CI: [[PTR_ADD2:%[0-9]+]]:_(p4) = G_PTR_ADD [[MV]], [[C3]](s64) + ; CI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p4) :: (load 1, addrspace 6) + ; CI: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 + ; CI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) + ; CI: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]] + ; CI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32) + ; CI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; CI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; CI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C6]] + ; CI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[COPY1]](s32) + ; CI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[SHL]](s32) + ; CI: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[TRUNC1]] + ; CI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) + ; CI: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C4]] + ; CI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; CI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C6]] + ; CI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C5]](s32) + ; CI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) + ; CI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC3]] + ; CI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C7]](s32) + ; CI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; CI: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[OR2]](s32) + ; CI: $vgpr0_vgpr1 = COPY [[SEXT]](s64) + %0:_(p6) = COPY $sgpr0 + %1:_(s64) = G_SEXTLOAD %0 :: (load 4, align 1, addrspace 6) + $vgpr0_vgpr1 = COPY %1 +... + +--- +name: test_sextload_constant32bit_s32_s8_align1 +body: | + bb.0: + liveins: $sgpr0 + + ; CI-LABEL: name: test_sextload_constant32bit_s32_s8_align1 + ; CI: [[COPY:%[0-9]+]]:_(p6) = COPY $sgpr0 + ; CI: [[C:%[0-9]+]]:_(p6) = G_CONSTANT i32 0 + ; CI: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[COPY]](p6), [[C]](p6) + ; CI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[MV]](p4) :: (load 1, addrspace 6) + ; CI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; CI: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY1]], 8 + ; CI: $vgpr0 = COPY [[SEXT_INREG]](s32) + %0:_(p6) = COPY $sgpr0 + %1:_(s32) = G_SEXTLOAD %0 :: (load 1, align 1, addrspace 6) + $vgpr0 = COPY %1 +... + +--- +name: test_sextload_constant32bit_s32_s16_align2 +body: | + bb.0: + liveins: $sgpr0 + + ; CI-LABEL: name: test_sextload_constant32bit_s32_s16_align2 + ; CI: [[COPY:%[0-9]+]]:_(p6) = COPY $sgpr0 + ; CI: [[C:%[0-9]+]]:_(p6) = G_CONSTANT i32 0 + ; CI: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[COPY]](p6), [[C]](p6) + ; CI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[MV]](p4) :: (load 2, addrspace 6) + ; CI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; CI: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY1]], 16 + ; CI: $vgpr0 = COPY [[SEXT_INREG]](s32) + %0:_(p6) = COPY $sgpr0 + %1:_(s32) = G_SEXTLOAD %0 :: (load 2, align 2, addrspace 6) + $vgpr0 = COPY %1 +... + +--- +name: test_sextload_constant32bit_s32_s16_align1 +body: | + bb.0: + liveins: $sgpr0 + + ; CI-LABEL: name: test_sextload_constant32bit_s32_s16_align1 + ; CI: [[COPY:%[0-9]+]]:_(p6) = COPY $sgpr0 + ; CI: [[C:%[0-9]+]]:_(p6) = G_CONSTANT i32 0 + ; CI: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[COPY]](p6), [[C]](p6) + ; CI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[MV]](p4) :: (load 1, addrspace 6) + ; CI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; CI: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[MV]], [[C1]](s64) + ; CI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (load 1, addrspace 6) + ; CI: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 + ; CI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) + ; CI: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C2]] + ; CI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CI: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; CI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; CI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C4]] + ; CI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C3]](s32) + ; CI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[SHL]](s32) + ; CI: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[TRUNC1]] + ; CI: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[OR]](s16) + ; CI: $vgpr0 = COPY [[SEXT]](s32) + %0:_(p6) = COPY $sgpr0 + %1:_(s32) = G_SEXTLOAD %0 :: (load 2, align 1, addrspace 6) + $vgpr0 = COPY %1 +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-unmerge-values.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-unmerge-values.mir index 561fcdf504e9d..aa52a681c1568 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-unmerge-values.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-unmerge-values.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn-- -O0 -run-pass=legalizer -global-isel-abort=0 -o - %s | FileCheck %s +# RUN: llc -mtriple=amdgcn-- -O0 -run-pass=legalizer -o - %s | FileCheck %s --- name: test_unmerge_s32_s64 @@ -656,6 +656,46 @@ body: | $vgpr1 = COPY %4 ... +--- +name: test_unmerge_s8_p3 +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: test_unmerge_s8_p3 + ; CHECK: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0 + ; CHECK: [[PTRTOINT:%[0-9]+]]:_(s32) = G_PTRTOINT [[COPY]](p3) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[PTRTOINT]], [[C]](s32) + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C1]](s32) + ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[PTRTOINT]](s32) + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C2]] + ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[AND]], [[COPY1]](s32) + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C2]] + ; CHECK: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[C1]](s32) + ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[PTRTOINT]](s32) + ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; CHECK: [[COPY6:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; CHECK: [[COPY7:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; CHECK: $vgpr0 = COPY [[COPY4]](s32) + ; CHECK: $vgpr1 = COPY [[COPY5]](s32) + ; CHECK: $vgpr2 = COPY [[COPY6]](s32) + ; CHECK: $vgpr3 = COPY [[COPY7]](s32) + %0:_(p3) = COPY $vgpr0 + %1:_(s8), %2:_(s8), %3:_(s8), %4:_(s8) = G_UNMERGE_VALUES %0 + %5:_(s32) = G_ANYEXT %1 + %6:_(s32) = G_ANYEXT %2 + %7:_(s32) = G_ANYEXT %3 + %8:_(s32) = G_ANYEXT %4 + $vgpr0 = COPY %5 + $vgpr1 = COPY %6 + $vgpr2 = COPY %7 + $vgpr3 = COPY %8 + +... + --- name: test_unmerge_s16_s64 body: | @@ -694,14 +734,21 @@ body: | liveins: $vgpr0 ; CHECK-LABEL: name: test_unmerge_s1_s3 ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; CHECK: [[TRUNC:%[0-9]+]]:_(s3) = G_TRUNC [[COPY]](s32) - ; CHECK: [[UV:%[0-9]+]]:_(s1), [[UV1:%[0-9]+]]:_(s1), [[UV2:%[0-9]+]]:_(s1) = G_UNMERGE_VALUES [[TRUNC]](s3) - ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UV]](s1) - ; CHECK: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[UV1]](s1) - ; CHECK: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[UV2]](s1) - ; CHECK: $vgpr0 = COPY [[ANYEXT]](s32) - ; CHECK: $vgpr1 = COPY [[ANYEXT1]](s32) - ; CHECK: $vgpr2 = COPY [[ANYEXT2]](s32) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND]], [[C]](s32) + ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[C2]](s32) + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; CHECK: $vgpr0 = COPY [[COPY3]](s32) + ; CHECK: $vgpr1 = COPY [[COPY4]](s32) + ; CHECK: $vgpr2 = COPY [[COPY5]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s3) = G_TRUNC %0 %2:_(s1), %3:_(s1), %4:_(s1) = G_UNMERGE_VALUES %1 @@ -720,24 +767,51 @@ body: | liveins: $vgpr0 ; CHECK-LABEL: name: test_unmerge_s1_s8 ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 - ; CHECK: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY]](s32) - ; CHECK: [[UV:%[0-9]+]]:_(s1), [[UV1:%[0-9]+]]:_(s1), [[UV2:%[0-9]+]]:_(s1), [[UV3:%[0-9]+]]:_(s1), [[UV4:%[0-9]+]]:_(s1), [[UV5:%[0-9]+]]:_(s1), [[UV6:%[0-9]+]]:_(s1), [[UV7:%[0-9]+]]:_(s1) = G_UNMERGE_VALUES [[TRUNC]](s8) - ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UV]](s1) - ; CHECK: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[UV1]](s1) - ; CHECK: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[UV2]](s1) - ; CHECK: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[UV3]](s1) - ; CHECK: [[ANYEXT4:%[0-9]+]]:_(s32) = G_ANYEXT [[UV4]](s1) - ; CHECK: [[ANYEXT5:%[0-9]+]]:_(s32) = G_ANYEXT [[UV5]](s1) - ; CHECK: [[ANYEXT6:%[0-9]+]]:_(s32) = G_ANYEXT [[UV6]](s1) - ; CHECK: [[ANYEXT7:%[0-9]+]]:_(s32) = G_ANYEXT [[UV7]](s1) - ; CHECK: $vgpr0 = COPY [[ANYEXT]](s32) - ; CHECK: $vgpr1 = COPY [[ANYEXT1]](s32) - ; CHECK: $vgpr2 = COPY [[ANYEXT2]](s32) - ; CHECK: $vgpr3 = COPY [[ANYEXT3]](s32) - ; CHECK: $vgpr4 = COPY [[ANYEXT4]](s32) - ; CHECK: $vgpr5 = COPY [[ANYEXT5]](s32) - ; CHECK: $vgpr6 = COPY [[ANYEXT6]](s32) - ; CHECK: $vgpr7 = COPY [[ANYEXT7]](s32) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND]], [[C]](s32) + ; CHECK: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2 + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]] + ; CHECK: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[C2]](s32) + ; CHECK: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 3 + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C1]] + ; CHECK: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[C3]](s32) + ; CHECK: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4 + ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C1]] + ; CHECK: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[AND3]], [[C4]](s32) + ; CHECK: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 5 + ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C1]] + ; CHECK: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[AND4]], [[C5]](s32) + ; CHECK: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 6 + ; CHECK: [[COPY6:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C1]] + ; CHECK: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[AND5]], [[C6]](s32) + ; CHECK: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 7 + ; CHECK: [[COPY7:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C1]] + ; CHECK: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[AND6]], [[C7]](s32) + ; CHECK: [[COPY8:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK: [[COPY9:%[0-9]+]]:_(s32) = COPY [[LSHR]](s32) + ; CHECK: [[COPY10:%[0-9]+]]:_(s32) = COPY [[LSHR1]](s32) + ; CHECK: [[COPY11:%[0-9]+]]:_(s32) = COPY [[LSHR2]](s32) + ; CHECK: [[COPY12:%[0-9]+]]:_(s32) = COPY [[LSHR3]](s32) + ; CHECK: [[COPY13:%[0-9]+]]:_(s32) = COPY [[LSHR4]](s32) + ; CHECK: [[COPY14:%[0-9]+]]:_(s32) = COPY [[LSHR5]](s32) + ; CHECK: [[COPY15:%[0-9]+]]:_(s32) = COPY [[LSHR6]](s32) + ; CHECK: $vgpr0 = COPY [[COPY8]](s32) + ; CHECK: $vgpr1 = COPY [[COPY9]](s32) + ; CHECK: $vgpr2 = COPY [[COPY10]](s32) + ; CHECK: $vgpr3 = COPY [[COPY11]](s32) + ; CHECK: $vgpr4 = COPY [[COPY12]](s32) + ; CHECK: $vgpr5 = COPY [[COPY13]](s32) + ; CHECK: $vgpr6 = COPY [[COPY14]](s32) + ; CHECK: $vgpr7 = COPY [[COPY15]](s32) %0:_(s32) = COPY $vgpr0 %1:_(s8) = G_TRUNC %0 %2:_(s1), %3:_(s1), %4:_(s1), %5:_(s1), %6:_(s1), %7:_(s1), %8:_(s1), %9:_(s1) = G_UNMERGE_VALUES %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-zextload-constant-32bit.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-zextload-constant-32bit.mir new file mode 100644 index 0000000000000..9d342e8d6f5e7 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-zextload-constant-32bit.mir @@ -0,0 +1,170 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=bonaire -O0 -run-pass=legalizer %s -o - | FileCheck -check-prefix=CI %s + +--- +name: test_zextload_constant32bit_s64_s32_align4 +body: | + bb.0: + liveins: $sgpr0 + + ; CI-LABEL: name: test_zextload_constant32bit_s64_s32_align4 + ; CI: [[COPY:%[0-9]+]]:_(p6) = COPY $sgpr0 + ; CI: [[C:%[0-9]+]]:_(p6) = G_CONSTANT i32 0 + ; CI: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[COPY]](p6), [[C]](p6) + ; CI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[MV]](p4) :: (load 4, addrspace 6) + ; CI: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[LOAD]](s32) + ; CI: $vgpr0_vgpr1 = COPY [[ZEXT]](s64) + %0:_(p6) = COPY $sgpr0 + %1:_(s64) = G_ZEXTLOAD %0 :: (load 4, align 4, addrspace 6) + $vgpr0_vgpr1 = COPY %1 +... + +--- +name: test_zextload_constant32bit_s64_s32_align2 +body: | + bb.0: + liveins: $sgpr0 + + ; CI-LABEL: name: test_zextload_constant32bit_s64_s32_align2 + ; CI: [[COPY:%[0-9]+]]:_(p6) = COPY $sgpr0 + ; CI: [[C:%[0-9]+]]:_(p6) = G_CONSTANT i32 0 + ; CI: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[COPY]](p6), [[C]](p6) + ; CI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[MV]](p4) :: (load 2, addrspace 6) + ; CI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 + ; CI: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[MV]], [[C1]](s64) + ; CI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (load 2, addrspace 6) + ; CI: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; CI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C2]] + ; CI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; CI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C2]] + ; CI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C3]](s32) + ; CI: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]] + ; CI: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR]](s32) + ; CI: $vgpr0_vgpr1 = COPY [[ZEXT]](s64) + %0:_(p6) = COPY $sgpr0 + %1:_(s64) = G_ZEXTLOAD %0 :: (load 4, align 2, addrspace 6) + $vgpr0_vgpr1 = COPY %1 +... + +--- +name: test_zextload_constant32bit_s64_s32_align1 +body: | + bb.0: + liveins: $sgpr0 + + ; CI-LABEL: name: test_zextload_constant32bit_s64_s32_align1 + ; CI: [[COPY:%[0-9]+]]:_(p6) = COPY $sgpr0 + ; CI: [[C:%[0-9]+]]:_(p6) = G_CONSTANT i32 0 + ; CI: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[COPY]](p6), [[C]](p6) + ; CI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[MV]](p4) :: (load 1, addrspace 6) + ; CI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; CI: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[MV]], [[C1]](s64) + ; CI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (load 1, addrspace 6) + ; CI: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 + ; CI: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[MV]], [[C2]](s64) + ; CI: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p4) :: (load 1, addrspace 6) + ; CI: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 3 + ; CI: [[PTR_ADD2:%[0-9]+]]:_(p4) = G_PTR_ADD [[MV]], [[C3]](s64) + ; CI: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p4) :: (load 1, addrspace 6) + ; CI: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 + ; CI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) + ; CI: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]] + ; CI: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32) + ; CI: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; CI: [[COPY2:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; CI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C6]] + ; CI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[COPY1]](s32) + ; CI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[SHL]](s32) + ; CI: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[TRUNC1]] + ; CI: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD2]](s32) + ; CI: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C4]] + ; CI: [[COPY3:%[0-9]+]]:_(s32) = COPY [[LOAD3]](s32) + ; CI: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[C6]] + ; CI: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C5]](s32) + ; CI: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32) + ; CI: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC3]] + ; CI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16) + ; CI: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CI: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C7]](s32) + ; CI: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]] + ; CI: [[ZEXT2:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32) + ; CI: $vgpr0_vgpr1 = COPY [[ZEXT2]](s64) + %0:_(p6) = COPY $sgpr0 + %1:_(s64) = G_ZEXTLOAD %0 :: (load 4, align 1, addrspace 6) + $vgpr0_vgpr1 = COPY %1 +... + +--- +name: test_zextload_constant32bit_s32_s8_align1 +body: | + bb.0: + liveins: $sgpr0 + + ; CI-LABEL: name: test_zextload_constant32bit_s32_s8_align1 + ; CI: [[COPY:%[0-9]+]]:_(p6) = COPY $sgpr0 + ; CI: [[C:%[0-9]+]]:_(p6) = G_CONSTANT i32 0 + ; CI: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[COPY]](p6), [[C]](p6) + ; CI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[MV]](p4) :: (load 1, addrspace 6) + ; CI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; CI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; CI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; CI: $vgpr0 = COPY [[AND]](s32) + %0:_(p6) = COPY $sgpr0 + %1:_(s32) = G_ZEXTLOAD %0 :: (load 1, align 1, addrspace 6) + $vgpr0 = COPY %1 +... + +--- +name: test_zextload_constant32bit_s32_s16_align2 +body: | + bb.0: + liveins: $sgpr0 + + ; CI-LABEL: name: test_zextload_constant32bit_s32_s16_align2 + ; CI: [[COPY:%[0-9]+]]:_(p6) = COPY $sgpr0 + ; CI: [[C:%[0-9]+]]:_(p6) = G_CONSTANT i32 0 + ; CI: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[COPY]](p6), [[C]](p6) + ; CI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[MV]](p4) :: (load 2, addrspace 6) + ; CI: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; CI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD]](s32) + ; CI: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]] + ; CI: $vgpr0 = COPY [[AND]](s32) + %0:_(p6) = COPY $sgpr0 + %1:_(s32) = G_ZEXTLOAD %0 :: (load 2, align 2, addrspace 6) + $vgpr0 = COPY %1 +... + +--- +name: test_zextload_constant32bit_s32_s16_align1 +body: | + bb.0: + liveins: $sgpr0 + + ; CI-LABEL: name: test_zextload_constant32bit_s32_s16_align1 + ; CI: [[COPY:%[0-9]+]]:_(p6) = COPY $sgpr0 + ; CI: [[C:%[0-9]+]]:_(p6) = G_CONSTANT i32 0 + ; CI: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[COPY]](p6), [[C]](p6) + ; CI: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[MV]](p4) :: (load 1, addrspace 6) + ; CI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1 + ; CI: [[PTR_ADD:%[0-9]+]]:_(p4) = G_PTR_ADD [[MV]], [[C1]](s64) + ; CI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p4) :: (load 1, addrspace 6) + ; CI: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 255 + ; CI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32) + ; CI: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C2]] + ; CI: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8 + ; CI: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 255 + ; CI: [[COPY1:%[0-9]+]]:_(s32) = COPY [[LOAD1]](s32) + ; CI: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C4]] + ; CI: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C3]](s32) + ; CI: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[SHL]](s32) + ; CI: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[TRUNC1]] + ; CI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16) + ; CI: $vgpr0 = COPY [[ZEXT]](s32) + %0:_(p6) = COPY $sgpr0 + %1:_(s32) = G_ZEXTLOAD %0 :: (load 2, align 1, addrspace 6) + $vgpr0 = COPY %1 +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll index f34c481824afd..ed240f2240bb3 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll @@ -1459,9 +1459,9 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX9-NEXT: s_add_u32 s21, s21, s31 ; GFX9-NEXT: s_cselect_b32 s24, 1, 0 ; GFX9-NEXT: s_and_b32 s24, s24, 1 -; GFX9-NEXT: s_mul_hi_u32 s32, s0, s12 +; GFX9-NEXT: s_mul_hi_u32 s33, s0, s12 ; GFX9-NEXT: s_add_i32 s23, s23, s24 -; GFX9-NEXT: s_add_u32 s21, s21, s32 +; GFX9-NEXT: s_add_u32 s21, s21, s33 ; GFX9-NEXT: s_cselect_b32 s24, 1, 0 ; GFX9-NEXT: s_and_b32 s24, s24, 1 ; GFX9-NEXT: s_add_i32 s23, s23, s24 @@ -1508,26 +1508,26 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) { ; GFX9-NEXT: s_add_u32 s22, s22, s31 ; GFX9-NEXT: s_cselect_b32 s25, 1, 0 ; GFX9-NEXT: s_and_b32 s25, s25, 1 -; GFX9-NEXT: s_mul_hi_u32 s32, s3, s10 -; GFX9-NEXT: s_add_i32 s24, s24, s25 -; GFX9-NEXT: s_add_u32 s22, s22, s32 -; GFX9-NEXT: s_cselect_b32 s25, 1, 0 -; GFX9-NEXT: s_and_b32 s25, s25, 1 -; GFX9-NEXT: s_mul_hi_u32 s33, s2, s11 +; GFX9-NEXT: s_mul_hi_u32 s33, s3, s10 ; GFX9-NEXT: s_add_i32 s24, s24, s25 ; GFX9-NEXT: s_add_u32 s22, s22, s33 ; GFX9-NEXT: s_cselect_b32 s25, 1, 0 ; GFX9-NEXT: s_and_b32 s25, s25, 1 -; GFX9-NEXT: s_mul_hi_u32 s34, s1, s12 +; GFX9-NEXT: s_mul_hi_u32 s34, s2, s11 ; GFX9-NEXT: s_add_i32 s24, s24, s25 ; GFX9-NEXT: s_add_u32 s22, s22, s34 ; GFX9-NEXT: s_cselect_b32 s25, 1, 0 ; GFX9-NEXT: s_and_b32 s25, s25, 1 -; GFX9-NEXT: s_mul_hi_u32 s35, s0, s13 +; GFX9-NEXT: s_mul_hi_u32 s35, s1, s12 ; GFX9-NEXT: s_add_i32 s24, s24, s25 ; GFX9-NEXT: s_add_u32 s22, s22, s35 ; GFX9-NEXT: s_cselect_b32 s25, 1, 0 ; GFX9-NEXT: s_and_b32 s25, s25, 1 +; GFX9-NEXT: s_mul_hi_u32 s36, s0, s13 +; GFX9-NEXT: s_add_i32 s24, s24, s25 +; GFX9-NEXT: s_add_u32 s22, s22, s36 +; GFX9-NEXT: s_cselect_b32 s25, 1, 0 +; GFX9-NEXT: s_and_b32 s25, s25, 1 ; GFX9-NEXT: s_add_i32 s24, s24, s25 ; GFX9-NEXT: s_add_u32 s22, s22, s23 ; GFX9-NEXT: s_cselect_b32 s23, 1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll index ea40cda4fa6be..5e39e6700401d 100644 --- a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll +++ b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll @@ -133,7 +133,7 @@ define amdgpu_kernel void @use_flat_to_group_addrspacecast(i32* %ptr) #0 { ; HSA-DAG: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], s[[PTR_LO]] ; HSA-DAG: v_cndmask_b32_e32 [[CASTPTR:v[0-9]+]], 0, v[[VPTR_LO]] ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 0{{$}} -; HSA: buffer_store_dword v[[K]], [[CASTPTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}} +; HSA: buffer_store_dword v[[K]], [[CASTPTR]], s{{\[[0-9]+:[0-9]+\]}}, 0 offen{{$}} define amdgpu_kernel void @use_flat_to_private_addrspacecast(i32* %ptr) #0 { %ftos = addrspacecast i32* %ptr to i32 addrspace(5)* store volatile i32 0, i32 addrspace(5)* %ftos @@ -231,7 +231,7 @@ define amdgpu_kernel void @cast_0_private_to_flat_addrspacecast() #0 { ; HSA-LABEL: {{^}}cast_0_flat_to_private_addrspacecast: ; HSA: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}} -; HSA: buffer_store_dword [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}} +; HSA: buffer_store_dword [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 define amdgpu_kernel void @cast_0_flat_to_private_addrspacecast() #0 { %cast = addrspacecast i32* null to i32 addrspace(5)* store volatile i32 7, i32 addrspace(5)* %cast diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll b/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll index d26f51302a9ca..072a76780447e 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll @@ -45,8 +45,8 @@ ; HSA-ALLOCA: s_add_u32 s6, s6, s9 ; HSA-ALLOCA: s_lshr_b32 flat_scratch_hi, s6, 8 -; SI-ALLOCA: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; encoding: [0x00,0x10,0x70,0xe0 -; SI-ALLOCA: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; encoding: [0x00,0x10,0x70,0xe0 +; SI-ALLOCA: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen ; encoding: [0x00,0x10,0x70,0xe0 +; SI-ALLOCA: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen ; encoding: [0x00,0x10,0x70,0xe0 ; HSAOPT: [[DISPATCH_PTR:%[0-9]+]] = call noalias nonnull dereferenceable(64) i8 addrspace(4)* @llvm.amdgcn.dispatch.ptr() @@ -226,10 +226,10 @@ for.end: ; R600-VECT: MOVA_INT -; SI-ALLOCA-DAG: buffer_store_short v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:6 ; encoding: [0x06,0x00,0x68,0xe0 -; SI-ALLOCA-DAG: buffer_store_short v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4 ; encoding: [0x04,0x00,0x68,0xe0 +; SI-ALLOCA-DAG: buffer_store_short v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:6 ; encoding: [0x06,0x00,0x68,0xe0 +; SI-ALLOCA-DAG: buffer_store_short v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4 ; encoding: [0x04,0x00,0x68,0xe0 ; Loaded value is 0 or 1, so sext will become zext, so we get buffer_load_ushort instead of buffer_load_sshort. -; SI-ALLOCA: buffer_load_sshort v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} +; SI-ALLOCA: buffer_load_sshort v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 ; SI-PROMOTE-VECT: s_load_dword [[IDX:s[0-9]+]] ; SI-PROMOTE-VECT: s_mov_b32 [[SREG:s[0-9]+]], 0x10000 @@ -257,8 +257,8 @@ entry: ; SI-PROMOTE-VECT-DAG: s_lshl_b32 ; SI-PROMOTE-VECT-DAG: v_lshrrev -; SI-ALLOCA-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4 ; encoding: [0x04,0x00,0x60,0xe0 -; SI-ALLOCA-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:5 ; encoding: [0x05,0x00,0x60,0xe0 +; SI-ALLOCA-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4 ; encoding: [0x04,0x00,0x60,0xe0 +; SI-ALLOCA-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:5 ; encoding: [0x05,0x00,0x60,0xe0 define amdgpu_kernel void @char_array(i32 addrspace(1)* %out, i32 %index) #0 { entry: %0 = alloca [2 x i8], addrspace(5) @@ -281,7 +281,7 @@ entry: ; R600-NOT: [[CHAN]]+ ; ; A total of 5 bytes should be allocated and used. -; SI: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4 ; +; SI: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4 ; define amdgpu_kernel void @no_overlap(i32 addrspace(1)* %out, i32 %in) #0 { entry: %0 = alloca [3 x i8], align 1, addrspace(5) @@ -393,9 +393,9 @@ entry: ; FUNC-LABEL: ptrtoint: ; SI-NOT: ds_write -; SI: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen +; SI: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 offen ; SI: v_add_{{[iu]}}32_e32 [[ADD_OFFSET:v[0-9]+]], vcc, 5, -; SI: buffer_load_dword v{{[0-9]+}}, [[ADD_OFFSET:v[0-9]+]], s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ; +; SI: buffer_load_dword v{{[0-9]+}}, [[ADD_OFFSET:v[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0 offen ; define amdgpu_kernel void @ptrtoint(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { %alloca = alloca [16 x i32], addrspace(5) %tmp0 = getelementptr [16 x i32], [16 x i32] addrspace(5)* %alloca, i32 0, i32 %a diff --git a/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll b/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll index 78d99d881cb1b..3d75eca93cb48 100644 --- a/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll @@ -2,8 +2,8 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -mattr=-trap-handler < %s | FileCheck %s --check-prefixes=GCN,TRAP-HANDLER-DISABLE ; GCN-LABEL: {{^}}amdhsa_trap_num_sgprs -; TRAP-HANDLER-ENABLE: NumSgprs: 60 -; TRAP-HANDLER-DISABLE: NumSgprs: 78 +; TRAP-HANDLER-ENABLE: NumSgprs: 61 +; TRAP-HANDLER-DISABLE: NumSgprs: 79 define amdgpu_kernel void @amdhsa_trap_num_sgprs( i32 addrspace(1)* %out0, i32 %in0, i32 addrspace(1)* %out1, i32 %in1, diff --git a/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll b/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll index b301384b71594..f5505c97ebd54 100644 --- a/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll +++ b/llvm/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll @@ -13,9 +13,9 @@ declare void @llvm.amdgcn.s.barrier() #2 ; SI-LABEL: {{^}}test_private_array_ptr_calc: ; SI-ALLOCA: v_add_i32_e32 [[PTRREG:v[0-9]+]], vcc, 16, v{{[0-9]+}} -; SI-ALLOCA: buffer_store_dword {{v[0-9]+}}, [[PTRREG]], s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:64 +; SI-ALLOCA: buffer_store_dword {{v[0-9]+}}, [[PTRREG]], s[{{[0-9]+:[0-9]+}}], 0 offen offset:64 ; SI-ALLOCA: s_barrier -; SI-ALLOCA: buffer_load_dword {{v[0-9]+}}, [[PTRREG]], s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:64 +; SI-ALLOCA: buffer_load_dword {{v[0-9]+}}, [[PTRREG]], s[{{[0-9]+:[0-9]+}}], 0 offen offset:64 ; ; FIXME: The AMDGPUPromoteAlloca pass should be able to convert this ; alloca to a vector. It currently fails because it does not know how diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll index 9dc74cb836259..5ee5626cdedff 100644 --- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll +++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll @@ -2,11 +2,11 @@ ; FIXME: Vectorization can increase required SGPR count beyond limit. -; ALL-LABEL: {{^}}max_9_sgprs: +; ALL-LABEL: {{^}}max_10_sgprs: ; ALL: SGPRBlocks: 1 -; ALL: NumSGPRsForWavesPerEU: 9 -define amdgpu_kernel void @max_9_sgprs() #0 { +; ALL: NumSGPRsForWavesPerEU: 10 +define amdgpu_kernel void @max_10_sgprs() #0 { %one = load volatile i32, i32 addrspace(4)* undef %two = load volatile i32, i32 addrspace(4)* undef %three = load volatile i32, i32 addrspace(4)* undef @@ -17,7 +17,8 @@ define amdgpu_kernel void @max_9_sgprs() #0 { %eight = load volatile i32, i32 addrspace(4)* undef %nine = load volatile i32, i32 addrspace(4)* undef %ten = load volatile i32, i32 addrspace(4)* undef - call void asm sideeffect "", "s,s,s,s,s,s,s,s,s"(i32 %one, i32 %two, i32 %three, i32 %four, i32 %five, i32 %six, i32 %seven, i32 %eight, i32 %nine) + %eleven = load volatile i32, i32 addrspace(4)* undef + call void asm sideeffect "", "s,s,s,s,s,s,s,s,s,s"(i32 %one, i32 %two, i32 %three, i32 %four, i32 %five, i32 %six, i32 %seven, i32 %eight, i32 %nine, i32 %ten) store volatile i32 %one, i32 addrspace(1)* undef store volatile i32 %two, i32 addrspace(1)* undef store volatile i32 %three, i32 addrspace(1)* undef @@ -28,6 +29,7 @@ define amdgpu_kernel void @max_9_sgprs() #0 { store volatile i32 %eight, i32 addrspace(1)* undef store volatile i32 %nine, i32 addrspace(1)* undef store volatile i32 %ten, i32 addrspace(1)* undef + store volatile i32 %eleven, i32 addrspace(1)* undef ret void } diff --git a/llvm/test/CodeGen/AMDGPU/bypass-div.ll b/llvm/test/CodeGen/AMDGPU/bypass-div.ll index 299ae90837032..5cc320a3658b4 100644 --- a/llvm/test/CodeGen/AMDGPU/bypass-div.ll +++ b/llvm/test/CodeGen/AMDGPU/bypass-div.ll @@ -48,7 +48,7 @@ define i64 @sdiv64(i64 %a, i64 %b) { ; GFX9-NEXT: v_mul_lo_u32 v14, v7, v13 ; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v16, v11, vcc ; GFX9-NEXT: v_mul_hi_u32 v13, v7, v13 -; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v14, v12 +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v12, v14 ; GFX9-NEXT: v_mul_hi_u32 v12, v7, v10 ; GFX9-NEXT: v_mul_lo_u32 v10, v7, v10 ; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v11, v13, vcc @@ -70,7 +70,7 @@ define i64 @sdiv64(i64 %a, i64 %b) { ; GFX9-NEXT: v_mul_hi_u32 v13, v10, v8 ; GFX9-NEXT: v_mul_lo_u32 v8, v10, v8 ; GFX9-NEXT: v_addc_co_u32_e32 v14, vcc, v16, v14, vcc -; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v8, v12 +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v12, v8 ; GFX9-NEXT: v_mul_hi_u32 v8, v10, v9 ; GFX9-NEXT: v_mul_lo_u32 v9, v10, v9 ; GFX9-NEXT: v_addc_co_u32_e32 v12, vcc, v14, v13, vcc @@ -94,7 +94,7 @@ define i64 @sdiv64(i64 %a, i64 %b) { ; GFX9-NEXT: v_mul_hi_u32 v6, v1, v6 ; GFX9-NEXT: v_mul_hi_u32 v13, v1, v7 ; GFX9-NEXT: v_mul_lo_u32 v7, v1, v7 -; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v12, v10 +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v10, v12 ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v11, v6, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v10, vcc, v13, v15, vcc ; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v7 @@ -215,7 +215,7 @@ define i64 @udiv64(i64 %a, i64 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v10, vcc, v13, v14, vcc ; GFX9-NEXT: v_mul_lo_u32 v14, v5, v11 ; GFX9-NEXT: v_mul_hi_u32 v11, v5, v11 -; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v14, v9 +; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v9, v14 ; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v10, v11, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v10, vcc, v15, v12, vcc ; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v9, v8 @@ -237,7 +237,7 @@ define i64 @udiv64(i64 %a, i64 %b) { ; GFX9-NEXT: v_mul_hi_u32 v11, v8, v6 ; GFX9-NEXT: v_mul_lo_u32 v6, v8, v6 ; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, v13, v15, vcc -; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v10 +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v10, v6 ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v15, v11, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v14, v12, vcc ; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v7 @@ -254,7 +254,7 @@ define i64 @udiv64(i64 %a, i64 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v13, v8, vcc ; GFX9-NEXT: v_mul_lo_u32 v8, v1, v4 ; GFX9-NEXT: v_mul_hi_u32 v4, v1, v4 -; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v8, v6 +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v8 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v4, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v9, v12, vcc ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 @@ -376,7 +376,7 @@ define i64 @srem64(i64 %a, i64 %b) { ; GFX9-NEXT: v_mul_lo_u32 v13, v6, v12 ; GFX9-NEXT: v_mul_hi_u32 v12, v6, v12 ; GFX9-NEXT: v_addc_co_u32_e32 v10, vcc, v15, v10, vcc -; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, v13, v11 +; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, v11, v13 ; GFX9-NEXT: v_addc_co_u32_e32 v10, vcc, v10, v12, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v16, v14, vcc ; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v10, v9 @@ -398,7 +398,7 @@ define i64 @srem64(i64 %a, i64 %b) { ; GFX9-NEXT: v_mul_hi_u32 v12, v9, v7 ; GFX9-NEXT: v_mul_lo_u32 v7, v9, v7 ; GFX9-NEXT: v_addc_co_u32_e32 v16, vcc, v15, v16, vcc -; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v11 +; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v11, v7 ; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v16, v12, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v13, v14, vcc ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v7, v8 @@ -420,7 +420,7 @@ define i64 @srem64(i64 %a, i64 %b) { ; GFX9-NEXT: v_mul_hi_u32 v4, v1, v4 ; GFX9-NEXT: v_mul_hi_u32 v12, v1, v6 ; GFX9-NEXT: v_mul_lo_u32 v6, v1, v6 -; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v11, v9 +; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v9, v11 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v10, v4, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v12, v14, vcc ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v6 @@ -539,7 +539,7 @@ define i64 @urem64(i64 %a, i64 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v10, vcc, v13, v14, vcc ; GFX9-NEXT: v_mul_lo_u32 v14, v5, v11 ; GFX9-NEXT: v_mul_hi_u32 v11, v5, v11 -; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v14, v9 +; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v9, v14 ; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v10, v11, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v10, vcc, v15, v12, vcc ; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v9, v8 @@ -561,7 +561,7 @@ define i64 @urem64(i64 %a, i64 %b) { ; GFX9-NEXT: v_mul_hi_u32 v11, v8, v6 ; GFX9-NEXT: v_mul_lo_u32 v6, v8, v6 ; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, v13, v15, vcc -; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v10 +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v10, v6 ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v15, v11, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v14, v12, vcc ; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v7 @@ -578,7 +578,7 @@ define i64 @urem64(i64 %a, i64 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v13, v8, vcc ; GFX9-NEXT: v_mul_lo_u32 v8, v1, v4 ; GFX9-NEXT: v_mul_hi_u32 v4, v1, v4 -; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v8, v6 +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v8 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v4, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v9, v12, vcc ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 @@ -843,7 +843,7 @@ define <2 x i64> @sdivrem64(i64 %a, i64 %b) { ; GFX9-NEXT: v_mul_lo_u32 v14, v7, v13 ; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v16, v11, vcc ; GFX9-NEXT: v_mul_hi_u32 v13, v7, v13 -; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v14, v12 +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, v12, v14 ; GFX9-NEXT: v_mul_hi_u32 v12, v7, v10 ; GFX9-NEXT: v_mul_lo_u32 v10, v7, v10 ; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, v11, v13, vcc @@ -865,7 +865,7 @@ define <2 x i64> @sdivrem64(i64 %a, i64 %b) { ; GFX9-NEXT: v_mul_hi_u32 v13, v10, v8 ; GFX9-NEXT: v_mul_lo_u32 v8, v10, v8 ; GFX9-NEXT: v_addc_co_u32_e32 v14, vcc, v16, v14, vcc -; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v8, v12 +; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v12, v8 ; GFX9-NEXT: v_mul_hi_u32 v8, v10, v9 ; GFX9-NEXT: v_mul_lo_u32 v9, v10, v9 ; GFX9-NEXT: v_addc_co_u32_e32 v12, vcc, v14, v13, vcc @@ -889,7 +889,7 @@ define <2 x i64> @sdivrem64(i64 %a, i64 %b) { ; GFX9-NEXT: v_mul_hi_u32 v4, v1, v4 ; GFX9-NEXT: v_mul_hi_u32 v13, v1, v7 ; GFX9-NEXT: v_mul_lo_u32 v7, v1, v7 -; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v12, v10 +; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v10, v12 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v11, v4, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v10, vcc, v13, v15, vcc ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v7 @@ -1032,7 +1032,7 @@ define <2 x i64> @udivrem64(i64 %a, i64 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v10, vcc, v13, v14, vcc ; GFX9-NEXT: v_mul_lo_u32 v14, v5, v11 ; GFX9-NEXT: v_mul_hi_u32 v11, v5, v11 -; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v14, v9 +; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, v9, v14 ; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v10, v11, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v10, vcc, v15, v12, vcc ; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v9, v8 @@ -1054,7 +1054,7 @@ define <2 x i64> @udivrem64(i64 %a, i64 %b) { ; GFX9-NEXT: v_mul_hi_u32 v11, v8, v6 ; GFX9-NEXT: v_mul_lo_u32 v6, v8, v6 ; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, v13, v15, vcc -; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v10 +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v10, v6 ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v15, v11, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v14, v12, vcc ; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v7 @@ -1071,7 +1071,7 @@ define <2 x i64> @udivrem64(i64 %a, i64 %b) { ; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v13, v8, vcc ; GFX9-NEXT: v_mul_lo_u32 v8, v1, v4 ; GFX9-NEXT: v_mul_hi_u32 v4, v1, v4 -; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v8, v6 +; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v6, v8 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, v7, v4, vcc ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v9, v12, vcc ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v4, v5 diff --git a/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll index e123d80fb9569..91eb7d0add483 100644 --- a/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll +++ b/llvm/test/CodeGen/AMDGPU/byval-frame-setup.ll @@ -2,31 +2,6 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -enable-ipra=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s %struct.ByValStruct = type { [4 x i32] } - -; GCN-LABEL: {{^}}void_func_byval_struct: -; GCN: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s32{{$}} -; GCN-NOT: s32 -; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s32{{$}} -; GCN-NOT: s32 - -; GCN: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s32 offset:16{{$}} -; GCN-NOT: s32 -; GCN: buffer_store_dword [[LOAD1]], off, s[0:3], s32 offset:16{{$}} -; GCN-NOT: s32 -define hidden void @void_func_byval_struct(%struct.ByValStruct addrspace(5)* byval noalias nocapture align 4 %arg0, %struct.ByValStruct addrspace(5)* byval noalias nocapture align 4 %arg1) #1 { -entry: - %arrayidx = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg0, i32 0, i32 0, i32 0 - %tmp = load volatile i32, i32 addrspace(5)* %arrayidx, align 4 - %add = add nsw i32 %tmp, 1 - store volatile i32 %add, i32 addrspace(5)* %arrayidx, align 4 - %arrayidx2 = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg1, i32 0, i32 0, i32 0 - %tmp1 = load volatile i32, i32 addrspace(5)* %arrayidx2, align 4 - %add3 = add nsw i32 %tmp1, 2 - store volatile i32 %add3, i32 addrspace(5)* %arrayidx2, align 4 - store volatile i32 9, i32 addrspace(1)* null, align 4 - ret void -} - ; Make sure the offset is folded and function's frame register is used ; rather than the global scratch wave offset. ; GCN-LABEL: {{^}}void_func_byval_struct_use_outside_entry_block: @@ -67,331 +42,6 @@ bb0: bb1: ret void } - -; GCN-LABEL: {{^}}void_func_byval_struct_non_leaf: -; GCN: buffer_store_dword v33, off, s[0:3], s32 offset:36 -; GCN-DAG: v_writelane_b32 v33, s34, -; GCN: s_mov_b32 s34, s32 -; GCN-DAG: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s34{{$}} -; GCN-DAG: s_add_u32 s32, s32, 0xc00{{$}} -; GCN-DAG: buffer_store_dword v32, off, s[0:3], s34 offset:32 -; GCN-NOT: v_writelane_b32 v{{[0-9]+}}, s32 - -; GCN-DAG: v_add_{{[iu]}}32_e32 [[ADD0:v[0-9]+]], vcc, 1, [[LOAD0]] -; GCN: buffer_store_dword [[ADD0]], off, s[0:3], s34{{$}} - -; GCN-DAG: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s34 offset:16{{$}} -; GCN-DAG: v_add_{{[iu]}}32_e32 [[ADD1:v[0-9]+]], vcc, 2, [[LOAD1]] - -; GCN: s_swappc_b64 - -; GCN: buffer_store_dword [[ADD1]], off, s[0:3], s34 offset:16{{$}} - -; GCN: v_readlane_b32 -; GCN-NOT: v_readlane_b32 s32 -; GCN-DAG: buffer_load_dword v32, off, s[0:3], s34 offset:32 -; GCN: s_sub_u32 s32, s32, 0xc00{{$}} -; GCN: v_readlane_b32 s34, v33, -; GCN-DAG: buffer_load_dword v33, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN: s_setpc_b64 -define void @void_func_byval_struct_non_leaf(%struct.ByValStruct addrspace(5)* byval noalias nocapture align 4 %arg0, %struct.ByValStruct addrspace(5)* byval noalias nocapture align 4 %arg1) #1 { -entry: - %arrayidx = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg0, i32 0, i32 0, i32 0 - %tmp = load volatile i32, i32 addrspace(5)* %arrayidx, align 4 - %add = add nsw i32 %tmp, 1 - store volatile i32 %add, i32 addrspace(5)* %arrayidx, align 4 - %arrayidx2 = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg1, i32 0, i32 0, i32 0 - %tmp1 = load volatile i32, i32 addrspace(5)* %arrayidx2, align 4 - %add3 = add nsw i32 %tmp1, 2 - call void @external_void_func_void() - store volatile i32 %add3, i32 addrspace(5)* %arrayidx2, align 4 - store volatile i32 9, i32 addrspace(1)* null, align 4 - ret void -} - -; GCN-LABEL: {{^}}call_void_func_byval_struct_func: -; GCN: s_mov_b32 s34, s32 -; GCN-DAG: s_add_u32 s32, s32, 0xc00{{$}} -; GCN-DAG: v_writelane_b32 - -; GCN-DAG: v_mov_b32_e32 [[NINE:v[0-9]+]], 9 -; GCN-DAG: v_mov_b32_e32 [[THIRTEEN:v[0-9]+]], 13 - -; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s34{{$}} -; GCN-DAG: buffer_store_dword [[THIRTEEN]], off, s[0:3], s34 offset:16 - -; GCN-DAG: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s34{{$}} -; GCN-DAG: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s34 offset:4 -; GCN-DAG: buffer_load_dword [[LOAD2:v[0-9]+]], off, s[0:3], s34 offset:8 -; GCN-DAG: buffer_load_dword [[LOAD3:v[0-9]+]], off, s[0:3], s34 offset:12 - -; GCN-NOT: s_add_u32 s32, s32, 0x800 - - -; GCN-DAG: buffer_store_dword [[LOAD0]], off, s[0:3], s32{{$}} -; GCN-DAG: buffer_store_dword [[LOAD1]], off, s[0:3], s32 offset:4 -; GCN-DAG: buffer_store_dword [[LOAD2]], off, s[0:3], s32 offset:8 -; GCN-DAG: buffer_store_dword [[LOAD3]], off, s[0:3], s32 offset:12 - -; GCN: buffer_load_dword [[LOAD4:v[0-9]+]], off, s[0:3], s34 offset:16 -; GCN: buffer_load_dword [[LOAD5:v[0-9]+]], off, s[0:3], s34 offset:20 -; GCN: buffer_load_dword [[LOAD6:v[0-9]+]], off, s[0:3], s34 offset:24 -; GCN: buffer_load_dword [[LOAD7:v[0-9]+]], off, s[0:3], s34 offset:28 - -; GCN-DAG: buffer_store_dword [[LOAD4]], off, s[0:3], s32 offset:16 -; GCN-DAG: buffer_store_dword [[LOAD5]], off, s[0:3], s32 offset:20 -; GCN-DAG: buffer_store_dword [[LOAD6]], off, s[0:3], s32 offset:24 -; GCN-DAG: buffer_store_dword [[LOAD7]], off, s[0:3], s32 offset:28 - -; GCN: s_swappc_b64 -; GCN-NOT: v_readlane_b32 s32 -; GCN: v_readlane_b32 -; GCN-NOT: v_readlane_b32 s32 - -; GCN-NOT: s_sub_u32 s32, s32, 0x800 - -; GCN: s_sub_u32 s32, s32, 0xc00{{$}} -; GCN: v_readlane_b32 s34, v -; GCN: s_waitcnt -; GCN: s_setpc_b64 -define void @call_void_func_byval_struct_func() #1 { -entry: - %arg0 = alloca %struct.ByValStruct, align 4, addrspace(5) - %arg1 = alloca %struct.ByValStruct, align 4, addrspace(5) - %tmp = bitcast %struct.ByValStruct addrspace(5)* %arg0 to i8 addrspace(5)* - call void @llvm.lifetime.start.p5i8(i64 32, i8 addrspace(5)* %tmp) - %tmp1 = bitcast %struct.ByValStruct addrspace(5)* %arg1 to i8 addrspace(5)* - call void @llvm.lifetime.start.p5i8(i64 32, i8 addrspace(5)* %tmp1) - %arrayidx = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg0, i32 0, i32 0, i32 0 - store volatile i32 9, i32 addrspace(5)* %arrayidx, align 4 - %arrayidx2 = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg1, i32 0, i32 0, i32 0 - store volatile i32 13, i32 addrspace(5)* %arrayidx2, align 4 - call void @void_func_byval_struct(%struct.ByValStruct addrspace(5)* byval nonnull align 4 %arg0, %struct.ByValStruct addrspace(5)* byval nonnull align 4 %arg1) - call void @llvm.lifetime.end.p5i8(i64 32, i8 addrspace(5)* %tmp1) - call void @llvm.lifetime.end.p5i8(i64 32, i8 addrspace(5)* %tmp) - ret void -} - -; GCN-LABEL: {{^}}call_void_func_byval_struct_kernel: -; GCN: s_mov_b32 s33, s7 -; GCN-NOT: s_add_u32 s32, s32, 0x800 - -; GCN: v_mov_b32_e32 [[NINE:v[0-9]+]], 9 -; GCN: buffer_store_dword [[NINE]], off, s[0:3], s33 offset:8 -; GCN: v_mov_b32_e32 [[THIRTEEN:v[0-9]+]], 13 -; GCN: buffer_store_dword [[THIRTEEN]], off, s[0:3], s33 offset:24 - -; GCN-NOT: s_add_u32 s32, s32, 0x800 -; GCN-DAG: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s33 offset:8 -; GCN-DAG: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s33 offset:12 -; GCN-DAG: s_add_u32 s32, s33, 0xc00{{$}} -; GCN-DAG: buffer_load_dword [[LOAD2:v[0-9]+]], off, s[0:3], s33 offset:16 -; GCN-DAG: buffer_load_dword [[LOAD3:v[0-9]+]], off, s[0:3], s33 offset:20 - -; GCN: s_getpc_b64 - -; GCN-DAG: buffer_store_dword [[LOAD0]], off, s[0:3], s32{{$}} -; GCN-DAG: buffer_store_dword [[LOAD1]], off, s[0:3], s32 offset:4 -; GCN-DAG: buffer_store_dword [[LOAD2]], off, s[0:3], s32 offset:8 -; GCN-DAG: buffer_store_dword [[LOAD3]], off, s[0:3], s32 offset:12 - -; GCN-DAG: buffer_load_dword [[LOAD4:v[0-9]+]], off, s[0:3], s33 offset:24 -; GCN-DAG: buffer_load_dword [[LOAD5:v[0-9]+]], off, s[0:3], s33 offset:28 -; GCN-DAG: buffer_load_dword [[LOAD6:v[0-9]+]], off, s[0:3], s33 offset:32 -; GCN-DAG: buffer_load_dword [[LOAD7:v[0-9]+]], off, s[0:3], s33 offset:36 - -; GCN-DAG: buffer_store_dword [[LOAD4]], off, s[0:3], s32 offset:16 -; GCN-DAG: buffer_store_dword [[LOAD5]], off, s[0:3], s32 offset:20 -; GCN-DAG: buffer_store_dword [[LOAD6]], off, s[0:3], s32 offset:24 -; GCN-DAG: buffer_store_dword [[LOAD7]], off, s[0:3], s32 offset:28 - - -; GCN: s_swappc_b64 -; GCN-NOT: s_sub_u32 s32 -; GCN: s_endpgm -define amdgpu_kernel void @call_void_func_byval_struct_kernel() #1 { -entry: - %arg0 = alloca %struct.ByValStruct, align 4, addrspace(5) - %arg1 = alloca %struct.ByValStruct, align 4, addrspace(5) - %tmp = bitcast %struct.ByValStruct addrspace(5)* %arg0 to i8 addrspace(5)* - call void @llvm.lifetime.start.p5i8(i64 32, i8 addrspace(5)* %tmp) - %tmp1 = bitcast %struct.ByValStruct addrspace(5)* %arg1 to i8 addrspace(5)* - call void @llvm.lifetime.start.p5i8(i64 32, i8 addrspace(5)* %tmp1) - %arrayidx = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg0, i32 0, i32 0, i32 0 - store volatile i32 9, i32 addrspace(5)* %arrayidx, align 4 - %arrayidx2 = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg1, i32 0, i32 0, i32 0 - store volatile i32 13, i32 addrspace(5)* %arrayidx2, align 4 - call void @void_func_byval_struct(%struct.ByValStruct addrspace(5)* byval nonnull align 4 %arg0, %struct.ByValStruct addrspace(5)* byval nonnull align 4 %arg1) - call void @llvm.lifetime.end.p5i8(i64 32, i8 addrspace(5)* %tmp1) - call void @llvm.lifetime.end.p5i8(i64 32, i8 addrspace(5)* %tmp) - ret void -} - -; GCN-LABEL: {{^}}void_func_byval_struct_align8: -; GCN: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s32{{$}} -; GCN-NOT: s32 -; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s32{{$}} -; GCN-NOT: s32 - -; GCN: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s32 offset:16{{$}} -; GCN-NOT: s32 -; GCN: buffer_store_dword [[LOAD1]], off, s[0:3], s32 offset:16{{$}} -; GCN-NOT: s32 -define hidden void @void_func_byval_struct_align8(%struct.ByValStruct addrspace(5)* byval noalias nocapture align 8 %arg0, %struct.ByValStruct addrspace(5)* byval noalias nocapture align 8 %arg1) #1 { -entry: - %arrayidx = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg0, i32 0, i32 0, i32 0 - %tmp = load volatile i32, i32 addrspace(5)* %arrayidx, align 8 - %add = add nsw i32 %tmp, 1 - store volatile i32 %add, i32 addrspace(5)* %arrayidx, align 8 - %arrayidx2 = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg1, i32 0, i32 0, i32 0 - %tmp1 = load volatile i32, i32 addrspace(5)* %arrayidx2, align 8 - %add3 = add nsw i32 %tmp1, 2 - store volatile i32 %add3, i32 addrspace(5)* %arrayidx2, align 8 - store volatile i32 9, i32 addrspace(1)* null, align 4 - ret void -} - -; Make sure the byval alignment is respected in the call frame setup -; GCN-LABEL: {{^}}call_void_func_byval_struct_align8_kernel: -; GCN: s_mov_b32 s33, s7 -; GCN-NOT: s_add_u32 s32, s32, 0x800 - -; GCN: v_mov_b32_e32 [[NINE:v[0-9]+]], 9 -; GCN: buffer_store_dword [[NINE]], off, s[0:3], s33 offset:8 -; GCN: v_mov_b32_e32 [[THIRTEEN:v[0-9]+]], 13 -; GCN: buffer_store_dword [[THIRTEEN]], off, s[0:3], s33 offset:24 - - -; GCN-NOT: s_add_u32 s32, s32, 0x800 - -; GCN: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s33 offset:8 -; GCN: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s33 offset:12 -; GCN: buffer_load_dword [[LOAD2:v[0-9]+]], off, s[0:3], s33 offset:16 -; GCN: buffer_load_dword [[LOAD3:v[0-9]+]], off, s[0:3], s33 offset:20 - -; GCN-NOT: s_add_u32 s32, s32, 0x800 -; GCN-DAG: s_add_u32 s32, s33, 0xc00{{$}} - -; GCN: buffer_store_dword [[LOAD3]], off, s[0:3], s32 offset:12 -; GCN: buffer_store_dword [[LOAD2]], off, s[0:3], s32 offset:8 -; GCN: buffer_store_dword [[LOAD1]], off, s[0:3], s32 offset:4 -; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s32{{$}} - - -; GCN-DAG: buffer_load_dword [[LOAD4:v[0-9]+]], off, s[0:3], s33 offset:24 -; GCN-DAG: buffer_load_dword [[LOAD5:v[0-9]+]], off, s[0:3], s33 offset:28 -; GCN-DAG: buffer_load_dword [[LOAD6:v[0-9]+]], off, s[0:3], s33 offset:32 -; GCN-DAG: buffer_load_dword [[LOAD7:v[0-9]+]], off, s[0:3], s33 offset:36 - -; GCN-DAG: buffer_store_dword [[LOAD4]], off, s[0:3], s32 offset:16 -; GCN-DAG: buffer_store_dword [[LOAD5]], off, s[0:3], s32 offset:20 -; GCN-DAG: buffer_store_dword [[LOAD6]], off, s[0:3], s32 offset:24 -; GCN-DAG: buffer_store_dword [[LOAD7]], off, s[0:3], s32 offset:28 - - -; GCN: s_swappc_b64 -; GCN-NOT: s_sub_u32 s32 -; GCN: s_endpgm -define amdgpu_kernel void @call_void_func_byval_struct_align8_kernel() #1 { -entry: - %arg0 = alloca %struct.ByValStruct, align 8, addrspace(5) - %arg1 = alloca %struct.ByValStruct, align 8, addrspace(5) - %tmp = bitcast %struct.ByValStruct addrspace(5)* %arg0 to i8 addrspace(5)* - call void @llvm.lifetime.start.p5i8(i64 32, i8 addrspace(5)* %tmp) - %tmp1 = bitcast %struct.ByValStruct addrspace(5)* %arg1 to i8 addrspace(5)* - call void @llvm.lifetime.start.p5i8(i64 32, i8 addrspace(5)* %tmp1) - %arrayidx = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg0, i32 0, i32 0, i32 0 - store volatile i32 9, i32 addrspace(5)* %arrayidx, align 8 - %arrayidx2 = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg1, i32 0, i32 0, i32 0 - store volatile i32 13, i32 addrspace(5)* %arrayidx2, align 8 - call void @void_func_byval_struct_align8(%struct.ByValStruct addrspace(5)* byval nonnull align 8 %arg0, %struct.ByValStruct addrspace(5)* byval nonnull align 8 %arg1) - call void @llvm.lifetime.end.p5i8(i64 32, i8 addrspace(5)* %tmp1) - call void @llvm.lifetime.end.p5i8(i64 32, i8 addrspace(5)* %tmp) - ret void -} - -; GCN-LABEL: {{^}}call_void_func_byval_struct_align8_func: -; GCN: s_mov_b32 s34, s32 -; GCN-DAG: s_add_u32 s32, s32, 0xc00{{$}} -; GCN-DAG: v_writelane_b32 - -; GCN-DAG: v_mov_b32_e32 [[NINE:v[0-9]+]], 9 -; GCN-DAG: v_mov_b32_e32 [[THIRTEEN:v[0-9]+]], 13 - -; GCN-DAG: buffer_store_dword [[NINE]], off, s[0:3], s34{{$}} -; GCN-DAG: buffer_store_dword [[THIRTEEN]], off, s[0:3], s34 offset:16 - -; GCN-DAG: buffer_load_dword [[LOAD0:v[0-9]+]], off, s[0:3], s34{{$}} -; GCN-DAG: buffer_load_dword [[LOAD1:v[0-9]+]], off, s[0:3], s34 offset:4 -; GCN-DAG: buffer_load_dword [[LOAD2:v[0-9]+]], off, s[0:3], s34 offset:8 -; GCN-DAG: buffer_load_dword [[LOAD3:v[0-9]+]], off, s[0:3], s34 offset:12 - -; GCN-NOT: s_add_u32 s32, s32, 0x800 - -; GCN-DAG: buffer_store_dword [[LOAD0]], off, s[0:3], s32{{$}} -; GCN-DAG: buffer_store_dword [[LOAD1]], off, s[0:3], s32 offset:4 -; GCN-DAG: buffer_store_dword [[LOAD2]], off, s[0:3], s32 offset:8 -; GCN-DAG: buffer_store_dword [[LOAD3]], off, s[0:3], s32 offset:12 - -; GCN: buffer_load_dword [[LOAD4:v[0-9]+]], off, s[0:3], s34 offset:16 -; GCN: buffer_load_dword [[LOAD5:v[0-9]+]], off, s[0:3], s34 offset:20 -; GCN: buffer_load_dword [[LOAD6:v[0-9]+]], off, s[0:3], s34 offset:24 -; GCN: buffer_load_dword [[LOAD7:v[0-9]+]], off, s[0:3], s34 offset:28 - -; GCN: s_waitcnt vmcnt(0) -; GCN-DAG: buffer_store_dword [[LOAD4]], off, s[0:3], s32 offset:16 -; GCN-DAG: buffer_store_dword [[LOAD5]], off, s[0:3], s32 offset:20 -; GCN-DAG: buffer_store_dword [[LOAD6]], off, s[0:3], s32 offset:24 -; GCN-DAG: buffer_store_dword [[LOAD7]], off, s[0:3], s32 offset:28 - -; GCN: s_swappc_b64 -; GCN-NOT: v_readlane_b32 s32 -; GCN: v_readlane_b32 -; GCN-NOT: v_readlane_b32 s32 - -; GCN-NOT: s_sub_u32 s32, s32, 0x800 - -; GCN: s_sub_u32 s32, s32, 0xc00{{$}} -; GCN: v_readlane_b32 s34, v -; GCN: s_waitcnt -; GCN-NEXT: s_setpc_b64 -define void @call_void_func_byval_struct_align8_func() #0 { -entry: - %arg0 = alloca %struct.ByValStruct, align 8, addrspace(5) - %arg1 = alloca %struct.ByValStruct, align 8, addrspace(5) - %tmp = bitcast %struct.ByValStruct addrspace(5)* %arg0 to i8 addrspace(5)* - call void @llvm.lifetime.start.p5i8(i64 32, i8 addrspace(5)* %tmp) - %tmp1 = bitcast %struct.ByValStruct addrspace(5)* %arg1 to i8 addrspace(5)* - call void @llvm.lifetime.start.p5i8(i64 32, i8 addrspace(5)* %tmp1) - %arrayidx = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg0, i32 0, i32 0, i32 0 - store volatile i32 9, i32 addrspace(5)* %arrayidx, align 8 - %arrayidx2 = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg1, i32 0, i32 0, i32 0 - store volatile i32 13, i32 addrspace(5)* %arrayidx2, align 8 - call void @void_func_byval_struct_align8(%struct.ByValStruct addrspace(5)* byval nonnull align 8 %arg0, %struct.ByValStruct addrspace(5)* byval nonnull align 8 %arg1) - call void @llvm.lifetime.end.p5i8(i64 32, i8 addrspace(5)* %tmp1) - call void @llvm.lifetime.end.p5i8(i64 32, i8 addrspace(5)* %tmp) - ret void -} - -; GCN-LABEL: {{^}}call_void_func_byval_struct_kernel_no_frame_pointer_elim: -define amdgpu_kernel void @call_void_func_byval_struct_kernel_no_frame_pointer_elim() #2 { -entry: - %arg0 = alloca %struct.ByValStruct, align 4, addrspace(5) - %arg1 = alloca %struct.ByValStruct, align 4, addrspace(5) - %tmp = bitcast %struct.ByValStruct addrspace(5)* %arg0 to i8 addrspace(5)* - call void @llvm.lifetime.start.p5i8(i64 32, i8 addrspace(5)* %tmp) - %tmp1 = bitcast %struct.ByValStruct addrspace(5)* %arg1 to i8 addrspace(5)* - call void @llvm.lifetime.start.p5i8(i64 32, i8 addrspace(5)* %tmp1) - %arrayidx = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg0, i32 0, i32 0, i32 0 - store volatile i32 9, i32 addrspace(5)* %arrayidx, align 4 - %arrayidx2 = getelementptr inbounds %struct.ByValStruct, %struct.ByValStruct addrspace(5)* %arg1, i32 0, i32 0, i32 0 - store volatile i32 13, i32 addrspace(5)* %arrayidx2, align 4 - call void @void_func_byval_struct(%struct.ByValStruct addrspace(5)* byval nonnull align 4 %arg0, %struct.ByValStruct addrspace(5)* byval nonnull align 4 %arg1) - call void @llvm.lifetime.end.p5i8(i64 32, i8 addrspace(5)* %tmp1) - call void @llvm.lifetime.end.p5i8(i64 32, i8 addrspace(5)* %tmp) - ret void -} - declare hidden void @external_void_func_void() #0 declare void @llvm.lifetime.start.p5i8(i64, i8 addrspace(5)* nocapture) #3 diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll index faf15d0981c78..30461a55b1365 100644 --- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll @@ -78,14 +78,11 @@ define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 { } ; GCN-LABEL: {{^}}test_call_external_void_func_i1_signext: -; MESA: s_mov_b32 s33, s3{{$}} -; HSA: s_mov_b32 s33, s9{{$}} ; HSA: buffer_load_ubyte [[VAR:v[0-9]+]] -; HSA: s_mov_b32 s32, s33 +; HSA: s_mov_b32 s32, 0 ; MESA-DAG: buffer_load_ubyte [[VAR:v[0-9]+]] -; MESA-DAG: s_mov_b32 s32, s33{{$}} - +; MESA-DAG: s_mov_b32 s32, 0{{$}} ; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} ; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i1_signext@rel32@lo+4 @@ -103,13 +100,12 @@ define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 { ; FIXME: load should be scheduled before getpc ; GCN-LABEL: {{^}}test_call_external_void_func_i1_zeroext: -; MESA: s_mov_b32 s33, s3{{$}} ; HSA: buffer_load_ubyte v0 -; HSA-DAG: s_mov_b32 s32, s33{{$}} +; HSA-DAG: s_mov_b32 s32, 0{{$}} ; MESA: buffer_load_ubyte v0 -; MESA-DAG: s_mov_b32 s32, s33{{$}} +; MESA-DAG: s_mov_b32 s32, 0{{$}} ; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} ; GCN-NEXT: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i1_zeroext@rel32@lo+4 @@ -127,14 +123,13 @@ define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 { } ; GCN-LABEL: {{^}}test_call_external_void_func_i8_imm: -; MESA-DAG: s_mov_b32 s33, s3{{$}} ; GCN-DAG: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} ; GCN-DAG: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i8@rel32@lo+4 ; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i8@rel32@hi+4 ; GCN-DAG: v_mov_b32_e32 v0, 0x7b -; GCN-DAG: s_mov_b32 s32, s33{{$}} +; GCN-DAG: s_mov_b32 s32, 0{{$}} ; GCN: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}} ; GCN-NEXT: s_endpgm @@ -145,15 +140,13 @@ define amdgpu_kernel void @test_call_external_void_func_i8_imm(i32) #0 { ; FIXME: don't wait before call ; GCN-LABEL: {{^}}test_call_external_void_func_i8_signext: -; HSA-DAG: s_mov_b32 s33, s9{{$}} -; MESA-DAG: s_mov_b32 s33, s3{{$}} ; GCN-DAG: buffer_load_sbyte v0 ; GCN-DAG: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} ; GCN-DAG: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i8_signext@rel32@lo+4 ; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i8_signext@rel32@hi+4 -; GCN-DAG: s_mov_b32 s32, s3 +; GCN-DAG: s_mov_b32 s32, 0 ; GCN-NOT: s_waitcnt ; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}} @@ -165,15 +158,13 @@ define amdgpu_kernel void @test_call_external_void_func_i8_signext(i32) #0 { } ; GCN-LABEL: {{^}}test_call_external_void_func_i8_zeroext: -; MESA-DAG: s_mov_b32 s33, s3{{$}} -; HSA-DAG: s_mov_b32 s33, s9{{$}} ; GCN-DAG: buffer_load_ubyte v0 ; GCN-DAG: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} ; GCN-DAG: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i8_zeroext@rel32@lo+4 ; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i8_zeroext@rel32@hi+4 -; GCN-DAG: s_mov_b32 s32, s33 +; GCN-DAG: s_mov_b32 s32, 0 ; GCN-NOT: s_waitcnt ; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}} @@ -187,7 +178,7 @@ define amdgpu_kernel void @test_call_external_void_func_i8_zeroext(i32) #0 { ; GCN-LABEL: {{^}}test_call_external_void_func_i16_imm: ; GCN-DAG: v_mov_b32_e32 v0, 0x7b{{$}} -; GCN-DAG: s_mov_b32 s32, s33 +; GCN-DAG: s_mov_b32 s32, 0 ; GCN: s_swappc_b64 define amdgpu_kernel void @test_call_external_void_func_i16_imm() #0 { @@ -196,14 +187,13 @@ define amdgpu_kernel void @test_call_external_void_func_i16_imm() #0 { } ; GCN-LABEL: {{^}}test_call_external_void_func_i16_signext: -; MESA-DAG: s_mov_b32 s33, s3{{$}} ; GCN-DAG: buffer_load_sshort v0 ; GCN-DAG: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} ; GCN-DAG: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i16_signext@rel32@lo+4 ; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i16_signext@rel32@hi+4 -; GCN-DAG: s_mov_b32 s32, s33 +; GCN-DAG: s_mov_b32 s32, 0 ; GCN-NOT: s_waitcnt ; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}} @@ -215,14 +205,12 @@ define amdgpu_kernel void @test_call_external_void_func_i16_signext(i32) #0 { } ; GCN-LABEL: {{^}}test_call_external_void_func_i16_zeroext: -; MESA-DAG: s_mov_b32 s33, s3{{$}} - ; GCN-DAG: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} ; GCN-DAG: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i16_zeroext@rel32@lo+4 ; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i16_zeroext@rel32@hi+4 -; GCN-DAG: s_mov_b32 s32, s33 +; GCN-DAG: s_mov_b32 s32, 0 ; GCN-NOT: s_waitcnt ; GCN-NEXT: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}} @@ -234,13 +222,12 @@ define amdgpu_kernel void @test_call_external_void_func_i16_zeroext(i32) #0 { } ; GCN-LABEL: {{^}}test_call_external_void_func_i32_imm: -; MESA-DAG: s_mov_b32 s33, s3{{$}} ; GCN-DAG: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} ; GCN-DAG: s_add_u32 s[[PC_LO]], s[[PC_LO]], external_void_func_i32@rel32@lo+4 ; GCN-DAG: s_addc_u32 s[[PC_HI]], s[[PC_HI]], external_void_func_i32@rel32@hi+4 ; GCN-DAG: v_mov_b32_e32 v0, 42 -; GCN-DAG: s_mov_b32 s32, s33 +; GCN-DAG: s_mov_b32 s32, 0 ; GCN: s_swappc_b64 s[30:31], s{{\[}}[[PC_LO]]:[[PC_HI]]{{\]}} ; GCN-NEXT: s_endpgm @@ -497,9 +484,7 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32_imm() #0 { ret void } -; GCN-LABEL: {{^}}test_call_external_void_func_v3i32_imm: -; HSA-DAG: s_mov_b32 s33, s9 -; MESA-DAG: s_mov_b32 s33, s3{{$}} +; GCN-LABEL: {{^}}test_call_external_void_func_v3i32_imm: {{.*}} ; GCN-NOT: v3 ; GCN-DAG: v_mov_b32_e32 v0, 3 @@ -616,10 +601,8 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 { } ; GCN-LABEL: {{^}}test_call_external_void_func_v32i32_i32: -; HSA-DAG: s_mov_b32 s33, s9 ; HSA-NOT: s_add_u32 s32 -; MESA-DAG: s_mov_b32 s33, s3{{$}} ; MESA-NOT: s_add_u32 s32 ; GCN-DAG: buffer_load_dword [[VAL1:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}} @@ -670,19 +653,19 @@ define amdgpu_kernel void @test_call_external_void_func_struct_i8_i32() #0 { ; GCN-LABEL: {{^}}test_call_external_void_func_byval_struct_i8_i32: ; GCN-DAG: v_mov_b32_e32 [[VAL0:v[0-9]+]], 3 ; GCN-DAG: v_mov_b32_e32 [[VAL1:v[0-9]+]], 8 -; MESA-DAG: buffer_store_byte [[VAL0]], off, s[36:39], s33 offset:8 -; MESA-DAG: buffer_store_dword [[VAL1]], off, s[36:39], s33 offset:12 +; MESA-DAG: buffer_store_byte [[VAL0]], off, s[36:39], 0 offset:8 +; MESA-DAG: buffer_store_dword [[VAL1]], off, s[36:39], 0 offset:12 -; HSA-DAG: buffer_store_byte [[VAL0]], off, s[0:3], s33 offset:8 -; HSA-DAG: buffer_store_dword [[VAL1]], off, s[0:3], s33 offset:12 +; HSA-DAG: buffer_store_byte [[VAL0]], off, s[0:3], 0 offset:8 +; HSA-DAG: buffer_store_dword [[VAL1]], off, s[0:3], 0 offset:12 -; HSA: buffer_load_dword [[RELOAD_VAL0:v[0-9]+]], off, s[0:3], s33 offset:8 -; HSA: buffer_load_dword [[RELOAD_VAL1:v[0-9]+]], off, s[0:3], s33 offset:12 +; HSA: buffer_load_dword [[RELOAD_VAL0:v[0-9]+]], off, s[0:3], 0 offset:8 +; HSA: buffer_load_dword [[RELOAD_VAL1:v[0-9]+]], off, s[0:3], 0 offset:12 -; MESA: buffer_load_dword [[RELOAD_VAL0:v[0-9]+]], off, s[36:39], s33 offset:8 -; MESA: buffer_load_dword [[RELOAD_VAL1:v[0-9]+]], off, s[36:39], s33 offset:12 +; MESA: buffer_load_dword [[RELOAD_VAL0:v[0-9]+]], off, s[36:39], 0 offset:8 +; MESA: buffer_load_dword [[RELOAD_VAL1:v[0-9]+]], off, s[36:39], 0 offset:12 -; GCN-DAG: s_add_u32 [[SP:s[0-9]+]], s33, 0x400{{$}} +; GCN-DAG: s_movk_i32 [[SP:s[0-9]+]], 0x400{{$}} ; HSA-DAG: buffer_store_dword [[RELOAD_VAL0]], off, s[0:3], [[SP]]{{$}} ; HSA-DAG: buffer_store_dword [[RELOAD_VAL1]], off, s[0:3], [[SP]] offset:4 @@ -703,23 +686,22 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0 } ; GCN-LABEL: {{^}}test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32: -; MESA-DAG: s_add_u32 [[SP:s[0-9]+]], [[FP_REG:s[0-9]+]], 0x800{{$}} -; HSA-DAG: s_add_u32 [[SP:s[0-9]+]], [[FP_REG:s[0-9]+]], 0x800{{$}} +; GCN-DAG: s_movk_i32 [[SP:s[0-9]+]], 0x800{{$}} ; GCN-DAG: v_mov_b32_e32 [[VAL0:v[0-9]+]], 3 ; GCN-DAG: v_mov_b32_e32 [[VAL1:v[0-9]+]], 8 -; GCN-DAG: buffer_store_byte [[VAL0]], off, s{{\[[0-9]+:[0-9]+\]}}, [[FP_REG]] offset:8 -; GCN-DAG: buffer_store_dword [[VAL1]], off, s{{\[[0-9]+:[0-9]+\]}}, [[FP_REG]] offset:12 +; GCN-DAG: buffer_store_byte [[VAL0]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8 +; GCN-DAG: buffer_store_dword [[VAL1]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:12 -; GCN-DAG: buffer_load_dword [[RELOAD_VAL0:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, [[FP_REG]] offset:8 -; GCN-DAG: buffer_load_dword [[RELOAD_VAL1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, [[FP_REG]] offset:12 +; GCN-DAG: buffer_load_dword [[RELOAD_VAL0:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8 +; GCN-DAG: buffer_load_dword [[RELOAD_VAL1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:12 ; GCN-NOT: s_add_u32 [[SP]] ; GCN-DAG: buffer_store_dword [[RELOAD_VAL0]], off, s{{\[[0-9]+:[0-9]+\]}}, [[SP]]{{$}} ; GCN-DAG: buffer_store_dword [[RELOAD_VAL1]], off, s{{\[[0-9]+:[0-9]+\]}}, [[SP]] offset:4 ; GCN: s_swappc_b64 -; GCN-DAG: buffer_load_ubyte [[LOAD_OUT_VAL0:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, [[FP_REG]] offset:16 -; GCN-DAG: buffer_load_dword [[LOAD_OUT_VAL1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, [[FP_REG]] offset:20 +; GCN-DAG: buffer_load_ubyte [[LOAD_OUT_VAL0:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16 +; GCN-DAG: buffer_load_dword [[LOAD_OUT_VAL1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:20 ; GCN-NOT: s_sub_u32 [[SP]] ; GCN: buffer_store_byte [[LOAD_OUT_VAL0]], off diff --git a/llvm/test/CodeGen/AMDGPU/call-constant.ll b/llvm/test/CodeGen/AMDGPU/call-constant.ll index 19aadfc96ad13..11f4b3c0b913a 100644 --- a/llvm/test/CodeGen/AMDGPU/call-constant.ll +++ b/llvm/test/CodeGen/AMDGPU/call-constant.ll @@ -3,9 +3,8 @@ ; FIXME: Emitting unnecessary flat_scratch setup ; GCN-LABEL: {{^}}test_call_undef: -; GCN: s_mov_b32 s8, s7 ; GCN: s_mov_b32 flat_scratch_lo, s5 -; GCN: s_add_u32 s4, s4, s8 +; GCN: s_add_u32 s4, s4, s7 ; GCN: s_lshr_b32 ; GCN: s_endpgm define amdgpu_kernel void @test_call_undef() #0 { @@ -24,9 +23,8 @@ define i32 @test_tail_call_undef() #0 { } ; GCN-LABEL: {{^}}test_call_null: -; GCN: s_mov_b32 s8, s7 ; GCN: s_mov_b32 flat_scratch_lo, s5 -; GCN: s_add_u32 s4, s4, s8 +; GCN: s_add_u32 s4, s4, s7 ; GCN: s_lshr_b32 ; GCN: s_endpgm define amdgpu_kernel void @test_call_null() #0 { diff --git a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll index de0086495870c..c184ce778fa85 100644 --- a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll +++ b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll @@ -13,14 +13,14 @@ define void @use_vcc() #1 { } ; GCN-LABEL: {{^}}indirect_use_vcc: -; GCN: v_writelane_b32 v32, s34, 2 +; GCN: v_writelane_b32 v32, s33, 2 ; GCN: v_writelane_b32 v32, s30, 0 ; GCN: v_writelane_b32 v32, s31, 1 ; GCN: s_swappc_b64 ; GCN: v_readlane_b32 s4, v32, 0 ; GCN: v_readlane_b32 s5, v32, 1 -; GCN: v_readlane_b32 s34, v32, 2 -; GCN: ; NumSgprs: 37 +; GCN: v_readlane_b32 s33, v32, 2 +; GCN: ; NumSgprs: 36 ; GCN: ; NumVgprs: 33 define void @indirect_use_vcc() #1 { call void @use_vcc() @@ -29,8 +29,8 @@ define void @indirect_use_vcc() #1 { ; GCN-LABEL: {{^}}indirect_2level_use_vcc_kernel: ; GCN: is_dynamic_callstack = 0 -; CI: ; NumSgprs: 39 -; VI-NOBUG: ; NumSgprs: 41 +; CI: ; NumSgprs: 38 +; VI-NOBUG: ; NumSgprs: 40 ; VI-BUG: ; NumSgprs: 96 ; GCN: ; NumVgprs: 33 define amdgpu_kernel void @indirect_2level_use_vcc_kernel(i32 addrspace(1)* %out) #0 { @@ -48,8 +48,8 @@ define void @use_flat_scratch() #1 { } ; GCN-LABEL: {{^}}indirect_use_flat_scratch: -; CI: ; NumSgprs: 39 -; VI: ; NumSgprs: 41 +; CI: ; NumSgprs: 38 +; VI: ; NumSgprs: 40 ; GCN: ; NumVgprs: 33 define void @indirect_use_flat_scratch() #1 { call void @use_flat_scratch() @@ -58,8 +58,8 @@ define void @indirect_use_flat_scratch() #1 { ; GCN-LABEL: {{^}}indirect_2level_use_flat_scratch_kernel: ; GCN: is_dynamic_callstack = 0 -; CI: ; NumSgprs: 39 -; VI-NOBUG: ; NumSgprs: 41 +; CI: ; NumSgprs: 38 +; VI-NOBUG: ; NumSgprs: 40 ; VI-BUG: ; NumSgprs: 96 ; GCN: ; NumVgprs: 33 define amdgpu_kernel void @indirect_2level_use_flat_scratch_kernel(i32 addrspace(1)* %out) #0 { diff --git a/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll b/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll index ee77007ef59ed..ebfc93e238502 100644 --- a/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll @@ -5,11 +5,10 @@ declare hidden void @external_void_func_void() #0 ; GCN-LABEL: {{^}}test_kernel_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void: -; GCN: s_mov_b32 s33, s7 ; GCN: s_getpc_b64 s[34:35] ; GCN-NEXT: s_add_u32 s34, s34, ; GCN-NEXT: s_addc_u32 s35, s35, -; GCN-NEXT: s_mov_b32 s32, s33 +; GCN-NEXT: s_mov_b32 s32, 0 ; GCN: s_swappc_b64 s[30:31], s[34:35] ; GCN-NEXT: #ASMSTART @@ -24,9 +23,9 @@ define amdgpu_kernel void @test_kernel_call_external_void_func_void_clobber_s30_ ; GCN-LABEL: {{^}}test_func_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void: ; GCN: buffer_store_dword -; GCN: v_writelane_b32 v32, s34, 4 -; GCN: v_writelane_b32 v32, s36, 0 -; GCN: v_writelane_b32 v32, s37, 1 +; GCN: v_writelane_b32 v32, s33, 4 +; GCN: v_writelane_b32 v32, s34, 0 +; GCN: v_writelane_b32 v32, s35, 1 ; GCN: v_writelane_b32 v32, s30, 2 ; GCN: v_writelane_b32 v32, s31, 3 @@ -36,10 +35,10 @@ define amdgpu_kernel void @test_kernel_call_external_void_func_void_clobber_s30_ ; GCN-NEXT: s_swappc_b64 ; GCN-DAG: v_readlane_b32 s4, v32, 2 ; GCN-DAG: v_readlane_b32 s5, v32, 3 -; GCN: v_readlane_b32 s37, v32, 1 -; GCN: v_readlane_b32 s36, v32, 0 +; GCN: v_readlane_b32 s35, v32, 1 +; GCN: v_readlane_b32 s34, v32, 0 -; GCN: v_readlane_b32 s34, v32, 4 +; GCN: v_readlane_b32 s33, v32, 4 ; GCN: buffer_load_dword ; GCN: s_setpc_b64 define void @test_func_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void() #0 { @@ -51,14 +50,14 @@ define void @test_func_call_external_void_func_void_clobber_s30_s31_call_externa ; GCN-LABEL: {{^}}test_func_call_external_void_funcx2: ; GCN: buffer_store_dword v32 -; GCN: v_writelane_b32 v32, s34, 4 +; GCN: v_writelane_b32 v32, s33, 4 -; GCN: s_mov_b32 s34, s32 +; GCN: s_mov_b32 s33, s32 ; GCN: s_add_u32 s32, s32, 0x400 ; GCN: s_swappc_b64 ; GCN-NEXT: s_swappc_b64 -; GCN: v_readlane_b32 s34, v32, 4 +; GCN: v_readlane_b32 s33, v32, 4 ; GCN: buffer_load_dword v32, define void @test_func_call_external_void_funcx2() #0 { call void @external_void_func_void() @@ -105,9 +104,9 @@ define amdgpu_kernel void @test_call_void_func_void_clobber_vcc(i32 addrspace(1) } ; GCN-LABEL: {{^}}test_call_void_func_void_mayclobber_s31: -; GCN: s_mov_b32 s34, s31 +; GCN: s_mov_b32 s33, s31 ; GCN-NEXT: s_swappc_b64 -; GCN-NEXT: s_mov_b32 s31, s34 +; GCN-NEXT: s_mov_b32 s31, s33 define amdgpu_kernel void @test_call_void_func_void_mayclobber_s31(i32 addrspace(1)* %out) #0 { %s31 = call i32 asm sideeffect "; def $0", "={s31}"() call void @external_void_func_void() @@ -129,11 +128,10 @@ define amdgpu_kernel void @test_call_void_func_void_mayclobber_v31(i32 addrspace ; FIXME: What is the expected behavior for reserved registers here? ; GCN-LABEL: {{^}}test_call_void_func_void_preserves_s33: -; GCN: s_mov_b32 s33, s9 -; GCN: s_mov_b32 s32, s33 ; GCN: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+4 +; GCN: s_mov_b32 s32, 0 ; GCN: #ASMSTART ; GCN-NEXT: ; def s33 ; GCN-NEXT: #ASMEND @@ -150,14 +148,13 @@ define amdgpu_kernel void @test_call_void_func_void_preserves_s33(i32 addrspace( ret void } -; GCN-LABEL: {{^}}test_call_void_func_void_preserves_s34: -; GCN: s_mov_b32 s33, s9 -; GCN-NOT: s34 +; GCN-LABEL: {{^}}test_call_void_func_void_preserves_s34: {{.*}} ; GCN-NOT: s34 ; GCN: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+4 +; GCN: s_mov_b32 s32, 0 ; GCN-NOT: s34 ; GCN: ;;#ASMSTART @@ -180,15 +177,14 @@ define amdgpu_kernel void @test_call_void_func_void_preserves_s34(i32 addrspace( ret void } -; GCN-LABEL: {{^}}test_call_void_func_void_preserves_v32: -; GCN: s_mov_b32 s33, s9 +; GCN-LABEL: {{^}}test_call_void_func_void_preserves_v32: {{.*}} ; GCN-NOT: v32 ; GCN: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+4 +; GCN: s_mov_b32 s32, 0 ; GCN-NOT: v32 -; GCN-DAG: s_mov_b32 s32, s33 ; GCN: ;;#ASMSTART ; GCN-NEXT: ; def v32 @@ -234,12 +230,10 @@ define hidden void @void_func_void_clobber_s34() #2 { } ; GCN-LABEL: {{^}}test_call_void_func_void_clobber_s33: -; GCN: s_mov_b32 s33, s7 - ; GCN: s_getpc_b64 ; GCN-NEXT: s_add_u32 ; GCN-NEXT: s_addc_u32 -; GCN-NEXT: s_mov_b32 s32, s33 +; GCN-NEXT: s_mov_b32 s32, 0 ; GCN: s_swappc_b64 ; GCN-NEXT: s_endpgm define amdgpu_kernel void @test_call_void_func_void_clobber_s33() #0 { @@ -248,11 +242,10 @@ define amdgpu_kernel void @test_call_void_func_void_clobber_s33() #0 { } ; GCN-LABEL: {{^}}test_call_void_func_void_clobber_s34: -; GCN: s_mov_b32 s33, s7 ; GCN: s_getpc_b64 ; GCN-NEXT: s_add_u32 ; GCN-NEXT: s_addc_u32 -; GCN-NEXT: s_mov_b32 s32, s33 +; GCN-NEXT: s_mov_b32 s32, 0 ; GCN: s_swappc_b64 ; GCN-NEXT: s_endpgm define amdgpu_kernel void @test_call_void_func_void_clobber_s34() #0 { diff --git a/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll b/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll index 72423ec4189e5..6c9a9af159bdd 100644 --- a/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll +++ b/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll @@ -6,16 +6,17 @@ define amdgpu_kernel void @call_memory_arg_load(i32 addrspace(3)* %ptr, i32) #0 ; GCN-LABEL: call_memory_arg_load: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dword s4, s[4:5], 0x0 -; GCN-NEXT: s_mov_b32 s33, s9 -; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s33 +; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9 ; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 -; GCN-NEXT: s_mov_b32 s32, s33 +; GCN-NEXT: s_add_u32 s0, s0, s9 +; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: ds_read_b32 v0, v0 ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, func@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, func@rel32@hi+4 +; GCN-NEXT: s_mov_b32 s32, 0 ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GCN-NEXT: s_endpgm %vgpr = load volatile i32, i32 addrspace(3)* %ptr @@ -28,19 +29,20 @@ define amdgpu_kernel void @call_memory_no_dep(i32 addrspace(1)* %ptr, i32) #0 { ; GCN-LABEL: call_memory_no_dep: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 -; GCN-NEXT: s_mov_b32 s33, s9 -; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s33 -; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9 ; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; GCN-NEXT: s_add_u32 s0, s0, s9 +; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: global_store_dword v[0:1], v2, off ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_getpc_b64 s[6:7] ; GCN-NEXT: s_add_u32 s6, s6, func@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s7, s7, func@rel32@hi+4 -; GCN-NEXT: s_mov_b32 s32, s33 +; GCN-NEXT: s_mov_b32 s32, 0 ; GCN-NEXT: s_swappc_b64 s[30:31], s[6:7] ; GCN-NEXT: s_endpgm store i32 0, i32 addrspace(1)* %ptr @@ -52,15 +54,16 @@ define amdgpu_kernel void @call_memory_no_dep(i32 addrspace(1)* %ptr, i32) #0 { define amdgpu_kernel void @call_no_wait_after_call(i32 addrspace(1)* %ptr, i32) #0 { ; GCN-LABEL: call_no_wait_after_call: ; GCN: ; %bb.0: +; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9 ; GCN-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x0 -; GCN-NEXT: s_mov_b32 s33, s9 -; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s33 ; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; GCN-NEXT: s_add_u32 s0, s0, s9 +; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, func@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, func@rel32@hi+4 -; GCN-NEXT: s_mov_b32 s32, s33 +; GCN-NEXT: s_mov_b32 s32, 0 ; GCN-NEXT: v_mov_b32_e32 v32, 0 ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GCN-NEXT: v_mov_b32_e32 v0, s34 @@ -75,15 +78,16 @@ define amdgpu_kernel void @call_no_wait_after_call(i32 addrspace(1)* %ptr, i32) define amdgpu_kernel void @call_no_wait_after_call_return_val(i32 addrspace(1)* %ptr, i32) #0 { ; GCN-LABEL: call_no_wait_after_call_return_val: ; GCN: ; %bb.0: +; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9 ; GCN-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x0 -; GCN-NEXT: s_mov_b32 s33, s9 -; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s33 ; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; GCN-NEXT: s_add_u32 s0, s0, s9 +; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, func.return@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, func.return@rel32@hi+4 -; GCN-NEXT: s_mov_b32 s32, s33 +; GCN-NEXT: s_mov_b32 s32, 0 ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GCN-NEXT: v_mov_b32_e32 v1, s34 ; GCN-NEXT: v_mov_b32_e32 v2, s35 @@ -98,15 +102,16 @@ define amdgpu_kernel void @call_no_wait_after_call_return_val(i32 addrspace(1)* define amdgpu_kernel void @call_got_load(i32 addrspace(1)* %ptr, i32) #0 { ; GCN-LABEL: call_got_load: ; GCN: ; %bb.0: -; GCN-NEXT: s_mov_b32 s33, s9 -; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s33 +; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9 ; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 +; GCN-NEXT: s_add_u32 s0, s0, s9 +; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, got.func@gotpcrel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, got.func@gotpcrel32@hi+4 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: s_mov_b32 s32, s33 +; GCN-NEXT: s_mov_b32 s32, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GCN-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll index c42cadbc80c57..e989ea07926dc 100644 --- a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll @@ -12,9 +12,9 @@ define void @callee_no_stack() #0 { ; GCN-LABEL: {{^}}callee_no_stack_no_fp_elim_all: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt -; GCN-NEXT: s_mov_b32 s4, s34 -; GCN-NEXT: s_mov_b32 s34, s32 -; GCN-NEXT: s_mov_b32 s34, s4 +; GCN-NEXT: s_mov_b32 s4, s33 +; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: s_mov_b32 s33, s4 ; GCN-NEXT: s_setpc_b64 define void @callee_no_stack_no_fp_elim_all() #1 { ret void @@ -46,13 +46,13 @@ define void @callee_with_stack() #0 { ; GCN-LABEL: {{^}}callee_with_stack_no_fp_elim_all: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt -; GCN-NEXT: s_mov_b32 s4, s34 -; GCN-NEXT: s_mov_b32 s34, s32 +; GCN-NEXT: s_mov_b32 s4, s33 +; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_add_u32 s32, s32, 0x200 ; GCN-NEXT: v_mov_b32_e32 v0, 0{{$}} -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s34 offset:4{{$}} +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4{{$}} ; GCN-NEXT: s_sub_u32 s32, s32, 0x200 -; GCN-NEXT: s_mov_b32 s34, s4 +; GCN-NEXT: s_mov_b32 s33, s4 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 define void @callee_with_stack_no_fp_elim_all() #1 { @@ -80,14 +80,14 @@ define void @callee_with_stack_no_fp_elim_non_leaf() #2 { ; GCN: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; GCN-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] -; GCN: v_writelane_b32 [[CSR_VGPR]], s34, 2 -; GCN-DAG: s_mov_b32 s34, s32 +; GCN: v_writelane_b32 [[CSR_VGPR]], s33, 2 +; GCN-DAG: s_mov_b32 s33, s32 ; GCN-DAG: s_add_u32 s32, s32, 0x400{{$}} ; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} ; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s30, ; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s31, -; GCN-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s34{{$}} +; GCN-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s33{{$}} ; GCN: s_swappc_b64 @@ -95,7 +95,7 @@ define void @callee_with_stack_no_fp_elim_non_leaf() #2 { ; GCN-DAG: v_readlane_b32 s4, [[CSR_VGPR]] ; GCN: s_sub_u32 s32, s32, 0x400{{$}} -; GCN-NEXT: v_readlane_b32 s34, [[CSR_VGPR]], 2 +; GCN-NEXT: v_readlane_b32 s33, [[CSR_VGPR]], 2 ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; GCN-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] @@ -121,7 +121,7 @@ define void @callee_with_stack_and_call() #0 { ; GCN-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] ; GCN-DAG: s_add_u32 s32, s32, 0x400 -; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s34, [[FP_SPILL_LANE:[0-9]+]] +; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s33, [[FP_SPILL_LANE:[0-9]+]] ; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s30, 0 ; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s31, 1 @@ -131,7 +131,7 @@ define void @callee_with_stack_and_call() #0 { ; GCN-DAG: v_readlane_b32 s5, v32, 1 ; GCN: s_sub_u32 s32, s32, 0x400 -; GCN-NEXT: v_readlane_b32 s34, [[CSR_VGPR]], [[FP_SPILL_LANE]] +; GCN-NEXT: v_readlane_b32 s33, [[CSR_VGPR]], [[FP_SPILL_LANE]] ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; GCN-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] @@ -204,20 +204,20 @@ define void @spill_only_csr_sgpr() { ; TODO: Can the SP inc/deec be remvoed? ; GCN-LABEL: {{^}}callee_with_stack_no_fp_elim_csr_vgpr: ; GCN: s_waitcnt -; GCN-NEXT:s_mov_b32 [[FP_COPY:s[0-9]+]], s34 -; GCN-NEXT: s_mov_b32 s34, s32 +; GCN-NEXT:s_mov_b32 [[FP_COPY:s[0-9]+]], s33 +; GCN-NEXT: s_mov_b32 s33, s32 ; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 -; GCN-DAG: buffer_store_dword v33, off, s[0:3], s34 ; 4-byte Folded Spill -; GCN-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s34 offset:8 +; GCN-DAG: buffer_store_dword v33, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s33 offset:8 ; GCN: ;;#ASMSTART ; GCN-NEXT: ; clobber v33 ; GCN-NEXT: ;;#ASMEND -; GCN: buffer_load_dword v33, off, s[0:3], s34 ; 4-byte Folded Reload +; GCN: buffer_load_dword v33, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN: s_add_u32 s32, s32, 0x300 ; GCN-NEXT: s_sub_u32 s32, s32, 0x300 -; GCN-NEXT: s_mov_b32 s34, s4 +; GCN-NEXT: s_mov_b32 s33, s4 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 define void @callee_with_stack_no_fp_elim_csr_vgpr() #1 { @@ -230,17 +230,17 @@ define void @callee_with_stack_no_fp_elim_csr_vgpr() #1 { ; Use a copy to a free SGPR instead of introducing a second CSR VGPR. ; GCN-LABEL: {{^}}last_lane_vgpr_for_fp_csr: ; GCN: s_waitcnt -; GCN-NEXT: v_writelane_b32 v1, s34, 63 -; GCN-NEXT: s_mov_b32 s34, s32 -; GCN: buffer_store_dword v33, off, s[0:3], s34 ; 4-byte Folded Spill +; GCN-NEXT: v_writelane_b32 v1, s33, 63 +; GCN-NEXT: s_mov_b32 s33, s32 +; GCN: buffer_store_dword v33, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-COUNT-63: v_writelane_b32 v1 -; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s34 offset:8 +; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33 offset:8 ; GCN: ;;#ASMSTART ; GCN-COUNT-63: v_readlane_b32 s{{[0-9]+}}, v1 ; GCN: s_add_u32 s32, s32, 0x300 ; GCN-NEXT: s_sub_u32 s32, s32, 0x300 -; GCN-NEXT: v_readlane_b32 s34, v1, 63 +; GCN-NEXT: v_readlane_b32 s33, v1, 63 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 define void @last_lane_vgpr_for_fp_csr() #1 { @@ -262,19 +262,19 @@ define void @last_lane_vgpr_for_fp_csr() #1 { ; Use a copy to a free SGPR instead of introducing a second CSR VGPR. ; GCN-LABEL: {{^}}no_new_vgpr_for_fp_csr: ; GCN: s_waitcnt -; GCN-NEXT: s_mov_b32 [[FP_COPY:s[0-9]+]], s34 -; GCN-NEXT: s_mov_b32 s34, s32 -; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s34 ; 4-byte Folded Spill +; GCN-NEXT: s_mov_b32 [[FP_COPY:s[0-9]+]], s33 +; GCN-NEXT: s_mov_b32 s33, s32 +; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-COUNT-64: v_writelane_b32 v1, ; GCN: buffer_store_dword ; GCN: ;;#ASMSTART ; GCN-COUNT-64: v_readlane_b32 s{{[0-9]+}}, v1 -; GCN: buffer_load_dword v33, off, s[0:3], s34 ; 4-byte Folded Reload +; GCN: buffer_load_dword v33, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN: s_add_u32 s32, s32, 0x300 ; GCN-NEXT: s_sub_u32 s32, s32, 0x300 -; GCN-NEXT: s_mov_b32 s34, [[FP_COPY]] +; GCN-NEXT: s_mov_b32 s33, [[FP_COPY]] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 define void @no_new_vgpr_for_fp_csr() #1 { @@ -296,13 +296,13 @@ define void @no_new_vgpr_for_fp_csr() #1 { ; GCN-LABEL: {{^}}realign_stack_no_fp_elim: ; GCN: s_waitcnt ; GCN-NEXT: s_add_u32 [[SCRATCH:s[0-9]+]], s32, 0x7ffc0 -; GCN-NEXT: s_mov_b32 s4, s34 -; GCN-NEXT: s_and_b32 s34, [[SCRATCH]], 0xfff80000 +; GCN-NEXT: s_mov_b32 s4, s33 +; GCN-NEXT: s_and_b32 s33, [[SCRATCH]], 0xfff80000 ; GCN-NEXT: s_add_u32 s32, s32, 0x100000 ; GCN-NEXT: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 -; GCN-NEXT: buffer_store_dword [[ZERO]], off, s[0:3], s34 +; GCN-NEXT: buffer_store_dword [[ZERO]], off, s[0:3], s33 ; GCN-NEXT: s_sub_u32 s32, s32, 0x100000 -; GCN-NEXT: s_mov_b32 s34, s4 +; GCN-NEXT: s_mov_b32 s33, s4 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 define void @realign_stack_no_fp_elim() #1 { @@ -313,18 +313,18 @@ define void @realign_stack_no_fp_elim() #1 { ; GCN-LABEL: {{^}}no_unused_non_csr_sgpr_for_fp: ; GCN: s_waitcnt -; GCN-NEXT: v_writelane_b32 v1, s34, 2 +; GCN-NEXT: v_writelane_b32 v1, s33, 2 ; GCN-NEXT: v_writelane_b32 v1, s30, 0 -; GCN-NEXT: s_mov_b32 s34, s32 +; GCN-NEXT: s_mov_b32 s33, s32 ; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0 ; GCN: v_writelane_b32 v1, s31, 1 -; GCN: buffer_store_dword [[ZERO]], off, s[0:3], s34 offset:4 +; GCN: buffer_store_dword [[ZERO]], off, s[0:3], s33 offset:4 ; GCN: ;;#ASMSTART ; GCN: v_readlane_b32 s4, v1, 0 ; GCN-NEXT: s_add_u32 s32, s32, 0x200 ; GCN-NEXT: v_readlane_b32 s5, v1, 1 ; GCN-NEXT: s_sub_u32 s32, s32, 0x200 -; GCN-NEXT: v_readlane_b32 s34, v1, 2 +; GCN-NEXT: v_readlane_b32 s33, v1, 2 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[4:5] define void @no_unused_non_csr_sgpr_for_fp() #1 { @@ -347,9 +347,9 @@ define void @no_unused_non_csr_sgpr_for_fp() #1 { ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; GCN-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] -; GCN-NEXT: v_writelane_b32 v32, s34, 2 +; GCN-NEXT: v_writelane_b32 v32, s33, 2 ; GCN-NEXT: v_writelane_b32 v32, s30, 0 -; GCN-NEXT: s_mov_b32 s34, s32 +; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-DAG: v_writelane_b32 v32, s31, 1 ; GCN-DAG: buffer_store_dword @@ -360,7 +360,7 @@ define void @no_unused_non_csr_sgpr_for_fp() #1 { ; GCN: v_readlane_b32 s4, v32, 0 ; GCN-NEXT: v_readlane_b32 s5, v32, 1 ; GCN-NEXT: s_sub_u32 s32, s32, 0x300{{$}} -; GCN-NEXT: v_readlane_b32 s34, v32, 2 +; GCN-NEXT: v_readlane_b32 s33, v32, 2 ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; GCN-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] @@ -394,9 +394,9 @@ define void @no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr() #1 { ; GCN-NEXT: v_mov_b32_e32 [[SCRATCH_VGPR:v[0-9]+]], 0x1008 ; GCN-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], [[SCRATCH_VGPR]], s[0:3], s32 offen ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] -; GCN-NEXT: v_writelane_b32 v32, s34, 2 +; GCN-NEXT: v_writelane_b32 v32, s33, 2 ; GCN-NEXT: v_writelane_b32 v32, s30, 0 -; GCN-NEXT: s_mov_b32 s34, s32 +; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-DAG: v_writelane_b32 v32, s31, 1 ; GCN-DAG: s_add_u32 s32, s32, 0x40300{{$}} ; GCN-DAG: buffer_store_dword @@ -406,7 +406,7 @@ define void @no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr() #1 { ; GCN: v_readlane_b32 s4, v32, 0 ; GCN-NEXT: v_readlane_b32 s5, v32, 1 ; GCN-NEXT: s_sub_u32 s32, s32, 0x40300{{$}} -; GCN-NEXT: v_readlane_b32 s34, v32, 2 +; GCN-NEXT: v_readlane_b32 s33, v32, 2 ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; GCN-NEXT: v_mov_b32_e32 [[SCRATCH_VGPR:v[0-9]+]], 0x1008 ; GCN-NEXT: buffer_load_dword [[CSR_VGPR]], [[SCRATCH_VGPR]], s[0:3], s32 offen ; 4-byte Folded Reload @@ -444,13 +444,13 @@ define internal void @local_empty_func() #0 { ; An FP is needed, despite not needing any spills ; TODO: Ccould see callee does not use stack and omit FP. ; GCN-LABEL: {{^}}ipra_call_with_stack: -; GCN: s_mov_b32 [[FP_COPY:s[0-9]+]], s34 -; GCN: s_mov_b32 s34, s32 +; GCN: s_mov_b32 [[FP_COPY:s[0-9]+]], s33 +; GCN: s_mov_b32 s33, s32 ; GCN: s_add_u32 s32, s32, 0x400 -; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s34{{$}} +; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33{{$}} ; GCN: s_swappc_b64 ; GCN: s_sub_u32 s32, s32, 0x400 -; GCN: s_mov_b32 s34, [[FP_COPY:s[0-9]+]] +; GCN: s_mov_b32 s33, [[FP_COPY:s[0-9]+]] define void @ipra_call_with_stack() #0 { %alloca = alloca i32, addrspace(5) store volatile i32 0, i32 addrspace(5)* %alloca diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll index 497ea354fc098..023bc6e21276e 100644 --- a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll @@ -243,11 +243,10 @@ define hidden void @use_every_sgpr_input() #1 { } ; GCN-LABEL: {{^}}kern_indirect_use_every_sgpr_input: -; GCN: s_mov_b32 s33, s17 ; GCN: s_mov_b32 s12, s14 ; GCN: s_mov_b32 s13, s15 ; GCN: s_mov_b32 s14, s16 -; GCN: s_mov_b32 s32, s33 +; GCN: s_mov_b32 s32, 0 ; GCN: s_swappc_b64 ; GCN: .amdhsa_user_sgpr_private_segment_buffer 1 diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll index 601ed9698c618..d69f70ffad57c 100644 --- a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs.ll @@ -203,7 +203,7 @@ define hidden void @use_workgroup_id_yz() #1 { ; GCN-NEXT: s_getpc_b64 s[6:7] ; GCN-NEXT: s_add_u32 s6, s6, use_workgroup_id_x@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s7, s7, use_workgroup_id_x@rel32@hi+4 -; GCN: s_mov_b32 s32, s33 +; GCN: s_mov_b32 s32, 0 ; GCN: s_swappc_b64 ; GCN-NEXT: s_endpgm define amdgpu_kernel void @kern_indirect_use_workgroup_id_x() #1 { @@ -216,9 +216,8 @@ define amdgpu_kernel void @kern_indirect_use_workgroup_id_x() #1 { ; GCN: enable_sgpr_workgroup_id_y = 1 ; GCN: enable_sgpr_workgroup_id_z = 0 -; GCN: s_mov_b32 s33, s8 -; GCN-DAG: s_mov_b32 s4, s7 -; GCN: s_mov_b32 s32, s33 +; GCN: s_mov_b32 s4, s7 +; GCN: s_mov_b32 s32, 0 ; GCN: s_swappc_b64 define amdgpu_kernel void @kern_indirect_use_workgroup_id_y() #1 { call void @use_workgroup_id_y() @@ -229,9 +228,10 @@ define amdgpu_kernel void @kern_indirect_use_workgroup_id_y() #1 { ; GCN: enable_sgpr_workgroup_id_x = 1 ; GCN: enable_sgpr_workgroup_id_y = 0 ; GCN: enable_sgpr_workgroup_id_z = 1 -; GCN: s_mov_b32 s33, s8 + ; GCN: s_mov_b32 s4, s7 +; GCN: s_mov_b32 s32, 0 ; GCN: s_swappc_b64 define amdgpu_kernel void @kern_indirect_use_workgroup_id_z() #1 { call void @use_workgroup_id_z() @@ -243,11 +243,10 @@ define amdgpu_kernel void @kern_indirect_use_workgroup_id_z() #1 { ; GCN: enable_sgpr_workgroup_id_y = 1 ; GCN: enable_sgpr_workgroup_id_z = 0 -; GCN: s_mov_b32 s33, s8 - ; GCN: s_mov_b32 s5, s7 ; GCN: s_mov_b32 s4, s6 -; GCN: s_mov_b32 s32, s33 + +; GCN: s_mov_b32 s32, 0 ; GCN: s_swappc_b64 define amdgpu_kernel void @kern_indirect_use_workgroup_id_xy() #1 { call void @use_workgroup_id_xy() @@ -259,13 +258,11 @@ define amdgpu_kernel void @kern_indirect_use_workgroup_id_xy() #1 { ; GCN: enable_sgpr_workgroup_id_y = 1 ; GCN: enable_sgpr_workgroup_id_z = 1 -; GCN: s_mov_b32 s33, s9 - ; GCN: s_mov_b32 s4, s6 ; GCN: s_mov_b32 s5, s7 ; GCN: s_mov_b32 s6, s8 -; GCN: s_mov_b32 s32, s33 +; GCN: s_mov_b32 s32, 0 ; GCN: s_swappc_b64 define amdgpu_kernel void @kern_indirect_use_workgroup_id_xyz() #1 { call void @use_workgroup_id_xyz() @@ -277,12 +274,10 @@ define amdgpu_kernel void @kern_indirect_use_workgroup_id_xyz() #1 { ; GCN: enable_sgpr_workgroup_id_y = 0 ; GCN: enable_sgpr_workgroup_id_z = 1 -; GCN: s_mov_b32 s33, s8 ; GCN: s_mov_b32 s5, s7 ; GCN: s_mov_b32 s4, s6 -; GCN: s_mov_b32 s32, s33 - +; GCN: s_mov_b32 s32, 0 ; GCN: s_swappc_b64 define amdgpu_kernel void @kern_indirect_use_workgroup_id_xz() #1 { call void @use_workgroup_id_xz() @@ -294,10 +289,10 @@ define amdgpu_kernel void @kern_indirect_use_workgroup_id_xz() #1 { ; GCN: enable_sgpr_workgroup_id_y = 1 ; GCN: enable_sgpr_workgroup_id_z = 1 -; GCN: s_mov_b32 s33, s9 ; GCN: s_mov_b32 s4, s7 ; GCN: s_mov_b32 s5, s8 -; GCN: s_mov_b32 s32, s33 + +; GCN: s_mov_b32 s32, 0 ; GCN: s_swappc_b64 define amdgpu_kernel void @kern_indirect_use_workgroup_id_yz() #1 { call void @use_workgroup_id_yz() @@ -364,10 +359,10 @@ define hidden void @other_arg_use_workgroup_id_z(i32 %arg0) #1 { ; GCN: enable_sgpr_workgroup_id_y = 0 ; GCN: enable_sgpr_workgroup_id_z = 0 -; GCN-DAG: s_mov_b32 s33, s7 ; GCN-DAG: v_mov_b32_e32 v0, 0x22b ; GCN-DAG: s_mov_b32 s4, s6 -; GCN-DAG: s_mov_b32 s32, s33 + +; GCN-DAG: s_mov_b32 s32, 0 ; GCN-NOT: s4 ; GCN: s_swappc_b64 define amdgpu_kernel void @kern_indirect_other_arg_use_workgroup_id_x() #1 { @@ -380,11 +375,10 @@ define amdgpu_kernel void @kern_indirect_other_arg_use_workgroup_id_x() #1 { ; GCN: enable_sgpr_workgroup_id_y = 1 ; GCN: enable_sgpr_workgroup_id_z = 0 -; GCN-DAG: s_mov_b32 s33, s8 ; GCN-DAG: v_mov_b32_e32 v0, 0x22b ; GCN-DAG: s_mov_b32 s4, s7 -; GCN-DAG: s_mov_b32 s32, s33 +; GCN-DAG: s_mov_b32 s32, 0 ; GCN: s_swappc_b64 define amdgpu_kernel void @kern_indirect_other_arg_use_workgroup_id_y() #1 { call void @other_arg_use_workgroup_id_y(i32 555) @@ -396,10 +390,9 @@ define amdgpu_kernel void @kern_indirect_other_arg_use_workgroup_id_y() #1 { ; GCN: enable_sgpr_workgroup_id_y = 0 ; GCN: enable_sgpr_workgroup_id_z = 1 -; GCN-DAG: s_mov_b32 s33, s8 ; GCN-DAG: v_mov_b32_e32 v0, 0x22b -; GCN: s_mov_b32 s32, s33 +; GCN: s_mov_b32 s32, 0 ; GCN: s_swappc_b64 define amdgpu_kernel void @kern_indirect_other_arg_use_workgroup_id_z() #1 { call void @other_arg_use_workgroup_id_z(i32 555) @@ -465,11 +458,10 @@ define hidden void @use_every_sgpr_input() #1 { ; GCN: enable_sgpr_dispatch_id = 1 ; GCN: enable_sgpr_flat_scratch_init = 1 -; GCN: s_mov_b32 s33, s17 ; GCN: s_mov_b32 s12, s14 ; GCN: s_mov_b32 s13, s15 ; GCN: s_mov_b32 s14, s16 -; GCN: s_mov_b32 s32, s33 +; GCN: s_mov_b32 s32, 0 ; GCN: s_swappc_b64 define amdgpu_kernel void @kern_indirect_use_every_sgpr_input() #1 { call void @use_every_sgpr_input() @@ -538,7 +530,7 @@ define hidden void @func_use_every_sgpr_input_call_use_workgroup_id_xyz() #1 { } ; GCN-LABEL: {{^}}func_use_every_sgpr_input_call_use_workgroup_id_xyz_spill: -; GCN-DAG: s_mov_b32 s34, s32 +; GCN-DAG: s_mov_b32 s33, s32 ; GCN-DAG: s_add_u32 s32, s32, 0x400 ; GCN-DAG: s_mov_b64 s{{\[}}[[LO_X:[0-9]+]]{{\:}}[[HI_X:[0-9]+]]{{\]}}, s[4:5] ; GCN-DAG: s_mov_b64 s{{\[}}[[LO_Y:[0-9]+]]{{\:}}[[HI_Y:[0-9]+]]{{\]}}, s[6:7] @@ -558,7 +550,7 @@ define hidden void @func_use_every_sgpr_input_call_use_workgroup_id_xyz() #1 { ; GCN: s_swappc_b64 -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s34{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33{{$}} ; GCN-DAG: v_mov_b32_e32 v[[LO1:[0-9]+]], s[[LO_X]] ; GCN-DAG: v_mov_b32_e32 v[[HI1:[0-9]+]], s[[HI_X]] ; GCN-DAG: {{flat|global}}_load_dword v{{[0-9]+}}, v{{\[}}[[LO1]]:[[HI1]]{{\]}} diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll index 421d41294a28d..265024e6bb8ff 100644 --- a/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-vgprs.ll @@ -457,15 +457,13 @@ define void @too_many_args_use_workitem_id_x( ; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_x: ; VARABI: enable_vgpr_workitem_id = 0 -; VARABI: s_mov_b32 s33, s7 -; VARABI: s_mov_b32 s32, s33 +; VARABI: s_mov_b32 s32, 0 ; VARABI: buffer_store_dword v0, off, s[0:3], s32{{$}} ; VARABI: s_swappc_b64 ; FIXEDABI: enable_vgpr_workitem_id = 2 -; FIXEDABI: s_mov_b32 s33, s17 -; FIXEDABI-DAG: s_mov_b32 s32, s33 +; FIXEDABI-DAG: s_mov_b32 s32, 0 ; FIXEDABI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x140{{$}} ; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1 ; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2 @@ -488,7 +486,7 @@ define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x() #1 { } ; GCN-LABEL: {{^}}func_call_too_many_args_use_workitem_id_x: -; VARABI: s_mov_b32 s34, s32 +; VARABI: s_mov_b32 s33, s32 ; VARABI: buffer_store_dword v1, off, s[0:3], s32{{$}} ; Touching the workitem id register is not necessary. @@ -516,14 +514,14 @@ define void @func_call_too_many_args_use_workitem_id_x(i32 %arg0) #1 { ; Requires loading and storing to stack slot. ; GCN-LABEL: {{^}}too_many_args_call_too_many_args_use_workitem_id_x: ; GCN-DAG: s_add_u32 s32, s32, 0x400{{$}} -; GCN-DAG: buffer_store_dword v32, off, s[0:3], s34 offset:4 ; 4-byte Folded Spill -; GCN-DAG: buffer_load_dword v32, off, s[0:3], s34{{$}} +; GCN-DAG: buffer_store_dword v32, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-DAG: buffer_load_dword v32, off, s[0:3], s33{{$}} ; GCN: buffer_store_dword v32, off, s[0:3], s32{{$}} ; GCN: s_swappc_b64 -; GCN: buffer_load_dword v32, off, s[0:3], s34 offset:4 ; 4-byte Folded Reload +; GCN: buffer_load_dword v32, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GCN: s_sub_u32 s32, s32, 0x400{{$}} ; GCN: s_setpc_b64 define void @too_many_args_call_too_many_args_use_workitem_id_x( @@ -616,11 +614,10 @@ define void @too_many_args_use_workitem_id_x_byval( ; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_x_byval: ; VARABI: enable_vgpr_workitem_id = 0 -; VARABI-DAG: s_mov_b32 s33, s7 -; VARABI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}} -; VARABI: buffer_store_dword [[K]], off, s[0:3], s33 offset:4 -; VARABI: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s33 offset:4 -; VARABI: s_add_u32 s32, s33, 0x400{{$}} +; VARABI: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}} +; VARABI: buffer_store_dword [[K]], off, s[0:3], 0 offset:4 +; VARABI: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], 0 offset:4 +; VARABI: s_movk_i32 s32, 0x400{{$}} ; VARABI-NOT: s32 ; VARABI: buffer_store_dword v0, off, s[0:3], s32 offset:4 @@ -630,16 +627,16 @@ define void @too_many_args_use_workitem_id_x_byval( ; VARABI: s_swappc_b64 -; FIXEDABI: s_mov_b32 s33, s17 -; FIXEDABI-DAG: s_add_u32 s32, s33, 0x400 -; FIXEDABI-DAG: v_mov_b32_e32 [[K0:v[0-9]+]], 0x3e7 -; FIXEDABI: buffer_store_dword [[K0]], off, s[0:3], s33 offset:4{{$}} +; FIXEDABI: v_mov_b32_e32 [[K0:v[0-9]+]], 0x3e7 +; FIXEDABI: buffer_store_dword [[K0]], off, s[0:3], 0 offset:4{{$}} + +; FIXEDABI: s_movk_i32 s32, 0x400{{$}} ; FIXEDABI: v_mov_b32_e32 [[K1:v[0-9]+]], 0x140 ; FIXEDABI: buffer_store_dword [[K1]], off, s[0:3], s32{{$}} ; FIXME: Why this reload? -; FIXEDABI: buffer_load_dword [[RELOAD:v[0-9]+]], off, s[0:3], s33 offset:4{{$}} +; FIXEDABI: buffer_load_dword [[RELOAD:v[0-9]+]], off, s[0:3], 0 offset:4{{$}} ; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1 ; FIXEDABI-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2 @@ -667,8 +664,8 @@ define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x_byval() #1 ; GCN-LABEL: {{^}}func_call_too_many_args_use_workitem_id_x_byval: ; VARABI: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}} -; VARABI: buffer_store_dword [[K]], off, s[0:3], s34{{$}} -; VARABI: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s34{{$}} +; VARABI: buffer_store_dword [[K]], off, s[0:3], s33{{$}} +; VARABI: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s33{{$}} ; VARABI: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; VARABI: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32{{$}} ; VARABI: v_mov_b32_e32 [[RELOAD_BYVAL]], @@ -677,11 +674,11 @@ define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x_byval() #1 ; FIXED-ABI-NOT: v31 ; FIXEDABI: v_mov_b32_e32 [[K0:v[0-9]+]], 0x3e7{{$}} -; FIXEDABI: buffer_store_dword [[K0]], off, s[0:3], s34{{$}} +; FIXEDABI: buffer_store_dword [[K0]], off, s[0:3], s33{{$}} ; FIXEDABI: v_mov_b32_e32 [[K1:v[0-9]+]], 0x140{{$}} ; FIXEDABI: buffer_store_dword [[K1]], off, s[0:3], s32{{$}} -; FIXEDABI: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s34{{$}} +; FIXEDABI: buffer_load_dword [[RELOAD_BYVAL:v[0-9]+]], off, s[0:3], s33{{$}} ; FIXED-ABI-NOT: v31 ; FIXEDABI: buffer_store_dword [[RELOAD_BYVAL]], off, s[0:3], s32 offset:4{{$}} @@ -789,9 +786,7 @@ define void @too_many_args_use_workitem_id_xyz( ; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_xyz: ; GCN: enable_vgpr_workitem_id = 2 -; VARABI-DAG: s_mov_b32 s33, s7 -; FIXEDABI-DAG: s_mov_b32 s33, s17 -; GCN-DAG: s_mov_b32 s32, s33 +; GCN-DAG: s_mov_b32 s32, 0 ; GCN-DAG: v_lshlrev_b32_e32 [[TMP1:v[0-9]+]], 10, v1 ; GCN-DAG: v_lshlrev_b32_e32 [[TMP0:v[0-9]+]], 20, v2 @@ -885,16 +880,13 @@ define void @too_many_args_use_workitem_id_x_stack_yz( ; GCN-LABEL: {{^}}kern_call_too_many_args_use_workitem_id_x_stack_yz: ; GCN: enable_vgpr_workitem_id = 2 -; VARABI: s_mov_b32 s33, s7 -; FIXEDABI: s_mov_b32 s33, s17 - ; GCN-NOT: v0 ; GCN-DAG: v_lshlrev_b32_e32 v1, 10, v1 ; GCN-DAG: v_or_b32_e32 v0, v0, v1 ; GCN-DAG: v_lshlrev_b32_e32 v2, 20, v2 ; GCN-DAG: v_or_b32_e32 v31, v0, v2 -; GCN: s_mov_b32 s32, s33 +; GCN: s_mov_b32 s32, 0 ; GCN: s_swappc_b64 define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_x_stack_yz() #1 { call void @too_many_args_use_workitem_id_x_stack_yz( diff --git a/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll b/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll index a091811fc7c1c..0878205a53fde 100644 --- a/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll +++ b/llvm/test/CodeGen/AMDGPU/captured-frame-index.ll @@ -28,8 +28,8 @@ define amdgpu_kernel void @stored_fi_to_lds(float addrspace(5)* addrspace(3)* %p ; Offset is applied ; GCN-LABEL: {{^}}stored_fi_to_lds_2_small_objects: ; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 4{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:8{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8{{$}} ; GCN-DAG: s_load_dword [[LDSPTR:s[0-9]+]] @@ -51,9 +51,9 @@ define amdgpu_kernel void @stored_fi_to_lds_2_small_objects(float addrspace(5)* ; Same frame index is used multiple times in the store ; GCN-LABEL: {{^}}stored_fi_to_self: ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x4d2{{$}} -; GCN: buffer_store_dword [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}} +; GCN: buffer_store_dword [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4{{$}} ; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 4{{$}} -; GCN: buffer_store_dword [[ZERO]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}} +; GCN: buffer_store_dword [[ZERO]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4{{$}} define amdgpu_kernel void @stored_fi_to_self() #0 { %tmp = alloca i32 addrspace(5)*, addrspace(5) @@ -66,13 +66,13 @@ define amdgpu_kernel void @stored_fi_to_self() #0 { ; GCN-LABEL: {{^}}stored_fi_to_self_offset: ; GCN-DAG: v_mov_b32_e32 [[K0:v[0-9]+]], 32{{$}} -; GCN: buffer_store_dword [[K0]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}} +; GCN: buffer_store_dword [[K0]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4{{$}} ; GCN-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0x4d2{{$}} -; GCN: buffer_store_dword [[K1]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:2052{{$}} +; GCN: buffer_store_dword [[K1]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2052{{$}} ; GCN: v_mov_b32_e32 [[OFFSETK:v[0-9]+]], 0x804{{$}} -; GCN: buffer_store_dword [[OFFSETK]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:2052{{$}} +; GCN: buffer_store_dword [[OFFSETK]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2052{{$}} define amdgpu_kernel void @stored_fi_to_self_offset() #0 { %tmp0 = alloca [512 x i32], addrspace(5) %tmp1 = alloca i32 addrspace(5)*, addrspace(5) @@ -89,15 +89,15 @@ define amdgpu_kernel void @stored_fi_to_self_offset() #0 { } ; GCN-LABEL: {{^}}stored_fi_to_fi: -; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}} -; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:8{{$}} -; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:12{{$}} +; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4{{$}} +; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8{{$}} +; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:12{{$}} ; GCN: v_mov_b32_e32 [[FI1:v[0-9]+]], 8{{$}} -; GCN: buffer_store_dword [[FI1]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:12{{$}} +; GCN: buffer_store_dword [[FI1]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:12{{$}} ; GCN: v_mov_b32_e32 [[FI2:v[0-9]+]], 12{{$}} -; GCN: buffer_store_dword [[FI2]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:8{{$}} +; GCN: buffer_store_dword [[FI2]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8{{$}} define amdgpu_kernel void @stored_fi_to_fi() #0 { %tmp0 = alloca i32 addrspace(5)*, addrspace(5) %tmp1 = alloca i32 addrspace(5)*, addrspace(5) @@ -115,7 +115,7 @@ define amdgpu_kernel void @stored_fi_to_fi() #0 { } ; GCN-LABEL: {{^}}stored_fi_to_global: -; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}} +; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4{{$}} ; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4{{$}} ; GCN: buffer_store_dword [[FI]] define amdgpu_kernel void @stored_fi_to_global(float addrspace(5)* addrspace(1)* %ptr) #0 { @@ -127,9 +127,9 @@ define amdgpu_kernel void @stored_fi_to_global(float addrspace(5)* addrspace(1)* ; Offset is applied ; GCN-LABEL: {{^}}stored_fi_to_global_2_small_objects: -; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}} -; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:8{{$}} -; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:12{{$}} +; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4{{$}} +; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8{{$}} +; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:12{{$}} ; GCN: v_mov_b32_e32 [[FI1:v[0-9]+]], 8{{$}} ; GCN: buffer_store_dword [[FI1]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} @@ -150,7 +150,7 @@ define amdgpu_kernel void @stored_fi_to_global_2_small_objects(float addrspace(5 ; GCN-LABEL: {{^}}stored_fi_to_global_huge_frame_offset: ; GCN: v_mov_b32_e32 [[BASE_0:v[0-9]+]], 0{{$}} -; GCN: buffer_store_dword [[BASE_0]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}} +; GCN: buffer_store_dword [[BASE_0]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4{{$}} ; FIXME: Re-initialize ; GCN: v_mov_b32_e32 [[BASE_0_1:v[0-9]+]], 4{{$}} @@ -160,7 +160,7 @@ define amdgpu_kernel void @stored_fi_to_global_2_small_objects(float addrspace(5 ; GCN: v_add_i32_e32 [[BASE_1_OFF_2:v[0-9]+]], vcc, 56, [[BASE_0_1]] -; GCN: buffer_store_dword [[K]], [[BASE_1_OFF_1]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}} +; GCN: buffer_store_dword [[K]], [[BASE_1_OFF_1]], s{{\[[0-9]+:[0-9]+\]}}, 0 offen{{$}} ; GCN: buffer_store_dword [[BASE_1_OFF_2]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} define amdgpu_kernel void @stored_fi_to_global_huge_frame_offset(i32 addrspace(5)* addrspace(1)* %ptr) #0 { diff --git a/llvm/test/CodeGen/AMDGPU/cc-update.ll b/llvm/test/CodeGen/AMDGPU/cc-update.ll new file mode 100644 index 0000000000000..edd65cbf79c98 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/cc-update.ll @@ -0,0 +1,422 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck --check-prefix=GFX803 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX900 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefix=GFX1010 %s + +define amdgpu_kernel void @test_kern_empty() local_unnamed_addr #0 { +; GFX803-LABEL: test_kern_empty: +; GFX803: ; %bb.0: ; %entry +; GFX803-NEXT: s_endpgm +; +; GFX900-LABEL: test_kern_empty: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_endpgm +; +; GFX1010-LABEL: test_kern_empty: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_endpgm +entry: + ret void +} + +define amdgpu_kernel void @test_kern_stack() local_unnamed_addr #0 { +; GFX803-LABEL: test_kern_stack: +; GFX803: ; %bb.0: ; %entry +; GFX803-NEXT: s_add_u32 s4, s4, s7 +; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 +; GFX803-NEXT: s_add_u32 s0, s0, s7 +; GFX803-NEXT: s_addc_u32 s1, s1, 0 +; GFX803-NEXT: v_mov_b32_e32 v0, 0 +; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s5 +; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 +; GFX803-NEXT: s_endpgm +; +; GFX900-LABEL: test_kern_stack: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_add_u32 flat_scratch_lo, s4, s7 +; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 +; GFX900-NEXT: s_add_u32 s0, s0, s7 +; GFX900-NEXT: s_addc_u32 s1, s1, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 +; GFX900-NEXT: s_endpgm +; +; GFX1010-LABEL: test_kern_stack: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_add_u32 s4, s4, s7 +; GFX1010-NEXT: s_addc_u32 s5, s5, 0 +; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 +; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 +; GFX1010-NEXT: s_add_u32 s0, s0, s7 +; GFX1010-NEXT: s_addc_u32 s1, s1, 0 +; GFX1010-NEXT: v_mov_b32_e32 v0, 0 +; GFX1010-NEXT: ; implicit-def: $vcc_hi +; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 +; GFX1010-NEXT: s_endpgm +entry: + %x = alloca i32, align 4, addrspace(5) + store volatile i32 0, i32 addrspace(5)* %x, align 4 + ret void +} + +define amdgpu_kernel void @test_kern_call() local_unnamed_addr #0 { +; GFX803-LABEL: test_kern_call: +; GFX803: ; %bb.0: ; %entry +; GFX803-NEXT: s_add_u32 s4, s4, s7 +; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 +; GFX803-NEXT: s_add_u32 s0, s0, s7 +; GFX803-NEXT: s_addc_u32 s1, s1, 0 +; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s5 +; GFX803-NEXT: s_getpc_b64 s[4:5] +; GFX803-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 +; GFX803-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+4 +; GFX803-NEXT: s_mov_b32 s32, 0 +; GFX803-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX803-NEXT: s_endpgm +; +; GFX900-LABEL: test_kern_call: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_add_u32 flat_scratch_lo, s4, s7 +; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 +; GFX900-NEXT: s_add_u32 s0, s0, s7 +; GFX900-NEXT: s_addc_u32 s1, s1, 0 +; GFX900-NEXT: s_getpc_b64 s[4:5] +; GFX900-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 +; GFX900-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+4 +; GFX900-NEXT: s_mov_b32 s32, 0 +; GFX900-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX900-NEXT: s_endpgm +; +; GFX1010-LABEL: test_kern_call: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_add_u32 s4, s4, s7 +; GFX1010-NEXT: s_mov_b32 s32, 0 +; GFX1010-NEXT: s_addc_u32 s5, s5, 0 +; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 +; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 +; GFX1010-NEXT: s_add_u32 s0, s0, s7 +; GFX1010-NEXT: s_addc_u32 s1, s1, 0 +; GFX1010-NEXT: s_getpc_b64 s[4:5] +; GFX1010-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 +; GFX1010-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+4 +; GFX1010-NEXT: ; implicit-def: $vcc_hi +; GFX1010-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX1010-NEXT: s_endpgm +entry: + tail call void @ex() #0 + ret void +} + +define amdgpu_kernel void @test_kern_stack_and_call() local_unnamed_addr #0 { +; GFX803-LABEL: test_kern_stack_and_call: +; GFX803: ; %bb.0: ; %entry +; GFX803-NEXT: s_add_u32 s4, s4, s7 +; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 +; GFX803-NEXT: s_add_u32 s0, s0, s7 +; GFX803-NEXT: s_addc_u32 s1, s1, 0 +; GFX803-NEXT: v_mov_b32_e32 v0, 0 +; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s5 +; GFX803-NEXT: s_getpc_b64 s[4:5] +; GFX803-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 +; GFX803-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+4 +; GFX803-NEXT: s_movk_i32 s32, 0x400 +; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 +; GFX803-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX803-NEXT: s_endpgm +; +; GFX900-LABEL: test_kern_stack_and_call: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_add_u32 flat_scratch_lo, s4, s7 +; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 +; GFX900-NEXT: s_add_u32 s0, s0, s7 +; GFX900-NEXT: s_addc_u32 s1, s1, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: s_getpc_b64 s[4:5] +; GFX900-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 +; GFX900-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+4 +; GFX900-NEXT: s_movk_i32 s32, 0x400 +; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 +; GFX900-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX900-NEXT: s_endpgm +; +; GFX1010-LABEL: test_kern_stack_and_call: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_add_u32 s4, s4, s7 +; GFX1010-NEXT: s_movk_i32 s32, 0x200 +; GFX1010-NEXT: s_addc_u32 s5, s5, 0 +; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 +; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 +; GFX1010-NEXT: s_add_u32 s0, s0, s7 +; GFX1010-NEXT: s_addc_u32 s1, s1, 0 +; GFX1010-NEXT: v_mov_b32_e32 v0, 0 +; GFX1010-NEXT: s_getpc_b64 s[4:5] +; GFX1010-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 +; GFX1010-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+4 +; GFX1010-NEXT: ; implicit-def: $vcc_hi +; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 +; GFX1010-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX1010-NEXT: s_endpgm +entry: + %x = alloca i32, align 4, addrspace(5) + store volatile i32 0, i32 addrspace(5)* %x, align 4 + tail call void @ex() #0 + ret void +} + +define amdgpu_kernel void @test_force_fp_kern_empty() local_unnamed_addr #2 { +; GFX803-LABEL: test_force_fp_kern_empty: +; GFX803: ; %bb.0: ; %entry +; GFX803-NEXT: s_mov_b32 s33, 0 +; GFX803-NEXT: s_endpgm +; +; GFX900-LABEL: test_force_fp_kern_empty: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_mov_b32 s33, 0 +; GFX900-NEXT: s_endpgm +; +; GFX1010-LABEL: test_force_fp_kern_empty: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_mov_b32 s33, 0 +; GFX1010-NEXT: s_endpgm +entry: + ret void +} + +define amdgpu_kernel void @test_force_fp_kern_stack() local_unnamed_addr #2 { +; GFX803-LABEL: test_force_fp_kern_stack: +; GFX803: ; %bb.0: ; %entry +; GFX803-NEXT: s_add_u32 s4, s4, s7 +; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 +; GFX803-NEXT: s_add_u32 s0, s0, s7 +; GFX803-NEXT: s_mov_b32 s33, 0 +; GFX803-NEXT: s_addc_u32 s1, s1, 0 +; GFX803-NEXT: v_mov_b32_e32 v0, 0 +; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s5 +; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 +; GFX803-NEXT: s_endpgm +; +; GFX900-LABEL: test_force_fp_kern_stack: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_add_u32 flat_scratch_lo, s4, s7 +; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 +; GFX900-NEXT: s_add_u32 s0, s0, s7 +; GFX900-NEXT: s_mov_b32 s33, 0 +; GFX900-NEXT: s_addc_u32 s1, s1, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 +; GFX900-NEXT: s_endpgm +; +; GFX1010-LABEL: test_force_fp_kern_stack: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_add_u32 s4, s4, s7 +; GFX1010-NEXT: s_mov_b32 s33, 0 +; GFX1010-NEXT: s_addc_u32 s5, s5, 0 +; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 +; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 +; GFX1010-NEXT: s_add_u32 s0, s0, s7 +; GFX1010-NEXT: s_addc_u32 s1, s1, 0 +; GFX1010-NEXT: v_mov_b32_e32 v0, 0 +; GFX1010-NEXT: ; implicit-def: $vcc_hi +; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 +; GFX1010-NEXT: s_endpgm +entry: + %x = alloca i32, align 4, addrspace(5) + store volatile i32 0, i32 addrspace(5)* %x, align 4 + ret void +} + +define amdgpu_kernel void @test_force_fp_kern_call() local_unnamed_addr #2 { +; GFX803-LABEL: test_force_fp_kern_call: +; GFX803: ; %bb.0: ; %entry +; GFX803-NEXT: s_add_u32 s4, s4, s7 +; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 +; GFX803-NEXT: s_add_u32 s0, s0, s7 +; GFX803-NEXT: s_addc_u32 s1, s1, 0 +; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s5 +; GFX803-NEXT: s_getpc_b64 s[4:5] +; GFX803-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 +; GFX803-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+4 +; GFX803-NEXT: s_mov_b32 s32, 0 +; GFX803-NEXT: s_mov_b32 s33, 0 +; GFX803-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX803-NEXT: s_endpgm +; +; GFX900-LABEL: test_force_fp_kern_call: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_add_u32 flat_scratch_lo, s4, s7 +; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 +; GFX900-NEXT: s_add_u32 s0, s0, s7 +; GFX900-NEXT: s_addc_u32 s1, s1, 0 +; GFX900-NEXT: s_getpc_b64 s[4:5] +; GFX900-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 +; GFX900-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+4 +; GFX900-NEXT: s_mov_b32 s32, 0 +; GFX900-NEXT: s_mov_b32 s33, 0 +; GFX900-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX900-NEXT: s_endpgm +; +; GFX1010-LABEL: test_force_fp_kern_call: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_add_u32 s4, s4, s7 +; GFX1010-NEXT: s_mov_b32 s32, 0 +; GFX1010-NEXT: s_mov_b32 s33, 0 +; GFX1010-NEXT: s_addc_u32 s5, s5, 0 +; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 +; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 +; GFX1010-NEXT: s_add_u32 s0, s0, s7 +; GFX1010-NEXT: s_addc_u32 s1, s1, 0 +; GFX1010-NEXT: s_getpc_b64 s[4:5] +; GFX1010-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 +; GFX1010-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+4 +; GFX1010-NEXT: ; implicit-def: $vcc_hi +; GFX1010-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX1010-NEXT: s_endpgm +entry: + tail call void @ex() #2 + ret void +} + +define amdgpu_kernel void @test_force_fp_kern_stack_and_call() local_unnamed_addr #2 { +; GFX803-LABEL: test_force_fp_kern_stack_and_call: +; GFX803: ; %bb.0: ; %entry +; GFX803-NEXT: s_add_u32 s4, s4, s7 +; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 +; GFX803-NEXT: s_add_u32 s0, s0, s7 +; GFX803-NEXT: s_mov_b32 s33, 0 +; GFX803-NEXT: s_addc_u32 s1, s1, 0 +; GFX803-NEXT: v_mov_b32_e32 v0, 0 +; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s5 +; GFX803-NEXT: s_getpc_b64 s[4:5] +; GFX803-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 +; GFX803-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+4 +; GFX803-NEXT: s_movk_i32 s32, 0x400 +; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 +; GFX803-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX803-NEXT: s_endpgm +; +; GFX900-LABEL: test_force_fp_kern_stack_and_call: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_add_u32 flat_scratch_lo, s4, s7 +; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 +; GFX900-NEXT: s_add_u32 s0, s0, s7 +; GFX900-NEXT: s_addc_u32 s1, s1, 0 +; GFX900-NEXT: s_mov_b32 s33, 0 +; GFX900-NEXT: v_mov_b32_e32 v0, 0 +; GFX900-NEXT: s_getpc_b64 s[4:5] +; GFX900-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 +; GFX900-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+4 +; GFX900-NEXT: s_movk_i32 s32, 0x400 +; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 +; GFX900-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX900-NEXT: s_endpgm +; +; GFX1010-LABEL: test_force_fp_kern_stack_and_call: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_add_u32 s4, s4, s7 +; GFX1010-NEXT: s_movk_i32 s32, 0x200 +; GFX1010-NEXT: s_mov_b32 s33, 0 +; GFX1010-NEXT: s_addc_u32 s5, s5, 0 +; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 +; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 +; GFX1010-NEXT: s_add_u32 s0, s0, s7 +; GFX1010-NEXT: s_addc_u32 s1, s1, 0 +; GFX1010-NEXT: v_mov_b32_e32 v0, 0 +; GFX1010-NEXT: s_getpc_b64 s[4:5] +; GFX1010-NEXT: s_add_u32 s4, s4, ex@rel32@lo+4 +; GFX1010-NEXT: s_addc_u32 s5, s5, ex@rel32@hi+4 +; GFX1010-NEXT: ; implicit-def: $vcc_hi +; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 +; GFX1010-NEXT: s_swappc_b64 s[30:31], s[4:5] +; GFX1010-NEXT: s_endpgm +entry: + %x = alloca i32, align 4, addrspace(5) + store volatile i32 0, i32 addrspace(5)* %x, align 4 + tail call void @ex() #2 + ret void +} + +define amdgpu_kernel void @test_sgpr_offset_kernel() #1 { +; GFX803-LABEL: test_sgpr_offset_kernel: +; GFX803: ; %bb.0: ; %entry +; GFX803-NEXT: s_add_u32 s4, s4, s7 +; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 +; GFX803-NEXT: s_add_u32 s0, s0, s7 +; GFX803-NEXT: s_addc_u32 s1, s1, 0 +; GFX803-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 +; GFX803-NEXT: s_mov_b32 s4, 0x40000 +; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s5 +; GFX803-NEXT: s_waitcnt vmcnt(0) +; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], s4 ; 4-byte Folded Spill +; GFX803-NEXT: ;;#ASMSTART +; GFX803-NEXT: ;;#ASMEND +; GFX803-NEXT: s_mov_b32 s4, 0x40000 +; GFX803-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload +; GFX803-NEXT: s_waitcnt vmcnt(0) +; GFX803-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8 +; GFX803-NEXT: s_endpgm +; +; GFX900-LABEL: test_sgpr_offset_kernel: +; GFX900: ; %bb.0: ; %entry +; GFX900-NEXT: s_add_u32 flat_scratch_lo, s4, s7 +; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 +; GFX900-NEXT: s_add_u32 s0, s0, s7 +; GFX900-NEXT: s_addc_u32 s1, s1, 0 +; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 +; GFX900-NEXT: s_mov_b32 s6, 0x40000 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill +; GFX900-NEXT: ;;#ASMSTART +; GFX900-NEXT: ;;#ASMEND +; GFX900-NEXT: s_mov_b32 s6, 0x40000 +; GFX900-NEXT: buffer_load_dword v0, off, s[0:3], s6 ; 4-byte Folded Reload +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8 +; GFX900-NEXT: s_endpgm +; +; GFX1010-LABEL: test_sgpr_offset_kernel: +; GFX1010: ; %bb.0: ; %entry +; GFX1010-NEXT: s_add_u32 s4, s4, s7 +; GFX1010-NEXT: s_addc_u32 s5, s5, 0 +; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4 +; GFX1010-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5 +; GFX1010-NEXT: s_add_u32 s0, s0, s7 +; GFX1010-NEXT: s_addc_u32 s1, s1, 0 +; GFX1010-NEXT: s_mov_b32 s6, 0x20000 +; GFX1010-NEXT: ; implicit-def: $vcc_hi +; GFX1010-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 +; GFX1010-NEXT: s_waitcnt vmcnt(0) +; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill +; GFX1010-NEXT: v_nop +; GFX1010-NEXT: s_mov_b32 s6, 0x20000 +; GFX1010-NEXT: ;;#ASMSTART +; GFX1010-NEXT: ;;#ASMEND +; GFX1010-NEXT: buffer_load_dword v0, off, s[0:3], s6 ; 4-byte Folded Reload +; GFX1010-NEXT: s_waitcnt vmcnt(0) +; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8 +; GFX1010-NEXT: s_endpgm +entry: + ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not + ; fit in the instruction, and has to live in the SGPR offset. + %alloca = alloca i8, i32 4092, align 4, addrspace(5) + %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)* + + %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1 + ; 0x40000 / 64 = 4096 (for wave64) + ; CHECK: s_add_u32 s6, s7, 0x40000 + ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s6 ; 4-byte Folded Spill + %a = load volatile i32, i32 addrspace(5)* %aptr + + ; Force %a to spill + call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}" () + + %outptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1 + store volatile i32 %a, i32 addrspace(5)* %outptr + + ret void +} + +declare hidden void @ex() local_unnamed_addr #0 + +attributes #0 = { nounwind } +attributes #1 = { nounwind "amdgpu-num-vgpr"="8" } +attributes #2 = { nounwind "frame-pointer"="all" } diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll index 1a3cc72fe5b2c..a2fb893af5404 100644 --- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll +++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes.ll @@ -135,8 +135,8 @@ done: ; GCN-LABEL: {{^}}test_sink_scratch_small_offset_i32: ; GCN: s_and_saveexec_b64 -; GCN: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:4092{{$}} -; GCN: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:4092{{$}} +; GCN: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4092{{$}} +; GCN: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4092{{$}} ; GCN: {{^}}BB4_2: define amdgpu_kernel void @test_sink_scratch_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) { entry: @@ -174,9 +174,9 @@ done: ; GCN-LABEL: {{^}}test_sink_scratch_small_offset_i32_reserved: ; GCN: s_and_saveexec_b64 ; GCN: v_mov_b32_e32 [[BASE_FI0:v[0-9]+]], 4 -; GCN: buffer_store_dword {{v[0-9]+}}, [[BASE_FI0]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:4092{{$}} +; GCN: buffer_store_dword {{v[0-9]+}}, [[BASE_FI0]], {{s\[[0-9]+:[0-9]+\]}}, 0 offen offset:4092{{$}} ; GCN: v_mov_b32_e32 [[BASE_FI1:v[0-9]+]], 4 -; GCN: buffer_load_dword {{v[0-9]+}}, [[BASE_FI1]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:4092{{$}} +; GCN: buffer_load_dword {{v[0-9]+}}, [[BASE_FI1]], {{s\[[0-9]+:[0-9]+\]}}, 0 offen offset:4092{{$}} ; GCN: {{^BB[0-9]+}}_2: define amdgpu_kernel void @test_sink_scratch_small_offset_i32_reserved(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) { @@ -213,8 +213,8 @@ done: ; GCN-LABEL: {{^}}test_no_sink_scratch_large_offset_i32: ; GCN: s_and_saveexec_b64 -; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}} -; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}} +; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen{{$}} +; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen{{$}} ; GCN: {{^BB[0-9]+}}_2: define amdgpu_kernel void @test_no_sink_scratch_large_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) { entry: diff --git a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll index 0df32537808ac..a85cdcc01922d 100644 --- a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll +++ b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll @@ -5,9 +5,9 @@ define <2 x half> @chain_hi_to_lo_private() { ; GCN-LABEL: chain_hi_to_lo_private: ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_load_ushort v0, off, s[0:3], s33 offset:2 +; GCN-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:2 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_load_short_d16_hi v0, off, s[0:3], s33 +; GCN-NEXT: buffer_load_short_d16_hi v0, off, s[0:3], 0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] bb: @@ -26,9 +26,9 @@ define <2 x half> @chain_hi_to_lo_private_different_bases(half addrspace(5)* %ba ; GCN-LABEL: chain_hi_to_lo_private_different_bases: ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_load_ushort v0, v0, s[0:3], s33 offen +; GCN-NEXT: buffer_load_ushort v0, v0, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_load_short_d16_hi v0, v1, s[0:3], s33 offen +; GCN-NEXT: buffer_load_short_d16_hi v0, v1, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] bb: @@ -46,7 +46,7 @@ define <2 x half> @chain_hi_to_lo_arithmatic(half addrspace(5)* %base, half %in) ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_add_f16_e32 v1, 1.0, v1 -; GCN-NEXT: buffer_load_short_d16_hi v1, v0, s[0:3], s33 offen +; GCN-NEXT: buffer_load_short_d16_hi v1, v0, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -196,6 +196,8 @@ define amdgpu_kernel void @vload2_private(i16 addrspace(1)* nocapture readonly % ; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9 ; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 ; GCN-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0 +; GCN-NEXT: s_add_u32 s0, s0, s9 +; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v2, s4 ; GCN-NEXT: v_mov_b32_e32 v3, s5 @@ -203,20 +205,20 @@ define amdgpu_kernel void @vload2_private(i16 addrspace(1)* nocapture readonly % ; GCN-NEXT: v_mov_b32_e32 v0, s6 ; GCN-NEXT: v_mov_b32_e32 v1, s7 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_short v4, off, s[0:3], s9 offset:4 +; GCN-NEXT: buffer_store_short v4, off, s[0:3], 0 offset:4 ; GCN-NEXT: global_load_ushort v4, v[2:3], off offset:2 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_short v4, off, s[0:3], s9 offset:6 +; GCN-NEXT: buffer_store_short v4, off, s[0:3], 0 offset:6 ; GCN-NEXT: global_load_ushort v2, v[2:3], off offset:4 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_short v2, off, s[0:3], s9 offset:8 -; GCN-NEXT: buffer_load_ushort v2, off, s[0:3], s9 offset:4 -; GCN-NEXT: buffer_load_ushort v4, off, s[0:3], s9 offset:6 +; GCN-NEXT: buffer_store_short v2, off, s[0:3], 0 offset:8 +; GCN-NEXT: buffer_load_ushort v2, off, s[0:3], 0 offset:4 +; GCN-NEXT: buffer_load_ushort v4, off, s[0:3], 0 offset:6 ; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v3, v4 -; GCN-NEXT: buffer_load_short_d16_hi v3, off, s[0:3], s9 offset:8 +; GCN-NEXT: buffer_load_short_d16_hi v3, off, s[0:3], 0 offset:8 ; GCN-NEXT: v_lshl_or_b32 v2, v4, 16, v2 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: global_store_dwordx2 v[0:1], v[2:3], off @@ -298,10 +300,10 @@ define <2 x i16> @chain_hi_to_lo_private_other_dep(i16 addrspace(5)* %ptr) { ; GCN-LABEL: chain_hi_to_lo_private_other_dep: ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_load_short_d16_hi v1, v0, s[0:3], s33 offen +; GCN-NEXT: buffer_load_short_d16_hi v1, v0, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_pk_sub_u16 v1, v1, -12 op_sel_hi:[1,0] -; GCN-NEXT: buffer_load_short_d16 v1, v0, s[0:3], s33 offen offset:2 +; GCN-NEXT: buffer_load_short_d16 v1, v0, s[0:3], 0 offen offset:2 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll index 1af2ca55308b6..b86db5a7ac689 100644 --- a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll +++ b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll @@ -225,7 +225,7 @@ bb.end: ; preds = %bb.then, %bb ; GCN: s_andn2_b64 exec, exec, ; GCN-NEXT: s_cbranch_execnz [[BB1_LOOP]] -; GCN: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen +; GCN: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0 offen ; GCN: s_and_saveexec_b64 [[SAVEEXEC_OUTER]], {{vcc|s\[[0-9:]+\]}} ; GCN-NEXT: s_cbranch_execz [[BB1_OUTER_LOOP]] diff --git a/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll b/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll index f144ed263ff39..2c1074ae62fcd 100644 --- a/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll @@ -22,16 +22,16 @@ ; GCN: s_and_b64 s{{\[}}[[ANDEXEC_LO:[0-9]+]]:[[ANDEXEC_HI:[0-9]+]]{{\]}}, s{{\[}}[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]{{\]}}, [[CMP0]] ; Spill load -; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s7 offset:[[LOAD0_OFFSET:[0-9]+]] ; 4-byte Folded Spill +; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], 0 offset:[[LOAD0_OFFSET:[0-9]+]] ; 4-byte Folded Spill ; Spill saved exec ; VGPR: v_writelane_b32 [[SPILL_VGPR:v[0-9]+]], s[[SAVEEXEC_LO]], [[SAVEEXEC_LO_LANE:[0-9]+]] ; VGPR: v_writelane_b32 [[SPILL_VGPR]], s[[SAVEEXEC_HI]], [[SAVEEXEC_HI_LANE:[0-9]+]] ; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_LO:[0-9]+]], s[[SAVEEXEC_LO]] -; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], s7 offset:20 ; 4-byte Folded Spill +; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], 0 offset:20 ; 4-byte Folded Spill ; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_HI:[0-9]+]], s[[SAVEEXEC_HI]] -; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], s7 offset:24 ; 4-byte Folded Spill +; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], 0 offset:24 ; 4-byte Folded Spill ; GCN: s_mov_b64 exec, s{{\[}}[[ANDEXEC_LO]]:[[ANDEXEC_HI]]{{\]}} @@ -40,13 +40,13 @@ ; GCN: ; %bb.{{[0-9]+}}: ; %if ; GCN: s_mov_b32 m0, -1 ; GCN: ds_read_b32 [[LOAD1:v[0-9]+]] -; GCN: buffer_load_dword [[RELOAD_LOAD0:v[0-9]+]], off, s[0:3], s7 offset:[[LOAD0_OFFSET]] ; 4-byte Folded Reload +; GCN: buffer_load_dword [[RELOAD_LOAD0:v[0-9]+]], off, s[0:3], 0 offset:[[LOAD0_OFFSET]] ; 4-byte Folded Reload ; GCN: s_waitcnt vmcnt(0) lgkmcnt(0) ; Spill val register ; GCN: v_add_i32_e32 [[VAL:v[0-9]+]], vcc, [[LOAD1]], [[RELOAD_LOAD0]] -; GCN: buffer_store_dword [[VAL]], off, s[0:3], s7 offset:[[VAL_OFFSET:[0-9]+]] ; 4-byte Folded Spill +; GCN: buffer_store_dword [[VAL]], off, s[0:3], 0 offset:[[VAL_OFFSET:[0-9]+]] ; 4-byte Folded Spill ; VMEM: [[ENDIF]]: @@ -56,18 +56,18 @@ -; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], s7 offset:20 ; 4-byte Folded Reload +; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], 0 offset:20 ; 4-byte Folded Reload ; VMEM: s_waitcnt vmcnt(0) ; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[V_RELOAD_SAVEEXEC_LO]] -; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:24 ; 4-byte Folded Reload +; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], 0 offset:24 ; 4-byte Folded Reload ; VMEM: s_waitcnt vmcnt(0) ; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC_HI]] ; GCN: s_or_b64 exec, exec, s{{\[}}[[S_RELOAD_SAVEEXEC_LO]]:[[S_RELOAD_SAVEEXEC_HI]]{{\]}} ; Restore val -; GCN: buffer_load_dword [[RELOAD_VAL:v[0-9]+]], off, s[0:3], s7 offset:[[VAL_OFFSET]] ; 4-byte Folded Reload +; GCN: buffer_load_dword [[RELOAD_VAL:v[0-9]+]], off, s[0:3], 0 offset:[[VAL_OFFSET]] ; 4-byte Folded Reload ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RELOAD_VAL]] define amdgpu_kernel void @divergent_if_endif(i32 addrspace(1)* %out) #0 { @@ -102,7 +102,7 @@ endif: ; GCN: s_and_b64 s{{\[}}[[ANDEXEC_LO:[0-9]+]]:[[ANDEXEC_HI:[0-9]+]]{{\]}}, s{{\[}}[[SAVEEXEC_LO:[0-9]+]]:[[SAVEEXEC_HI:[0-9]+]]{{\]}}, [[CMP0]] ; Spill load -; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s7 offset:[[LOAD0_OFFSET:[0-9]+]] ; 4-byte Folded Spill +; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], 0 offset:[[LOAD0_OFFSET:[0-9]+]] ; 4-byte Folded Spill ; Spill saved exec ; VGPR: v_writelane_b32 [[SPILL_VGPR:v[0-9]+]], s[[SAVEEXEC_LO]], [[SAVEEXEC_LO_LANE:[0-9]+]] @@ -110,9 +110,9 @@ endif: ; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_LO:[0-9]+]], s[[SAVEEXEC_LO]] -; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], s7 offset:24 ; 4-byte Folded Spill +; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], 0 offset:24 ; 4-byte Folded Spill ; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_HI:[0-9]+]], s[[SAVEEXEC_HI]] -; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], s7 offset:28 ; 4-byte Folded Spill +; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], 0 offset:28 ; 4-byte Folded Spill ; GCN: s_mov_b64 exec, s{{\[}}[[ANDEXEC_LO]]:[[ANDEXEC_HI]]{{\]}} @@ -120,10 +120,10 @@ endif: ; GCN: [[LOOP:BB[0-9]+_[0-9]+]]: -; GCN: buffer_load_dword v[[VAL_LOOP_RELOAD:[0-9]+]], off, s[0:3], s7 offset:[[LOAD0_OFFSET]] ; 4-byte Folded Reload +; GCN: buffer_load_dword v[[VAL_LOOP_RELOAD:[0-9]+]], off, s[0:3], 0 offset:[[LOAD0_OFFSET]] ; 4-byte Folded Reload ; GCN: v_subrev_i32_e32 [[VAL_LOOP:v[0-9]+]], vcc, v{{[0-9]+}}, v[[VAL_LOOP_RELOAD]] ; GCN: s_cmp_lg_u32 -; GCN: buffer_store_dword [[VAL_LOOP]], off, s[0:3], s7 offset:[[VAL_SUB_OFFSET:[0-9]+]] ; 4-byte Folded Spill +; GCN: buffer_store_dword [[VAL_LOOP]], off, s[0:3], 0 offset:[[VAL_SUB_OFFSET:[0-9]+]] ; 4-byte Folded Spill ; GCN-NEXT: s_cbranch_scc1 [[LOOP]] @@ -131,16 +131,16 @@ endif: ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]] ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]] -; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], s7 offset:24 ; 4-byte Folded Reload +; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], 0 offset:24 ; 4-byte Folded Reload ; VMEM: s_waitcnt vmcnt(0) ; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[V_RELOAD_SAVEEXEC_LO]] -; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:28 ; 4-byte Folded Reload +; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], 0 offset:28 ; 4-byte Folded Reload ; VMEM: s_waitcnt vmcnt(0) ; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC_HI]] ; GCN: s_or_b64 exec, exec, s{{\[}}[[S_RELOAD_SAVEEXEC_LO]]:[[S_RELOAD_SAVEEXEC_HI]]{{\]}} -; GCN: buffer_load_dword v[[VAL_END:[0-9]+]], off, s[0:3], s7 offset:[[VAL_SUB_OFFSET]] ; 4-byte Folded Reload +; GCN: buffer_load_dword v[[VAL_END:[0-9]+]], off, s[0:3], 0 offset:[[VAL_SUB_OFFSET]] ; 4-byte Folded Reload ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[VAL_END]] define amdgpu_kernel void @divergent_loop(i32 addrspace(1)* %out) #0 { @@ -179,16 +179,16 @@ end: ; GCN: s_xor_b64 s{{\[}}[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]{{\]}}, s{{\[}}[[ANDEXEC_LO]]:[[ANDEXEC_HI]]{{\]}}, s{{\[}}[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]{{\]}} ; Spill load -; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s7 offset:[[LOAD0_OFFSET:[0-9]+]] ; 4-byte Folded Spill +; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], 0 offset:[[LOAD0_OFFSET:[0-9]+]] ; 4-byte Folded Spill ; Spill saved exec ; VGPR: v_writelane_b32 [[SPILL_VGPR:v[0-9]+]], s[[SAVEEXEC_LO]], [[SAVEEXEC_LO_LANE:[0-9]+]] ; VGPR: v_writelane_b32 [[SPILL_VGPR]], s[[SAVEEXEC_HI]], [[SAVEEXEC_HI_LANE:[0-9]+]] ; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_LO:[0-9]+]], s[[SAVEEXEC_LO]] -; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], s7 offset:[[SAVEEXEC_LO_OFFSET:[0-9]+]] ; 4-byte Folded Spill +; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], 0 offset:[[SAVEEXEC_LO_OFFSET:[0-9]+]] ; 4-byte Folded Spill ; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_HI:[0-9]+]], s[[SAVEEXEC_HI]] -; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], s7 offset:[[SAVEEXEC_HI_OFFSET:[0-9]+]] ; 4-byte Folded Spill +; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], 0 offset:[[SAVEEXEC_HI_OFFSET:[0-9]+]] ; 4-byte Folded Spill ; GCN: s_mov_b64 exec, [[CMP0]] @@ -201,18 +201,18 @@ end: ; VGPR: v_readlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]] -; VMEM: buffer_load_dword v[[FLOW_V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], s7 offset:[[SAVEEXEC_LO_OFFSET]] +; VMEM: buffer_load_dword v[[FLOW_V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], 0 offset:[[SAVEEXEC_LO_OFFSET]] ; VMEM: s_waitcnt vmcnt(0) ; VMEM: v_readfirstlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[FLOW_V_RELOAD_SAVEEXEC_LO]] -; VMEM: buffer_load_dword v[[FLOW_V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:[[SAVEEXEC_HI_OFFSET]] ; 4-byte Folded Reload +; VMEM: buffer_load_dword v[[FLOW_V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], 0 offset:[[SAVEEXEC_HI_OFFSET]] ; 4-byte Folded Reload ; VMEM: s_waitcnt vmcnt(0) ; VMEM: v_readfirstlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[FLOW_V_RELOAD_SAVEEXEC_HI]] ; GCN: s_or_saveexec_b64 s{{\[}}[[FLOW_S_RELOAD_SAVEEXEC_LO]]:[[FLOW_S_RELOAD_SAVEEXEC_HI]]{{\]}}, s{{\[}}[[FLOW_S_RELOAD_SAVEEXEC_LO]]:[[FLOW_S_RELOAD_SAVEEXEC_HI]]{{\]}} ; Regular spill value restored after exec modification -; GCN: buffer_load_dword [[FLOW_VAL:v[0-9]+]], off, s[0:3], s7 offset:[[FLOW_VAL_OFFSET:[0-9]+]] ; 4-byte Folded Reload +; GCN: buffer_load_dword [[FLOW_VAL:v[0-9]+]], off, s[0:3], 0 offset:[[FLOW_VAL_OFFSET:[0-9]+]] ; 4-byte Folded Reload ; Spill saved exec @@ -221,26 +221,26 @@ end: ; VMEM: v_mov_b32_e32 v[[FLOW_V_SAVEEXEC_LO:[0-9]+]], s[[FLOW_S_RELOAD_SAVEEXEC_LO]] -; VMEM: buffer_store_dword v[[FLOW_V_SAVEEXEC_LO]], off, s[0:3], s7 offset:[[FLOW_SAVEEXEC_LO_OFFSET:[0-9]+]] ; 4-byte Folded Spill +; VMEM: buffer_store_dword v[[FLOW_V_SAVEEXEC_LO]], off, s[0:3], 0 offset:[[FLOW_SAVEEXEC_LO_OFFSET:[0-9]+]] ; 4-byte Folded Spill ; VMEM: v_mov_b32_e32 v[[FLOW_V_SAVEEXEC_HI:[0-9]+]], s[[FLOW_S_RELOAD_SAVEEXEC_HI]] -; VMEM: buffer_store_dword v[[FLOW_V_SAVEEXEC_HI]], off, s[0:3], s7 offset:[[FLOW_SAVEEXEC_HI_OFFSET:[0-9]+]] ; 4-byte Folded Spill +; VMEM: buffer_store_dword v[[FLOW_V_SAVEEXEC_HI]], off, s[0:3], 0 offset:[[FLOW_SAVEEXEC_HI_OFFSET:[0-9]+]] ; 4-byte Folded Spill -; GCN: buffer_store_dword [[FLOW_VAL]], off, s[0:3], s7 offset:[[RESULT_OFFSET:[0-9]+]] ; 4-byte Folded Spill +; GCN: buffer_store_dword [[FLOW_VAL]], off, s[0:3], 0 offset:[[RESULT_OFFSET:[0-9]+]] ; 4-byte Folded Spill ; GCN: s_xor_b64 exec, exec, s{{\[}}[[FLOW_S_RELOAD_SAVEEXEC_LO]]:[[FLOW_S_RELOAD_SAVEEXEC_HI]]{{\]}} ; GCN-NEXT: s_cbranch_execz [[ENDIF:BB[0-9]+_[0-9]+]] ; GCN: ; %bb.{{[0-9]+}}: ; %if ; GCN: ds_read_b32 -; GCN: buffer_load_dword v[[LOAD0_RELOAD:[0-9]+]], off, s[0:3], s7 offset:[[LOAD0_OFFSET]] ; 4-byte Folded Reload +; GCN: buffer_load_dword v[[LOAD0_RELOAD:[0-9]+]], off, s[0:3], 0 offset:[[LOAD0_OFFSET]] ; 4-byte Folded Reload ; GCN: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, v{{[0-9]+}}, v[[LOAD0_RELOAD]] -; GCN: buffer_store_dword [[ADD]], off, s[0:3], s7 offset:[[RESULT_OFFSET]] ; 4-byte Folded Spill +; GCN: buffer_store_dword [[ADD]], off, s[0:3], 0 offset:[[RESULT_OFFSET]] ; 4-byte Folded Spill ; GCN-NEXT: s_branch [[ENDIF:BB[0-9]+_[0-9]+]] ; GCN: [[ELSE]]: ; %else -; GCN: buffer_load_dword v[[LOAD0_RELOAD:[0-9]+]], off, s[0:3], s7 offset:[[LOAD0_OFFSET]] ; 4-byte Folded Reload +; GCN: buffer_load_dword v[[LOAD0_RELOAD:[0-9]+]], off, s[0:3], 0 offset:[[LOAD0_OFFSET]] ; 4-byte Folded Reload ; GCN: v_subrev_i32_e32 [[SUB:v[0-9]+]], vcc, v{{[0-9]+}}, v[[LOAD0_RELOAD]] -; GCN: buffer_store_dword [[ADD]], off, s[0:3], s7 offset:[[FLOW_RESULT_OFFSET:[0-9]+]] ; 4-byte Folded Spill +; GCN: buffer_store_dword [[ADD]], off, s[0:3], 0 offset:[[FLOW_RESULT_OFFSET:[0-9]+]] ; 4-byte Folded Spill ; GCN-NEXT: s_branch [[FLOW]] ; GCN: [[ENDIF]]: @@ -248,17 +248,17 @@ end: ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[FLOW_SAVEEXEC_HI_LANE]] -; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], s7 offset:[[FLOW_SAVEEXEC_LO_OFFSET]] ; 4-byte Folded Reload +; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], 0 offset:[[FLOW_SAVEEXEC_LO_OFFSET]] ; 4-byte Folded Reload ; VMEM: s_waitcnt vmcnt(0) ; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[V_RELOAD_SAVEEXEC_LO]] -; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:[[FLOW_SAVEEXEC_HI_OFFSET]] ; 4-byte Folded Reload +; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], 0 offset:[[FLOW_SAVEEXEC_HI_OFFSET]] ; 4-byte Folded Reload ; VMEM: s_waitcnt vmcnt(0) ; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC_HI]] ; GCN: s_or_b64 exec, exec, s{{\[}}[[S_RELOAD_SAVEEXEC_LO]]:[[S_RELOAD_SAVEEXEC_HI]]{{\]}} -; GCN: buffer_load_dword v[[RESULT:[0-9]+]], off, s[0:3], s7 offset:[[RESULT_OFFSET]] ; 4-byte Folded Reload +; GCN: buffer_load_dword v[[RESULT:[0-9]+]], off, s[0:3], 0 offset:[[RESULT_OFFSET]] ; 4-byte Folded Reload ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[RESULT]] define amdgpu_kernel void @divergent_if_else_endif(i32 addrspace(1)* %out) #0 { entry: diff --git a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll index 019beea0ab175..007ca13d53c9d 100644 --- a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll +++ b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll @@ -30,9 +30,9 @@ define float @call_split_type_used_outside_block_v2f32() #0 { ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: v_writelane_b32 v32, s34, 2 +; GCN-NEXT: v_writelane_b32 v32, s33, 2 ; GCN-NEXT: v_writelane_b32 v32, s30, 0 -; GCN-NEXT: s_mov_b32 s34, s32 +; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_add_u32 s32, s32, 0x400 ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, func_v2f32@rel32@lo+4 @@ -42,7 +42,7 @@ define float @call_split_type_used_outside_block_v2f32() #0 { ; GCN-NEXT: v_readlane_b32 s4, v32, 0 ; GCN-NEXT: v_readlane_b32 s5, v32, 1 ; GCN-NEXT: s_sub_u32 s32, s32, 0x400 -; GCN-NEXT: v_readlane_b32 s34, v32, 2 +; GCN-NEXT: v_readlane_b32 s33, v32, 2 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] @@ -64,9 +64,9 @@ define float @call_split_type_used_outside_block_v3f32() #0 { ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: v_writelane_b32 v32, s34, 2 +; GCN-NEXT: v_writelane_b32 v32, s33, 2 ; GCN-NEXT: v_writelane_b32 v32, s30, 0 -; GCN-NEXT: s_mov_b32 s34, s32 +; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_add_u32 s32, s32, 0x400 ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, func_v3f32@rel32@lo+4 @@ -76,7 +76,7 @@ define float @call_split_type_used_outside_block_v3f32() #0 { ; GCN-NEXT: v_readlane_b32 s4, v32, 0 ; GCN-NEXT: v_readlane_b32 s5, v32, 1 ; GCN-NEXT: s_sub_u32 s32, s32, 0x400 -; GCN-NEXT: v_readlane_b32 s34, v32, 2 +; GCN-NEXT: v_readlane_b32 s33, v32, 2 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] @@ -98,9 +98,9 @@ define half @call_split_type_used_outside_block_v4f16() #0 { ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: v_writelane_b32 v32, s34, 2 +; GCN-NEXT: v_writelane_b32 v32, s33, 2 ; GCN-NEXT: v_writelane_b32 v32, s30, 0 -; GCN-NEXT: s_mov_b32 s34, s32 +; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_add_u32 s32, s32, 0x400 ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, func_v4f16@rel32@lo+4 @@ -110,7 +110,7 @@ define half @call_split_type_used_outside_block_v4f16() #0 { ; GCN-NEXT: v_readlane_b32 s4, v32, 0 ; GCN-NEXT: v_readlane_b32 s5, v32, 1 ; GCN-NEXT: s_sub_u32 s32, s32, 0x400 -; GCN-NEXT: v_readlane_b32 s34, v32, 2 +; GCN-NEXT: v_readlane_b32 s33, v32, 2 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] @@ -132,9 +132,9 @@ define { i32, half } @call_split_type_used_outside_block_struct() #0 { ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] -; GCN-NEXT: v_writelane_b32 v32, s34, 2 +; GCN-NEXT: v_writelane_b32 v32, s33, 2 ; GCN-NEXT: v_writelane_b32 v32, s30, 0 -; GCN-NEXT: s_mov_b32 s34, s32 +; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_add_u32 s32, s32, 0x400 ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, func_struct@rel32@lo+4 @@ -145,7 +145,7 @@ define { i32, half } @call_split_type_used_outside_block_struct() #0 { ; GCN-NEXT: v_readlane_b32 s5, v32, 1 ; GCN-NEXT: v_mov_b32_e32 v1, v4 ; GCN-NEXT: s_sub_u32 s32, s32, 0x400 -; GCN-NEXT: v_readlane_b32 s34, v32, 2 +; GCN-NEXT: v_readlane_b32 s33, v32, 2 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] @@ -169,14 +169,15 @@ define amdgpu_kernel void @v3i16_registers(i1 %cond) #0 { ; GCN-LABEL: v3i16_registers: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_dword s4, s[4:5], 0x0 -; GCN-NEXT: s_mov_b32 s33, s9 -; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s33 +; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9 ; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 -; GCN-NEXT: s_mov_b32 s32, s33 +; GCN-NEXT: s_add_u32 s0, s0, s9 +; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_and_b32 s4, 1, s4 ; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 1 ; GCN-NEXT: s_and_b64 vcc, exec, s[4:5] +; GCN-NEXT: s_mov_b32 s32, 0 ; GCN-NEXT: s_cbranch_vccz BB4_2 ; GCN-NEXT: ; %bb.1: ; GCN-NEXT: s_mov_b32 s4, 0 @@ -213,14 +214,15 @@ define amdgpu_kernel void @v3f16_registers(i1 %cond) #0 { ; GCN-LABEL: v3f16_registers: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_load_dword s4, s[4:5], 0x0 -; GCN-NEXT: s_mov_b32 s33, s9 -; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s33 +; GCN-NEXT: s_add_u32 flat_scratch_lo, s6, s9 ; GCN-NEXT: s_addc_u32 flat_scratch_hi, s7, 0 -; GCN-NEXT: s_mov_b32 s32, s33 +; GCN-NEXT: s_add_u32 s0, s0, s9 +; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_and_b32 s4, 1, s4 ; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 1 ; GCN-NEXT: s_and_b64 vcc, exec, s[4:5] +; GCN-NEXT: s_mov_b32 s32, 0 ; GCN-NEXT: s_cbranch_vccz BB5_2 ; GCN-NEXT: ; %bb.1: ; GCN-NEXT: s_mov_b32 s4, 0 diff --git a/llvm/test/CodeGen/AMDGPU/extload-private.ll b/llvm/test/CodeGen/AMDGPU/extload-private.ll index f119af24038da..be8c3b5a44b58 100644 --- a/llvm/test/CodeGen/AMDGPU/extload-private.ll +++ b/llvm/test/CodeGen/AMDGPU/extload-private.ll @@ -2,7 +2,7 @@ ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}load_i8_sext_private: -; SI: buffer_load_sbyte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4{{$}} +; SI: buffer_load_sbyte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4{{$}} define amdgpu_kernel void @load_i8_sext_private(i32 addrspace(1)* %out) { entry: %tmp0 = alloca i8, addrspace(5) @@ -13,7 +13,7 @@ entry: } ; FUNC-LABEL: {{^}}load_i8_zext_private: -; SI: buffer_load_ubyte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4{{$}} +; SI: buffer_load_ubyte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4{{$}} define amdgpu_kernel void @load_i8_zext_private(i32 addrspace(1)* %out) { entry: %tmp0 = alloca i8, addrspace(5) @@ -24,7 +24,7 @@ entry: } ; FUNC-LABEL: {{^}}load_i16_sext_private: -; SI: buffer_load_sshort v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4{{$}} +; SI: buffer_load_sshort v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4{{$}} define amdgpu_kernel void @load_i16_sext_private(i32 addrspace(1)* %out) { entry: %tmp0 = alloca i16, addrspace(5) @@ -35,7 +35,7 @@ entry: } ; FUNC-LABEL: {{^}}load_i16_zext_private: -; SI: buffer_load_ushort v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4{{$}} +; SI: buffer_load_ushort v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4{{$}} define amdgpu_kernel void @load_i16_zext_private(i32 addrspace(1)* %out) { entry: %tmp0 = alloca i16, addrspace(5) diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll index 6cdd03b754911..85f9ea173eb5e 100644 --- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll +++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.private.ll @@ -9,8 +9,8 @@ define i32 @private_load_2xi16_align2(i16 addrspace(5)* %p) #0 { ; GFX7-ALIGNED: ; %bb.0: ; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-ALIGNED-NEXT: v_add_i32_e32 v1, vcc, 2, v0 -; GFX7-ALIGNED-NEXT: buffer_load_ushort v0, v0, s[0:3], s33 offen -; GFX7-ALIGNED-NEXT: buffer_load_ushort v1, v1, s[0:3], s33 offen +; GFX7-ALIGNED-NEXT: buffer_load_ushort v0, v0, s[0:3], 0 offen +; GFX7-ALIGNED-NEXT: buffer_load_ushort v1, v1, s[0:3], 0 offen ; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) ; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-ALIGNED-NEXT: v_or_b32_e32 v0, v0, v1 @@ -20,8 +20,8 @@ define i32 @private_load_2xi16_align2(i16 addrspace(5)* %p) #0 { ; GFX7-UNALIGNED: ; %bb.0: ; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-UNALIGNED-NEXT: v_add_i32_e32 v1, vcc, 2, v0 -; GFX7-UNALIGNED-NEXT: buffer_load_ushort v0, v0, s[0:3], s33 offen -; GFX7-UNALIGNED-NEXT: buffer_load_ushort v1, v1, s[0:3], s33 offen +; GFX7-UNALIGNED-NEXT: buffer_load_ushort v0, v0, s[0:3], 0 offen +; GFX7-UNALIGNED-NEXT: buffer_load_ushort v1, v1, s[0:3], 0 offen ; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) ; GFX7-UNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-UNALIGNED-NEXT: v_or_b32_e32 v0, v0, v1 @@ -30,8 +30,8 @@ define i32 @private_load_2xi16_align2(i16 addrspace(5)* %p) #0 { ; GFX9-LABEL: private_load_2xi16_align2: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_load_ushort v1, v0, s[0:3], s33 offen -; GFX9-NEXT: buffer_load_ushort v0, v0, s[0:3], s33 offen offset:2 +; GFX9-NEXT: buffer_load_ushort v1, v0, s[0:3], 0 offen +; GFX9-NEXT: buffer_load_ushort v0, v0, s[0:3], 0 offen offset:2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -53,8 +53,8 @@ define void @private_store_2xi16_align2(i16 addrspace(5)* %p, i16 addrspace(5)* ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v3, 1 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, 2 ; GFX7-ALIGNED-NEXT: v_add_i32_e32 v2, vcc, 2, v1 -; GFX7-ALIGNED-NEXT: buffer_store_short v3, v1, s[0:3], s33 offen -; GFX7-ALIGNED-NEXT: buffer_store_short v0, v2, s[0:3], s33 offen +; GFX7-ALIGNED-NEXT: buffer_store_short v3, v1, s[0:3], 0 offen +; GFX7-ALIGNED-NEXT: buffer_store_short v0, v2, s[0:3], 0 offen ; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) ; GFX7-ALIGNED-NEXT: s_setpc_b64 s[30:31] ; @@ -64,8 +64,8 @@ define void @private_store_2xi16_align2(i16 addrspace(5)* %p, i16 addrspace(5)* ; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v3, 1 ; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v0, 2 ; GFX7-UNALIGNED-NEXT: v_add_i32_e32 v2, vcc, 2, v1 -; GFX7-UNALIGNED-NEXT: buffer_store_short v3, v1, s[0:3], s33 offen -; GFX7-UNALIGNED-NEXT: buffer_store_short v0, v2, s[0:3], s33 offen +; GFX7-UNALIGNED-NEXT: buffer_store_short v3, v1, s[0:3], 0 offen +; GFX7-UNALIGNED-NEXT: buffer_store_short v0, v2, s[0:3], 0 offen ; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) ; GFX7-UNALIGNED-NEXT: s_setpc_b64 s[30:31] ; @@ -73,9 +73,9 @@ define void @private_store_2xi16_align2(i16 addrspace(5)* %p, i16 addrspace(5)* ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, 1 -; GFX9-NEXT: buffer_store_short v0, v1, s[0:3], s33 offen -; GFX9-NEXT: v_mov_b32_e32 v0, 2 -; GFX9-NEXT: buffer_store_short v0, v1, s[0:3], s33 offen offset:2 +; GFX9-NEXT: v_mov_b32_e32 v2, 2 +; GFX9-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen +; GFX9-NEXT: buffer_store_short v2, v1, s[0:3], 0 offen offset:2 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep.r = getelementptr i16, i16 addrspace(5)* %r, i64 1 @@ -89,36 +89,35 @@ define i32 @private_load_2xi16_align1(i16 addrspace(5)* %p) #0 { ; GFX7-ALIGNED-LABEL: private_load_2xi16_align1: ; GFX7-ALIGNED: ; %bb.0: ; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-ALIGNED-NEXT: v_add_i32_e32 v2, vcc, 1, v0 -; GFX7-ALIGNED-NEXT: buffer_load_ubyte v2, v2, s[0:3], s33 offen ; GFX7-ALIGNED-NEXT: v_add_i32_e32 v1, vcc, 2, v0 -; GFX7-ALIGNED-NEXT: v_add_i32_e32 v3, vcc, 3, v0 -; GFX7-ALIGNED-NEXT: buffer_load_ubyte v3, v3, s[0:3], s33 offen -; GFX7-ALIGNED-NEXT: buffer_load_ubyte v1, v1, s[0:3], s33 offen -; GFX7-ALIGNED-NEXT: buffer_load_ubyte v0, v0, s[0:3], s33 offen -; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(3) -; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX7-ALIGNED-NEXT: v_add_i32_e32 v2, vcc, 1, v0 +; GFX7-ALIGNED-NEXT: buffer_load_ubyte v3, v0, s[0:3], 0 offen +; GFX7-ALIGNED-NEXT: v_add_i32_e32 v0, vcc, 3, v0 +; GFX7-ALIGNED-NEXT: buffer_load_ubyte v0, v0, s[0:3], 0 offen +; GFX7-ALIGNED-NEXT: buffer_load_ubyte v2, v2, s[0:3], 0 offen +; GFX7-ALIGNED-NEXT: buffer_load_ubyte v1, v1, s[0:3], 0 offen ; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(2) -; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v0, 8, v0 ; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(1) -; GFX7-ALIGNED-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) -; GFX7-ALIGNED-NEXT: v_or_b32_e32 v0, v2, v0 -; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX7-ALIGNED-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX7-ALIGNED-NEXT: v_or_b32_e32 v2, v2, v3 +; GFX7-ALIGNED-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-ALIGNED-NEXT: v_or_b32_e32 v0, v2, v0 ; GFX7-ALIGNED-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-UNALIGNED-LABEL: private_load_2xi16_align1: ; GFX7-UNALIGNED: ; %bb.0: ; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-UNALIGNED-NEXT: buffer_load_dword v0, v0, s[0:3], s33 offen +; GFX7-UNALIGNED-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) ; GFX7-UNALIGNED-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: private_load_2xi16_align1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_load_dword v0, v0, s[0:3], s33 offen +; GFX9-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff ; GFX9-NEXT: s_mov_b32 s4, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -141,15 +140,15 @@ define void @private_store_2xi16_align1(i16 addrspace(5)* %p, i16 addrspace(5)* ; GFX7-ALIGNED: ; %bb.0: ; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v3, 1 -; GFX7-ALIGNED-NEXT: buffer_store_byte v3, v1, s[0:3], s33 offen ; GFX7-ALIGNED-NEXT: v_add_i32_e32 v2, vcc, 2, v1 -; GFX7-ALIGNED-NEXT: v_add_i32_e32 v3, vcc, 1, v1 -; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v4, 0 +; GFX7-ALIGNED-NEXT: v_add_i32_e32 v4, vcc, 1, v1 +; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v5, 0 +; GFX7-ALIGNED-NEXT: buffer_store_byte v3, v1, s[0:3], 0 offen +; GFX7-ALIGNED-NEXT: buffer_store_byte v5, v4, s[0:3], 0 offen ; GFX7-ALIGNED-NEXT: v_add_i32_e32 v1, vcc, 3, v1 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, 2 -; GFX7-ALIGNED-NEXT: buffer_store_byte v4, v3, s[0:3], s33 offen -; GFX7-ALIGNED-NEXT: buffer_store_byte v4, v1, s[0:3], s33 offen -; GFX7-ALIGNED-NEXT: buffer_store_byte v0, v2, s[0:3], s33 offen +; GFX7-ALIGNED-NEXT: buffer_store_byte v5, v1, s[0:3], 0 offen +; GFX7-ALIGNED-NEXT: buffer_store_byte v0, v2, s[0:3], 0 offen ; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) ; GFX7-ALIGNED-NEXT: s_setpc_b64 s[30:31] ; @@ -157,7 +156,7 @@ define void @private_store_2xi16_align1(i16 addrspace(5)* %p, i16 addrspace(5)* ; GFX7-UNALIGNED: ; %bb.0: ; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0x20001 -; GFX7-UNALIGNED-NEXT: buffer_store_dword v0, v1, s[0:3], s33 offen +; GFX7-UNALIGNED-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) ; GFX7-UNALIGNED-NEXT: s_setpc_b64 s[30:31] ; @@ -165,7 +164,7 @@ define void @private_store_2xi16_align1(i16 addrspace(5)* %p, i16 addrspace(5)* ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, 0x20001 -; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], s33 offen +; GFX9-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] %gep.r = getelementptr i16, i16 addrspace(5)* %r, i64 1 @@ -186,21 +185,21 @@ define i32 @private_load_2xi16_align4(i16 addrspace(5)* %p) #0 { ; GFX7-ALIGNED-LABEL: private_load_2xi16_align4: ; GFX7-ALIGNED: ; %bb.0: ; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-ALIGNED-NEXT: buffer_load_dword v0, v0, s[0:3], s33 offen +; GFX7-ALIGNED-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX7-ALIGNED-NEXT: s_waitcnt vmcnt(0) ; GFX7-ALIGNED-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-UNALIGNED-LABEL: private_load_2xi16_align4: ; GFX7-UNALIGNED: ; %bb.0: ; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-UNALIGNED-NEXT: buffer_load_dword v0, v0, s[0:3], s33 offen +; GFX7-UNALIGNED-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0) ; GFX7-UNALIGNED-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: private_load_2xi16_align4: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_load_dword v0, v0, s[0:3], s33 offen +; GFX9-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff ; GFX9-NEXT: s_mov_b32 s4, 0xffff ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -233,7 +232,7 @@ define void @private_store_2xi16_align4(i16 addrspace(5)* %p, i16 addrspace(5)* ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, 0x20001 -; GCN-NEXT: buffer_store_dword v0, v1, s[0:3], s33 offen +; GCN-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] %gep.r = getelementptr i16, i16 addrspace(5)* %r, i64 1 diff --git a/llvm/test/CodeGen/AMDGPU/fma-combine.ll b/llvm/test/CodeGen/AMDGPU/fma-combine.ll index a962a3b4ed06c..5d78ddada14d6 100644 --- a/llvm/test/CodeGen/AMDGPU/fma-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/fma-combine.ll @@ -2,6 +2,8 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -verify-machineinstrs -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefix=SI-NOFMA -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast -enable-no-infs-fp-math -enable-unsafe-fp-math -mattr=+fp32-denormals < %s | FileCheck -enable-var-scope -check-prefix=SI-FMA -check-prefix=SI-UNSAFE -check-prefix=SI -check-prefix=FUNC %s +; FIXME: Remove enable-unsafe-fp-math in RUN line and add flags to IR instrs + ; Note: The SI-FMA conversions of type x * (y + 1) --> x * y + x would be ; beneficial even without fp32 denormals, but they do require no-infs-fp-math ; for correctness. @@ -376,9 +378,10 @@ define amdgpu_kernel void @aggressive_combine_to_fma_fsub_1_f64(double addrspace %u = load volatile double, double addrspace(1)* %gep.3 %v = load volatile double, double addrspace(1)* %gep.4 - %tmp0 = fmul double %u, %v - %tmp1 = call double @llvm.fma.f64(double %y, double %z, double %tmp0) #0 - %tmp2 = fsub double %x, %tmp1 + ; nsz flag is needed since this combine may change sign of zero + %tmp0 = fmul nsz double %u, %v + %tmp1 = call nsz double @llvm.fma.f64(double %y, double %z, double %tmp0) #0 + %tmp2 = fsub nsz double %x, %tmp1 store double %tmp2, double addrspace(1)* %gep.out ret void diff --git a/llvm/test/CodeGen/AMDGPU/fold-fi-mubuf.mir b/llvm/test/CodeGen/AMDGPU/fold-fi-mubuf.mir index f80176508befe..cf4fd49c3d60a 100644 --- a/llvm/test/CodeGen/AMDGPU/fold-fi-mubuf.mir +++ b/llvm/test/CodeGen/AMDGPU/fold-fi-mubuf.mir @@ -1,8 +1,9 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -run-pass si-fold-operands,dead-mi-elimination %s -o - | FileCheck -check-prefix=GCN %s +# Kernels can have no FP --- -name: no_fold_fi_non_stack_rsrc_soffset +name: kernel_no_fold_fi_non_stack_rsrc_and_soffset tracksRegLiveness: true frameInfo: maxAlignment: 4 @@ -12,14 +13,12 @@ stack: machineFunctionInfo: isEntryFunction: true scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99' - scratchWaveOffsetReg: '$sgpr6' - frameOffsetReg: '$sgpr6' - stackPtrOffsetReg: '$sgpr6' + stackPtrOffsetReg: '$sgpr32' body: | bb.0: liveins: $sgpr12_sgpr13_sgpr14_sgpr15 - ; GCN-LABEL: name: no_fold_fi_non_stack_rsrc_soffset + ; GCN-LABEL: name: kernel_no_fold_fi_non_stack_rsrc_and_soffset ; GCN: liveins: $sgpr12_sgpr13_sgpr14_sgpr15 ; GCN: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr12_sgpr13_sgpr14_sgpr15 ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec @@ -36,7 +35,7 @@ body: | ... --- -name: no_fold_fi_non_stack_rsrc +name: kernel_no_fold_fi_non_stack_rsrc tracksRegLiveness: true frameInfo: maxAlignment: 4 @@ -46,14 +45,12 @@ stack: machineFunctionInfo: isEntryFunction: true scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99' - scratchWaveOffsetReg: '$sgpr6' - frameOffsetReg: '$sgpr6' stackPtrOffsetReg: '$sgpr32' body: | bb.0: liveins: $sgpr12_sgpr13_sgpr14_sgpr15 - ; GCN-LABEL: name: no_fold_fi_non_stack_rsrc + ; GCN-LABEL: name: kernel_no_fold_fi_non_stack_rsrc ; GCN: liveins: $sgpr12_sgpr13_sgpr14_sgpr15 ; GCN: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr12_sgpr13_sgpr14_sgpr15 ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec @@ -68,9 +65,8 @@ body: | ... -# Offset is from global scratch wave offset. --- -name: fold_fi_mubuf_scratch_scratch_wave_offset +name: kernel_no_fold_fi_non_stack_soffset tracksRegLiveness: true frameInfo: maxAlignment: 4 @@ -80,12 +76,44 @@ stack: machineFunctionInfo: isEntryFunction: true scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' - scratchWaveOffsetReg: '$sgpr33' stackPtrOffsetReg: '$sgpr32' body: | bb.0: - ; GCN-LABEL: name: fold_fi_mubuf_scratch_scratch_wave_offset + ; GCN-LABEL: name: kernel_no_fold_fi_non_stack_soffset + ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec + ; GCN: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 7, implicit $exec + ; GCN: BUFFER_STORE_DWORD_OFFEN [[V_MOV_B32_e32_1]], [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec + ; GCN: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec + ; GCN: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] + ; GCN: S_ENDPGM 0, implicit $vgpr0 + %0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec + %1:vgpr_32 = V_MOV_B32_e32 7, implicit $exec + %2:sreg_32_xm0 = S_MOV_B32 0 + + BUFFER_STORE_DWORD_OFFEN %1:vgpr_32, %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, %2, 0, 0, 0, 0, 0, 0, implicit $exec + %3:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, %2, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr0 = COPY %3 + S_ENDPGM 0, implicit $vgpr0 + +... + +--- +name: kernel_fold_fi_mubuf +tracksRegLiveness: true +frameInfo: + maxAlignment: 4 + localFrameSize: 4 +stack: + - { id: 0, size: 4, alignment: 4, local-offset: 0 } +machineFunctionInfo: + isEntryFunction: true + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + stackPtrOffsetReg: '$sgpr32' +body: | + bb.0: + + ; GCN-LABEL: name: kernel_fold_fi_mubuf ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 7, implicit $exec ; GCN: BUFFER_STORE_DWORD_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec ; GCN: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec @@ -94,15 +122,17 @@ body: | %0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec %1:vgpr_32 = V_MOV_B32_e32 7, implicit $exec - BUFFER_STORE_DWORD_OFFEN %1:vgpr_32, %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, 0, 0, 0, implicit $exec - %2:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFEN %1:vgpr_32, %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec + %2:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec $vgpr0 = COPY %2 S_ENDPGM 0, implicit $vgpr0 ... + +# Functions have an unswizzled SP/FP relative to the wave offset --- -name: no_fold_fi_mubuf_scratch_sp_offset +name: function_no_fold_fi_non_stack_rsrc_and_soffset tracksRegLiveness: true frameInfo: maxAlignment: 4 @@ -110,14 +140,143 @@ frameInfo: stack: - { id: 0, size: 4, alignment: 4, local-offset: 0 } machineFunctionInfo: - isEntryFunction: true + isEntryFunction: false + scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99' + frameOffsetReg: '$sgpr32' + stackPtrOffsetReg: '$sgpr32' +body: | + bb.0: + liveins: $sgpr12_sgpr13_sgpr14_sgpr15 + + ; GCN-LABEL: name: function_no_fold_fi_non_stack_rsrc_and_soffset + ; GCN: liveins: $sgpr12_sgpr13_sgpr14_sgpr15 + ; GCN: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr12_sgpr13_sgpr14_sgpr15 + ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec + ; GCN: [[BUFFER_LOAD_DWORD_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN [[V_MOV_B32_e32_]], [[COPY]], 0, 0, 0, 0, 0, 0, 0, implicit $exec + ; GCN: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_IDXEN]] + ; GCN: SI_RETURN_TO_EPILOG $vgpr0 + %0:sgpr_128 = COPY $sgpr12_sgpr13_sgpr14_sgpr15 + %1:sreg_32_xm0 = S_MOV_B32 0 + %2:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec + %3:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN %2, %0, %1, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr0 = COPY %3 + SI_RETURN_TO_EPILOG $vgpr0 + +... + +--- +name: function_no_fold_fi_non_stack_rsrc +tracksRegLiveness: true +frameInfo: + maxAlignment: 4 + localFrameSize: 4 +stack: + - { id: 0, size: 4, alignment: 4, local-offset: 0 } +machineFunctionInfo: + isEntryFunction: false + scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99' + frameOffsetReg: '$sgpr32' + stackPtrOffsetReg: '$sgpr32' +body: | + bb.0: + liveins: $sgpr12_sgpr13_sgpr14_sgpr15 + + ; GCN-LABEL: name: function_no_fold_fi_non_stack_rsrc + ; GCN: liveins: $sgpr12_sgpr13_sgpr14_sgpr15 + ; GCN: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr12_sgpr13_sgpr14_sgpr15 + ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec + ; GCN: [[BUFFER_LOAD_DWORD_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN [[V_MOV_B32_e32_]], [[COPY]], $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec + ; GCN: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_IDXEN]] + ; GCN: SI_RETURN_TO_EPILOG $vgpr0 + %0:sgpr_128 = COPY $sgpr12_sgpr13_sgpr14_sgpr15 + %2:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec + %3:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN %2, %0, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr0 = COPY %3 + SI_RETURN_TO_EPILOG $vgpr0 + +... + +--- +name: function_no_fold_fi_non_stack_soffset +tracksRegLiveness: true +frameInfo: + maxAlignment: 4 + localFrameSize: 4 +stack: + - { id: 0, size: 4, alignment: 4, local-offset: 0 } +machineFunctionInfo: + isEntryFunction: false + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + frameOffsetReg: '$sgpr32' + stackPtrOffsetReg: '$sgpr32' +body: | + bb.0: + + ; GCN-LABEL: name: function_no_fold_fi_non_stack_soffset + ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 7, implicit $exec + ; GCN: BUFFER_STORE_DWORD_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec + ; GCN: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec + ; GCN: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] + ; GCN: S_ENDPGM 0, implicit $vgpr0 + %0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec + %1:vgpr_32 = V_MOV_B32_e32 7, implicit $exec + + BUFFER_STORE_DWORD_OFFEN %1:vgpr_32, %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec + %2:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr0 = COPY %2 + S_ENDPGM 0, implicit $vgpr0 + +... + +--- +name: function_fold_fi_mubuf_wave_relative +tracksRegLiveness: true +frameInfo: + maxAlignment: 4 + localFrameSize: 4 +stack: + - { id: 0, size: 4, alignment: 4, local-offset: 0 } +machineFunctionInfo: + isEntryFunction: false + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + frameOffsetReg: '$sgpr32' + stackPtrOffsetReg: '$sgpr32' +body: | + bb.0: + + ; GCN-LABEL: name: function_fold_fi_mubuf_wave_relative + ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 7, implicit $exec + ; GCN: BUFFER_STORE_DWORD_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec + ; GCN: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec + ; GCN: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] + ; GCN: S_ENDPGM 0, implicit $vgpr0 + %0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec + %1:vgpr_32 = V_MOV_B32_e32 7, implicit $exec + + BUFFER_STORE_DWORD_OFFEN %1:vgpr_32, %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec + %2:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr0 = COPY %2 + S_ENDPGM 0, implicit $vgpr0 + +... + +--- +name: function_fold_fi_mubuf_stack_relative +tracksRegLiveness: true +frameInfo: + maxAlignment: 4 + localFrameSize: 4 +stack: + - { id: 0, size: 4, alignment: 4, local-offset: 0 } +machineFunctionInfo: + isEntryFunction: false scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' - scratchWaveOffsetReg: '$sgpr33' + frameOffsetReg: '$sgpr32' stackPtrOffsetReg: '$sgpr32' body: | bb.0: - ; GCN-LABEL: name: no_fold_fi_mubuf_scratch_sp_offset + ; GCN-LABEL: name: function_fold_fi_mubuf_stack_relative ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 7, implicit $exec ; GCN: BUFFER_STORE_DWORD_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec ; GCN: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll index a928384457a4a..be8ef40a48761 100644 --- a/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll +++ b/llvm/test/CodeGen/AMDGPU/frame-index-elimination.ll @@ -7,10 +7,9 @@ ; Materialize into a mov. Make sure there isn't an unnecessary copy. ; GCN-LABEL: {{^}}func_mov_fi_i32: ; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN: s_sub_u32 [[SUB:s[0-9]+|vcc_lo|vcc_hi]], s32, s33 -; CI-NEXT: v_lshr_b32_e64 v0, [[SUB]], 6 -; GFX9-NEXT: v_lshrrev_b32_e64 v0, 6, [[SUB]] +; CI-NEXT: v_lshr_b32_e64 v0, s32, 6 +; GFX9-NEXT: v_lshrrev_b32_e64 v0, 6, s32 ; GCN-NOT: v_mov ; GCN: ds_write_b32 v0, v0 @@ -24,20 +23,16 @@ define void @func_mov_fi_i32() #0 { ; GCN-LABEL: {{^}}func_mov_fi_i32_offset: ; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI: s_sub_u32 [[SUB0:s[0-9]+|vcc_lo|vcc_hi]], s32, s33 -; CI-NEXT: s_sub_u32 [[SUB1:s[0-9]+|vcc_lo|vcc_hi]], s32, s33 -; CI-DAG: v_lshr_b32_e64 v0, [[SUB0]], 6 -; CI-DAG: v_lshr_b32_e64 [[SCALED:v[0-9]+]], [[SUB1]], 6 +; CI-DAG: v_lshr_b32_e64 v0, s32, 6 ; CI-NOT: v_mov ; CI: ds_write_b32 v0, v0 -; CI-NEXT: v_add_i32_e64 v0, s{{\[[0-9]+:[0-9]+\]}}, 4, [[SCALED]] +; CI-NEXT: v_lshr_b32_e64 [[SCALED:v[0-9]+]], s32, 6 +; CI-NEXT: v_add_i32_e{{32|64}} v0, {{s\[[0-9]+:[0-9]+\]|vcc}}, 4, [[SCALED]] ; CI-NEXT: ds_write_b32 v0, v0 -; GFX9: s_sub_u32 [[SUB0:s[0-9]+|vcc_lo|vcc_hi]], s32, s33 -; GFX9-NEXT: s_sub_u32 [[SUB1:s[0-9]+|vcc_lo|vcc_hi]], s32, s33 -; GFX9-NEXT: v_lshrrev_b32_e64 v0, 6, [[SUB0]] -; GFX9-NEXT: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, [[SUB1]] -; GFX9-DAG: ds_write_b32 v0, v0 +; GFX9: v_lshrrev_b32_e64 v0, 6, s32 +; GFX9-NEXT: ds_write_b32 v0, v0 +; GFX9-NEXT: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, s32 ; GFX9-NEXT: v_add_u32_e32 v0, 4, [[SCALED]] ; GFX9-NEXT: ds_write_b32 v0, v0 define void @func_mov_fi_i32_offset() #0 { @@ -53,15 +48,13 @@ define void @func_mov_fi_i32_offset() #0 { ; GCN-LABEL: {{^}}func_add_constant_to_fi_i32: ; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN: s_sub_u32 [[SUB:s[0-9]+|vcc_lo|vcc_hi]], s32, s33 -; CI-NEXT: v_lshr_b32_e64 [[SCALED:v[0-9]+]], [[SUB]], 6 +; CI: v_lshr_b32_e64 [[SCALED:v[0-9]+]], s32, 6 ; CI-NEXT: v_add_i32_e32 v0, vcc, 4, [[SCALED]] -; GFX9-NEXT: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, [[SUB]] +; GFX9: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, s32 ; GFX9-NEXT: v_add_u32_e32 v0, 4, [[SCALED]] - ; GCN-NOT: v_mov ; GCN: ds_write_b32 v0, v0 define void @func_add_constant_to_fi_i32() #0 { @@ -75,11 +68,10 @@ define void @func_add_constant_to_fi_i32() #0 { ; into. ; GCN-LABEL: {{^}}func_other_fi_user_i32: -; GCN: s_sub_u32 [[SUB:s[0-9]+|vcc_lo|vcc_hi]], s32, s33 -; CI-NEXT: v_lshr_b32_e64 v0, [[SUB]], 6 +; CI: v_lshr_b32_e64 v0, s32, 6 -; GFX9-NEXT: v_lshrrev_b32_e64 v0, 6, [[SUB]] +; GFX9: v_lshrrev_b32_e64 v0, 6, s32 ; GCN-NEXT: v_mul_u32_u24_e32 v0, 9, v0 ; GCN-NOT: v_mov @@ -94,7 +86,7 @@ define void @func_other_fi_user_i32() #0 { ; GCN-LABEL: {{^}}func_store_private_arg_i32_ptr: ; GCN: v_mov_b32_e32 v1, 15{{$}} -; GCN: buffer_store_dword v1, v0, s[0:3], s33 offen{{$}} +; GCN: buffer_store_dword v1, v0, s[0:3], 0 offen{{$}} define void @func_store_private_arg_i32_ptr(i32 addrspace(5)* %ptr) #0 { store volatile i32 15, i32 addrspace(5)* %ptr ret void @@ -102,7 +94,7 @@ define void @func_store_private_arg_i32_ptr(i32 addrspace(5)* %ptr) #0 { ; GCN-LABEL: {{^}}func_load_private_arg_i32_ptr: ; GCN: s_waitcnt -; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], s33 offen{{$}} +; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen{{$}} define void @func_load_private_arg_i32_ptr(i32 addrspace(5)* %ptr) #0 { %val = load volatile i32, i32 addrspace(5)* %ptr ret void @@ -110,12 +102,11 @@ define void @func_load_private_arg_i32_ptr(i32 addrspace(5)* %ptr) #0 { ; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32_ptr: ; GCN: s_waitcnt -; GCN-NEXT: s_sub_u32 [[SUB_OFFSET:s[0-9]+|vcc_lo|vcc_hi]], s32, s33 -; CI-NEXT: v_lshr_b32_e64 [[SHIFT:v[0-9]+]], [[SUB_OFFSET]], 6 +; CI: v_lshr_b32_e64 [[SHIFT:v[0-9]+]], s32, 6 ; CI-NEXT: v_or_b32_e32 v0, 4, [[SHIFT]] -; GFX9-NEXT: v_lshrrev_b32_e64 [[SHIFT:v[0-9]+]], 6, [[SUB_OFFSET]] +; GFX9: v_lshrrev_b32_e64 [[SHIFT:v[0-9]+]], 6, s32 ; GFX9-NEXT: v_or_b32_e32 v0, 4, [[SHIFT]] ; GCN-NOT: v_mov @@ -143,11 +134,10 @@ define void @void_func_byval_struct_i8_i32_ptr_value({ i8, i32 } addrspace(5)* b } ; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32_ptr_nonentry_block: -; GCN: s_sub_u32 [[SUB_OFFSET:s[0-9]+]], s32, s33 -; CI: v_lshr_b32_e64 [[SHIFT:v[0-9]+]], [[SUB_OFFSET]], 6 +; CI: v_lshr_b32_e64 [[SHIFT:v[0-9]+]], s32, 6 -; GFX9: v_lshrrev_b32_e64 [[SHIFT:v[0-9]+]], 6, [[SUB_OFFSET]] +; GFX9: v_lshrrev_b32_e64 [[SHIFT:v[0-9]+]], 6, s32 ; GCN: s_and_saveexec_b64 @@ -175,13 +165,12 @@ ret: ; Added offset can't be used with VOP3 add ; GCN-LABEL: {{^}}func_other_fi_user_non_inline_imm_offset_i32: -; GCN: s_sub_u32 [[SUB:s[0-9]+|vcc_lo|vcc_hi]], s32, s33 -; CI-DAG: s_movk_i32 [[K:s[0-9]+|vcc_lo|vcc_hi]], 0x200 -; CI-DAG: v_lshr_b32_e64 [[SCALED:v[0-9]+]], [[SUB]], 6 +; CI-DAG: s_movk_i32 [[K:s[0-9]+|vcc_lo|vcc_hi]], 0x200 +; CI-DAG: v_lshr_b32_e64 [[SCALED:v[0-9]+]], s32, 6 ; CI: v_add_i32_e32 [[VZ:v[0-9]+]], vcc, [[K]], [[SCALED]] -; GFX9-DAG: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, [[SUB]] +; GFX9-DAG: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, s32 ; GFX9: v_add_u32_e32 [[VZ:v[0-9]+]], 0x200, [[SCALED]] ; GCN: v_mul_u32_u24_e32 [[VZ]], 9, [[VZ]] @@ -199,13 +188,12 @@ define void @func_other_fi_user_non_inline_imm_offset_i32() #0 { } ; GCN-LABEL: {{^}}func_other_fi_user_non_inline_imm_offset_i32_vcc_live: -; GCN: s_sub_u32 [[DIFF:s[0-9]+]], s32, s33 -; CI-DAG: s_movk_i32 [[OFFSET:s[0-9]+]], 0x200 -; CI-DAG: v_lshr_b32_e64 [[SCALED:v[0-9]+]], [[DIFF]], 6 +; CI-DAG: s_movk_i32 [[OFFSET:s[0-9]+]], 0x200 +; CI-DAG: v_lshr_b32_e64 [[SCALED:v[0-9]+]], s32, 6 ; CI: v_add_i32_e64 [[VZ:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, [[OFFSET]], [[SCALED]] -; GFX9-DAG: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, [[DIFF]] +; GFX9-DAG: v_lshrrev_b32_e64 [[SCALED:v[0-9]+]], 6, s32 ; GFX9: v_add_u32_e32 [[VZ:v[0-9]+]], 0x200, [[SCALED]] ; GCN: v_mul_u32_u24_e32 [[VZ]], 9, [[VZ]] @@ -231,10 +219,10 @@ declare void @func(<4 x float> addrspace(5)* nocapture) #0 ; GCN-LABEL: {{^}}undefined_stack_store_reg: ; GCN: s_and_saveexec_b64 -; GCN: buffer_store_dword v0, off, s[0:3], s34 offset: -; GCN: buffer_store_dword v0, off, s[0:3], s34 offset: -; GCN: buffer_store_dword v0, off, s[0:3], s34 offset: -; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s34 offset: +; GCN: buffer_store_dword v0, off, s[0:3], s33 offset: +; GCN: buffer_store_dword v0, off, s[0:3], s33 offset: +; GCN: buffer_store_dword v0, off, s[0:3], s33 offset: +; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33 offset: define void @undefined_stack_store_reg(float %arg, i32 %arg1) #0 { bb: %tmp = alloca <4 x float>, align 16, addrspace(5) @@ -256,12 +244,11 @@ bb5: ; GCN-LABEL: {{^}}alloca_ptr_nonentry_block: ; GCN: s_and_saveexec_b64 ; GCN: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset:4 -; GCN: s_sub_u32 [[SUB_OFFSET:s[0-9]+|vcc_lo|vcc_hi]], s32, s33 -; CI: v_lshr_b32_e64 [[SHIFT:v[0-9]+]], [[SUB_OFFSET]], 6 +; CI: v_lshr_b32_e64 [[SHIFT:v[0-9]+]], s32, 6 ; CI-NEXT: v_or_b32_e32 [[PTR:v[0-9]+]], 4, [[SHIFT]] -; GFX9: v_lshrrev_b32_e64 [[SHIFT:v[0-9]+]], 6, [[SUB_OFFSET]] +; GFX9: v_lshrrev_b32_e64 [[SHIFT:v[0-9]+]], 6, s32 ; GFX9-NEXT: v_or_b32_e32 [[PTR:v[0-9]+]], 4, [[SHIFT]] ; GCN: ds_write_b32 v{{[0-9]+}}, [[PTR]] diff --git a/llvm/test/CodeGen/AMDGPU/frame-lowering-entry-all-sgpr-used.mir b/llvm/test/CodeGen/AMDGPU/frame-lowering-entry-all-sgpr-used.mir index 686a617a064ff..d8542bd075679 100644 --- a/llvm/test/CodeGen/AMDGPU/frame-lowering-entry-all-sgpr-used.mir +++ b/llvm/test/CodeGen/AMDGPU/frame-lowering-entry-all-sgpr-used.mir @@ -26,7 +26,6 @@ machineFunctionInfo: isEntryFunction: true waveLimiter: true scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99' - scratchWaveOffsetReg: '$sgpr101' frameOffsetReg: '$sgpr101' stackPtrOffsetReg: '$sgpr32' argumentInfo: diff --git a/llvm/test/CodeGen/AMDGPU/frame-lowering-fp-adjusted.mir b/llvm/test/CodeGen/AMDGPU/frame-lowering-fp-adjusted.mir index 42484398f9d05..12fa2d4d68722 100644 --- a/llvm/test/CodeGen/AMDGPU/frame-lowering-fp-adjusted.mir +++ b/llvm/test/CodeGen/AMDGPU/frame-lowering-fp-adjusted.mir @@ -29,9 +29,8 @@ machineFunctionInfo: isEntryFunction: true waveLimiter: true scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99' - scratchWaveOffsetReg: '$sgpr101' - frameOffsetReg: '$sgpr101' stackPtrOffsetReg: '$sgpr32' + frameOffsetReg: '$sgpr33' argumentInfo: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } dispatchPtr: { reg: '$sgpr4_sgpr5' } diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll index f5d7671b9cf31..b840810890c89 100644 --- a/llvm/test/CodeGen/AMDGPU/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/fshr.ll @@ -4,9 +4,18 @@ ; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,GFX89,GFX9 ; RUN: llc < %s -march=r600 -mcpu=redwood -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,R600 -declare i32 @llvm.fshr.i32(i32, i32, i32) nounwind readnone -declare <2 x i32> @llvm.fshr.v2i32(<2 x i32>, <2 x i32>, <2 x i32>) nounwind readnone -declare <4 x i32> @llvm.fshr.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) nounwind readnone +declare i32 @llvm.fshr.i32(i32, i32, i32) +declare <2 x i32> @llvm.fshr.v2i32(<2 x i32>, <2 x i32>, <2 x i32>) +declare <3 x i32> @llvm.fshr.v3i32(<3 x i32>, <3 x i32>, <3 x i32>) +declare <4 x i32> @llvm.fshr.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) +declare i16 @llvm.fshr.i16(i16, i16, i16) +declare <2 x i16> @llvm.fshr.v2i16(<2 x i16>, <2 x i16>, <2 x i16>) +declare <3 x i16> @llvm.fshr.v3i16(<3 x i16>, <3 x i16>, <3 x i16>) +declare <4 x i16> @llvm.fshr.v4i16(<4 x i16>, <4 x i16>, <4 x i16>) +declare i64 @llvm.fshr.i64(i64, i64, i64) +declare <2 x i64> @llvm.fshr.v2i64(<2 x i64>, <2 x i64>, <2 x i64>) +declare i24 @llvm.fshr.i24(i24, i24, i24) +declare <2 x i24> @llvm.fshr.v2i24(<2 x i24>, <2 x i24>, <2 x i24>) define amdgpu_kernel void @fshr_i32(i32 addrspace(1)* %in, i32 %x, i32 %y, i32 %z) { ; SI-LABEL: fshr_i32: @@ -502,3 +511,858 @@ entry: store <4 x i32> %0, <4 x i32> addrspace(1)* %in ret void } + +define i32 @v_fshr_i32(i32 %src0, i32 %src1, i32 %src2) { +; GFX89-LABEL: v_fshr_i32: +; GFX89: ; %bb.0: +; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-NEXT: v_alignbit_b32 v0, v0, v1, v2 +; GFX89-NEXT: s_setpc_b64 s[30:31] +; +; R600-LABEL: v_fshr_i32: +; R600: ; %bb.0: +; R600-NEXT: CF_END +; R600-NEXT: PAD + %ret = call i32 @llvm.fshr.i32(i32 %src0, i32 %src1, i32 %src2) + ret i32 %ret +} + +define <2 x i32> @v_fshr_v2i32(<2 x i32> %src0, <2 x i32> %src1, <2 x i32> %src2) { +; GFX89-LABEL: v_fshr_v2i32: +; GFX89: ; %bb.0: +; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-NEXT: v_and_b32_e32 v4, 31, v4 +; GFX89-NEXT: v_alignbit_b32 v0, v0, v2, v4 +; GFX89-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX89-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX89-NEXT: v_and_b32_e32 v2, 31, v5 +; GFX89-NEXT: v_alignbit_b32 v1, v1, v3, v2 +; GFX89-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX89-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX89-NEXT: s_setpc_b64 s[30:31] +; +; R600-LABEL: v_fshr_v2i32: +; R600: ; %bb.0: +; R600-NEXT: CF_END +; R600-NEXT: PAD + %ret = call <2 x i32> @llvm.fshr.v2i32(<2 x i32> %src0, <2 x i32> %src1, <2 x i32> %src2) + ret <2 x i32> %ret +} + +define <3 x i32> @v_fshr_v3i32(<3 x i32> %src0, <3 x i32> %src1, <3 x i32> %src2) { +; GFX89-LABEL: v_fshr_v3i32: +; GFX89: ; %bb.0: +; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-NEXT: v_and_b32_e32 v6, 31, v6 +; GFX89-NEXT: v_alignbit_b32 v0, v0, v3, v6 +; GFX89-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6 +; GFX89-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; GFX89-NEXT: v_and_b32_e32 v3, 31, v7 +; GFX89-NEXT: v_alignbit_b32 v1, v1, v4, v3 +; GFX89-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX89-NEXT: v_and_b32_e32 v3, 31, v8 +; GFX89-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; GFX89-NEXT: v_alignbit_b32 v2, v2, v5, v3 +; GFX89-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX89-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc +; GFX89-NEXT: s_setpc_b64 s[30:31] +; +; R600-LABEL: v_fshr_v3i32: +; R600: ; %bb.0: +; R600-NEXT: CF_END +; R600-NEXT: PAD + %ret = call <3 x i32> @llvm.fshr.v3i32(<3 x i32> %src0, <3 x i32> %src1, <3 x i32> %src2) + ret <3 x i32> %ret +} + +define <4 x i32> @v_fshr_v4i32(<4 x i32> %src0, <4 x i32> %src1, <4 x i32> %src2) { +; GFX89-LABEL: v_fshr_v4i32: +; GFX89: ; %bb.0: +; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX89-NEXT: v_and_b32_e32 v8, 31, v8 +; GFX89-NEXT: v_alignbit_b32 v0, v0, v4, v8 +; GFX89-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8 +; GFX89-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GFX89-NEXT: v_and_b32_e32 v4, 31, v9 +; GFX89-NEXT: v_alignbit_b32 v1, v1, v5, v4 +; GFX89-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX89-NEXT: v_and_b32_e32 v4, 31, v10 +; GFX89-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX89-NEXT: v_alignbit_b32 v2, v2, v6, v4 +; GFX89-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX89-NEXT: v_and_b32_e32 v4, 31, v11 +; GFX89-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc +; GFX89-NEXT: v_alignbit_b32 v3, v3, v7, v4 +; GFX89-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX89-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc +; GFX89-NEXT: s_setpc_b64 s[30:31] +; +; R600-LABEL: v_fshr_v4i32: +; R600: ; %bb.0: +; R600-NEXT: CF_END +; R600-NEXT: PAD + %ret = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %src0, <4 x i32> %src1, <4 x i32> %src2) + ret <4 x i32> %ret +} + +define i16 @v_fshr_i16(i16 %src0, i16 %src1, i16 %src2) { +; SI-LABEL: v_fshr_i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_and_b32_e32 v2, 15, v2 +; SI-NEXT: v_and_b32_e32 v3, 0xffff, v1 +; SI-NEXT: v_sub_i32_e32 v4, vcc, 16, v2 +; SI-NEXT: v_lshr_b32_e32 v3, v3, v2 +; SI-NEXT: v_lshl_b32_e32 v0, v0, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v3 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_fshr_i16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_and_b32_e32 v2, 15, v2 +; VI-NEXT: v_sub_u16_e32 v4, 16, v2 +; VI-NEXT: v_lshrrev_b16_e32 v3, v2, v1 +; VI-NEXT: v_lshlrev_b16_e32 v0, v4, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v3 +; VI-NEXT: v_cmp_eq_u16_e32 vcc, 0, v2 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fshr_i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX9-NEXT: v_sub_u16_e32 v4, 16, v2 +; GFX9-NEXT: v_lshrrev_b16_e32 v3, v2, v1 +; GFX9-NEXT: v_lshlrev_b16_e32 v0, v4, v0 +; GFX9-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, 0, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; R600-LABEL: v_fshr_i16: +; R600: ; %bb.0: +; R600-NEXT: CF_END +; R600-NEXT: PAD + %ret = call i16 @llvm.fshr.i16(i16 %src0, i16 %src1, i16 %src2) + ret i16 %ret +} + +define <2 x i16> @v_fshr_v2i16(<2 x i16> %src0, <2 x i16> %src1, <2 x i16> %src2) { +; SI-LABEL: v_fshr_v2i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s4, 0xffff +; SI-NEXT: v_and_b32_e32 v5, 15, v5 +; SI-NEXT: v_and_b32_e32 v7, s4, v3 +; SI-NEXT: v_sub_i32_e32 v8, vcc, 16, v5 +; SI-NEXT: v_lshr_b32_e32 v7, v7, v5 +; SI-NEXT: v_lshl_b32_e32 v1, v1, v8 +; SI-NEXT: v_or_b32_e32 v1, v1, v7 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5 +; SI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; SI-NEXT: v_and_b32_e32 v3, 15, v4 +; SI-NEXT: v_sub_i32_e32 v5, vcc, 16, v3 +; SI-NEXT: v_and_b32_e32 v6, s4, v2 +; SI-NEXT: v_lshr_b32_e32 v4, v6, v3 +; SI-NEXT: v_lshl_b32_e32 v0, v0, v5 +; SI-NEXT: v_or_b32_e32 v0, v0, v4 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_fshr_v2i16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_and_b32_e32 v3, 0xf000f, v2 +; VI-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; VI-NEXT: v_bfe_u32 v2, v2, 16, 4 +; VI-NEXT: v_lshrrev_b16_e32 v4, v3, v1 +; VI-NEXT: v_lshrrev_b16_sdwa v6, v2, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_or_b32_e32 v4, v4, v6 +; VI-NEXT: v_sub_u16_e32 v6, 16, v2 +; VI-NEXT: v_sub_u16_e32 v7, 16, v3 +; VI-NEXT: v_lshlrev_b16_sdwa v6, v6, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_lshlrev_b16_e32 v0, v7, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v6 +; VI-NEXT: v_or_b32_e32 v0, v0, v4 +; VI-NEXT: v_cmp_eq_u16_e32 vcc, 0, v3 +; VI-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc +; VI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_cmp_eq_u16_e32 vcc, 0, v2 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; VI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; VI-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fshr_v2i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v2, 0xf000f, v2 +; GFX9-NEXT: v_pk_sub_i16 v4, 16, v2 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_lshlrev_b16 v0, v4, v0 +; GFX9-NEXT: v_pk_lshrrev_b16 v3, v2, v1 +; GFX9-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, 0, v2 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_cmp_eq_u16_sdwa s[4:5], v2, v4 src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v3 +; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; R600-LABEL: v_fshr_v2i16: +; R600: ; %bb.0: +; R600-NEXT: CF_END +; R600-NEXT: PAD + %ret = call <2 x i16> @llvm.fshr.v2i16(<2 x i16> %src0, <2 x i16> %src1, <2 x i16> %src2) + ret <2 x i16> %ret +} + +define <3 x i16> @v_fshr_v3i16(<3 x i16> %src0, <3 x i16> %src1, <3 x i16> %src2) { +; SI-LABEL: v_fshr_v3i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s4, 0xffff +; SI-NEXT: v_and_b32_e32 v7, 15, v7 +; SI-NEXT: v_and_b32_e32 v12, s4, v4 +; SI-NEXT: v_sub_i32_e32 v13, vcc, 16, v7 +; SI-NEXT: v_lshr_b32_e32 v12, v12, v7 +; SI-NEXT: v_lshl_b32_e32 v1, v1, v13 +; SI-NEXT: v_or_b32_e32 v1, v1, v12 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 +; SI-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc +; SI-NEXT: v_and_b32_e32 v4, 15, v6 +; SI-NEXT: v_sub_i32_e32 v7, vcc, 16, v4 +; SI-NEXT: v_and_b32_e32 v11, s4, v3 +; SI-NEXT: v_lshr_b32_e32 v6, v11, v4 +; SI-NEXT: v_lshl_b32_e32 v0, v0, v7 +; SI-NEXT: v_or_b32_e32 v0, v0, v6 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; SI-NEXT: v_mov_b32_e32 v9, 0xffff +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, v9, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v1, 15, v8 +; SI-NEXT: v_sub_i32_e32 v4, vcc, 16, v1 +; SI-NEXT: v_and_b32_e32 v10, s4, v5 +; SI-NEXT: v_lshr_b32_e32 v3, v10, v1 +; SI-NEXT: v_lshl_b32_e32 v2, v2, v4 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; SI-NEXT: v_cndmask_b32_e32 v1, v2, v5, vcc +; SI-NEXT: v_and_b32_e32 v2, v9, v1 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_fshr_v3i16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v6, 15 +; VI-NEXT: v_and_b32_sdwa v6, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; VI-NEXT: v_lshrrev_b16_e32 v8, v6, v7 +; VI-NEXT: v_sub_u16_e32 v6, 16, v6 +; VI-NEXT: v_lshlrev_b16_sdwa v6, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v6, v6, v8 +; VI-NEXT: v_bfe_u32 v8, v4, 16, 4 +; VI-NEXT: v_cmp_eq_u16_e32 vcc, 0, v8 +; VI-NEXT: v_cndmask_b32_e32 v6, v6, v7, vcc +; VI-NEXT: v_and_b32_e32 v7, 15, v5 +; VI-NEXT: v_lshrrev_b16_e32 v8, v7, v3 +; VI-NEXT: v_sub_u16_e32 v7, 16, v7 +; VI-NEXT: s_mov_b32 s4, 0xf000f +; VI-NEXT: v_lshlrev_b16_e32 v1, v7, v1 +; VI-NEXT: v_and_b32_e32 v5, s4, v5 +; VI-NEXT: v_or_b32_e32 v1, v1, v8 +; VI-NEXT: v_cmp_eq_u16_e32 vcc, 0, v5 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; VI-NEXT: v_and_b32_e32 v3, 15, v4 +; VI-NEXT: v_lshrrev_b16_e32 v5, v3, v2 +; VI-NEXT: v_sub_u16_e32 v3, 16, v3 +; VI-NEXT: v_lshlrev_b16_e32 v0, v3, v0 +; VI-NEXT: v_and_b32_e32 v3, s4, v4 +; VI-NEXT: v_or_b32_e32 v0, v0, v5 +; VI-NEXT: v_cmp_eq_u16_e32 vcc, 0, v3 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fshr_v3i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, 15 +; GFX9-NEXT: v_and_b32_e32 v6, 15, v4 +; GFX9-NEXT: v_mov_b32_e32 v8, 0xffff +; GFX9-NEXT: v_and_b32_sdwa v7, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v6, v8, v6 +; GFX9-NEXT: v_lshl_or_b32 v6, v7, 16, v6 +; GFX9-NEXT: v_pk_lshrrev_b16 v7, v6, v2 +; GFX9-NEXT: v_pk_sub_i16 v6, 16, v6 op_sel_hi:[0,1] +; GFX9-NEXT: s_mov_b32 s6, 0xf000f +; GFX9-NEXT: v_pk_lshlrev_b16 v0, v6, v0 +; GFX9-NEXT: v_and_b32_e32 v4, s6, v4 +; GFX9-NEXT: v_or_b32_e32 v0, v0, v7 +; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, 0, v4 +; GFX9-NEXT: v_mov_b32_e32 v7, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v0, v2, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_cmp_eq_u16_sdwa s[4:5], v4, v7 src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[4:5] +; GFX9-NEXT: v_and_b32_e32 v2, 15, v5 +; GFX9-NEXT: v_and_b32_e32 v2, v8, v2 +; GFX9-NEXT: v_pk_lshrrev_b16 v4, v2, v3 +; GFX9-NEXT: v_pk_sub_i16 v2, 16, v2 +; GFX9-NEXT: v_pk_lshlrev_b16 v1, v2, v1 +; GFX9-NEXT: v_and_b32_e32 v2, s6, v5 +; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, 0, v2 +; GFX9-NEXT: v_or_b32_e32 v1, v1, v4 +; GFX9-NEXT: v_and_b32_e32 v2, v8, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; R600-LABEL: v_fshr_v3i16: +; R600: ; %bb.0: +; R600-NEXT: CF_END +; R600-NEXT: PAD + %ret = call <3 x i16> @llvm.fshr.v3i16(<3 x i16> %src0, <3 x i16> %src1, <3 x i16> %src2) + ret <3 x i16> %ret +} + +define <4 x i16> @v_fshr_v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2) { +; SI-LABEL: v_fshr_v4i16: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s4, 0xffff +; SI-NEXT: v_and_b32_e32 v11, 15, v11 +; SI-NEXT: v_and_b32_e32 v16, s4, v7 +; SI-NEXT: v_sub_i32_e32 v17, vcc, 16, v11 +; SI-NEXT: v_lshr_b32_e32 v16, v16, v11 +; SI-NEXT: v_lshl_b32_e32 v3, v3, v17 +; SI-NEXT: v_or_b32_e32 v3, v3, v16 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v11 +; SI-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc +; SI-NEXT: v_and_b32_e32 v7, 15, v10 +; SI-NEXT: v_sub_i32_e32 v11, vcc, 16, v7 +; SI-NEXT: v_and_b32_e32 v15, s4, v6 +; SI-NEXT: v_lshr_b32_e32 v10, v15, v7 +; SI-NEXT: v_lshl_b32_e32 v2, v2, v11 +; SI-NEXT: v_or_b32_e32 v2, v2, v10 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7 +; SI-NEXT: v_mov_b32_e32 v12, 0xffff +; SI-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc +; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; SI-NEXT: v_and_b32_e32 v2, v12, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v3, 15, v9 +; SI-NEXT: v_sub_i32_e32 v7, vcc, 16, v3 +; SI-NEXT: v_and_b32_e32 v14, s4, v5 +; SI-NEXT: v_lshr_b32_e32 v6, v14, v3 +; SI-NEXT: v_lshl_b32_e32 v1, v1, v7 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v6 +; SI-NEXT: v_and_b32_e32 v3, 15, v8 +; SI-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; SI-NEXT: v_sub_i32_e32 v6, vcc, 16, v3 +; SI-NEXT: v_and_b32_e32 v13, s4, v4 +; SI-NEXT: v_lshr_b32_e32 v5, v13, v3 +; SI-NEXT: v_lshl_b32_e32 v0, v0, v6 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; SI-NEXT: v_or_b32_e32 v0, v0, v5 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_and_b32_e32 v0, v12, v0 +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_alignbit_b32 v1, v2, v0, 16 +; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_fshr_v4i16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v6, 15 +; VI-NEXT: v_and_b32_sdwa v7, v5, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v3 +; VI-NEXT: v_lshrrev_b16_e32 v9, v7, v8 +; VI-NEXT: v_sub_u16_e32 v7, 16, v7 +; VI-NEXT: v_lshlrev_b16_sdwa v7, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v7, v7, v9 +; VI-NEXT: v_bfe_u32 v9, v5, 16, 4 +; VI-NEXT: v_cmp_eq_u16_e32 vcc, 0, v9 +; VI-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc +; VI-NEXT: v_and_b32_sdwa v6, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_lshrrev_b32_e32 v8, 16, v2 +; VI-NEXT: v_lshrrev_b16_e32 v9, v6, v8 +; VI-NEXT: v_sub_u16_e32 v6, 16, v6 +; VI-NEXT: v_lshlrev_b16_sdwa v6, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_or_b32_e32 v6, v6, v9 +; VI-NEXT: v_bfe_u32 v9, v4, 16, 4 +; VI-NEXT: v_cmp_eq_u16_e32 vcc, 0, v9 +; VI-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc +; VI-NEXT: v_and_b32_e32 v8, 15, v5 +; VI-NEXT: v_lshrrev_b16_e32 v9, v8, v3 +; VI-NEXT: v_sub_u16_e32 v8, 16, v8 +; VI-NEXT: s_mov_b32 s4, 0xf000f +; VI-NEXT: v_lshlrev_b16_e32 v1, v8, v1 +; VI-NEXT: v_and_b32_e32 v5, s4, v5 +; VI-NEXT: v_or_b32_e32 v1, v1, v9 +; VI-NEXT: v_cmp_eq_u16_e32 vcc, 0, v5 +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; VI-NEXT: v_and_b32_e32 v3, 15, v4 +; VI-NEXT: v_lshrrev_b16_e32 v5, v3, v2 +; VI-NEXT: v_sub_u16_e32 v3, 16, v3 +; VI-NEXT: v_lshlrev_b16_e32 v0, v3, v0 +; VI-NEXT: v_and_b32_e32 v3, s4, v4 +; VI-NEXT: v_or_b32_e32 v0, v0, v5 +; VI-NEXT: v_cmp_eq_u16_e32 vcc, 0, v3 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 +; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fshr_v4i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v7, 15 +; GFX9-NEXT: v_and_b32_e32 v6, 15, v5 +; GFX9-NEXT: v_mov_b32_e32 v9, 0xffff +; GFX9-NEXT: v_and_b32_sdwa v8, v5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v6, v9, v6 +; GFX9-NEXT: v_lshl_or_b32 v6, v8, 16, v6 +; GFX9-NEXT: v_pk_lshrrev_b16 v8, v6, v3 +; GFX9-NEXT: v_pk_sub_i16 v6, 16, v6 op_sel_hi:[0,1] +; GFX9-NEXT: s_mov_b32 s6, 0xf000f +; GFX9-NEXT: v_pk_lshlrev_b16 v1, v6, v1 +; GFX9-NEXT: v_and_b32_e32 v5, s6, v5 +; GFX9-NEXT: v_or_b32_e32 v1, v1, v8 +; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, 0, v5 +; GFX9-NEXT: v_mov_b32_e32 v8, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v6, v1, v3, vcc +; GFX9-NEXT: v_cmp_eq_u16_sdwa s[4:5], v5, v8 src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[4:5] +; GFX9-NEXT: v_and_b32_e32 v3, 15, v4 +; GFX9-NEXT: v_and_b32_sdwa v5, v4, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v3, v9, v3 +; GFX9-NEXT: v_lshl_or_b32 v3, v5, 16, v3 +; GFX9-NEXT: v_pk_lshrrev_b16 v5, v3, v2 +; GFX9-NEXT: v_pk_sub_i16 v3, 16, v3 op_sel_hi:[0,1] +; GFX9-NEXT: v_pk_lshlrev_b16 v0, v3, v0 +; GFX9-NEXT: v_and_b32_e32 v3, s6, v4 +; GFX9-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX9-NEXT: v_cmp_eq_u16_e32 vcc, 0, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v2, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_cmp_eq_u16_sdwa s[4:5], v3, v8 src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[4:5] +; GFX9-NEXT: v_and_b32_e32 v2, v9, v4 +; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v2 +; GFX9-NEXT: v_and_b32_e32 v2, v9, v6 +; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2 +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; R600-LABEL: v_fshr_v4i16: +; R600: ; %bb.0: +; R600-NEXT: CF_END +; R600-NEXT: PAD + %ret = call <4 x i16> @llvm.fshr.v4i16(<4 x i16> %src0, <4 x i16> %src1, <4 x i16> %src2) + ret <4 x i16> %ret +} + +define i64 @v_fshr_i64(i64 %src0, i64 %src1, i64 %src2) { +; SI-LABEL: v_fshr_i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_and_b32_e32 v4, 63, v4 +; SI-NEXT: v_sub_i32_e32 v7, vcc, 64, v4 +; SI-NEXT: v_lshr_b64 v[5:6], v[2:3], v4 +; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], v7 +; SI-NEXT: v_or_b32_e32 v0, v0, v5 +; SI-NEXT: v_mov_b32_e32 v5, 0 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5] +; SI-NEXT: v_or_b32_e32 v1, v1, v6 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; SI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_fshr_i64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_and_b32_e32 v4, 63, v4 +; VI-NEXT: v_sub_u32_e32 v7, vcc, 64, v4 +; VI-NEXT: v_lshrrev_b64 v[5:6], v4, v[2:3] +; VI-NEXT: v_lshlrev_b64 v[0:1], v7, v[0:1] +; VI-NEXT: v_or_b32_e32 v0, v0, v5 +; VI-NEXT: v_mov_b32_e32 v5, 0 +; VI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5] +; VI-NEXT: v_or_b32_e32 v1, v1, v6 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fshr_i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v4, 63, v4 +; GFX9-NEXT: v_sub_u32_e32 v7, 64, v4 +; GFX9-NEXT: v_lshrrev_b64 v[5:6], v4, v[2:3] +; GFX9-NEXT: v_lshlrev_b64 v[0:1], v7, v[0:1] +; GFX9-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5] +; GFX9-NEXT: v_or_b32_e32 v1, v1, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; R600-LABEL: v_fshr_i64: +; R600: ; %bb.0: +; R600-NEXT: CF_END +; R600-NEXT: PAD + %ret = call i64 @llvm.fshr.i64(i64 %src0, i64 %src1, i64 %src2) + ret i64 %ret +} + +define <2 x i64> @v_fshr_v2i64(<2 x i64> %src0, <2 x i64> %src1, <2 x i64> %src2) { +; SI-LABEL: v_fshr_v2i64: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_and_b32_e32 v8, 63, v8 +; SI-NEXT: v_sub_i32_e32 v9, vcc, 64, v8 +; SI-NEXT: v_lshl_b64 v[0:1], v[0:1], v9 +; SI-NEXT: v_lshr_b64 v[11:12], v[4:5], v8 +; SI-NEXT: v_mov_b32_e32 v9, 0 +; SI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; SI-NEXT: v_or_b32_e32 v0, v0, v11 +; SI-NEXT: v_and_b32_e32 v8, 63, v10 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; SI-NEXT: v_sub_i32_e64 v4, s[4:5], 64, v8 +; SI-NEXT: v_or_b32_e32 v1, v1, v12 +; SI-NEXT: v_lshr_b64 v[10:11], v[6:7], v8 +; SI-NEXT: v_lshl_b64 v[2:3], v[2:3], v4 +; SI-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; SI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; SI-NEXT: v_or_b32_e32 v3, v3, v11 +; SI-NEXT: v_or_b32_e32 v2, v2, v10 +; SI-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc +; SI-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_fshr_v2i64: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_and_b32_e32 v8, 63, v8 +; VI-NEXT: v_sub_u32_e32 v9, vcc, 64, v8 +; VI-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1] +; VI-NEXT: v_lshrrev_b64 v[11:12], v8, v[4:5] +; VI-NEXT: v_mov_b32_e32 v9, 0 +; VI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; VI-NEXT: v_or_b32_e32 v0, v0, v11 +; VI-NEXT: v_and_b32_e32 v8, 63, v10 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; VI-NEXT: v_sub_u32_e64 v4, s[4:5], 64, v8 +; VI-NEXT: v_or_b32_e32 v1, v1, v12 +; VI-NEXT: v_lshrrev_b64 v[10:11], v8, v[6:7] +; VI-NEXT: v_lshlrev_b64 v[2:3], v4, v[2:3] +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; VI-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; VI-NEXT: v_or_b32_e32 v3, v3, v11 +; VI-NEXT: v_or_b32_e32 v2, v2, v10 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc +; VI-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fshr_v2i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v8, 63, v8 +; GFX9-NEXT: v_sub_u32_e32 v9, 64, v8 +; GFX9-NEXT: v_lshlrev_b64 v[0:1], v9, v[0:1] +; GFX9-NEXT: v_lshrrev_b64 v[11:12], v8, v[4:5] +; GFX9-NEXT: v_mov_b32_e32 v9, 0 +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; GFX9-NEXT: v_or_b32_e32 v0, v0, v11 +; GFX9-NEXT: v_and_b32_e32 v8, 63, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; GFX9-NEXT: v_sub_u32_e32 v4, 64, v8 +; GFX9-NEXT: v_or_b32_e32 v1, v1, v12 +; GFX9-NEXT: v_lshrrev_b64 v[10:11], v8, v[6:7] +; GFX9-NEXT: v_lshlrev_b64 v[2:3], v4, v[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] +; GFX9-NEXT: v_or_b32_e32 v3, v3, v11 +; GFX9-NEXT: v_or_b32_e32 v2, v2, v10 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; R600-LABEL: v_fshr_v2i64: +; R600: ; %bb.0: +; R600-NEXT: CF_END +; R600-NEXT: PAD + %ret = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %src0, <2 x i64> %src1, <2 x i64> %src2) + ret <2 x i64> %ret +} + +define i24 @v_fshr_i24(i24 %src0, i24 %src1, i24 %src2) { +; SI-LABEL: v_fshr_i24: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: s_mov_b32 s4, 0xffffff +; SI-NEXT: v_and_b32_e32 v2, s4, v2 +; SI-NEXT: s_mov_b32 s5, 0xaaaaaaab +; SI-NEXT: v_mul_hi_u32 v3, v2, s5 +; SI-NEXT: v_and_b32_e32 v4, s4, v1 +; SI-NEXT: v_lshrrev_b32_e32 v3, 4, v3 +; SI-NEXT: v_mul_lo_u32 v3, v3, 24 +; SI-NEXT: v_sub_i32_e32 v2, vcc, v2, v3 +; SI-NEXT: v_lshr_b32_e32 v3, v4, v2 +; SI-NEXT: v_sub_i32_e32 v4, vcc, 24, v2 +; SI-NEXT: v_and_b32_e32 v4, s4, v4 +; SI-NEXT: v_lshl_b32_e32 v0, v0, v4 +; SI-NEXT: v_or_b32_e32 v0, v0, v3 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_fshr_i24: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: s_mov_b32 s4, 0xffffff +; VI-NEXT: v_and_b32_e32 v2, s4, v2 +; VI-NEXT: s_mov_b32 s5, 0xaaaaaaab +; VI-NEXT: v_mul_hi_u32 v3, v2, s5 +; VI-NEXT: v_and_b32_e32 v4, s4, v1 +; VI-NEXT: v_lshrrev_b32_e32 v3, 4, v3 +; VI-NEXT: v_mul_lo_u32 v3, v3, 24 +; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v3 +; VI-NEXT: v_lshrrev_b32_e32 v3, v2, v4 +; VI-NEXT: v_sub_u32_e32 v4, vcc, 24, v2 +; VI-NEXT: v_and_b32_e32 v4, s4, v4 +; VI-NEXT: v_lshlrev_b32_e32 v0, v4, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v3 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fshr_i24: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s4, 0xffffff +; GFX9-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX9-NEXT: s_mov_b32 s5, 0xaaaaaaab +; GFX9-NEXT: v_mul_hi_u32 v3, v2, s5 +; GFX9-NEXT: v_and_b32_e32 v4, s4, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 4, v3 +; GFX9-NEXT: v_mul_lo_u32 v3, v3, 24 +; GFX9-NEXT: v_sub_u32_e32 v2, v2, v3 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, v2, v4 +; GFX9-NEXT: v_sub_u32_e32 v4, 24, v2 +; GFX9-NEXT: v_and_b32_e32 v4, s4, v4 +; GFX9-NEXT: v_lshl_or_b32 v0, v0, v4, v3 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; R600-LABEL: v_fshr_i24: +; R600: ; %bb.0: +; R600-NEXT: CF_END +; R600-NEXT: PAD + %ret = call i24 @llvm.fshr.i24(i24 %src0, i24 %src1, i24 %src2) + ret i24 %ret +} + +define <2 x i24> @v_fshr_v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2) { +; SI-LABEL: v_fshr_v2i24: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 +; SI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 +; SI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:20 +; SI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:12 +; SI-NEXT: buffer_load_dword v5, off, s[0:3], s32 +; SI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:4 +; SI-NEXT: s_mov_b32 s4, 0xffffff +; SI-NEXT: s_mov_b32 s5, 0xaaaaaaab +; SI-NEXT: v_add_i32_e32 v7, vcc, 3, v0 +; SI-NEXT: v_add_i32_e32 v8, vcc, 4, v0 +; SI-NEXT: v_add_i32_e32 v9, vcc, 5, v0 +; SI-NEXT: v_add_i32_e32 v10, vcc, 2, v0 +; SI-NEXT: s_waitcnt vmcnt(5) +; SI-NEXT: v_and_b32_e32 v14, s4, v1 +; SI-NEXT: s_waitcnt vmcnt(4) +; SI-NEXT: v_and_b32_e32 v2, s4, v2 +; SI-NEXT: v_mul_hi_u32 v12, v2, s5 +; SI-NEXT: s_waitcnt vmcnt(3) +; SI-NEXT: v_and_b32_e32 v3, s4, v3 +; SI-NEXT: v_mul_hi_u32 v13, v3, s5 +; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: v_and_b32_e32 v11, s4, v4 +; SI-NEXT: v_lshrrev_b32_e32 v12, 4, v12 +; SI-NEXT: v_mul_lo_u32 v12, v12, 24 +; SI-NEXT: v_lshrrev_b32_e32 v13, 4, v13 +; SI-NEXT: v_mul_lo_u32 v13, v13, 24 +; SI-NEXT: v_sub_i32_e32 v2, vcc, v2, v12 +; SI-NEXT: v_lshr_b32_e32 v12, v14, v2 +; SI-NEXT: v_sub_i32_e32 v3, vcc, v3, v13 +; SI-NEXT: v_sub_i32_e32 v13, vcc, 24, v2 +; SI-NEXT: v_sub_i32_e32 v14, vcc, 24, v3 +; SI-NEXT: v_and_b32_e32 v13, s4, v13 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_lshl_b32_e32 v5, v5, v13 +; SI-NEXT: v_and_b32_e32 v14, 0xffffff, v14 +; SI-NEXT: v_lshr_b32_e32 v11, v11, v3 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_lshl_b32_e32 v6, v6, v14 +; SI-NEXT: v_or_b32_e32 v5, v5, v12 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; SI-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; SI-NEXT: v_or_b32_e32 v6, v6, v11 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; SI-NEXT: v_cndmask_b32_e32 v2, v6, v4, vcc +; SI-NEXT: buffer_store_byte v2, v7, s[0:3], 0 offen +; SI-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; SI-NEXT: v_lshrrev_b32_e32 v0, 8, v2 +; SI-NEXT: s_waitcnt expcnt(1) +; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; SI-NEXT: buffer_store_byte v0, v8, s[0:3], 0 offen +; SI-NEXT: buffer_store_byte v2, v9, s[0:3], 0 offen +; SI-NEXT: buffer_store_byte v1, v10, s[0:3], 0 offen +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_fshr_v2i24: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 +; VI-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 +; VI-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:20 +; VI-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:12 +; VI-NEXT: buffer_load_dword v5, off, s[0:3], s32 +; VI-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:4 +; VI-NEXT: s_mov_b32 s4, 0xffffff +; VI-NEXT: s_mov_b32 s5, 0xaaaaaaab +; VI-NEXT: v_add_u32_e32 v7, vcc, 3, v0 +; VI-NEXT: v_add_u32_e32 v8, vcc, 4, v0 +; VI-NEXT: v_add_u32_e32 v9, vcc, 5, v0 +; VI-NEXT: v_add_u32_e32 v10, vcc, 2, v0 +; VI-NEXT: s_waitcnt vmcnt(5) +; VI-NEXT: v_and_b32_e32 v14, s4, v1 +; VI-NEXT: s_waitcnt vmcnt(4) +; VI-NEXT: v_and_b32_e32 v2, s4, v2 +; VI-NEXT: v_mul_hi_u32 v12, v2, s5 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_and_b32_e32 v3, s4, v3 +; VI-NEXT: v_mul_hi_u32 v13, v3, s5 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: v_and_b32_e32 v11, s4, v4 +; VI-NEXT: v_lshrrev_b32_e32 v12, 4, v12 +; VI-NEXT: v_mul_lo_u32 v12, v12, 24 +; VI-NEXT: v_lshrrev_b32_e32 v13, 4, v13 +; VI-NEXT: v_mul_lo_u32 v13, v13, 24 +; VI-NEXT: v_sub_u32_e32 v2, vcc, v2, v12 +; VI-NEXT: v_lshrrev_b32_e32 v12, v2, v14 +; VI-NEXT: v_sub_u32_e32 v3, vcc, v3, v13 +; VI-NEXT: v_sub_u32_e32 v13, vcc, 24, v2 +; VI-NEXT: v_sub_u32_e32 v14, vcc, 24, v3 +; VI-NEXT: v_and_b32_e32 v13, s4, v13 +; VI-NEXT: s_waitcnt vmcnt(1) +; VI-NEXT: v_lshlrev_b32_e32 v5, v13, v5 +; VI-NEXT: v_and_b32_e32 v14, 0xffffff, v14 +; VI-NEXT: v_lshrrev_b32_e32 v11, v3, v11 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: v_lshlrev_b32_e32 v6, v14, v6 +; VI-NEXT: v_or_b32_e32 v5, v5, v12 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; VI-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; VI-NEXT: v_or_b32_e32 v6, v6, v11 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; VI-NEXT: v_cndmask_b32_e32 v2, v6, v4, vcc +; VI-NEXT: buffer_store_byte v2, v7, s[0:3], 0 offen +; VI-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; VI-NEXT: v_lshrrev_b32_e32 v0, 8, v2 +; VI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; VI-NEXT: buffer_store_byte v0, v8, s[0:3], 0 offen +; VI-NEXT: buffer_store_byte v2, v9, s[0:3], 0 offen +; VI-NEXT: buffer_store_byte v1, v10, s[0:3], 0 offen +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: s_setpc_b64 s[30:31] +; +; GFX9-LABEL: v_fshr_v2i24: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 +; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:20 +; GFX9-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:12 +; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s32 +; GFX9-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:4 +; GFX9-NEXT: s_mov_b32 s4, 0xffffff +; GFX9-NEXT: s_mov_b32 s5, 0xaaaaaaab +; GFX9-NEXT: s_waitcnt vmcnt(5) +; GFX9-NEXT: v_and_b32_e32 v10, s4, v1 +; GFX9-NEXT: s_waitcnt vmcnt(4) +; GFX9-NEXT: v_and_b32_e32 v2, s4, v2 +; GFX9-NEXT: v_mul_hi_u32 v6, v2, s5 +; GFX9-NEXT: s_waitcnt vmcnt(3) +; GFX9-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX9-NEXT: v_mul_hi_u32 v7, v3, s5 +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_and_b32_e32 v9, s4, v4 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 4, v6 +; GFX9-NEXT: v_mul_lo_u32 v6, v6, 24 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 4, v7 +; GFX9-NEXT: v_mul_lo_u32 v7, v7, 24 +; GFX9-NEXT: v_sub_u32_e32 v2, v2, v6 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, v2, v10 +; GFX9-NEXT: v_sub_u32_e32 v3, v3, v7 +; GFX9-NEXT: v_sub_u32_e32 v7, 24, v2 +; GFX9-NEXT: v_sub_u32_e32 v10, 24, v3 +; GFX9-NEXT: v_and_b32_e32 v7, s4, v7 +; GFX9-NEXT: v_lshrrev_b32_e32 v9, v3, v9 +; GFX9-NEXT: v_and_b32_e32 v10, 0xffffff, v10 +; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: v_lshl_or_b32 v5, v5, v7, v6 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshl_or_b32 v6, v8, v10, v9 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v4, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v2 +; GFX9-NEXT: buffer_store_byte_d16_hi v2, v0, s[0:3], 0 offen offset:5 +; GFX9-NEXT: buffer_store_byte v3, v0, s[0:3], 0 offen offset:4 +; GFX9-NEXT: buffer_store_byte v2, v0, s[0:3], 0 offen offset:3 +; GFX9-NEXT: buffer_store_byte_d16_hi v1, v0, s[0:3], 0 offen offset:2 +; GFX9-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_setpc_b64 s[30:31] +; +; R600-LABEL: v_fshr_v2i24: +; R600: ; %bb.0: +; R600-NEXT: CF_END +; R600-NEXT: PAD + %ret = call <2 x i24> @llvm.fshr.v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2) + ret <2 x i24> %ret +} diff --git a/llvm/test/CodeGen/AMDGPU/function-returns.ll b/llvm/test/CodeGen/AMDGPU/function-returns.ll index 49a69386e632e..9c84996379481 100644 --- a/llvm/test/CodeGen/AMDGPU/function-returns.ll +++ b/llvm/test/CodeGen/AMDGPU/function-returns.ll @@ -457,8 +457,8 @@ define {i8, i32} @struct_i8_i32_func_void() #0 { ; GCN-LABEL: {{^}}void_func_sret_struct_i8_i32: ; GCN: buffer_load_ubyte [[VAL0:v[0-9]+]] ; GCN: buffer_load_dword [[VAL1:v[0-9]+]] -; GCN: buffer_store_byte [[VAL0]], v0, s[0:3], s33 offen{{$}} -; GCN: buffer_store_dword [[VAL1]], v0, s[0:3], s33 offen offset:4{{$}} +; GCN: buffer_store_byte [[VAL0]], v0, s[0:3], 0 offen{{$}} +; GCN: buffer_store_dword [[VAL1]], v0, s[0:3], 0 offen offset:4{{$}} define void @void_func_sret_struct_i8_i32({ i8, i32 } addrspace(5)* sret %arg0) #0 { %val0 = load volatile i8, i8 addrspace(1)* undef %val1 = load volatile i32, i32 addrspace(1)* undef @@ -474,39 +474,39 @@ define void @void_func_sret_struct_i8_i32({ i8, i32 } addrspace(5)* sret %arg0) ; AssertZext inserted. Not using it introduces the spills. ; GCN-LABEL: {{^}}v33i32_func_void: -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:4{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:8{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:12{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:16{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:20{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:24{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:28{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:32{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:36{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:40{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:44{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:48{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:52{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:56{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:60{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:64{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:68{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:72{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:76{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:80{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:84{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:88{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:92{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:96{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:100{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:104{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:108{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:112{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:116{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:120{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:124{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:128{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:4{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:8{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:12{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:16{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:20{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:24{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:28{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:32{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:36{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:40{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:44{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:48{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:52{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:56{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:60{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:64{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:68{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:72{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:76{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:80{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:84{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:88{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:92{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:96{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:100{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:104{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:108{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:112{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:116{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:120{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:124{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:128{{$}} ; GFX9: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 define <33 x i32> @v33i32_func_void() #0 { @@ -516,39 +516,39 @@ define <33 x i32> @v33i32_func_void() #0 { } ; GCN-LABEL: {{^}}struct_v32i32_i32_func_void: -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:4{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:8{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:12{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:16{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:20{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:24{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:28{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:32{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:36{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:40{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:44{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:48{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:52{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:56{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:60{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:64{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:68{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:72{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:76{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:80{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:84{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:88{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:92{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:96{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:100{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:104{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:108{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:112{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:116{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:120{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:124{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:128{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:4{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:8{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:12{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:16{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:20{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:24{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:28{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:32{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:36{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:40{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:44{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:48{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:52{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:56{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:60{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:64{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:68{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:72{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:76{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:80{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:84{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:88{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:92{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:96{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:100{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:104{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:108{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:112{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:116{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:120{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:124{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:128{{$}} ; GFX9: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 define { <32 x i32>, i32 } @struct_v32i32_i32_func_void() #0 { @@ -558,39 +558,39 @@ define { <32 x i32>, i32 } @struct_v32i32_i32_func_void() #0 { } ; GCN-LABEL: {{^}}struct_i32_v32i32_func_void: -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:128{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:132{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:136{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:140{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:144{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:148{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:152{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:156{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:160{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:164{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:168{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:172{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:176{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:180{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:184{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:188{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:192{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:196{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:200{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:204{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:208{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:212{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:216{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:220{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:224{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:228{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:232{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:236{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:240{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:244{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:248{{$}} -; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s33 offen offset:252{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:128{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:132{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:136{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:140{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:144{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:148{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:152{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:156{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:160{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:164{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:168{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:172{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:176{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:180{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:184{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:188{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:192{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:196{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:200{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:204{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:208{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:212{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:216{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:220{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:224{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:228{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:232{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:236{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:240{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:244{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:248{{$}} +; GFX9-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], 0 offen offset:252{{$}} ; GFX9: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 define { i32, <32 x i32> } @struct_i32_v32i32_func_void() #0 { diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll index 14e8e609e5f3a..858bd54123679 100644 --- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll @@ -48,10 +48,10 @@ entry: } ; CHECK: .name: num_spilled_sgprs -; GFX700: .sgpr_spill_count: 40 -; GFX803: .sgpr_spill_count: 24 -; GFX900: .sgpr_spill_count: 24 -; GFX1010: .sgpr_spill_count: 24 +; GFX700: .sgpr_spill_count: 38 +; GFX803: .sgpr_spill_count: 22 +; GFX900: .sgpr_spill_count: 22 +; GFX1010: .sgpr_spill_count: 22 ; CHECK: .symbol: num_spilled_sgprs.kd define amdgpu_kernel void @num_spilled_sgprs( i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, [8 x i32], diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll index 11dc60a5e2a2d..12d8973333f0e 100644 --- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll @@ -57,9 +57,9 @@ entry: ; CHECK-LABEL: - Name: num_spilled_sgprs ; CHECK: SymbolName: 'num_spilled_sgprs@kd' ; CHECK: CodeProps: -; GFX700: NumSpilledSGPRs: 40 -; GFX803: NumSpilledSGPRs: 24 -; GFX900: NumSpilledSGPRs: 24 +; GFX700: NumSpilledSGPRs: 38 +; GFX803: NumSpilledSGPRs: 22 +; GFX900: NumSpilledSGPRs: 22 define amdgpu_kernel void @num_spilled_sgprs( i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, [8 x i32], i32 addrspace(1)* %out2, i32 addrspace(1)* %out3, [8 x i32], diff --git a/llvm/test/CodeGen/AMDGPU/idot8s.ll b/llvm/test/CodeGen/AMDGPU/idot8s.ll index 4ec5d77b64f78..a54b2f59e475d 100644 --- a/llvm/test/CodeGen/AMDGPU/idot8s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8s.ll @@ -9,49 +9,49 @@ define amdgpu_kernel void @idot8_acc32(<8 x i4> addrspace(1)* %src1, ; GFX7-LABEL: idot8_acc32: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s0, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s1, s[10:11], 0x0 -; GFX7-NEXT: s_load_dword s21, s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s20, s[0:1], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bfe_i32 s2, s0, 0x40000 -; GFX7-NEXT: s_bfe_i32 s8, s1, 0x40000 -; GFX7-NEXT: s_bfe_i32 s10, s1, 0x40004 -; GFX7-NEXT: v_mov_b32_e32 v0, s8 -; GFX7-NEXT: v_mov_b32_e32 v1, s21 -; GFX7-NEXT: v_mad_i32_i24 v0, s2, v0, v1 -; GFX7-NEXT: s_bfe_i32 s9, s0, 0x40004 -; GFX7-NEXT: v_mov_b32_e32 v1, s10 -; GFX7-NEXT: s_bfe_i32 s12, s1, 0x40008 -; GFX7-NEXT: v_mad_i32_i24 v0, s9, v1, v0 -; GFX7-NEXT: s_bfe_i32 s11, s0, 0x40008 -; GFX7-NEXT: v_mov_b32_e32 v1, s12 -; GFX7-NEXT: s_bfe_i32 s14, s1, 0x4000c -; GFX7-NEXT: v_mad_i32_i24 v0, s11, v1, v0 -; GFX7-NEXT: s_bfe_i32 s13, s0, 0x4000c -; GFX7-NEXT: v_mov_b32_e32 v1, s14 -; GFX7-NEXT: s_bfe_i32 s16, s1, 0x40010 -; GFX7-NEXT: v_mad_i32_i24 v0, s13, v1, v0 -; GFX7-NEXT: s_bfe_i32 s15, s0, 0x40010 -; GFX7-NEXT: v_mov_b32_e32 v1, s16 -; GFX7-NEXT: s_bfe_i32 s18, s1, 0x40014 -; GFX7-NEXT: s_bfe_i32 s20, s1, 0x40018 -; GFX7-NEXT: v_mad_i32_i24 v0, s15, v1, v0 -; GFX7-NEXT: s_bfe_i32 s17, s0, 0x40014 -; GFX7-NEXT: v_mov_b32_e32 v1, s18 -; GFX7-NEXT: s_bfe_i32 s19, s0, 0x40018 -; GFX7-NEXT: v_mad_i32_i24 v0, s17, v1, v0 +; GFX7-NEXT: s_bfe_i32 s6, s4, 0x40000 +; GFX7-NEXT: s_bfe_i32 s7, s5, 0x40000 +; GFX7-NEXT: s_bfe_i32 s9, s5, 0x40004 +; GFX7-NEXT: v_mov_b32_e32 v0, s7 ; GFX7-NEXT: v_mov_b32_e32 v1, s20 -; GFX7-NEXT: s_ashr_i32 s1, s1, 28 -; GFX7-NEXT: v_mad_i32_i24 v0, s19, v1, v0 -; GFX7-NEXT: s_ashr_i32 s0, s0, 28 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mad_i32_i24 v0, s0, v1, v0 -; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX7-NEXT: v_mad_i32_i24 v0, s6, v0, v1 +; GFX7-NEXT: s_bfe_i32 s8, s4, 0x40004 +; GFX7-NEXT: v_mov_b32_e32 v1, s9 +; GFX7-NEXT: s_bfe_i32 s11, s5, 0x40008 +; GFX7-NEXT: v_mad_i32_i24 v0, s8, v1, v0 +; GFX7-NEXT: s_bfe_i32 s10, s4, 0x40008 +; GFX7-NEXT: v_mov_b32_e32 v1, s11 +; GFX7-NEXT: s_bfe_i32 s13, s5, 0x4000c +; GFX7-NEXT: v_mad_i32_i24 v0, s10, v1, v0 +; GFX7-NEXT: s_bfe_i32 s12, s4, 0x4000c +; GFX7-NEXT: v_mov_b32_e32 v1, s13 +; GFX7-NEXT: s_bfe_i32 s15, s5, 0x40010 +; GFX7-NEXT: v_mad_i32_i24 v0, s12, v1, v0 +; GFX7-NEXT: s_bfe_i32 s14, s4, 0x40010 +; GFX7-NEXT: v_mov_b32_e32 v1, s15 +; GFX7-NEXT: s_bfe_i32 s17, s5, 0x40014 +; GFX7-NEXT: s_bfe_i32 s19, s5, 0x40018 +; GFX7-NEXT: v_mad_i32_i24 v0, s14, v1, v0 +; GFX7-NEXT: s_bfe_i32 s16, s4, 0x40014 +; GFX7-NEXT: v_mov_b32_e32 v1, s17 +; GFX7-NEXT: s_bfe_i32 s18, s4, 0x40018 +; GFX7-NEXT: v_mad_i32_i24 v0, s16, v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s19 +; GFX7-NEXT: s_ashr_i32 s5, s5, 28 +; GFX7-NEXT: v_mad_i32_i24 v0, s18, v1, v0 +; GFX7-NEXT: s_ashr_i32 s4, s4, 28 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mad_i32_i24 v0, s4, v1, v0 +; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: idot8_acc32: @@ -60,41 +60,41 @@ define amdgpu_kernel void @idot8_acc32(<8 x i4> addrspace(1)* %src1, ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX8-NEXT: s_load_dword s19, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s18, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_i32 s5, s2, 0x40000 -; GFX8-NEXT: s_bfe_i32 s6, s4, 0x40000 -; GFX8-NEXT: s_bfe_i32 s8, s4, 0x40004 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s19 -; GFX8-NEXT: v_mad_i32_i24 v0, s5, v0, v1 -; GFX8-NEXT: s_bfe_i32 s7, s2, 0x40004 -; GFX8-NEXT: v_mov_b32_e32 v1, s8 -; GFX8-NEXT: s_bfe_i32 s10, s4, 0x40008 -; GFX8-NEXT: v_mad_i32_i24 v0, s7, v1, v0 -; GFX8-NEXT: s_bfe_i32 s9, s2, 0x40008 -; GFX8-NEXT: v_mov_b32_e32 v1, s10 -; GFX8-NEXT: s_bfe_i32 s12, s4, 0x4000c -; GFX8-NEXT: v_mad_i32_i24 v0, s9, v1, v0 -; GFX8-NEXT: s_bfe_i32 s11, s2, 0x4000c -; GFX8-NEXT: v_mov_b32_e32 v1, s12 -; GFX8-NEXT: s_bfe_i32 s14, s4, 0x40010 -; GFX8-NEXT: v_mad_i32_i24 v0, s11, v1, v0 -; GFX8-NEXT: s_bfe_i32 s13, s2, 0x40010 -; GFX8-NEXT: v_mov_b32_e32 v1, s14 -; GFX8-NEXT: s_bfe_i32 s16, s4, 0x40014 -; GFX8-NEXT: s_bfe_i32 s18, s4, 0x40018 -; GFX8-NEXT: v_mad_i32_i24 v0, s13, v1, v0 -; GFX8-NEXT: s_bfe_i32 s15, s2, 0x40014 -; GFX8-NEXT: v_mov_b32_e32 v1, s16 -; GFX8-NEXT: s_bfe_i32 s17, s2, 0x40018 -; GFX8-NEXT: v_mad_i32_i24 v0, s15, v1, v0 +; GFX8-NEXT: s_bfe_i32 s4, s2, 0x40000 +; GFX8-NEXT: s_bfe_i32 s5, s3, 0x40000 +; GFX8-NEXT: s_bfe_i32 s7, s3, 0x40004 +; GFX8-NEXT: v_mov_b32_e32 v0, s5 ; GFX8-NEXT: v_mov_b32_e32 v1, s18 -; GFX8-NEXT: s_ashr_i32 s4, s4, 28 -; GFX8-NEXT: v_mad_i32_i24 v0, s17, v1, v0 +; GFX8-NEXT: v_mad_i32_i24 v0, s4, v0, v1 +; GFX8-NEXT: s_bfe_i32 s6, s2, 0x40004 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 +; GFX8-NEXT: s_bfe_i32 s9, s3, 0x40008 +; GFX8-NEXT: v_mad_i32_i24 v0, s6, v1, v0 +; GFX8-NEXT: s_bfe_i32 s8, s2, 0x40008 +; GFX8-NEXT: v_mov_b32_e32 v1, s9 +; GFX8-NEXT: s_bfe_i32 s11, s3, 0x4000c +; GFX8-NEXT: v_mad_i32_i24 v0, s8, v1, v0 +; GFX8-NEXT: s_bfe_i32 s10, s2, 0x4000c +; GFX8-NEXT: v_mov_b32_e32 v1, s11 +; GFX8-NEXT: s_bfe_i32 s13, s3, 0x40010 +; GFX8-NEXT: v_mad_i32_i24 v0, s10, v1, v0 +; GFX8-NEXT: s_bfe_i32 s12, s2, 0x40010 +; GFX8-NEXT: v_mov_b32_e32 v1, s13 +; GFX8-NEXT: s_bfe_i32 s15, s3, 0x40014 +; GFX8-NEXT: s_bfe_i32 s17, s3, 0x40018 +; GFX8-NEXT: v_mad_i32_i24 v0, s12, v1, v0 +; GFX8-NEXT: s_bfe_i32 s14, s2, 0x40014 +; GFX8-NEXT: v_mov_b32_e32 v1, s15 +; GFX8-NEXT: s_bfe_i32 s16, s2, 0x40018 +; GFX8-NEXT: v_mad_i32_i24 v0, s14, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s17 +; GFX8-NEXT: s_ashr_i32 s3, s3, 28 +; GFX8-NEXT: v_mad_i32_i24 v0, s16, v1, v0 ; GFX8-NEXT: s_ashr_i32 s2, s2, 28 -; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: v_mad_i32_i24 v2, s2, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -107,41 +107,41 @@ define amdgpu_kernel void @idot8_acc32(<8 x i4> addrspace(1)* %src1, ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s19, s[0:1], 0x0 +; GFX9-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s18, s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s5, s2, 0x40000 -; GFX9-NEXT: s_bfe_i32 s6, s4, 0x40000 -; GFX9-NEXT: s_bfe_i32 s8, s4, 0x40004 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s19 -; GFX9-NEXT: v_mad_i32_i24 v0, s5, v0, v1 -; GFX9-NEXT: s_bfe_i32 s7, s2, 0x40004 -; GFX9-NEXT: v_mov_b32_e32 v1, s8 -; GFX9-NEXT: s_bfe_i32 s10, s4, 0x40008 -; GFX9-NEXT: v_mad_i32_i24 v0, s7, v1, v0 -; GFX9-NEXT: s_bfe_i32 s9, s2, 0x40008 -; GFX9-NEXT: v_mov_b32_e32 v1, s10 -; GFX9-NEXT: s_bfe_i32 s12, s4, 0x4000c -; GFX9-NEXT: v_mad_i32_i24 v0, s9, v1, v0 -; GFX9-NEXT: s_bfe_i32 s11, s2, 0x4000c -; GFX9-NEXT: v_mov_b32_e32 v1, s12 -; GFX9-NEXT: s_bfe_i32 s14, s4, 0x40010 -; GFX9-NEXT: v_mad_i32_i24 v0, s11, v1, v0 -; GFX9-NEXT: s_bfe_i32 s13, s2, 0x40010 -; GFX9-NEXT: v_mov_b32_e32 v1, s14 -; GFX9-NEXT: s_bfe_i32 s16, s4, 0x40014 -; GFX9-NEXT: s_bfe_i32 s18, s4, 0x40018 -; GFX9-NEXT: v_mad_i32_i24 v0, s13, v1, v0 -; GFX9-NEXT: s_bfe_i32 s15, s2, 0x40014 -; GFX9-NEXT: v_mov_b32_e32 v1, s16 -; GFX9-NEXT: s_bfe_i32 s17, s2, 0x40018 -; GFX9-NEXT: v_mad_i32_i24 v0, s15, v1, v0 +; GFX9-NEXT: s_bfe_i32 s4, s2, 0x40000 +; GFX9-NEXT: s_bfe_i32 s5, s3, 0x40000 +; GFX9-NEXT: s_bfe_i32 s7, s3, 0x40004 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 ; GFX9-NEXT: v_mov_b32_e32 v1, s18 -; GFX9-NEXT: s_ashr_i32 s4, s4, 28 -; GFX9-NEXT: v_mad_i32_i24 v0, s17, v1, v0 +; GFX9-NEXT: v_mad_i32_i24 v0, s4, v0, v1 +; GFX9-NEXT: s_bfe_i32 s6, s2, 0x40004 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_bfe_i32 s9, s3, 0x40008 +; GFX9-NEXT: v_mad_i32_i24 v0, s6, v1, v0 +; GFX9-NEXT: s_bfe_i32 s8, s2, 0x40008 +; GFX9-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-NEXT: s_bfe_i32 s11, s3, 0x4000c +; GFX9-NEXT: v_mad_i32_i24 v0, s8, v1, v0 +; GFX9-NEXT: s_bfe_i32 s10, s2, 0x4000c +; GFX9-NEXT: v_mov_b32_e32 v1, s11 +; GFX9-NEXT: s_bfe_i32 s13, s3, 0x40010 +; GFX9-NEXT: v_mad_i32_i24 v0, s10, v1, v0 +; GFX9-NEXT: s_bfe_i32 s12, s2, 0x40010 +; GFX9-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-NEXT: s_bfe_i32 s15, s3, 0x40014 +; GFX9-NEXT: s_bfe_i32 s17, s3, 0x40018 +; GFX9-NEXT: v_mad_i32_i24 v0, s12, v1, v0 +; GFX9-NEXT: s_bfe_i32 s14, s2, 0x40014 +; GFX9-NEXT: v_mov_b32_e32 v1, s15 +; GFX9-NEXT: s_bfe_i32 s16, s2, 0x40018 +; GFX9-NEXT: v_mad_i32_i24 v0, s14, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-NEXT: s_ashr_i32 s3, s3, 28 +; GFX9-NEXT: v_mad_i32_i24 v0, s16, v1, v0 ; GFX9-NEXT: s_ashr_i32 s2, s2, 28 -; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_mad_i32_i24 v2, s2, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 @@ -154,11 +154,11 @@ define amdgpu_kernel void @idot8_acc32(<8 x i4> addrspace(1)* %src1, ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: v_dot8_i32_i4 v2, s4, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 @@ -167,18 +167,18 @@ define amdgpu_kernel void @idot8_acc32(<8 x i4> addrspace(1)* %src1, ; ; GFX10-DL-LABEL: idot8_acc32: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[8:9], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s2, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_dot8_i32_i4 v2, s1, s2, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s9 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NEXT: v_dot8_i32_i4 v2, s0, s1, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, @@ -254,66 +254,66 @@ entry: define amdgpu_kernel void @idot8_acc16(<8 x i4> addrspace(1)* %src1, ; GFX7-LABEL: idot8_acc16: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s6, -1 -; GFX7-NEXT: s_mov_b32 s0, 0xffff +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s8, 0xffff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: buffer_load_ushort v0, off, s[4:7], 0 -; GFX7-NEXT: s_load_dword s1, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s2, s[10:11], 0x0 +; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bfe_i32 s8, s1, 0x40000 -; GFX7-NEXT: s_bfe_i32 s9, s2, 0x40000 -; GFX7-NEXT: s_bfe_i32 s11, s2, 0x40004 -; GFX7-NEXT: s_and_b32 s9, s9, s0 -; GFX7-NEXT: s_bfe_i32 s10, s1, 0x40004 -; GFX7-NEXT: s_bfe_i32 s13, s2, 0x40008 -; GFX7-NEXT: s_and_b32 s11, s11, s0 -; GFX7-NEXT: s_and_b32 s8, s8, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s9 -; GFX7-NEXT: s_bfe_i32 s12, s1, 0x40008 -; GFX7-NEXT: s_bfe_i32 s15, s2, 0x4000c -; GFX7-NEXT: s_and_b32 s13, s13, s0 -; GFX7-NEXT: s_and_b32 s10, s10, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s11 -; GFX7-NEXT: s_bfe_i32 s14, s1, 0x4000c -; GFX7-NEXT: s_bfe_i32 s17, s2, 0x40010 -; GFX7-NEXT: s_and_b32 s15, s15, s0 -; GFX7-NEXT: s_and_b32 s12, s12, s0 -; GFX7-NEXT: v_mov_b32_e32 v3, s13 -; GFX7-NEXT: s_bfe_i32 s16, s1, 0x40010 -; GFX7-NEXT: s_bfe_i32 s19, s2, 0x40014 -; GFX7-NEXT: s_and_b32 s17, s17, s0 -; GFX7-NEXT: s_and_b32 s14, s14, s0 -; GFX7-NEXT: v_mov_b32_e32 v4, s15 -; GFX7-NEXT: s_bfe_i32 s21, s2, 0x40018 -; GFX7-NEXT: s_bfe_i32 s18, s1, 0x40014 -; GFX7-NEXT: s_and_b32 s19, s19, s0 -; GFX7-NEXT: s_and_b32 s16, s16, s0 -; GFX7-NEXT: v_mov_b32_e32 v5, s17 -; GFX7-NEXT: s_bfe_i32 s20, s1, 0x40018 -; GFX7-NEXT: s_ashr_i32 s2, s2, 28 -; GFX7-NEXT: s_and_b32 s21, s21, s0 -; GFX7-NEXT: s_and_b32 s18, s18, s0 -; GFX7-NEXT: v_mov_b32_e32 v6, s19 -; GFX7-NEXT: s_ashr_i32 s1, s1, 28 -; GFX7-NEXT: s_and_b32 s20, s20, s0 -; GFX7-NEXT: s_and_b32 s2, s2, s0 -; GFX7-NEXT: v_mov_b32_e32 v7, s21 -; GFX7-NEXT: s_and_b32 s0, s1, s0 +; GFX7-NEXT: s_bfe_i32 s6, s4, 0x40000 +; GFX7-NEXT: s_bfe_i32 s7, s5, 0x40000 +; GFX7-NEXT: s_bfe_i32 s10, s5, 0x40004 +; GFX7-NEXT: s_and_b32 s7, s7, s8 +; GFX7-NEXT: s_bfe_i32 s9, s4, 0x40004 +; GFX7-NEXT: s_bfe_i32 s12, s5, 0x40008 +; GFX7-NEXT: s_and_b32 s10, s10, s8 +; GFX7-NEXT: s_and_b32 s6, s6, s8 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_bfe_i32 s11, s4, 0x40008 +; GFX7-NEXT: s_bfe_i32 s14, s5, 0x4000c +; GFX7-NEXT: s_and_b32 s12, s12, s8 +; GFX7-NEXT: s_and_b32 s9, s9, s8 +; GFX7-NEXT: v_mov_b32_e32 v2, s10 +; GFX7-NEXT: s_bfe_i32 s13, s4, 0x4000c +; GFX7-NEXT: s_bfe_i32 s16, s5, 0x40010 +; GFX7-NEXT: s_and_b32 s14, s14, s8 +; GFX7-NEXT: s_and_b32 s11, s11, s8 +; GFX7-NEXT: v_mov_b32_e32 v3, s12 +; GFX7-NEXT: s_bfe_i32 s15, s4, 0x40010 +; GFX7-NEXT: s_bfe_i32 s18, s5, 0x40014 +; GFX7-NEXT: s_and_b32 s16, s16, s8 +; GFX7-NEXT: s_and_b32 s13, s13, s8 +; GFX7-NEXT: v_mov_b32_e32 v4, s14 +; GFX7-NEXT: s_bfe_i32 s20, s5, 0x40018 +; GFX7-NEXT: s_bfe_i32 s17, s4, 0x40014 +; GFX7-NEXT: s_and_b32 s18, s18, s8 +; GFX7-NEXT: s_and_b32 s15, s15, s8 +; GFX7-NEXT: v_mov_b32_e32 v5, s16 +; GFX7-NEXT: s_bfe_i32 s19, s4, 0x40018 +; GFX7-NEXT: s_ashr_i32 s5, s5, 28 +; GFX7-NEXT: s_and_b32 s20, s20, s8 +; GFX7-NEXT: s_and_b32 s17, s17, s8 +; GFX7-NEXT: v_mov_b32_e32 v6, s18 +; GFX7-NEXT: s_ashr_i32 s4, s4, 28 +; GFX7-NEXT: s_and_b32 s19, s19, s8 +; GFX7-NEXT: s_and_b32 s5, s5, s8 +; GFX7-NEXT: v_mov_b32_e32 v7, s20 +; GFX7-NEXT: s_and_b32 s4, s4, s8 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v0, s8, v1, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s10, v2, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s12, v3, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s14, v4, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s16, v5, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s18, v6, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s20, v7, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s2 -; GFX7-NEXT: v_mad_u32_u24 v0, s0, v1, v0 -; GFX7-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GFX7-NEXT: v_mad_u32_u24 v0, s6, v1, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s9, v2, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s11, v3, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s13, v4, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s15, v5, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s17, v6, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s19, v7, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0 +; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: idot8_acc16: @@ -327,41 +327,41 @@ define amdgpu_kernel void @idot8_acc16(<8 x i4> addrspace(1)* %src1, ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_i32 s5, s0, 0x40000 -; GFX8-NEXT: s_bfe_i32 s6, s1, 0x40000 -; GFX8-NEXT: s_bfe_i32 s8, s1, 0x40004 -; GFX8-NEXT: s_bfe_i32 s10, s1, 0x40008 -; GFX8-NEXT: v_mov_b32_e32 v6, s6 +; GFX8-NEXT: s_bfe_i32 s4, s0, 0x40000 +; GFX8-NEXT: s_bfe_i32 s5, s1, 0x40000 +; GFX8-NEXT: s_bfe_i32 s7, s1, 0x40004 +; GFX8-NEXT: s_bfe_i32 s9, s1, 0x40008 +; GFX8-NEXT: v_mov_b32_e32 v6, s5 ; GFX8-NEXT: s_lshr_b32 s2, s0, 12 -; GFX8-NEXT: s_lshr_b32 s4, s1, 12 -; GFX8-NEXT: s_bfe_i32 s7, s0, 0x40004 -; GFX8-NEXT: s_bfe_i32 s9, s0, 0x40008 -; GFX8-NEXT: v_mov_b32_e32 v3, s10 -; GFX8-NEXT: v_mov_b32_e32 v7, s8 +; GFX8-NEXT: s_lshr_b32 s3, s1, 12 +; GFX8-NEXT: s_bfe_i32 s6, s0, 0x40004 +; GFX8-NEXT: s_bfe_i32 s8, s0, 0x40008 +; GFX8-NEXT: v_mov_b32_e32 v3, s9 +; GFX8-NEXT: v_mov_b32_e32 v7, s7 ; GFX8-NEXT: v_lshlrev_b16_e64 v4, 12, s2 -; GFX8-NEXT: v_lshlrev_b16_e64 v5, 12, s4 -; GFX8-NEXT: v_mul_i32_i24_e32 v3, s9, v3 -; GFX8-NEXT: s_bfe_i32 s12, s1, 0x40010 +; GFX8-NEXT: v_lshlrev_b16_e64 v5, 12, s3 +; GFX8-NEXT: v_mul_i32_i24_e32 v3, s8, v3 +; GFX8-NEXT: s_bfe_i32 s11, s1, 0x40010 ; GFX8-NEXT: v_ashrrev_i16_e32 v4, 12, v4 ; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v5 -; GFX8-NEXT: s_bfe_i32 s14, s1, 0x40014 -; GFX8-NEXT: s_bfe_i32 s11, s0, 0x40010 -; GFX8-NEXT: v_mov_b32_e32 v8, s12 -; GFX8-NEXT: s_bfe_i32 s16, s1, 0x40018 -; GFX8-NEXT: s_bfe_i32 s13, s0, 0x40014 -; GFX8-NEXT: v_mov_b32_e32 v9, s14 -; GFX8-NEXT: s_bfe_i32 s15, s0, 0x40018 +; GFX8-NEXT: s_bfe_i32 s13, s1, 0x40014 +; GFX8-NEXT: s_bfe_i32 s10, s0, 0x40010 +; GFX8-NEXT: v_mov_b32_e32 v8, s11 +; GFX8-NEXT: s_bfe_i32 s15, s1, 0x40018 +; GFX8-NEXT: s_bfe_i32 s12, s0, 0x40014 +; GFX8-NEXT: v_mov_b32_e32 v9, s13 +; GFX8-NEXT: s_bfe_i32 s14, s0, 0x40018 ; GFX8-NEXT: s_ashr_i32 s1, s1, 28 -; GFX8-NEXT: v_mov_b32_e32 v10, s16 +; GFX8-NEXT: v_mov_b32_e32 v10, s15 ; GFX8-NEXT: s_ashr_i32 s0, s0, 28 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_i32_i24 v2, s5, v6, v2 -; GFX8-NEXT: v_mad_i32_i24 v2, s7, v7, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s4, v6, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s6, v7, v2 ; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX8-NEXT: v_mad_u32_u24 v2, v4, v5, v2 -; GFX8-NEXT: v_mad_i32_i24 v2, s11, v8, v2 -; GFX8-NEXT: v_mad_i32_i24 v2, s13, v9, v2 -; GFX8-NEXT: v_mad_i32_i24 v2, s15, v10, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s10, v8, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s12, v9, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s14, v10, v2 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mad_i32_i24 v2, s0, v3, v2 ; GFX8-NEXT: flat_store_short v[0:1], v2 @@ -378,41 +378,41 @@ define amdgpu_kernel void @idot8_acc16(<8 x i4> addrspace(1)* %src1, ; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s5, s0, 0x40000 -; GFX9-NEXT: s_bfe_i32 s6, s1, 0x40000 -; GFX9-NEXT: s_bfe_i32 s8, s1, 0x40004 -; GFX9-NEXT: s_bfe_i32 s10, s1, 0x40008 -; GFX9-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-NEXT: s_bfe_i32 s4, s0, 0x40000 +; GFX9-NEXT: s_bfe_i32 s5, s1, 0x40000 +; GFX9-NEXT: s_bfe_i32 s7, s1, 0x40004 +; GFX9-NEXT: s_bfe_i32 s9, s1, 0x40008 +; GFX9-NEXT: v_mov_b32_e32 v6, s5 ; GFX9-NEXT: s_lshr_b32 s2, s0, 12 -; GFX9-NEXT: s_lshr_b32 s4, s1, 12 -; GFX9-NEXT: s_bfe_i32 s7, s0, 0x40004 -; GFX9-NEXT: s_bfe_i32 s9, s0, 0x40008 -; GFX9-NEXT: v_mov_b32_e32 v3, s10 -; GFX9-NEXT: v_mov_b32_e32 v7, s8 +; GFX9-NEXT: s_lshr_b32 s3, s1, 12 +; GFX9-NEXT: s_bfe_i32 s6, s0, 0x40004 +; GFX9-NEXT: s_bfe_i32 s8, s0, 0x40008 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_mov_b32_e32 v7, s7 ; GFX9-NEXT: v_lshlrev_b16_e64 v4, 12, s2 -; GFX9-NEXT: v_lshlrev_b16_e64 v5, 12, s4 -; GFX9-NEXT: v_mul_i32_i24_e32 v3, s9, v3 -; GFX9-NEXT: s_bfe_i32 s12, s1, 0x40010 +; GFX9-NEXT: v_lshlrev_b16_e64 v5, 12, s3 +; GFX9-NEXT: v_mul_i32_i24_e32 v3, s8, v3 +; GFX9-NEXT: s_bfe_i32 s11, s1, 0x40010 ; GFX9-NEXT: v_ashrrev_i16_e32 v4, 12, v4 ; GFX9-NEXT: v_ashrrev_i16_e32 v5, 12, v5 -; GFX9-NEXT: s_bfe_i32 s14, s1, 0x40014 -; GFX9-NEXT: s_bfe_i32 s11, s0, 0x40010 -; GFX9-NEXT: v_mov_b32_e32 v8, s12 -; GFX9-NEXT: s_bfe_i32 s16, s1, 0x40018 -; GFX9-NEXT: s_bfe_i32 s13, s0, 0x40014 -; GFX9-NEXT: v_mov_b32_e32 v9, s14 -; GFX9-NEXT: s_bfe_i32 s15, s0, 0x40018 +; GFX9-NEXT: s_bfe_i32 s13, s1, 0x40014 +; GFX9-NEXT: s_bfe_i32 s10, s0, 0x40010 +; GFX9-NEXT: v_mov_b32_e32 v8, s11 +; GFX9-NEXT: s_bfe_i32 s15, s1, 0x40018 +; GFX9-NEXT: s_bfe_i32 s12, s0, 0x40014 +; GFX9-NEXT: v_mov_b32_e32 v9, s13 +; GFX9-NEXT: s_bfe_i32 s14, s0, 0x40018 ; GFX9-NEXT: s_ashr_i32 s1, s1, 28 -; GFX9-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-NEXT: v_mov_b32_e32 v10, s15 ; GFX9-NEXT: s_ashr_i32 s0, s0, 28 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mad_i32_i24 v2, s5, v6, v2 -; GFX9-NEXT: v_mad_i32_i24 v2, s7, v7, v2 +; GFX9-NEXT: v_mad_i32_i24 v2, s4, v6, v2 +; GFX9-NEXT: v_mad_i32_i24 v2, s6, v7, v2 ; GFX9-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-NEXT: v_mad_u32_u24 v2, v4, v5, v2 -; GFX9-NEXT: v_mad_i32_i24 v2, s11, v8, v2 -; GFX9-NEXT: v_mad_i32_i24 v2, s13, v9, v2 -; GFX9-NEXT: v_mad_i32_i24 v2, s15, v10, v2 +; GFX9-NEXT: v_mad_i32_i24 v2, s10, v8, v2 +; GFX9-NEXT: v_mad_i32_i24 v2, s12, v9, v2 +; GFX9-NEXT: v_mad_i32_i24 v2, s14, v10, v2 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_mad_i32_i24 v2, s0, v3, v2 ; GFX9-NEXT: global_store_short v[0:1], v2, off @@ -429,41 +429,41 @@ define amdgpu_kernel void @idot8_acc16(<8 x i4> addrspace(1)* %src1, ; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_bfe_i32 s5, s0, 0x40000 -; GFX9-DL-NEXT: s_bfe_i32 s6, s1, 0x40000 -; GFX9-DL-NEXT: s_bfe_i32 s8, s1, 0x40004 -; GFX9-DL-NEXT: s_bfe_i32 s10, s1, 0x40008 -; GFX9-DL-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-DL-NEXT: s_bfe_i32 s4, s0, 0x40000 +; GFX9-DL-NEXT: s_bfe_i32 s5, s1, 0x40000 +; GFX9-DL-NEXT: s_bfe_i32 s7, s1, 0x40004 +; GFX9-DL-NEXT: s_bfe_i32 s9, s1, 0x40008 +; GFX9-DL-NEXT: v_mov_b32_e32 v6, s5 ; GFX9-DL-NEXT: s_lshr_b32 s2, s0, 12 -; GFX9-DL-NEXT: s_lshr_b32 s4, s1, 12 -; GFX9-DL-NEXT: s_bfe_i32 s7, s0, 0x40004 -; GFX9-DL-NEXT: s_bfe_i32 s9, s0, 0x40008 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s10 -; GFX9-DL-NEXT: v_mov_b32_e32 v7, s8 +; GFX9-DL-NEXT: s_lshr_b32 s3, s1, 12 +; GFX9-DL-NEXT: s_bfe_i32 s6, s0, 0x40004 +; GFX9-DL-NEXT: s_bfe_i32 s8, s0, 0x40008 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-DL-NEXT: v_mov_b32_e32 v7, s7 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s2 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s4 -; GFX9-DL-NEXT: v_mul_i32_i24_e32 v3, s9, v3 -; GFX9-DL-NEXT: s_bfe_i32 s12, s1, 0x40010 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s3 +; GFX9-DL-NEXT: v_mul_i32_i24_e32 v3, s8, v3 +; GFX9-DL-NEXT: s_bfe_i32 s11, s1, 0x40010 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v4, 12, v4 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v5, 12, v5 -; GFX9-DL-NEXT: s_bfe_i32 s14, s1, 0x40014 -; GFX9-DL-NEXT: s_bfe_i32 s11, s0, 0x40010 -; GFX9-DL-NEXT: v_mov_b32_e32 v8, s12 -; GFX9-DL-NEXT: s_bfe_i32 s16, s1, 0x40018 -; GFX9-DL-NEXT: s_bfe_i32 s13, s0, 0x40014 -; GFX9-DL-NEXT: v_mov_b32_e32 v9, s14 -; GFX9-DL-NEXT: s_bfe_i32 s15, s0, 0x40018 +; GFX9-DL-NEXT: s_bfe_i32 s13, s1, 0x40014 +; GFX9-DL-NEXT: s_bfe_i32 s10, s0, 0x40010 +; GFX9-DL-NEXT: v_mov_b32_e32 v8, s11 +; GFX9-DL-NEXT: s_bfe_i32 s15, s1, 0x40018 +; GFX9-DL-NEXT: s_bfe_i32 s12, s0, 0x40014 +; GFX9-DL-NEXT: v_mov_b32_e32 v9, s13 +; GFX9-DL-NEXT: s_bfe_i32 s14, s0, 0x40018 ; GFX9-DL-NEXT: s_ashr_i32 s1, s1, 28 -; GFX9-DL-NEXT: v_mov_b32_e32 v10, s16 +; GFX9-DL-NEXT: v_mov_b32_e32 v10, s15 ; GFX9-DL-NEXT: s_ashr_i32 s0, s0, 28 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s5, v6, v2 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s7, v7, v2 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s4, v6, v2 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s6, v7, v2 ; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, v4, v5, v2 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s11, v8, v2 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s13, v9, v2 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s15, v10, v2 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s10, v8, v2 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s12, v9, v2 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s14, v10, v2 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s0, v3, v2 ; GFX9-DL-NEXT: global_store_short v[0:1], v2, off @@ -471,49 +471,49 @@ define amdgpu_kernel void @idot8_acc16(<8 x i4> addrspace(1)* %src1, ; ; GFX10-DL-LABEL: idot8_acc16: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_lshr_b32 s2, s0, 12 -; GFX10-DL-NEXT: s_lshr_b32 s4, s1, 12 -; GFX10-DL-NEXT: s_bfe_i32 s5, s0, 0x40000 -; GFX10-DL-NEXT: s_bfe_i32 s6, s1, 0x40000 -; GFX10-DL-NEXT: s_bfe_i32 s7, s0, 0x40004 +; GFX10-DL-NEXT: s_lshr_b32 s3, s1, 12 +; GFX10-DL-NEXT: s_bfe_i32 s4, s0, 0x40000 +; GFX10-DL-NEXT: s_bfe_i32 s5, s1, 0x40000 +; GFX10-DL-NEXT: s_bfe_i32 s6, s0, 0x40004 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s2 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s4 -; GFX10-DL-NEXT: s_bfe_i32 s8, s0, 0x40008 -; GFX10-DL-NEXT: s_bfe_i32 s9, s1, 0x40008 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s3 +; GFX10-DL-NEXT: s_bfe_i32 s7, s0, 0x40008 +; GFX10-DL-NEXT: s_bfe_i32 s8, s1, 0x40008 ; GFX10-DL-NEXT: s_bfe_i32 s2, s1, 0x40004 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v3, 12, v3 -; GFX10-DL-NEXT: s_mov_b32 s4, 0xffff +; GFX10-DL-NEXT: s_mov_b32 s3, 0xffff ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v4, 12, v4 -; GFX10-DL-NEXT: v_mul_i32_i24_e64 v5, s8, s9 -; GFX10-DL-NEXT: v_and_b32_e32 v3, s4, v3 -; GFX10-DL-NEXT: v_and_b32_e32 v4, s4, v4 -; GFX10-DL-NEXT: s_bfe_i32 s4, s1, 0x40010 +; GFX10-DL-NEXT: v_mul_i32_i24_e64 v5, s7, s8 +; GFX10-DL-NEXT: v_and_b32_e32 v3, s3, v3 +; GFX10-DL-NEXT: v_and_b32_e32 v4, s3, v4 +; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40010 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s5, s6, v2 -; GFX10-DL-NEXT: s_bfe_i32 s5, s0, 0x40014 -; GFX10-DL-NEXT: s_bfe_i32 s6, s1, 0x40014 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s7, s2, v2 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s4, s5, v2 +; GFX10-DL-NEXT: s_bfe_i32 s4, s0, 0x40014 +; GFX10-DL-NEXT: s_bfe_i32 s5, s1, 0x40014 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s6, s2, v2 ; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40010 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, v3, v4, v2 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s4, v2 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s3, v2 ; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40018 -; GFX10-DL-NEXT: s_bfe_i32 s4, s1, 0x40018 +; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40018 ; GFX10-DL-NEXT: s_ashr_i32 s0, s0, 28 ; GFX10-DL-NEXT: s_ashr_i32 s1, s1, 28 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s5, s6, v2 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s4, v2 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s4, s5, v2 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s3, v2 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s1, v2 ; GFX10-DL-NEXT: global_store_short v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm @@ -589,66 +589,66 @@ entry: define amdgpu_kernel void @idot8_acc8(<8 x i4> addrspace(1)* %src1, ; GFX7-LABEL: idot8_acc8: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s6, -1 -; GFX7-NEXT: s_movk_i32 s0, 0xff +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_movk_i32 s8, 0xff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 -; GFX7-NEXT: s_load_dword s1, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s2, s[10:11], 0x0 +; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bfe_i32 s8, s1, 0x40000 -; GFX7-NEXT: s_bfe_i32 s9, s2, 0x40000 -; GFX7-NEXT: s_bfe_i32 s11, s2, 0x40004 -; GFX7-NEXT: s_and_b32 s9, s9, s0 -; GFX7-NEXT: s_bfe_i32 s10, s1, 0x40004 -; GFX7-NEXT: s_bfe_i32 s13, s2, 0x40008 -; GFX7-NEXT: s_and_b32 s11, s11, s0 -; GFX7-NEXT: s_and_b32 s8, s8, s0 -; GFX7-NEXT: v_mov_b32_e32 v1, s9 -; GFX7-NEXT: s_bfe_i32 s12, s1, 0x40008 -; GFX7-NEXT: s_bfe_i32 s15, s2, 0x4000c -; GFX7-NEXT: s_and_b32 s13, s13, s0 -; GFX7-NEXT: s_and_b32 s10, s10, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s11 -; GFX7-NEXT: s_bfe_i32 s14, s1, 0x4000c -; GFX7-NEXT: s_bfe_i32 s17, s2, 0x40010 -; GFX7-NEXT: s_and_b32 s15, s15, s0 -; GFX7-NEXT: s_and_b32 s12, s12, s0 -; GFX7-NEXT: v_mov_b32_e32 v3, s13 -; GFX7-NEXT: s_bfe_i32 s16, s1, 0x40010 -; GFX7-NEXT: s_bfe_i32 s19, s2, 0x40014 -; GFX7-NEXT: s_and_b32 s17, s17, s0 -; GFX7-NEXT: s_and_b32 s14, s14, s0 -; GFX7-NEXT: v_mov_b32_e32 v4, s15 -; GFX7-NEXT: s_bfe_i32 s21, s2, 0x40018 -; GFX7-NEXT: s_bfe_i32 s18, s1, 0x40014 -; GFX7-NEXT: s_and_b32 s19, s19, s0 -; GFX7-NEXT: s_and_b32 s16, s16, s0 -; GFX7-NEXT: v_mov_b32_e32 v5, s17 -; GFX7-NEXT: s_bfe_i32 s20, s1, 0x40018 -; GFX7-NEXT: s_ashr_i32 s2, s2, 28 -; GFX7-NEXT: s_and_b32 s21, s21, s0 -; GFX7-NEXT: s_and_b32 s18, s18, s0 -; GFX7-NEXT: v_mov_b32_e32 v6, s19 -; GFX7-NEXT: s_ashr_i32 s1, s1, 28 -; GFX7-NEXT: s_and_b32 s20, s20, s0 -; GFX7-NEXT: s_and_b32 s2, s2, s0 -; GFX7-NEXT: v_mov_b32_e32 v7, s21 -; GFX7-NEXT: s_and_b32 s0, s1, s0 +; GFX7-NEXT: s_bfe_i32 s6, s4, 0x40000 +; GFX7-NEXT: s_bfe_i32 s7, s5, 0x40000 +; GFX7-NEXT: s_bfe_i32 s10, s5, 0x40004 +; GFX7-NEXT: s_and_b32 s7, s7, s8 +; GFX7-NEXT: s_bfe_i32 s9, s4, 0x40004 +; GFX7-NEXT: s_bfe_i32 s12, s5, 0x40008 +; GFX7-NEXT: s_and_b32 s10, s10, s8 +; GFX7-NEXT: s_and_b32 s6, s6, s8 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: s_bfe_i32 s11, s4, 0x40008 +; GFX7-NEXT: s_bfe_i32 s14, s5, 0x4000c +; GFX7-NEXT: s_and_b32 s12, s12, s8 +; GFX7-NEXT: s_and_b32 s9, s9, s8 +; GFX7-NEXT: v_mov_b32_e32 v2, s10 +; GFX7-NEXT: s_bfe_i32 s13, s4, 0x4000c +; GFX7-NEXT: s_bfe_i32 s16, s5, 0x40010 +; GFX7-NEXT: s_and_b32 s14, s14, s8 +; GFX7-NEXT: s_and_b32 s11, s11, s8 +; GFX7-NEXT: v_mov_b32_e32 v3, s12 +; GFX7-NEXT: s_bfe_i32 s15, s4, 0x40010 +; GFX7-NEXT: s_bfe_i32 s18, s5, 0x40014 +; GFX7-NEXT: s_and_b32 s16, s16, s8 +; GFX7-NEXT: s_and_b32 s13, s13, s8 +; GFX7-NEXT: v_mov_b32_e32 v4, s14 +; GFX7-NEXT: s_bfe_i32 s20, s5, 0x40018 +; GFX7-NEXT: s_bfe_i32 s17, s4, 0x40014 +; GFX7-NEXT: s_and_b32 s18, s18, s8 +; GFX7-NEXT: s_and_b32 s15, s15, s8 +; GFX7-NEXT: v_mov_b32_e32 v5, s16 +; GFX7-NEXT: s_bfe_i32 s19, s4, 0x40018 +; GFX7-NEXT: s_ashr_i32 s5, s5, 28 +; GFX7-NEXT: s_and_b32 s20, s20, s8 +; GFX7-NEXT: s_and_b32 s17, s17, s8 +; GFX7-NEXT: v_mov_b32_e32 v6, s18 +; GFX7-NEXT: s_ashr_i32 s4, s4, 28 +; GFX7-NEXT: s_and_b32 s19, s19, s8 +; GFX7-NEXT: s_and_b32 s5, s5, s8 +; GFX7-NEXT: v_mov_b32_e32 v7, s20 +; GFX7-NEXT: s_and_b32 s4, s4, s8 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v0, s8, v1, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s10, v2, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s12, v3, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s14, v4, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s16, v5, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s18, v6, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s20, v7, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s2 -; GFX7-NEXT: v_mad_u32_u24 v0, s0, v1, v0 -; GFX7-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; GFX7-NEXT: v_mad_u32_u24 v0, s6, v1, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s9, v2, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s11, v3, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s13, v4, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s15, v5, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s17, v6, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s19, v7, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0 +; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: idot8_acc8: @@ -657,50 +657,50 @@ define amdgpu_kernel void @idot8_acc8(<8 x i4> addrspace(1)* %src1, ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_movk_i32 s2, 0xff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s6, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_i32 s7, s6, 0x40000 -; GFX8-NEXT: s_lshr_b32 s4, s6, 12 -; GFX8-NEXT: s_bfe_i32 s9, s6, 0x40004 -; GFX8-NEXT: s_bfe_i32 s11, s6, 0x40008 +; GFX8-NEXT: s_bfe_i32 s6, s3, 0x40000 +; GFX8-NEXT: s_lshr_b32 s4, s3, 12 +; GFX8-NEXT: s_bfe_i32 s8, s3, 0x40004 +; GFX8-NEXT: s_bfe_i32 s10, s3, 0x40008 ; GFX8-NEXT: s_lshr_b32 s1, s0, 12 ; GFX8-NEXT: s_bfe_i32 s5, s0, 0x40000 -; GFX8-NEXT: v_mov_b32_e32 v6, s7 +; GFX8-NEXT: v_mov_b32_e32 v6, s6 ; GFX8-NEXT: v_lshlrev_b16_e64 v4, 12, s1 ; GFX8-NEXT: v_lshlrev_b16_e64 v5, 12, s4 -; GFX8-NEXT: s_bfe_i32 s8, s0, 0x40004 -; GFX8-NEXT: s_bfe_i32 s10, s0, 0x40008 -; GFX8-NEXT: v_mov_b32_e32 v3, s11 -; GFX8-NEXT: v_mov_b32_e32 v7, s9 +; GFX8-NEXT: s_bfe_i32 s7, s0, 0x40004 +; GFX8-NEXT: s_bfe_i32 s9, s0, 0x40008 +; GFX8-NEXT: v_mov_b32_e32 v3, s10 +; GFX8-NEXT: v_mov_b32_e32 v7, s8 ; GFX8-NEXT: v_ashrrev_i16_e32 v4, 12, v4 ; GFX8-NEXT: v_ashrrev_i16_e32 v5, 12, v5 -; GFX8-NEXT: v_mul_i32_i24_e32 v3, s10, v3 -; GFX8-NEXT: s_bfe_i32 s13, s6, 0x40010 +; GFX8-NEXT: v_mul_i32_i24_e32 v3, s9, v3 +; GFX8-NEXT: s_bfe_i32 s12, s3, 0x40010 ; GFX8-NEXT: v_and_b32_e32 v4, s2, v4 ; GFX8-NEXT: v_and_b32_e32 v5, s2, v5 -; GFX8-NEXT: s_bfe_i32 s15, s6, 0x40014 -; GFX8-NEXT: s_bfe_i32 s12, s0, 0x40010 -; GFX8-NEXT: v_mov_b32_e32 v8, s13 -; GFX8-NEXT: s_bfe_i32 s17, s6, 0x40018 -; GFX8-NEXT: s_bfe_i32 s14, s0, 0x40014 -; GFX8-NEXT: v_mov_b32_e32 v9, s15 -; GFX8-NEXT: s_bfe_i32 s16, s0, 0x40018 -; GFX8-NEXT: s_ashr_i32 s6, s6, 28 -; GFX8-NEXT: v_mov_b32_e32 v10, s17 +; GFX8-NEXT: s_bfe_i32 s14, s3, 0x40014 +; GFX8-NEXT: s_bfe_i32 s11, s0, 0x40010 +; GFX8-NEXT: v_mov_b32_e32 v8, s12 +; GFX8-NEXT: s_bfe_i32 s16, s3, 0x40018 +; GFX8-NEXT: s_bfe_i32 s13, s0, 0x40014 +; GFX8-NEXT: v_mov_b32_e32 v9, s14 +; GFX8-NEXT: s_bfe_i32 s15, s0, 0x40018 +; GFX8-NEXT: s_ashr_i32 s3, s3, 28 +; GFX8-NEXT: v_mov_b32_e32 v10, s16 ; GFX8-NEXT: s_ashr_i32 s0, s0, 28 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mad_i32_i24 v2, s5, v6, v2 -; GFX8-NEXT: v_mad_i32_i24 v2, s8, v7, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s7, v7, v2 ; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX8-NEXT: v_mad_u32_u24 v2, v4, v5, v2 -; GFX8-NEXT: v_mad_i32_i24 v2, s12, v8, v2 -; GFX8-NEXT: v_mad_i32_i24 v2, s14, v9, v2 -; GFX8-NEXT: v_mad_i32_i24 v2, s16, v10, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s6 +; GFX8-NEXT: v_mad_i32_i24 v2, s11, v8, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s13, v9, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s15, v10, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: v_mad_i32_i24 v2, s0, v3, v2 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_endpgm @@ -711,50 +711,50 @@ define amdgpu_kernel void @idot8_acc8(<8 x i4> addrspace(1)* %src1, ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: s_movk_i32 s2, 0xff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s7, s6, 0x40000 -; GFX9-NEXT: s_lshr_b32 s4, s6, 12 -; GFX9-NEXT: s_bfe_i32 s9, s6, 0x40004 -; GFX9-NEXT: s_bfe_i32 s11, s6, 0x40008 +; GFX9-NEXT: s_bfe_i32 s6, s3, 0x40000 +; GFX9-NEXT: s_lshr_b32 s4, s3, 12 +; GFX9-NEXT: s_bfe_i32 s8, s3, 0x40004 +; GFX9-NEXT: s_bfe_i32 s10, s3, 0x40008 ; GFX9-NEXT: s_lshr_b32 s1, s0, 12 ; GFX9-NEXT: s_bfe_i32 s5, s0, 0x40000 -; GFX9-NEXT: v_mov_b32_e32 v6, s7 +; GFX9-NEXT: v_mov_b32_e32 v6, s6 ; GFX9-NEXT: v_lshlrev_b16_e64 v4, 12, s1 ; GFX9-NEXT: v_lshlrev_b16_e64 v5, 12, s4 -; GFX9-NEXT: s_bfe_i32 s8, s0, 0x40004 -; GFX9-NEXT: s_bfe_i32 s10, s0, 0x40008 -; GFX9-NEXT: v_mov_b32_e32 v3, s11 -; GFX9-NEXT: v_mov_b32_e32 v7, s9 +; GFX9-NEXT: s_bfe_i32 s7, s0, 0x40004 +; GFX9-NEXT: s_bfe_i32 s9, s0, 0x40008 +; GFX9-NEXT: v_mov_b32_e32 v3, s10 +; GFX9-NEXT: v_mov_b32_e32 v7, s8 ; GFX9-NEXT: v_ashrrev_i16_e32 v4, 12, v4 ; GFX9-NEXT: v_ashrrev_i16_e32 v5, 12, v5 -; GFX9-NEXT: v_mul_i32_i24_e32 v3, s10, v3 -; GFX9-NEXT: s_bfe_i32 s13, s6, 0x40010 +; GFX9-NEXT: v_mul_i32_i24_e32 v3, s9, v3 +; GFX9-NEXT: s_bfe_i32 s12, s3, 0x40010 ; GFX9-NEXT: v_and_b32_e32 v4, s2, v4 ; GFX9-NEXT: v_and_b32_e32 v5, s2, v5 -; GFX9-NEXT: s_bfe_i32 s15, s6, 0x40014 -; GFX9-NEXT: s_bfe_i32 s12, s0, 0x40010 -; GFX9-NEXT: v_mov_b32_e32 v8, s13 -; GFX9-NEXT: s_bfe_i32 s17, s6, 0x40018 -; GFX9-NEXT: s_bfe_i32 s14, s0, 0x40014 -; GFX9-NEXT: v_mov_b32_e32 v9, s15 -; GFX9-NEXT: s_bfe_i32 s16, s0, 0x40018 -; GFX9-NEXT: s_ashr_i32 s6, s6, 28 -; GFX9-NEXT: v_mov_b32_e32 v10, s17 +; GFX9-NEXT: s_bfe_i32 s14, s3, 0x40014 +; GFX9-NEXT: s_bfe_i32 s11, s0, 0x40010 +; GFX9-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-NEXT: s_bfe_i32 s16, s3, 0x40018 +; GFX9-NEXT: s_bfe_i32 s13, s0, 0x40014 +; GFX9-NEXT: v_mov_b32_e32 v9, s14 +; GFX9-NEXT: s_bfe_i32 s15, s0, 0x40018 +; GFX9-NEXT: s_ashr_i32 s3, s3, 28 +; GFX9-NEXT: v_mov_b32_e32 v10, s16 ; GFX9-NEXT: s_ashr_i32 s0, s0, 28 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mad_i32_i24 v2, s5, v6, v2 -; GFX9-NEXT: v_mad_i32_i24 v2, s8, v7, v2 +; GFX9-NEXT: v_mad_i32_i24 v2, s7, v7, v2 ; GFX9-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX9-NEXT: v_mad_u32_u24 v2, v4, v5, v2 -; GFX9-NEXT: v_mad_i32_i24 v2, s12, v8, v2 -; GFX9-NEXT: v_mad_i32_i24 v2, s14, v9, v2 -; GFX9-NEXT: v_mad_i32_i24 v2, s16, v10, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-NEXT: v_mad_i32_i24 v2, s11, v8, v2 +; GFX9-NEXT: v_mad_i32_i24 v2, s13, v9, v2 +; GFX9-NEXT: v_mad_i32_i24 v2, s15, v10, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: v_mad_i32_i24 v2, s0, v3, v2 ; GFX9-NEXT: global_store_byte v[0:1], v2, off ; GFX9-NEXT: s_endpgm @@ -765,99 +765,99 @@ define amdgpu_kernel void @idot8_acc8(<8 x i4> addrspace(1)* %src1, ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_movk_i32 s2, 0xff ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s6, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_bfe_i32 s7, s6, 0x40000 -; GFX9-DL-NEXT: s_lshr_b32 s4, s6, 12 -; GFX9-DL-NEXT: s_bfe_i32 s9, s6, 0x40004 -; GFX9-DL-NEXT: s_bfe_i32 s11, s6, 0x40008 +; GFX9-DL-NEXT: s_bfe_i32 s6, s3, 0x40000 +; GFX9-DL-NEXT: s_lshr_b32 s4, s3, 12 +; GFX9-DL-NEXT: s_bfe_i32 s8, s3, 0x40004 +; GFX9-DL-NEXT: s_bfe_i32 s10, s3, 0x40008 ; GFX9-DL-NEXT: s_lshr_b32 s1, s0, 12 ; GFX9-DL-NEXT: s_bfe_i32 s5, s0, 0x40000 -; GFX9-DL-NEXT: v_mov_b32_e32 v6, s7 +; GFX9-DL-NEXT: v_mov_b32_e32 v6, s6 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s1 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s4 -; GFX9-DL-NEXT: s_bfe_i32 s8, s0, 0x40004 -; GFX9-DL-NEXT: s_bfe_i32 s10, s0, 0x40008 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s11 -; GFX9-DL-NEXT: v_mov_b32_e32 v7, s9 +; GFX9-DL-NEXT: s_bfe_i32 s7, s0, 0x40004 +; GFX9-DL-NEXT: s_bfe_i32 s9, s0, 0x40008 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s10 +; GFX9-DL-NEXT: v_mov_b32_e32 v7, s8 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v4, 12, v4 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v5, 12, v5 -; GFX9-DL-NEXT: v_mul_i32_i24_e32 v3, s10, v3 -; GFX9-DL-NEXT: s_bfe_i32 s13, s6, 0x40010 +; GFX9-DL-NEXT: v_mul_i32_i24_e32 v3, s9, v3 +; GFX9-DL-NEXT: s_bfe_i32 s12, s3, 0x40010 ; GFX9-DL-NEXT: v_and_b32_e32 v4, s2, v4 ; GFX9-DL-NEXT: v_and_b32_e32 v5, s2, v5 -; GFX9-DL-NEXT: s_bfe_i32 s15, s6, 0x40014 -; GFX9-DL-NEXT: s_bfe_i32 s12, s0, 0x40010 -; GFX9-DL-NEXT: v_mov_b32_e32 v8, s13 -; GFX9-DL-NEXT: s_bfe_i32 s17, s6, 0x40018 -; GFX9-DL-NEXT: s_bfe_i32 s14, s0, 0x40014 -; GFX9-DL-NEXT: v_mov_b32_e32 v9, s15 -; GFX9-DL-NEXT: s_bfe_i32 s16, s0, 0x40018 -; GFX9-DL-NEXT: s_ashr_i32 s6, s6, 28 -; GFX9-DL-NEXT: v_mov_b32_e32 v10, s17 +; GFX9-DL-NEXT: s_bfe_i32 s14, s3, 0x40014 +; GFX9-DL-NEXT: s_bfe_i32 s11, s0, 0x40010 +; GFX9-DL-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-DL-NEXT: s_bfe_i32 s16, s3, 0x40018 +; GFX9-DL-NEXT: s_bfe_i32 s13, s0, 0x40014 +; GFX9-DL-NEXT: v_mov_b32_e32 v9, s14 +; GFX9-DL-NEXT: s_bfe_i32 s15, s0, 0x40018 +; GFX9-DL-NEXT: s_ashr_i32 s3, s3, 28 +; GFX9-DL-NEXT: v_mov_b32_e32 v10, s16 ; GFX9-DL-NEXT: s_ashr_i32 s0, s0, 28 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s5, v6, v2 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s8, v7, v2 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s7, v7, v2 ; GFX9-DL-NEXT: v_add_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, v4, v5, v2 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s12, v8, v2 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s14, v9, v2 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s16, v10, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s6 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s11, v8, v2 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s13, v9, v2 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s15, v10, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-DL-NEXT: v_mad_i32_i24 v2, s0, v3, v2 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: idot8_acc8: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_lshr_b32 s2, s0, 12 -; GFX10-DL-NEXT: s_lshr_b32 s4, s1, 12 -; GFX10-DL-NEXT: s_bfe_i32 s5, s0, 0x40000 -; GFX10-DL-NEXT: s_bfe_i32 s6, s1, 0x40000 -; GFX10-DL-NEXT: s_bfe_i32 s7, s0, 0x40004 +; GFX10-DL-NEXT: s_lshr_b32 s3, s1, 12 +; GFX10-DL-NEXT: s_bfe_i32 s4, s0, 0x40000 +; GFX10-DL-NEXT: s_bfe_i32 s5, s1, 0x40000 +; GFX10-DL-NEXT: s_bfe_i32 s6, s0, 0x40004 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s2 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s4 -; GFX10-DL-NEXT: s_bfe_i32 s8, s0, 0x40008 -; GFX10-DL-NEXT: s_bfe_i32 s9, s1, 0x40008 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s3 +; GFX10-DL-NEXT: s_bfe_i32 s7, s0, 0x40008 +; GFX10-DL-NEXT: s_bfe_i32 s8, s1, 0x40008 ; GFX10-DL-NEXT: s_bfe_i32 s2, s1, 0x40004 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v3, 12, v3 -; GFX10-DL-NEXT: s_movk_i32 s4, 0xff +; GFX10-DL-NEXT: s_movk_i32 s3, 0xff ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v4, 12, v4 -; GFX10-DL-NEXT: v_mul_i32_i24_e64 v5, s8, s9 -; GFX10-DL-NEXT: v_and_b32_e32 v3, s4, v3 -; GFX10-DL-NEXT: v_and_b32_e32 v4, s4, v4 -; GFX10-DL-NEXT: s_bfe_i32 s4, s1, 0x40010 +; GFX10-DL-NEXT: v_mul_i32_i24_e64 v5, s7, s8 +; GFX10-DL-NEXT: v_and_b32_e32 v3, s3, v3 +; GFX10-DL-NEXT: v_and_b32_e32 v4, s3, v4 +; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40010 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s5, s6, v2 -; GFX10-DL-NEXT: s_bfe_i32 s5, s0, 0x40014 -; GFX10-DL-NEXT: s_bfe_i32 s6, s1, 0x40014 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s7, s2, v2 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s4, s5, v2 +; GFX10-DL-NEXT: s_bfe_i32 s4, s0, 0x40014 +; GFX10-DL-NEXT: s_bfe_i32 s5, s1, 0x40014 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s6, s2, v2 ; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40010 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, v3, v4, v2 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s4, v2 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s3, v2 ; GFX10-DL-NEXT: s_bfe_i32 s2, s0, 0x40018 -; GFX10-DL-NEXT: s_bfe_i32 s4, s1, 0x40018 +; GFX10-DL-NEXT: s_bfe_i32 s3, s1, 0x40018 ; GFX10-DL-NEXT: s_ashr_i32 s0, s0, 28 ; GFX10-DL-NEXT: s_ashr_i32 s1, s1, 28 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s5, s6, v2 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s4, v2 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s4, s5, v2 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s3, v2 ; GFX10-DL-NEXT: v_mad_i32_i24 v2, s0, s1, v2 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm @@ -934,51 +934,51 @@ entry: define amdgpu_kernel void @idot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1, ; GFX7-LABEL: idot8_multiuses_mul1: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s0, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s1, s[10:11], 0x0 -; GFX7-NEXT: s_load_dword s21, s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s20, s[0:1], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bfe_i32 s2, s0, 0x40000 -; GFX7-NEXT: s_bfe_i32 s8, s1, 0x40000 -; GFX7-NEXT: v_mov_b32_e32 v0, s8 -; GFX7-NEXT: v_mov_b32_e32 v1, s21 -; GFX7-NEXT: v_mad_i32_i24 v1, s2, v0, v1 -; GFX7-NEXT: s_bfe_i32 s10, s1, 0x40004 -; GFX7-NEXT: s_bfe_i32 s9, s0, 0x40004 -; GFX7-NEXT: s_bfe_i32 s12, s1, 0x40008 -; GFX7-NEXT: v_mad_i32_i24 v0, s2, v0, v1 -; GFX7-NEXT: v_mov_b32_e32 v2, s10 -; GFX7-NEXT: v_mad_i32_i24 v0, s9, v2, v0 -; GFX7-NEXT: s_bfe_i32 s11, s0, 0x40008 -; GFX7-NEXT: v_mov_b32_e32 v2, s12 -; GFX7-NEXT: s_bfe_i32 s14, s1, 0x4000c -; GFX7-NEXT: v_mad_i32_i24 v0, s11, v2, v0 -; GFX7-NEXT: s_bfe_i32 s13, s0, 0x4000c -; GFX7-NEXT: v_mov_b32_e32 v2, s14 -; GFX7-NEXT: s_bfe_i32 s16, s1, 0x40010 -; GFX7-NEXT: v_mad_i32_i24 v0, s13, v2, v0 -; GFX7-NEXT: s_bfe_i32 s15, s0, 0x40010 -; GFX7-NEXT: v_mov_b32_e32 v2, s16 -; GFX7-NEXT: s_bfe_i32 s18, s1, 0x40014 -; GFX7-NEXT: s_bfe_i32 s20, s1, 0x40018 -; GFX7-NEXT: v_mad_i32_i24 v0, s15, v2, v0 -; GFX7-NEXT: s_bfe_i32 s17, s0, 0x40014 -; GFX7-NEXT: v_mov_b32_e32 v2, s18 -; GFX7-NEXT: s_bfe_i32 s19, s0, 0x40018 -; GFX7-NEXT: v_mad_i32_i24 v0, s17, v2, v0 -; GFX7-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-NEXT: s_ashr_i32 s1, s1, 28 -; GFX7-NEXT: v_mad_i32_i24 v0, s19, v2, v0 -; GFX7-NEXT: s_ashr_i32 s0, s0, 28 -; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: v_mad_i32_i24 v0, s0, v2, v0 +; GFX7-NEXT: s_bfe_i32 s6, s4, 0x40000 +; GFX7-NEXT: s_bfe_i32 s7, s5, 0x40000 +; GFX7-NEXT: v_mov_b32_e32 v0, s7 +; GFX7-NEXT: v_mov_b32_e32 v1, s20 +; GFX7-NEXT: v_mad_i32_i24 v1, s6, v0, v1 +; GFX7-NEXT: s_bfe_i32 s9, s5, 0x40004 +; GFX7-NEXT: s_bfe_i32 s8, s4, 0x40004 +; GFX7-NEXT: s_bfe_i32 s11, s5, 0x40008 +; GFX7-NEXT: v_mad_i32_i24 v0, s6, v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, s9 +; GFX7-NEXT: v_mad_i32_i24 v0, s8, v2, v0 +; GFX7-NEXT: s_bfe_i32 s10, s4, 0x40008 +; GFX7-NEXT: v_mov_b32_e32 v2, s11 +; GFX7-NEXT: s_bfe_i32 s13, s5, 0x4000c +; GFX7-NEXT: v_mad_i32_i24 v0, s10, v2, v0 +; GFX7-NEXT: s_bfe_i32 s12, s4, 0x4000c +; GFX7-NEXT: v_mov_b32_e32 v2, s13 +; GFX7-NEXT: s_bfe_i32 s15, s5, 0x40010 +; GFX7-NEXT: v_mad_i32_i24 v0, s12, v2, v0 +; GFX7-NEXT: s_bfe_i32 s14, s4, 0x40010 +; GFX7-NEXT: v_mov_b32_e32 v2, s15 +; GFX7-NEXT: s_bfe_i32 s17, s5, 0x40014 +; GFX7-NEXT: s_bfe_i32 s19, s5, 0x40018 +; GFX7-NEXT: v_mad_i32_i24 v0, s14, v2, v0 +; GFX7-NEXT: s_bfe_i32 s16, s4, 0x40014 +; GFX7-NEXT: v_mov_b32_e32 v2, s17 +; GFX7-NEXT: s_bfe_i32 s18, s4, 0x40018 +; GFX7-NEXT: v_mad_i32_i24 v0, s16, v2, v0 +; GFX7-NEXT: v_mov_b32_e32 v2, s19 +; GFX7-NEXT: s_ashr_i32 s5, s5, 28 +; GFX7-NEXT: v_mad_i32_i24 v0, s18, v2, v0 +; GFX7-NEXT: s_ashr_i32 s4, s4, 28 +; GFX7-NEXT: v_mov_b32_e32 v2, s5 +; GFX7-NEXT: v_mad_i32_i24 v0, s4, v2, v0 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: idot8_multiuses_mul1: @@ -987,42 +987,42 @@ define amdgpu_kernel void @idot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1, ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX8-NEXT: s_load_dword s19, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s18, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_i32 s5, s2, 0x40000 -; GFX8-NEXT: s_bfe_i32 s6, s4, 0x40000 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s19 -; GFX8-NEXT: v_mad_i32_i24 v1, s5, v0, v1 -; GFX8-NEXT: s_bfe_i32 s8, s4, 0x40004 -; GFX8-NEXT: s_bfe_i32 s7, s2, 0x40004 -; GFX8-NEXT: s_bfe_i32 s10, s4, 0x40008 -; GFX8-NEXT: v_mad_i32_i24 v0, s5, v0, v1 -; GFX8-NEXT: v_mov_b32_e32 v2, s8 -; GFX8-NEXT: v_mad_i32_i24 v0, s7, v2, v0 -; GFX8-NEXT: s_bfe_i32 s9, s2, 0x40008 -; GFX8-NEXT: v_mov_b32_e32 v2, s10 -; GFX8-NEXT: s_bfe_i32 s12, s4, 0x4000c -; GFX8-NEXT: v_mad_i32_i24 v0, s9, v2, v0 -; GFX8-NEXT: s_bfe_i32 s11, s2, 0x4000c -; GFX8-NEXT: v_mov_b32_e32 v2, s12 -; GFX8-NEXT: s_bfe_i32 s14, s4, 0x40010 -; GFX8-NEXT: v_mad_i32_i24 v0, s11, v2, v0 -; GFX8-NEXT: s_bfe_i32 s13, s2, 0x40010 -; GFX8-NEXT: v_mov_b32_e32 v2, s14 -; GFX8-NEXT: s_bfe_i32 s16, s4, 0x40014 -; GFX8-NEXT: s_bfe_i32 s18, s4, 0x40018 -; GFX8-NEXT: v_mad_i32_i24 v0, s13, v2, v0 -; GFX8-NEXT: s_bfe_i32 s15, s2, 0x40014 -; GFX8-NEXT: v_mov_b32_e32 v2, s16 -; GFX8-NEXT: s_bfe_i32 s17, s2, 0x40018 -; GFX8-NEXT: v_mad_i32_i24 v0, s15, v2, v0 -; GFX8-NEXT: v_mov_b32_e32 v2, s18 -; GFX8-NEXT: s_ashr_i32 s4, s4, 28 -; GFX8-NEXT: v_mad_i32_i24 v0, s17, v2, v0 +; GFX8-NEXT: s_bfe_i32 s4, s2, 0x40000 +; GFX8-NEXT: s_bfe_i32 s5, s3, 0x40000 +; GFX8-NEXT: v_mov_b32_e32 v0, s5 +; GFX8-NEXT: v_mov_b32_e32 v1, s18 +; GFX8-NEXT: v_mad_i32_i24 v1, s4, v0, v1 +; GFX8-NEXT: s_bfe_i32 s7, s3, 0x40004 +; GFX8-NEXT: s_bfe_i32 s6, s2, 0x40004 +; GFX8-NEXT: s_bfe_i32 s9, s3, 0x40008 +; GFX8-NEXT: v_mad_i32_i24 v0, s4, v0, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, s7 +; GFX8-NEXT: v_mad_i32_i24 v0, s6, v2, v0 +; GFX8-NEXT: s_bfe_i32 s8, s2, 0x40008 +; GFX8-NEXT: v_mov_b32_e32 v2, s9 +; GFX8-NEXT: s_bfe_i32 s11, s3, 0x4000c +; GFX8-NEXT: v_mad_i32_i24 v0, s8, v2, v0 +; GFX8-NEXT: s_bfe_i32 s10, s2, 0x4000c +; GFX8-NEXT: v_mov_b32_e32 v2, s11 +; GFX8-NEXT: s_bfe_i32 s13, s3, 0x40010 +; GFX8-NEXT: v_mad_i32_i24 v0, s10, v2, v0 +; GFX8-NEXT: s_bfe_i32 s12, s2, 0x40010 +; GFX8-NEXT: v_mov_b32_e32 v2, s13 +; GFX8-NEXT: s_bfe_i32 s15, s3, 0x40014 +; GFX8-NEXT: s_bfe_i32 s17, s3, 0x40018 +; GFX8-NEXT: v_mad_i32_i24 v0, s12, v2, v0 +; GFX8-NEXT: s_bfe_i32 s14, s2, 0x40014 +; GFX8-NEXT: v_mov_b32_e32 v2, s15 +; GFX8-NEXT: s_bfe_i32 s16, s2, 0x40018 +; GFX8-NEXT: v_mad_i32_i24 v0, s14, v2, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, s17 +; GFX8-NEXT: s_ashr_i32 s3, s3, 28 +; GFX8-NEXT: v_mad_i32_i24 v0, s16, v2, v0 ; GFX8-NEXT: s_ashr_i32 s2, s2, 28 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_mov_b32_e32 v2, s3 ; GFX8-NEXT: v_mad_i32_i24 v0, s2, v2, v0 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v0, v1 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -1036,42 +1036,42 @@ define amdgpu_kernel void @idot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1, ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s19, s[0:1], 0x0 +; GFX9-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-NEXT: s_load_dword s18, s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_i32 s5, s2, 0x40000 -; GFX9-NEXT: s_bfe_i32 s6, s4, 0x40000 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s19 -; GFX9-NEXT: v_mad_i32_i24 v1, s5, v0, v1 -; GFX9-NEXT: s_bfe_i32 s8, s4, 0x40004 -; GFX9-NEXT: s_bfe_i32 s7, s2, 0x40004 -; GFX9-NEXT: s_bfe_i32 s10, s4, 0x40008 -; GFX9-NEXT: v_mad_i32_i24 v0, s5, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v2, s8 -; GFX9-NEXT: v_mad_i32_i24 v0, s7, v2, v0 -; GFX9-NEXT: s_bfe_i32 s9, s2, 0x40008 -; GFX9-NEXT: v_mov_b32_e32 v2, s10 -; GFX9-NEXT: s_bfe_i32 s12, s4, 0x4000c -; GFX9-NEXT: v_mad_i32_i24 v0, s9, v2, v0 -; GFX9-NEXT: s_bfe_i32 s11, s2, 0x4000c -; GFX9-NEXT: v_mov_b32_e32 v2, s12 -; GFX9-NEXT: s_bfe_i32 s14, s4, 0x40010 -; GFX9-NEXT: v_mad_i32_i24 v0, s11, v2, v0 -; GFX9-NEXT: s_bfe_i32 s13, s2, 0x40010 -; GFX9-NEXT: v_mov_b32_e32 v2, s14 -; GFX9-NEXT: s_bfe_i32 s16, s4, 0x40014 -; GFX9-NEXT: s_bfe_i32 s18, s4, 0x40018 -; GFX9-NEXT: v_mad_i32_i24 v0, s13, v2, v0 -; GFX9-NEXT: s_bfe_i32 s15, s2, 0x40014 -; GFX9-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-NEXT: s_bfe_i32 s17, s2, 0x40018 -; GFX9-NEXT: v_mad_i32_i24 v0, s15, v2, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, s18 -; GFX9-NEXT: s_ashr_i32 s4, s4, 28 -; GFX9-NEXT: v_mad_i32_i24 v0, s17, v2, v0 +; GFX9-NEXT: s_bfe_i32 s4, s2, 0x40000 +; GFX9-NEXT: s_bfe_i32 s5, s3, 0x40000 +; GFX9-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-NEXT: v_mov_b32_e32 v1, s18 +; GFX9-NEXT: v_mad_i32_i24 v1, s4, v0, v1 +; GFX9-NEXT: s_bfe_i32 s7, s3, 0x40004 +; GFX9-NEXT: s_bfe_i32 s6, s2, 0x40004 +; GFX9-NEXT: s_bfe_i32 s9, s3, 0x40008 +; GFX9-NEXT: v_mad_i32_i24 v0, s4, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: v_mad_i32_i24 v0, s6, v2, v0 +; GFX9-NEXT: s_bfe_i32 s8, s2, 0x40008 +; GFX9-NEXT: v_mov_b32_e32 v2, s9 +; GFX9-NEXT: s_bfe_i32 s11, s3, 0x4000c +; GFX9-NEXT: v_mad_i32_i24 v0, s8, v2, v0 +; GFX9-NEXT: s_bfe_i32 s10, s2, 0x4000c +; GFX9-NEXT: v_mov_b32_e32 v2, s11 +; GFX9-NEXT: s_bfe_i32 s13, s3, 0x40010 +; GFX9-NEXT: v_mad_i32_i24 v0, s10, v2, v0 +; GFX9-NEXT: s_bfe_i32 s12, s2, 0x40010 +; GFX9-NEXT: v_mov_b32_e32 v2, s13 +; GFX9-NEXT: s_bfe_i32 s15, s3, 0x40014 +; GFX9-NEXT: s_bfe_i32 s17, s3, 0x40018 +; GFX9-NEXT: v_mad_i32_i24 v0, s12, v2, v0 +; GFX9-NEXT: s_bfe_i32 s14, s2, 0x40014 +; GFX9-NEXT: v_mov_b32_e32 v2, s15 +; GFX9-NEXT: s_bfe_i32 s16, s2, 0x40018 +; GFX9-NEXT: v_mad_i32_i24 v0, s14, v2, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, s17 +; GFX9-NEXT: s_ashr_i32 s3, s3, 28 +; GFX9-NEXT: v_mad_i32_i24 v0, s16, v2, v0 ; GFX9-NEXT: s_ashr_i32 s2, s2, 28 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v2, s3 ; GFX9-NEXT: v_mad_i32_i24 v0, s2, v2, v0 ; GFX9-NEXT: v_add_u32_e32 v2, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 @@ -1085,42 +1085,42 @@ define amdgpu_kernel void @idot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1, ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s19, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s18, s[0:1], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_bfe_i32 s5, s2, 0x40000 -; GFX9-DL-NEXT: s_bfe_i32 s6, s4, 0x40000 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s19 -; GFX9-DL-NEXT: v_mad_i32_i24 v1, s5, v0, v1 -; GFX9-DL-NEXT: s_bfe_i32 s8, s4, 0x40004 -; GFX9-DL-NEXT: s_bfe_i32 s7, s2, 0x40004 -; GFX9-DL-NEXT: s_bfe_i32 s10, s4, 0x40008 -; GFX9-DL-NEXT: v_mad_i32_i24 v0, s5, v0, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s8 -; GFX9-DL-NEXT: v_mad_i32_i24 v0, s7, v2, v0 -; GFX9-DL-NEXT: s_bfe_i32 s9, s2, 0x40008 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s10 -; GFX9-DL-NEXT: s_bfe_i32 s12, s4, 0x4000c -; GFX9-DL-NEXT: v_mad_i32_i24 v0, s9, v2, v0 -; GFX9-DL-NEXT: s_bfe_i32 s11, s2, 0x4000c -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s12 -; GFX9-DL-NEXT: s_bfe_i32 s14, s4, 0x40010 -; GFX9-DL-NEXT: v_mad_i32_i24 v0, s11, v2, v0 -; GFX9-DL-NEXT: s_bfe_i32 s13, s2, 0x40010 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s14 -; GFX9-DL-NEXT: s_bfe_i32 s16, s4, 0x40014 -; GFX9-DL-NEXT: s_bfe_i32 s18, s4, 0x40018 -; GFX9-DL-NEXT: v_mad_i32_i24 v0, s13, v2, v0 -; GFX9-DL-NEXT: s_bfe_i32 s15, s2, 0x40014 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s16 -; GFX9-DL-NEXT: s_bfe_i32 s17, s2, 0x40018 -; GFX9-DL-NEXT: v_mad_i32_i24 v0, s15, v2, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s18 -; GFX9-DL-NEXT: s_ashr_i32 s4, s4, 28 -; GFX9-DL-NEXT: v_mad_i32_i24 v0, s17, v2, v0 +; GFX9-DL-NEXT: s_bfe_i32 s4, s2, 0x40000 +; GFX9-DL-NEXT: s_bfe_i32 s5, s3, 0x40000 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s5 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s18 +; GFX9-DL-NEXT: v_mad_i32_i24 v1, s4, v0, v1 +; GFX9-DL-NEXT: s_bfe_i32 s7, s3, 0x40004 +; GFX9-DL-NEXT: s_bfe_i32 s6, s2, 0x40004 +; GFX9-DL-NEXT: s_bfe_i32 s9, s3, 0x40008 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s4, v0, v1 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s6, v2, v0 +; GFX9-DL-NEXT: s_bfe_i32 s8, s2, 0x40008 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s9 +; GFX9-DL-NEXT: s_bfe_i32 s11, s3, 0x4000c +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s8, v2, v0 +; GFX9-DL-NEXT: s_bfe_i32 s10, s2, 0x4000c +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s11 +; GFX9-DL-NEXT: s_bfe_i32 s13, s3, 0x40010 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s10, v2, v0 +; GFX9-DL-NEXT: s_bfe_i32 s12, s2, 0x40010 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s13 +; GFX9-DL-NEXT: s_bfe_i32 s15, s3, 0x40014 +; GFX9-DL-NEXT: s_bfe_i32 s17, s3, 0x40018 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s12, v2, v0 +; GFX9-DL-NEXT: s_bfe_i32 s14, s2, 0x40014 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s15 +; GFX9-DL-NEXT: s_bfe_i32 s16, s2, 0x40018 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s14, v2, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s17 +; GFX9-DL-NEXT: s_ashr_i32 s3, s3, 28 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s16, v2, v0 ; GFX9-DL-NEXT: s_ashr_i32 s2, s2, 28 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s3 ; GFX9-DL-NEXT: v_mad_i32_i24 v0, s2, v2, v0 ; GFX9-DL-NEXT: v_add_u32_e32 v2, v1, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 @@ -1135,35 +1135,35 @@ define amdgpu_kernel void @idot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_bfe_i32 s6, s2, 0x40000 -; GFX10-DL-NEXT: s_bfe_i32 s7, s4, 0x40000 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s5 -; GFX10-DL-NEXT: s_bfe_i32 s5, s2, 0x40004 -; GFX10-DL-NEXT: s_bfe_i32 s8, s4, 0x40004 -; GFX10-DL-NEXT: v_mad_i32_i24 v0, s6, s7, v0 -; GFX10-DL-NEXT: v_mad_i32_i24 v1, s6, s7, v0 -; GFX10-DL-NEXT: s_bfe_i32 s6, s2, 0x40008 -; GFX10-DL-NEXT: s_bfe_i32 s7, s4, 0x40008 -; GFX10-DL-NEXT: v_mad_i32_i24 v1, s5, s8, v1 -; GFX10-DL-NEXT: s_bfe_i32 s5, s2, 0x4000c -; GFX10-DL-NEXT: s_bfe_i32 s8, s4, 0x4000c -; GFX10-DL-NEXT: v_mad_i32_i24 v1, s6, s7, v1 -; GFX10-DL-NEXT: s_bfe_i32 s6, s2, 0x40010 -; GFX10-DL-NEXT: s_bfe_i32 s7, s4, 0x40010 -; GFX10-DL-NEXT: v_mad_i32_i24 v1, s5, s8, v1 -; GFX10-DL-NEXT: s_bfe_i32 s5, s2, 0x40014 -; GFX10-DL-NEXT: s_bfe_i32 s8, s4, 0x40014 -; GFX10-DL-NEXT: v_mad_i32_i24 v1, s6, s7, v1 -; GFX10-DL-NEXT: s_bfe_i32 s6, s2, 0x40018 -; GFX10-DL-NEXT: s_bfe_i32 s7, s4, 0x40018 +; GFX10-DL-NEXT: s_bfe_i32 s5, s2, 0x40000 +; GFX10-DL-NEXT: s_bfe_i32 s6, s3, 0x40000 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: s_bfe_i32 s4, s2, 0x40004 +; GFX10-DL-NEXT: s_bfe_i32 s7, s3, 0x40004 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s5, s6, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v1, s5, s6, v0 +; GFX10-DL-NEXT: s_bfe_i32 s5, s2, 0x40008 +; GFX10-DL-NEXT: s_bfe_i32 s6, s3, 0x40008 +; GFX10-DL-NEXT: v_mad_i32_i24 v1, s4, s7, v1 +; GFX10-DL-NEXT: s_bfe_i32 s4, s2, 0x4000c +; GFX10-DL-NEXT: s_bfe_i32 s7, s3, 0x4000c +; GFX10-DL-NEXT: v_mad_i32_i24 v1, s5, s6, v1 +; GFX10-DL-NEXT: s_bfe_i32 s5, s2, 0x40010 +; GFX10-DL-NEXT: s_bfe_i32 s6, s3, 0x40010 +; GFX10-DL-NEXT: v_mad_i32_i24 v1, s4, s7, v1 +; GFX10-DL-NEXT: s_bfe_i32 s4, s2, 0x40014 +; GFX10-DL-NEXT: s_bfe_i32 s7, s3, 0x40014 +; GFX10-DL-NEXT: v_mad_i32_i24 v1, s5, s6, v1 +; GFX10-DL-NEXT: s_bfe_i32 s5, s2, 0x40018 +; GFX10-DL-NEXT: s_bfe_i32 s6, s3, 0x40018 ; GFX10-DL-NEXT: s_ashr_i32 s2, s2, 28 -; GFX10-DL-NEXT: s_ashr_i32 s4, s4, 28 -; GFX10-DL-NEXT: v_mad_i32_i24 v1, s5, s8, v1 -; GFX10-DL-NEXT: v_mad_i32_i24 v1, s6, s7, v1 -; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s4, v1 +; GFX10-DL-NEXT: s_ashr_i32 s3, s3, 28 +; GFX10-DL-NEXT: v_mad_i32_i24 v1, s4, s7, v1 +; GFX10-DL-NEXT: v_mad_i32_i24 v1, s5, s6, v1 +; GFX10-DL-NEXT: v_mad_i32_i24 v1, s2, s3, v1 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v0, v1 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 @@ -1243,64 +1243,64 @@ entry: define amdgpu_kernel void @idot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX7-LABEL: idot8_acc32_vecMul: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s1, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s9, s[10:11], 0x0 +; GFX7-NEXT: s_load_dword s5, s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s7, s[6:7], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_ashr_i64 s[10:11], s[0:1], 60 -; GFX7-NEXT: s_lshl_b32 s11, s1, 4 -; GFX7-NEXT: s_ashr_i64 s[16:17], s[10:11], 60 -; GFX7-NEXT: s_lshl_b32 s11, s1, 16 -; GFX7-NEXT: s_ashr_i64 s[18:19], s[10:11], 60 -; GFX7-NEXT: s_lshl_b32 s11, s1, 20 -; GFX7-NEXT: s_lshl_b32 s13, s1, 8 -; GFX7-NEXT: s_lshl_b32 s15, s1, 12 -; GFX7-NEXT: s_ashr_i64 s[20:21], s[10:11], 60 -; GFX7-NEXT: s_lshl_b32 s11, s1, 24 -; GFX7-NEXT: s_lshl_b32 s1, s1, 28 -; GFX7-NEXT: s_ashr_i64 s[0:1], s[0:1], 60 -; GFX7-NEXT: s_lshl_b32 s1, s9, 4 -; GFX7-NEXT: s_ashr_i64 s[26:27], s[0:1], 60 -; GFX7-NEXT: s_lshl_b32 s1, s9, 8 -; GFX7-NEXT: s_ashr_i64 s[28:29], s[0:1], 60 -; GFX7-NEXT: s_lshl_b32 s1, s9, 12 -; GFX7-NEXT: s_ashr_i64 s[30:31], s[0:1], 60 -; GFX7-NEXT: s_lshl_b32 s1, s9, 16 -; GFX7-NEXT: s_ashr_i64 s[32:33], s[0:1], 60 -; GFX7-NEXT: s_lshl_b32 s1, s9, 20 -; GFX7-NEXT: s_ashr_i64 s[34:35], s[0:1], 60 -; GFX7-NEXT: s_lshl_b32 s1, s9, 24 -; GFX7-NEXT: s_ashr_i64 s[36:37], s[0:1], 60 -; GFX7-NEXT: s_lshl_b32 s1, s9, 28 -; GFX7-NEXT: s_ashr_i64 s[24:25], s[8:9], 60 -; GFX7-NEXT: s_ashr_i64 s[8:9], s[0:1], 60 -; GFX7-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX7-NEXT: v_mov_b32_e32 v0, s8 -; GFX7-NEXT: s_ashr_i64 s[22:23], s[10:11], 60 -; GFX7-NEXT: s_ashr_i64 s[14:15], s[14:15], 60 +; GFX7-NEXT: s_ashr_i64 s[8:9], s[4:5], 60 +; GFX7-NEXT: s_lshl_b32 s9, s5, 4 +; GFX7-NEXT: s_ashr_i64 s[14:15], s[8:9], 60 +; GFX7-NEXT: s_lshl_b32 s9, s5, 16 +; GFX7-NEXT: s_ashr_i64 s[16:17], s[8:9], 60 +; GFX7-NEXT: s_lshl_b32 s9, s5, 20 +; GFX7-NEXT: s_lshl_b32 s11, s5, 8 +; GFX7-NEXT: s_lshl_b32 s13, s5, 12 +; GFX7-NEXT: s_ashr_i64 s[18:19], s[8:9], 60 +; GFX7-NEXT: s_lshl_b32 s9, s5, 24 +; GFX7-NEXT: s_lshl_b32 s5, s5, 28 +; GFX7-NEXT: s_ashr_i64 s[4:5], s[4:5], 60 +; GFX7-NEXT: s_lshl_b32 s5, s7, 4 +; GFX7-NEXT: s_ashr_i64 s[24:25], s[4:5], 60 +; GFX7-NEXT: s_lshl_b32 s5, s7, 8 +; GFX7-NEXT: s_ashr_i64 s[26:27], s[4:5], 60 +; GFX7-NEXT: s_lshl_b32 s5, s7, 12 +; GFX7-NEXT: s_ashr_i64 s[28:29], s[4:5], 60 +; GFX7-NEXT: s_lshl_b32 s5, s7, 16 +; GFX7-NEXT: s_ashr_i64 s[30:31], s[4:5], 60 +; GFX7-NEXT: s_lshl_b32 s5, s7, 20 +; GFX7-NEXT: s_ashr_i64 s[34:35], s[4:5], 60 +; GFX7-NEXT: s_lshl_b32 s5, s7, 24 +; GFX7-NEXT: s_ashr_i64 s[36:37], s[4:5], 60 +; GFX7-NEXT: s_lshl_b32 s5, s7, 28 +; GFX7-NEXT: s_ashr_i64 s[22:23], s[6:7], 60 +; GFX7-NEXT: s_ashr_i64 s[6:7], s[4:5], 60 +; GFX7-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: s_ashr_i64 s[20:21], s[8:9], 60 ; GFX7-NEXT: s_ashr_i64 s[12:13], s[12:13], 60 +; GFX7-NEXT: s_ashr_i64 s[10:11], s[10:11], 60 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mad_i32_i24 v0, s0, v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mad_i32_i24 v0, s4, v0, v1 ; GFX7-NEXT: v_mov_b32_e32 v1, s36 -; GFX7-NEXT: v_mad_i32_i24 v0, s22, v1, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s34 ; GFX7-NEXT: v_mad_i32_i24 v0, s20, v1, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s32 +; GFX7-NEXT: v_mov_b32_e32 v1, s34 ; GFX7-NEXT: v_mad_i32_i24 v0, s18, v1, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, s30 -; GFX7-NEXT: v_mad_i32_i24 v0, s14, v1, v0 +; GFX7-NEXT: v_mad_i32_i24 v0, s16, v1, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, s28 ; GFX7-NEXT: v_mad_i32_i24 v0, s12, v1, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, s26 -; GFX7-NEXT: v_mad_i32_i24 v0, s16, v1, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s24 ; GFX7-NEXT: v_mad_i32_i24 v0, s10, v1, v0 -; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX7-NEXT: v_mov_b32_e32 v1, s24 +; GFX7-NEXT: v_mad_i32_i24 v0, s14, v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s22 +; GFX7-NEXT: v_mad_i32_i24 v0, s8, v1, v0 +; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: idot8_acc32_vecMul: @@ -1308,57 +1308,58 @@ define amdgpu_kernel void @idot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s5, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s7, s[6:7], 0x0 -; GFX8-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s3, s[4:5], 0x0 +; GFX8-NEXT: s_load_dword s5, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_ashr_i64 s[8:9], s[4:5], 60 -; GFX8-NEXT: s_lshl_b32 s9, s5, 4 -; GFX8-NEXT: s_ashr_i64 s[16:17], s[8:9], 60 -; GFX8-NEXT: s_lshl_b32 s9, s5, 20 -; GFX8-NEXT: s_lshl_b32 s11, s5, 8 -; GFX8-NEXT: s_lshl_b32 s13, s5, 12 -; GFX8-NEXT: s_lshl_b32 s15, s5, 16 -; GFX8-NEXT: s_ashr_i64 s[18:19], s[8:9], 60 -; GFX8-NEXT: s_lshl_b32 s9, s5, 24 -; GFX8-NEXT: s_lshl_b32 s5, s5, 28 -; GFX8-NEXT: s_ashr_i64 s[4:5], s[4:5], 60 -; GFX8-NEXT: s_lshl_b32 s5, s7, 4 -; GFX8-NEXT: s_ashr_i64 s[24:25], s[4:5], 60 -; GFX8-NEXT: s_lshl_b32 s5, s7, 8 -; GFX8-NEXT: s_ashr_i64 s[26:27], s[4:5], 60 -; GFX8-NEXT: s_lshl_b32 s5, s7, 12 -; GFX8-NEXT: s_ashr_i64 s[28:29], s[4:5], 60 -; GFX8-NEXT: s_lshl_b32 s5, s7, 16 -; GFX8-NEXT: s_ashr_i64 s[30:31], s[4:5], 60 -; GFX8-NEXT: s_lshl_b32 s5, s7, 20 -; GFX8-NEXT: s_ashr_i64 s[32:33], s[4:5], 60 -; GFX8-NEXT: s_lshl_b32 s5, s7, 24 -; GFX8-NEXT: s_ashr_i64 s[34:35], s[4:5], 60 -; GFX8-NEXT: s_lshl_b32 s5, s7, 28 -; GFX8-NEXT: s_ashr_i64 s[22:23], s[6:7], 60 -; GFX8-NEXT: s_ashr_i64 s[6:7], s[4:5], 60 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s2 -; GFX8-NEXT: v_mad_i32_i24 v0, s4, v0, v1 -; GFX8-NEXT: s_ashr_i64 s[20:21], s[8:9], 60 +; GFX8-NEXT: s_ashr_i64 s[6:7], s[2:3], 60 +; GFX8-NEXT: s_lshl_b32 s7, s3, 4 +; GFX8-NEXT: s_ashr_i64 s[14:15], s[6:7], 60 +; GFX8-NEXT: s_lshl_b32 s7, s3, 20 +; GFX8-NEXT: s_lshl_b32 s9, s3, 8 +; GFX8-NEXT: s_lshl_b32 s11, s3, 12 +; GFX8-NEXT: s_lshl_b32 s13, s3, 16 +; GFX8-NEXT: s_ashr_i64 s[16:17], s[6:7], 60 +; GFX8-NEXT: s_lshl_b32 s7, s3, 24 +; GFX8-NEXT: s_lshl_b32 s3, s3, 28 +; GFX8-NEXT: s_ashr_i64 s[2:3], s[2:3], 60 +; GFX8-NEXT: s_lshl_b32 s3, s5, 4 +; GFX8-NEXT: s_ashr_i64 s[22:23], s[2:3], 60 +; GFX8-NEXT: s_lshl_b32 s3, s5, 8 +; GFX8-NEXT: s_ashr_i64 s[24:25], s[2:3], 60 +; GFX8-NEXT: s_lshl_b32 s3, s5, 12 +; GFX8-NEXT: s_ashr_i64 s[26:27], s[2:3], 60 +; GFX8-NEXT: s_lshl_b32 s3, s5, 16 +; GFX8-NEXT: s_ashr_i64 s[28:29], s[2:3], 60 +; GFX8-NEXT: s_lshl_b32 s3, s5, 20 +; GFX8-NEXT: s_ashr_i64 s[30:31], s[2:3], 60 +; GFX8-NEXT: s_lshl_b32 s3, s5, 24 +; GFX8-NEXT: s_ashr_i64 s[34:35], s[2:3], 60 +; GFX8-NEXT: s_lshl_b32 s3, s5, 28 +; GFX8-NEXT: s_ashr_i64 s[20:21], s[4:5], 60 +; GFX8-NEXT: s_ashr_i64 s[4:5], s[2:3], 60 +; GFX8-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 +; GFX8-NEXT: s_ashr_i64 s[18:19], s[6:7], 60 +; GFX8-NEXT: s_ashr_i64 s[12:13], s[12:13], 60 +; GFX8-NEXT: s_ashr_i64 s[10:11], s[10:11], 60 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mad_i32_i24 v0, s2, v0, v1 ; GFX8-NEXT: v_mov_b32_e32 v1, s34 -; GFX8-NEXT: v_mad_i32_i24 v0, s20, v1, v0 -; GFX8-NEXT: v_mov_b32_e32 v1, s32 ; GFX8-NEXT: v_mad_i32_i24 v0, s18, v1, v0 -; GFX8-NEXT: s_ashr_i64 s[14:15], s[14:15], 60 ; GFX8-NEXT: v_mov_b32_e32 v1, s30 -; GFX8-NEXT: v_mad_i32_i24 v0, s14, v1, v0 -; GFX8-NEXT: s_ashr_i64 s[12:13], s[12:13], 60 +; GFX8-NEXT: v_mad_i32_i24 v0, s16, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, s28 ; GFX8-NEXT: v_mad_i32_i24 v0, s12, v1, v0 -; GFX8-NEXT: s_ashr_i64 s[10:11], s[10:11], 60 ; GFX8-NEXT: v_mov_b32_e32 v1, s26 ; GFX8-NEXT: v_mad_i32_i24 v0, s10, v1, v0 +; GFX8-NEXT: s_ashr_i64 s[8:9], s[8:9], 60 ; GFX8-NEXT: v_mov_b32_e32 v1, s24 -; GFX8-NEXT: v_mad_i32_i24 v0, s16, v1, v0 +; GFX8-NEXT: v_mad_i32_i24 v0, s8, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, s22 -; GFX8-NEXT: v_mad_i32_i24 v2, s8, v1, v0 +; GFX8-NEXT: v_mad_i32_i24 v0, s14, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s20 +; GFX8-NEXT: v_mad_i32_i24 v2, s6, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -1369,57 +1370,58 @@ define amdgpu_kernel void @idot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_load_dword s5, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s7, s[6:7], 0x0 -; GFX9-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-NEXT: s_load_dword s3, s[4:5], 0x0 +; GFX9-NEXT: s_load_dword s5, s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_ashr_i64 s[8:9], s[4:5], 60 -; GFX9-NEXT: s_lshl_b32 s9, s5, 4 -; GFX9-NEXT: s_ashr_i64 s[16:17], s[8:9], 60 -; GFX9-NEXT: s_lshl_b32 s9, s5, 20 -; GFX9-NEXT: s_lshl_b32 s11, s5, 8 -; GFX9-NEXT: s_lshl_b32 s13, s5, 12 -; GFX9-NEXT: s_lshl_b32 s15, s5, 16 -; GFX9-NEXT: s_ashr_i64 s[18:19], s[8:9], 60 -; GFX9-NEXT: s_lshl_b32 s9, s5, 24 -; GFX9-NEXT: s_lshl_b32 s5, s5, 28 -; GFX9-NEXT: s_ashr_i64 s[4:5], s[4:5], 60 -; GFX9-NEXT: s_lshl_b32 s5, s7, 4 -; GFX9-NEXT: s_ashr_i64 s[24:25], s[4:5], 60 -; GFX9-NEXT: s_lshl_b32 s5, s7, 8 -; GFX9-NEXT: s_ashr_i64 s[26:27], s[4:5], 60 -; GFX9-NEXT: s_lshl_b32 s5, s7, 12 -; GFX9-NEXT: s_ashr_i64 s[28:29], s[4:5], 60 -; GFX9-NEXT: s_lshl_b32 s5, s7, 16 -; GFX9-NEXT: s_ashr_i64 s[30:31], s[4:5], 60 -; GFX9-NEXT: s_lshl_b32 s5, s7, 20 -; GFX9-NEXT: s_ashr_i64 s[32:33], s[4:5], 60 -; GFX9-NEXT: s_lshl_b32 s5, s7, 24 -; GFX9-NEXT: s_ashr_i64 s[34:35], s[4:5], 60 -; GFX9-NEXT: s_lshl_b32 s5, s7, 28 -; GFX9-NEXT: s_ashr_i64 s[22:23], s[6:7], 60 -; GFX9-NEXT: s_ashr_i64 s[6:7], s[4:5], 60 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: v_mad_i32_i24 v0, s4, v0, v1 -; GFX9-NEXT: s_ashr_i64 s[20:21], s[8:9], 60 +; GFX9-NEXT: s_ashr_i64 s[6:7], s[2:3], 60 +; GFX9-NEXT: s_lshl_b32 s7, s3, 4 +; GFX9-NEXT: s_ashr_i64 s[14:15], s[6:7], 60 +; GFX9-NEXT: s_lshl_b32 s7, s3, 20 +; GFX9-NEXT: s_lshl_b32 s9, s3, 8 +; GFX9-NEXT: s_lshl_b32 s11, s3, 12 +; GFX9-NEXT: s_lshl_b32 s13, s3, 16 +; GFX9-NEXT: s_ashr_i64 s[16:17], s[6:7], 60 +; GFX9-NEXT: s_lshl_b32 s7, s3, 24 +; GFX9-NEXT: s_lshl_b32 s3, s3, 28 +; GFX9-NEXT: s_ashr_i64 s[2:3], s[2:3], 60 +; GFX9-NEXT: s_lshl_b32 s3, s5, 4 +; GFX9-NEXT: s_ashr_i64 s[22:23], s[2:3], 60 +; GFX9-NEXT: s_lshl_b32 s3, s5, 8 +; GFX9-NEXT: s_ashr_i64 s[24:25], s[2:3], 60 +; GFX9-NEXT: s_lshl_b32 s3, s5, 12 +; GFX9-NEXT: s_ashr_i64 s[26:27], s[2:3], 60 +; GFX9-NEXT: s_lshl_b32 s3, s5, 16 +; GFX9-NEXT: s_ashr_i64 s[28:29], s[2:3], 60 +; GFX9-NEXT: s_lshl_b32 s3, s5, 20 +; GFX9-NEXT: s_ashr_i64 s[30:31], s[2:3], 60 +; GFX9-NEXT: s_lshl_b32 s3, s5, 24 +; GFX9-NEXT: s_ashr_i64 s[34:35], s[2:3], 60 +; GFX9-NEXT: s_lshl_b32 s3, s5, 28 +; GFX9-NEXT: s_ashr_i64 s[20:21], s[4:5], 60 +; GFX9-NEXT: s_ashr_i64 s[4:5], s[2:3], 60 +; GFX9-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: s_ashr_i64 s[18:19], s[6:7], 60 +; GFX9-NEXT: s_ashr_i64 s[12:13], s[12:13], 60 +; GFX9-NEXT: s_ashr_i64 s[10:11], s[10:11], 60 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_mad_i32_i24 v0, s2, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, s34 -; GFX9-NEXT: v_mad_i32_i24 v0, s20, v1, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, s32 ; GFX9-NEXT: v_mad_i32_i24 v0, s18, v1, v0 -; GFX9-NEXT: s_ashr_i64 s[14:15], s[14:15], 60 ; GFX9-NEXT: v_mov_b32_e32 v1, s30 -; GFX9-NEXT: v_mad_i32_i24 v0, s14, v1, v0 -; GFX9-NEXT: s_ashr_i64 s[12:13], s[12:13], 60 +; GFX9-NEXT: v_mad_i32_i24 v0, s16, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, s28 ; GFX9-NEXT: v_mad_i32_i24 v0, s12, v1, v0 -; GFX9-NEXT: s_ashr_i64 s[10:11], s[10:11], 60 ; GFX9-NEXT: v_mov_b32_e32 v1, s26 ; GFX9-NEXT: v_mad_i32_i24 v0, s10, v1, v0 +; GFX9-NEXT: s_ashr_i64 s[8:9], s[8:9], 60 ; GFX9-NEXT: v_mov_b32_e32 v1, s24 -; GFX9-NEXT: v_mad_i32_i24 v0, s16, v1, v0 +; GFX9-NEXT: v_mad_i32_i24 v0, s8, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, s22 -; GFX9-NEXT: v_mad_i32_i24 v2, s8, v1, v0 +; GFX9-NEXT: v_mad_i32_i24 v0, s14, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s20 +; GFX9-NEXT: v_mad_i32_i24 v2, s6, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_store_dword v[0:1], v2, off @@ -1430,57 +1432,58 @@ define amdgpu_kernel void @idot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX9-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s5, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s7, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s3, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s5, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_ashr_i64 s[8:9], s[4:5], 60 -; GFX9-DL-NEXT: s_lshl_b32 s9, s5, 4 -; GFX9-DL-NEXT: s_ashr_i64 s[16:17], s[8:9], 60 -; GFX9-DL-NEXT: s_lshl_b32 s9, s5, 20 -; GFX9-DL-NEXT: s_lshl_b32 s11, s5, 8 -; GFX9-DL-NEXT: s_lshl_b32 s13, s5, 12 -; GFX9-DL-NEXT: s_lshl_b32 s15, s5, 16 -; GFX9-DL-NEXT: s_ashr_i64 s[18:19], s[8:9], 60 -; GFX9-DL-NEXT: s_lshl_b32 s9, s5, 24 -; GFX9-DL-NEXT: s_lshl_b32 s5, s5, 28 -; GFX9-DL-NEXT: s_ashr_i64 s[4:5], s[4:5], 60 -; GFX9-DL-NEXT: s_lshl_b32 s5, s7, 4 -; GFX9-DL-NEXT: s_ashr_i64 s[24:25], s[4:5], 60 -; GFX9-DL-NEXT: s_lshl_b32 s5, s7, 8 -; GFX9-DL-NEXT: s_ashr_i64 s[26:27], s[4:5], 60 -; GFX9-DL-NEXT: s_lshl_b32 s5, s7, 12 -; GFX9-DL-NEXT: s_ashr_i64 s[28:29], s[4:5], 60 -; GFX9-DL-NEXT: s_lshl_b32 s5, s7, 16 -; GFX9-DL-NEXT: s_ashr_i64 s[30:31], s[4:5], 60 -; GFX9-DL-NEXT: s_lshl_b32 s5, s7, 20 -; GFX9-DL-NEXT: s_ashr_i64 s[32:33], s[4:5], 60 -; GFX9-DL-NEXT: s_lshl_b32 s5, s7, 24 -; GFX9-DL-NEXT: s_ashr_i64 s[34:35], s[4:5], 60 -; GFX9-DL-NEXT: s_lshl_b32 s5, s7, 28 -; GFX9-DL-NEXT: s_ashr_i64 s[22:23], s[6:7], 60 -; GFX9-DL-NEXT: s_ashr_i64 s[6:7], s[4:5], 60 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-DL-NEXT: v_mad_i32_i24 v0, s4, v0, v1 -; GFX9-DL-NEXT: s_ashr_i64 s[20:21], s[8:9], 60 +; GFX9-DL-NEXT: s_ashr_i64 s[6:7], s[2:3], 60 +; GFX9-DL-NEXT: s_lshl_b32 s7, s3, 4 +; GFX9-DL-NEXT: s_ashr_i64 s[14:15], s[6:7], 60 +; GFX9-DL-NEXT: s_lshl_b32 s7, s3, 20 +; GFX9-DL-NEXT: s_lshl_b32 s9, s3, 8 +; GFX9-DL-NEXT: s_lshl_b32 s11, s3, 12 +; GFX9-DL-NEXT: s_lshl_b32 s13, s3, 16 +; GFX9-DL-NEXT: s_ashr_i64 s[16:17], s[6:7], 60 +; GFX9-DL-NEXT: s_lshl_b32 s7, s3, 24 +; GFX9-DL-NEXT: s_lshl_b32 s3, s3, 28 +; GFX9-DL-NEXT: s_ashr_i64 s[2:3], s[2:3], 60 +; GFX9-DL-NEXT: s_lshl_b32 s3, s5, 4 +; GFX9-DL-NEXT: s_ashr_i64 s[22:23], s[2:3], 60 +; GFX9-DL-NEXT: s_lshl_b32 s3, s5, 8 +; GFX9-DL-NEXT: s_ashr_i64 s[24:25], s[2:3], 60 +; GFX9-DL-NEXT: s_lshl_b32 s3, s5, 12 +; GFX9-DL-NEXT: s_ashr_i64 s[26:27], s[2:3], 60 +; GFX9-DL-NEXT: s_lshl_b32 s3, s5, 16 +; GFX9-DL-NEXT: s_ashr_i64 s[28:29], s[2:3], 60 +; GFX9-DL-NEXT: s_lshl_b32 s3, s5, 20 +; GFX9-DL-NEXT: s_ashr_i64 s[30:31], s[2:3], 60 +; GFX9-DL-NEXT: s_lshl_b32 s3, s5, 24 +; GFX9-DL-NEXT: s_ashr_i64 s[34:35], s[2:3], 60 +; GFX9-DL-NEXT: s_lshl_b32 s3, s5, 28 +; GFX9-DL-NEXT: s_ashr_i64 s[20:21], s[4:5], 60 +; GFX9-DL-NEXT: s_ashr_i64 s[4:5], s[2:3], 60 +; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-DL-NEXT: s_ashr_i64 s[18:19], s[6:7], 60 +; GFX9-DL-NEXT: s_ashr_i64 s[12:13], s[12:13], 60 +; GFX9-DL-NEXT: s_ashr_i64 s[10:11], s[10:11], 60 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s2, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s34 -; GFX9-DL-NEXT: v_mad_i32_i24 v0, s20, v1, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s32 ; GFX9-DL-NEXT: v_mad_i32_i24 v0, s18, v1, v0 -; GFX9-DL-NEXT: s_ashr_i64 s[14:15], s[14:15], 60 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s30 -; GFX9-DL-NEXT: v_mad_i32_i24 v0, s14, v1, v0 -; GFX9-DL-NEXT: s_ashr_i64 s[12:13], s[12:13], 60 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s16, v1, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s28 ; GFX9-DL-NEXT: v_mad_i32_i24 v0, s12, v1, v0 -; GFX9-DL-NEXT: s_ashr_i64 s[10:11], s[10:11], 60 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s26 ; GFX9-DL-NEXT: v_mad_i32_i24 v0, s10, v1, v0 +; GFX9-DL-NEXT: s_ashr_i64 s[8:9], s[8:9], 60 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s24 -; GFX9-DL-NEXT: v_mad_i32_i24 v0, s16, v1, v0 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s8, v1, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s22 -; GFX9-DL-NEXT: v_mad_i32_i24 v2, s8, v1, v0 +; GFX9-DL-NEXT: v_mad_i32_i24 v0, s14, v1, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s20 +; GFX9-DL-NEXT: v_mad_i32_i24 v2, s6, v1, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: global_store_dword v[0:1], v2, off @@ -1492,49 +1495,49 @@ define amdgpu_kernel void @idot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s5, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s7, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s3, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s5, s[6:7], 0x0 ; GFX10-DL-NEXT: s_load_dword s2, s[0:1], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-DL-NEXT: s_lshl_b32 s7, s3, 28 ; GFX10-DL-NEXT: s_lshl_b32 s9, s5, 28 -; GFX10-DL-NEXT: s_lshl_b32 s11, s7, 28 +; GFX10-DL-NEXT: s_lshl_b32 s11, s3, 24 ; GFX10-DL-NEXT: s_lshl_b32 s13, s5, 24 -; GFX10-DL-NEXT: s_lshl_b32 s15, s7, 24 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: s_ashr_i64 s[6:7], s[6:7], 60 ; GFX10-DL-NEXT: s_ashr_i64 s[8:9], s[8:9], 60 ; GFX10-DL-NEXT: s_ashr_i64 s[10:11], s[10:11], 60 ; GFX10-DL-NEXT: s_ashr_i64 s[12:13], s[12:13], 60 -; GFX10-DL-NEXT: s_ashr_i64 s[14:15], s[14:15], 60 +; GFX10-DL-NEXT: s_lshl_b32 s7, s3, 20 ; GFX10-DL-NEXT: s_lshl_b32 s9, s5, 20 -; GFX10-DL-NEXT: s_lshl_b32 s11, s7, 20 -; GFX10-DL-NEXT: v_mad_i32_i24 v0, s8, s10, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s6, s8, v0 +; GFX10-DL-NEXT: s_lshl_b32 s11, s3, 16 ; GFX10-DL-NEXT: s_lshl_b32 s13, s5, 16 -; GFX10-DL-NEXT: s_lshl_b32 s15, s7, 16 +; GFX10-DL-NEXT: s_ashr_i64 s[6:7], s[6:7], 60 ; GFX10-DL-NEXT: s_ashr_i64 s[8:9], s[8:9], 60 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s10, s12, v0 ; GFX10-DL-NEXT: s_ashr_i64 s[10:11], s[10:11], 60 -; GFX10-DL-NEXT: v_mad_i32_i24 v0, s12, s14, v0 ; GFX10-DL-NEXT: s_ashr_i64 s[12:13], s[12:13], 60 -; GFX10-DL-NEXT: s_ashr_i64 s[14:15], s[14:15], 60 +; GFX10-DL-NEXT: s_lshl_b32 s7, s3, 12 ; GFX10-DL-NEXT: s_lshl_b32 s9, s5, 12 -; GFX10-DL-NEXT: s_lshl_b32 s11, s7, 12 -; GFX10-DL-NEXT: v_mad_i32_i24 v0, s8, s10, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s6, s8, v0 +; GFX10-DL-NEXT: s_lshl_b32 s11, s3, 8 ; GFX10-DL-NEXT: s_lshl_b32 s13, s5, 8 -; GFX10-DL-NEXT: s_lshl_b32 s15, s7, 8 +; GFX10-DL-NEXT: s_ashr_i64 s[6:7], s[6:7], 60 ; GFX10-DL-NEXT: s_ashr_i64 s[8:9], s[8:9], 60 -; GFX10-DL-NEXT: s_ashr_i64 s[10:11], s[10:11], 60 -; GFX10-DL-NEXT: v_mad_i32_i24 v0, s12, s14, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s10, s12, v0 +; GFX10-DL-NEXT: s_lshl_b32 s7, s3, 4 ; GFX10-DL-NEXT: s_lshl_b32 s9, s5, 4 -; GFX10-DL-NEXT: s_lshl_b32 s11, s7, 4 +; GFX10-DL-NEXT: s_ashr_i64 s[10:11], s[10:11], 60 ; GFX10-DL-NEXT: s_ashr_i64 s[12:13], s[12:13], 60 -; GFX10-DL-NEXT: s_ashr_i64 s[14:15], s[14:15], 60 -; GFX10-DL-NEXT: v_mad_i32_i24 v0, s8, s10, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s6, s8, v0 +; GFX10-DL-NEXT: s_ashr_i64 s[6:7], s[6:7], 60 ; GFX10-DL-NEXT: s_ashr_i64 s[8:9], s[8:9], 60 -; GFX10-DL-NEXT: s_ashr_i64 s[10:11], s[10:11], 60 +; GFX10-DL-NEXT: s_ashr_i64 s[2:3], s[2:3], 60 ; GFX10-DL-NEXT: s_ashr_i64 s[4:5], s[4:5], 60 -; GFX10-DL-NEXT: s_ashr_i64 s[6:7], s[6:7], 60 -; GFX10-DL-NEXT: v_mad_i32_i24 v0, s12, s14, v0 -; GFX10-DL-NEXT: v_mad_i32_i24 v0, s8, s10, v0 -; GFX10-DL-NEXT: v_mad_i32_i24 v2, s4, s6, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s10, s12, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s6, s8, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v2, s2, s4, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off @@ -1576,62 +1579,62 @@ entry: define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX7-LABEL: idot8_acc16_vecMul: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s6, -1 -; GFX7-NEXT: s_mov_b32 s2, 0xffff +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_mov_b32 s8, 0xffff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s0, s[10:11], 0x0 -; GFX7-NEXT: buffer_load_ushort v0, off, s[4:7], 0 -; GFX7-NEXT: s_load_dword s1, s[8:9], 0x0 +; GFX7-NEXT: s_load_dword s6, s[6:7], 0x0 +; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bfe_i32 s16, s0, 0x40018 -; GFX7-NEXT: s_bfe_i32 s17, s0, 0x40014 -; GFX7-NEXT: s_bfe_i32 s18, s0, 0x40010 -; GFX7-NEXT: s_bfe_i32 s19, s0, 0x40000 -; GFX7-NEXT: s_bfe_i32 s20, s0, 0x40004 -; GFX7-NEXT: s_bfe_i32 s21, s0, 0x40008 -; GFX7-NEXT: s_ashr_i32 s15, s0, 28 -; GFX7-NEXT: s_bfe_i32 s0, s0, 0x4000c -; GFX7-NEXT: s_ashr_i32 s8, s1, 28 -; GFX7-NEXT: s_bfe_i32 s9, s1, 0x40018 -; GFX7-NEXT: s_bfe_i32 s10, s1, 0x40014 -; GFX7-NEXT: s_bfe_i32 s11, s1, 0x40010 -; GFX7-NEXT: s_bfe_i32 s12, s1, 0x40000 -; GFX7-NEXT: v_mov_b32_e32 v4, s19 -; GFX7-NEXT: s_bfe_i32 s13, s1, 0x40004 -; GFX7-NEXT: v_mov_b32_e32 v3, s20 -; GFX7-NEXT: s_bfe_i32 s14, s1, 0x40008 -; GFX7-NEXT: v_mov_b32_e32 v2, s21 -; GFX7-NEXT: s_bfe_i32 s1, s1, 0x4000c -; GFX7-NEXT: v_mov_b32_e32 v1, s0 -; GFX7-NEXT: v_mul_i32_i24_e32 v1, s1, v1 -; GFX7-NEXT: v_mul_i32_i24_e32 v2, s14, v2 -; GFX7-NEXT: v_mul_i32_i24_e32 v3, s13, v3 -; GFX7-NEXT: v_mul_i32_i24_e32 v4, s12, v4 +; GFX7-NEXT: s_bfe_i32 s15, s6, 0x40018 +; GFX7-NEXT: s_bfe_i32 s16, s6, 0x40014 +; GFX7-NEXT: s_bfe_i32 s17, s6, 0x40010 +; GFX7-NEXT: s_bfe_i32 s18, s6, 0x40000 +; GFX7-NEXT: s_bfe_i32 s19, s6, 0x40004 +; GFX7-NEXT: s_bfe_i32 s20, s6, 0x40008 +; GFX7-NEXT: s_ashr_i32 s14, s6, 28 +; GFX7-NEXT: s_bfe_i32 s6, s6, 0x4000c +; GFX7-NEXT: s_ashr_i32 s5, s4, 28 +; GFX7-NEXT: s_bfe_i32 s7, s4, 0x40018 +; GFX7-NEXT: s_bfe_i32 s9, s4, 0x40014 +; GFX7-NEXT: s_bfe_i32 s10, s4, 0x40010 +; GFX7-NEXT: s_bfe_i32 s11, s4, 0x40000 +; GFX7-NEXT: v_mov_b32_e32 v4, s18 +; GFX7-NEXT: s_bfe_i32 s12, s4, 0x40004 +; GFX7-NEXT: v_mov_b32_e32 v3, s19 +; GFX7-NEXT: s_bfe_i32 s13, s4, 0x40008 +; GFX7-NEXT: v_mov_b32_e32 v2, s20 +; GFX7-NEXT: s_bfe_i32 s4, s4, 0x4000c +; GFX7-NEXT: v_mov_b32_e32 v1, s6 +; GFX7-NEXT: v_mul_i32_i24_e32 v1, s4, v1 +; GFX7-NEXT: v_mul_i32_i24_e32 v2, s13, v2 +; GFX7-NEXT: v_mul_i32_i24_e32 v3, s12, v3 +; GFX7-NEXT: v_mul_i32_i24_e32 v4, s11, v4 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_and_b32_e32 v2, s2, v2 +; GFX7-NEXT: v_and_b32_e32 v2, s8, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v4, s2, v4 +; GFX7-NEXT: v_and_b32_e32 v4, s8, v4 ; GFX7-NEXT: v_or_b32_e32 v1, v2, v1 ; GFX7-NEXT: v_or_b32_e32 v2, v4, v3 ; GFX7-NEXT: v_alignbit_b32 v3, v1, v2, 16 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; GFX7-NEXT: v_mov_b32_e32 v5, s18 -; GFX7-NEXT: v_mov_b32_e32 v6, s17 -; GFX7-NEXT: v_mov_b32_e32 v7, s16 +; GFX7-NEXT: v_mov_b32_e32 v5, s17 +; GFX7-NEXT: v_mov_b32_e32 v6, s16 +; GFX7-NEXT: v_mov_b32_e32 v7, s15 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v4, v0 -; GFX7-NEXT: v_mad_i32_i24 v0, s11, v5, v0 -; GFX7-NEXT: v_mad_i32_i24 v0, s10, v6, v0 -; GFX7-NEXT: v_mad_i32_i24 v0, s9, v7, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s15 -; GFX7-NEXT: v_mad_i32_i24 v0, s8, v1, v0 -; GFX7-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GFX7-NEXT: v_mad_i32_i24 v0, s10, v5, v0 +; GFX7-NEXT: v_mad_i32_i24 v0, s9, v6, v0 +; GFX7-NEXT: v_mad_i32_i24 v0, s7, v7, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s14 +; GFX7-NEXT: v_mad_i32_i24 v0, s5, v1, v0 +; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: idot8_acc16_vecMul: @@ -1639,59 +1642,59 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_load_dword s7, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ushort v2, v[0:1] ; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_lshl_b32 s29, s7, 28 -; GFX8-NEXT: s_ashr_i64 s[18:19], s[6:7], 60 -; GFX8-NEXT: s_lshl_b32 s21, s7, 8 -; GFX8-NEXT: s_lshl_b32 s23, s7, 12 -; GFX8-NEXT: s_lshl_b32 s17, s1, 28 -; GFX8-NEXT: s_lshl_b32 s25, s7, 16 -; GFX8-NEXT: s_lshl_b32 s27, s7, 24 -; GFX8-NEXT: s_lshl_b32 s19, s7, 4 -; GFX8-NEXT: s_lshl_b32 s7, s7, 20 +; GFX8-NEXT: s_lshl_b32 s27, s3, 28 +; GFX8-NEXT: s_ashr_i64 s[16:17], s[2:3], 60 +; GFX8-NEXT: s_lshl_b32 s19, s3, 8 +; GFX8-NEXT: s_lshl_b32 s21, s3, 12 +; GFX8-NEXT: s_lshl_b32 s15, s1, 28 +; GFX8-NEXT: s_lshl_b32 s23, s3, 16 +; GFX8-NEXT: s_lshl_b32 s25, s3, 24 +; GFX8-NEXT: s_lshl_b32 s17, s3, 4 +; GFX8-NEXT: s_lshl_b32 s3, s3, 20 ; GFX8-NEXT: s_ashr_i64 s[4:5], s[0:1], 60 -; GFX8-NEXT: s_ashr_i64 s[28:29], s[28:29], 60 -; GFX8-NEXT: s_lshl_b32 s9, s1, 8 -; GFX8-NEXT: s_lshl_b32 s11, s1, 12 -; GFX8-NEXT: s_lshl_b32 s13, s1, 16 -; GFX8-NEXT: s_lshl_b32 s15, s1, 24 +; GFX8-NEXT: s_ashr_i64 s[26:27], s[26:27], 60 +; GFX8-NEXT: s_lshl_b32 s7, s1, 8 +; GFX8-NEXT: s_lshl_b32 s9, s1, 12 +; GFX8-NEXT: s_lshl_b32 s11, s1, 16 +; GFX8-NEXT: s_lshl_b32 s13, s1, 24 ; GFX8-NEXT: s_lshl_b32 s5, s1, 4 ; GFX8-NEXT: s_lshl_b32 s1, s1, 20 -; GFX8-NEXT: s_ashr_i64 s[26:27], s[26:27], 60 -; GFX8-NEXT: s_ashr_i64 s[6:7], s[6:7], 60 -; GFX8-NEXT: s_ashr_i64 s[16:17], s[16:17], 60 -; GFX8-NEXT: v_mov_b32_e32 v4, s28 +; GFX8-NEXT: s_ashr_i64 s[24:25], s[24:25], 60 +; GFX8-NEXT: s_ashr_i64 s[2:3], s[2:3], 60 ; GFX8-NEXT: s_ashr_i64 s[14:15], s[14:15], 60 +; GFX8-NEXT: v_mov_b32_e32 v4, s26 +; GFX8-NEXT: s_ashr_i64 s[12:13], s[12:13], 60 ; GFX8-NEXT: s_ashr_i64 s[0:1], s[0:1], 60 -; GFX8-NEXT: v_mov_b32_e32 v3, s6 -; GFX8-NEXT: v_mov_b32_e32 v5, s26 -; GFX8-NEXT: s_ashr_i64 s[24:25], s[24:25], 60 -; GFX8-NEXT: v_mul_i32_i24_e32 v3, s0, v3 +; GFX8-NEXT: v_mov_b32_e32 v3, s2 +; GFX8-NEXT: v_mov_b32_e32 v5, s24 ; GFX8-NEXT: s_ashr_i64 s[22:23], s[22:23], 60 -; GFX8-NEXT: s_ashr_i64 s[12:13], s[12:13], 60 -; GFX8-NEXT: v_mov_b32_e32 v6, s24 +; GFX8-NEXT: v_mul_i32_i24_e32 v3, s0, v3 ; GFX8-NEXT: s_ashr_i64 s[20:21], s[20:21], 60 ; GFX8-NEXT: s_ashr_i64 s[10:11], s[10:11], 60 -; GFX8-NEXT: v_mov_b32_e32 v7, s22 -; GFX8-NEXT: s_ashr_i64 s[32:33], s[18:19], 60 +; GFX8-NEXT: v_mov_b32_e32 v6, s22 +; GFX8-NEXT: s_ashr_i64 s[18:19], s[18:19], 60 ; GFX8-NEXT: s_ashr_i64 s[8:9], s[8:9], 60 -; GFX8-NEXT: v_mov_b32_e32 v8, s20 -; GFX8-NEXT: s_ashr_i64 s[30:31], s[4:5], 60 -; GFX8-NEXT: v_mov_b32_e32 v9, s32 +; GFX8-NEXT: v_mov_b32_e32 v7, s20 +; GFX8-NEXT: s_ashr_i64 s[30:31], s[16:17], 60 +; GFX8-NEXT: s_ashr_i64 s[6:7], s[6:7], 60 +; GFX8-NEXT: v_mov_b32_e32 v8, s18 +; GFX8-NEXT: s_ashr_i64 s[28:29], s[4:5], 60 +; GFX8-NEXT: v_mov_b32_e32 v9, s30 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_i32_i24 v2, s16, v4, v2 -; GFX8-NEXT: v_mad_i32_i24 v2, s14, v5, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s14, v4, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s12, v5, v2 ; GFX8-NEXT: v_add_u32_sdwa v2, vcc, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; GFX8-NEXT: v_mad_i32_i24 v2, s12, v6, v2 -; GFX8-NEXT: v_mad_i32_i24 v2, s10, v7, v2 -; GFX8-NEXT: v_mad_i32_i24 v2, s8, v8, v2 -; GFX8-NEXT: v_mad_i32_i24 v2, s30, v9, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s18 +; GFX8-NEXT: v_mad_i32_i24 v2, s10, v6, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s8, v7, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s6, v8, v2 +; GFX8-NEXT: v_mad_i32_i24 v2, s28, v9, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s16 ; GFX8-NEXT: v_mad_i32_i24 v2, s4, v3, v2 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm @@ -1704,35 +1707,35 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_u32 s4, s2, 0x40018 -; GFX9-NEXT: s_lshr_b32 s5, s2, 28 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40010 -; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40014 -; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s11, s2, 0x4000c -; GFX9-NEXT: s_and_b32 s12, s2, 15 +; GFX9-NEXT: s_bfe_u32 s3, s2, 0x40018 +; GFX9-NEXT: s_lshr_b32 s4, s2, 28 +; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40010 +; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40014 +; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40008 +; GFX9-NEXT: s_bfe_u32 s10, s2, 0x4000c +; GFX9-NEXT: s_and_b32 s11, s2, 15 ; GFX9-NEXT: s_bfe_u32 s2, s2, 0x40004 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s12, s2 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s11, s2 ; GFX9-NEXT: v_pk_lshlrev_b16 v0, 12, s2 op_sel_hi:[0,1] -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s10, s11 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s9, s10 ; GFX9-NEXT: v_pk_lshlrev_b16 v1, 12, s2 op_sel_hi:[0,1] -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s8, s9 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s5, s8 ; GFX9-NEXT: v_pk_lshlrev_b16 v2, 12, s2 op_sel_hi:[0,1] -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s4, s5 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s3, s4 ; GFX9-NEXT: s_bfe_u32 s7, s6, 0x40018 -; GFX9-NEXT: s_lshr_b32 s13, s6, 28 -; GFX9-NEXT: s_bfe_u32 s14, s6, 0x40010 -; GFX9-NEXT: s_bfe_u32 s15, s6, 0x40014 -; GFX9-NEXT: s_bfe_u32 s16, s6, 0x40008 -; GFX9-NEXT: s_bfe_u32 s17, s6, 0x4000c -; GFX9-NEXT: s_and_b32 s18, s6, 15 +; GFX9-NEXT: s_lshr_b32 s12, s6, 28 +; GFX9-NEXT: s_bfe_u32 s13, s6, 0x40010 +; GFX9-NEXT: s_bfe_u32 s14, s6, 0x40014 +; GFX9-NEXT: s_bfe_u32 s15, s6, 0x40008 +; GFX9-NEXT: s_bfe_u32 s16, s6, 0x4000c +; GFX9-NEXT: s_and_b32 s17, s6, 15 ; GFX9-NEXT: s_bfe_u32 s6, s6, 0x40004 ; GFX9-NEXT: v_pk_lshlrev_b16 v3, 12, s2 op_sel_hi:[0,1] -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s18, s6 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s17, s6 ; GFX9-NEXT: v_pk_lshlrev_b16 v4, 12, s2 op_sel_hi:[0,1] -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s16, s17 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s15, s16 ; GFX9-NEXT: v_pk_lshlrev_b16 v5, 12, s2 op_sel_hi:[0,1] -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s14, s15 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s13, s14 ; GFX9-NEXT: v_pk_ashrrev_i16 v0, 12, v0 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1] @@ -1746,7 +1749,7 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_pk_mul_lo_u16 v2, v2, v6 ; GFX9-NEXT: global_load_ushort v6, v[0:1], off -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s7, s13 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s7, s12 ; GFX9-NEXT: v_pk_lshlrev_b16 v7, 12, s2 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1] @@ -1771,35 +1774,35 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s6, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_bfe_u32 s4, s2, 0x40018 -; GFX9-DL-NEXT: s_lshr_b32 s5, s2, 28 -; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s11, s2, 0x4000c -; GFX9-DL-NEXT: s_and_b32 s12, s2, 15 +; GFX9-DL-NEXT: s_bfe_u32 s3, s2, 0x40018 +; GFX9-DL-NEXT: s_lshr_b32 s4, s2, 28 +; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x4000c +; GFX9-DL-NEXT: s_and_b32 s11, s2, 15 ; GFX9-DL-NEXT: s_bfe_u32 s2, s2, 0x40004 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s12, s2 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s11, s2 ; GFX9-DL-NEXT: v_pk_lshlrev_b16 v0, 12, s2 op_sel_hi:[0,1] -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s10, s11 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s9, s10 ; GFX9-DL-NEXT: v_pk_lshlrev_b16 v1, 12, s2 op_sel_hi:[0,1] -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s8, s9 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s5, s8 ; GFX9-DL-NEXT: v_pk_lshlrev_b16 v2, 12, s2 op_sel_hi:[0,1] -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s4, s5 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s3, s4 ; GFX9-DL-NEXT: s_bfe_u32 s7, s6, 0x40018 -; GFX9-DL-NEXT: s_lshr_b32 s13, s6, 28 -; GFX9-DL-NEXT: s_bfe_u32 s14, s6, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s15, s6, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s16, s6, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s17, s6, 0x4000c -; GFX9-DL-NEXT: s_and_b32 s18, s6, 15 +; GFX9-DL-NEXT: s_lshr_b32 s12, s6, 28 +; GFX9-DL-NEXT: s_bfe_u32 s13, s6, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s14, s6, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s15, s6, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s16, s6, 0x4000c +; GFX9-DL-NEXT: s_and_b32 s17, s6, 15 ; GFX9-DL-NEXT: s_bfe_u32 s6, s6, 0x40004 ; GFX9-DL-NEXT: v_pk_lshlrev_b16 v3, 12, s2 op_sel_hi:[0,1] -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s18, s6 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s17, s6 ; GFX9-DL-NEXT: v_pk_lshlrev_b16 v4, 12, s2 op_sel_hi:[0,1] -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s16, s17 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s15, s16 ; GFX9-DL-NEXT: v_pk_lshlrev_b16 v5, 12, s2 op_sel_hi:[0,1] -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s14, s15 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s13, s14 ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v0, 12, v0 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v1, 12, v1 op_sel_hi:[0,1] @@ -1813,7 +1816,7 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v2, v6 ; GFX9-DL-NEXT: global_load_ushort v6, v[0:1], off -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s7, s13 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s7, s12 ; GFX9-DL-NEXT: v_pk_lshlrev_b16 v7, 12, s2 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1] ; GFX9-DL-NEXT: v_pk_ashrrev_i16 v7, 12, v7 op_sel_hi:[0,1] @@ -1832,53 +1835,53 @@ define amdgpu_kernel void @idot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1, ; ; GFX10-DL-LABEL: idot8_acc16_vecMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s5, s0, 15 -; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x40004 -; GFX10-DL-NEXT: s_and_b32 s7, s1, 15 -; GFX10-DL-NEXT: s_bfe_u32 s8, s1, 0x40004 +; GFX10-DL-NEXT: s_and_b32 s4, s0, 15 +; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x40004 +; GFX10-DL-NEXT: s_and_b32 s6, s1, 15 +; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x40004 ; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40018 -; GFX10-DL-NEXT: s_lshr_b32 s4, s0, 28 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s5, s5, s6 -; GFX10-DL-NEXT: s_bfe_u32 s9, s0, 0x40010 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s7, s7, s8 -; GFX10-DL-NEXT: s_bfe_u32 s10, s0, 0x40014 -; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x40008 -; GFX10-DL-NEXT: v_pk_lshlrev_b16 v3, 12, s5 op_sel_hi:[0,1] +; GFX10-DL-NEXT: s_lshr_b32 s3, s0, 28 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX10-DL-NEXT: s_bfe_u32 s8, s0, 0x40010 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s6, s6, s7 +; GFX10-DL-NEXT: s_bfe_u32 s9, s0, 0x40014 +; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x40008 +; GFX10-DL-NEXT: v_pk_lshlrev_b16 v3, 12, s4 op_sel_hi:[0,1] ; GFX10-DL-NEXT: s_bfe_u32 s0, s0, 0x4000c -; GFX10-DL-NEXT: v_pk_lshlrev_b16 v4, 12, s7 op_sel_hi:[0,1] -; GFX10-DL-NEXT: s_bfe_u32 s8, s1, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x4000c +; GFX10-DL-NEXT: v_pk_lshlrev_b16 v4, 12, s6 op_sel_hi:[0,1] +; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x40008 +; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x4000c ; GFX10-DL-NEXT: v_pk_ashrrev_i16 v3, 12, v3 op_sel_hi:[0,1] -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s6, s0 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s5, s0 ; GFX10-DL-NEXT: v_pk_ashrrev_i16 v4, 12, v4 op_sel_hi:[0,1] -; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40010 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s5, s8, s5 -; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x40018 +; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x40010 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s4, s7, s4 +; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40018 ; GFX10-DL-NEXT: v_pk_lshlrev_b16 v5, 12, s0 op_sel_hi:[0,1] ; GFX10-DL-NEXT: s_bfe_u32 s0, s1, 0x40014 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, v3, v4 -; GFX10-DL-NEXT: v_pk_lshlrev_b16 v6, 12, s5 op_sel_hi:[0,1] -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s5, s9, s10 +; GFX10-DL-NEXT: v_pk_lshlrev_b16 v6, 12, s4 op_sel_hi:[0,1] +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s4, s8, s9 ; GFX10-DL-NEXT: v_pk_ashrrev_i16 v4, 12, v5 op_sel_hi:[0,1] -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s6, s0 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s5, s0 ; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 28 ; GFX10-DL-NEXT: v_pk_ashrrev_i16 v5, 12, v6 op_sel_hi:[0,1] -; GFX10-DL-NEXT: v_pk_lshlrev_b16 v6, 12, s5 op_sel_hi:[0,1] +; GFX10-DL-NEXT: v_pk_lshlrev_b16 v6, 12, s4 op_sel_hi:[0,1] ; GFX10-DL-NEXT: v_pk_lshlrev_b16 v7, 12, s0 op_sel_hi:[0,1] -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s2, s4 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s2, s3 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v5 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s1, s7, s1 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s1, s6, s1 ; GFX10-DL-NEXT: v_pk_ashrrev_i16 v5, 12, v7 op_sel_hi:[0,1] ; GFX10-DL-NEXT: v_pk_lshlrev_b16 v7, 12, s1 op_sel_hi:[0,1] ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) @@ -1935,65 +1938,65 @@ entry: define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX7-LABEL: idot8_acc8_vecMul: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s6, -1 -; GFX7-NEXT: s_movk_i32 s0, 0xff -; GFX7-NEXT: s_mov_b32 s1, 0xffff +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s2, -1 +; GFX7-NEXT: s_movk_i32 s8, 0xff +; GFX7-NEXT: s_mov_b32 s9, 0xffff ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 -; GFX7-NEXT: s_load_dword s2, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s8, s[10:11], 0x0 +; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bfe_i32 s9, s2, 0x40000 -; GFX7-NEXT: s_bfe_i32 s16, s8, 0x40000 -; GFX7-NEXT: s_bfe_i32 s17, s8, 0x40004 -; GFX7-NEXT: s_bfe_i32 s18, s8, 0x40008 -; GFX7-NEXT: s_bfe_i32 s19, s8, 0x4000c -; GFX7-NEXT: s_bfe_i32 s20, s8, 0x40010 -; GFX7-NEXT: s_bfe_i32 s21, s8, 0x40014 -; GFX7-NEXT: s_bfe_i32 s22, s8, 0x40018 -; GFX7-NEXT: s_ashr_i32 s8, s8, 28 -; GFX7-NEXT: v_mov_b32_e32 v8, s16 -; GFX7-NEXT: s_bfe_i32 s10, s2, 0x40004 -; GFX7-NEXT: v_mov_b32_e32 v7, s17 -; GFX7-NEXT: s_bfe_i32 s11, s2, 0x40008 -; GFX7-NEXT: v_mov_b32_e32 v6, s18 -; GFX7-NEXT: s_bfe_i32 s12, s2, 0x4000c -; GFX7-NEXT: v_mov_b32_e32 v5, s19 -; GFX7-NEXT: s_bfe_i32 s13, s2, 0x40010 -; GFX7-NEXT: v_mov_b32_e32 v4, s20 -; GFX7-NEXT: s_bfe_i32 s14, s2, 0x40014 -; GFX7-NEXT: v_mov_b32_e32 v3, s21 -; GFX7-NEXT: s_bfe_i32 s15, s2, 0x40018 -; GFX7-NEXT: v_mov_b32_e32 v2, s22 -; GFX7-NEXT: s_ashr_i32 s2, s2, 28 -; GFX7-NEXT: v_mov_b32_e32 v1, s8 -; GFX7-NEXT: v_mul_i32_i24_e32 v1, s2, v1 -; GFX7-NEXT: v_mul_i32_i24_e32 v2, s15, v2 -; GFX7-NEXT: v_mul_i32_i24_e32 v3, s14, v3 -; GFX7-NEXT: v_mul_i32_i24_e32 v9, s13, v4 -; GFX7-NEXT: v_mul_i32_i24_e32 v5, s12, v5 -; GFX7-NEXT: v_mul_i32_i24_e32 v6, s11, v6 -; GFX7-NEXT: v_mul_i32_i24_e32 v7, s10, v7 -; GFX7-NEXT: v_mul_i32_i24_e32 v8, s9, v8 +; GFX7-NEXT: s_bfe_i32 s6, s4, 0x40000 +; GFX7-NEXT: s_bfe_i32 s15, s5, 0x40000 +; GFX7-NEXT: s_bfe_i32 s16, s5, 0x40004 +; GFX7-NEXT: s_bfe_i32 s17, s5, 0x40008 +; GFX7-NEXT: s_bfe_i32 s18, s5, 0x4000c +; GFX7-NEXT: s_bfe_i32 s19, s5, 0x40010 +; GFX7-NEXT: s_bfe_i32 s20, s5, 0x40014 +; GFX7-NEXT: s_bfe_i32 s21, s5, 0x40018 +; GFX7-NEXT: s_ashr_i32 s5, s5, 28 +; GFX7-NEXT: v_mov_b32_e32 v8, s15 +; GFX7-NEXT: s_bfe_i32 s7, s4, 0x40004 +; GFX7-NEXT: v_mov_b32_e32 v7, s16 +; GFX7-NEXT: s_bfe_i32 s10, s4, 0x40008 +; GFX7-NEXT: v_mov_b32_e32 v6, s17 +; GFX7-NEXT: s_bfe_i32 s11, s4, 0x4000c +; GFX7-NEXT: v_mov_b32_e32 v5, s18 +; GFX7-NEXT: s_bfe_i32 s12, s4, 0x40010 +; GFX7-NEXT: v_mov_b32_e32 v4, s19 +; GFX7-NEXT: s_bfe_i32 s13, s4, 0x40014 +; GFX7-NEXT: v_mov_b32_e32 v3, s20 +; GFX7-NEXT: s_bfe_i32 s14, s4, 0x40018 +; GFX7-NEXT: v_mov_b32_e32 v2, s21 +; GFX7-NEXT: s_ashr_i32 s4, s4, 28 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mul_i32_i24_e32 v1, s4, v1 +; GFX7-NEXT: v_mul_i32_i24_e32 v2, s14, v2 +; GFX7-NEXT: v_mul_i32_i24_e32 v3, s13, v3 +; GFX7-NEXT: v_mul_i32_i24_e32 v9, s12, v4 +; GFX7-NEXT: v_mul_i32_i24_e32 v5, s11, v5 +; GFX7-NEXT: v_mul_i32_i24_e32 v6, s10, v6 +; GFX7-NEXT: v_mul_i32_i24_e32 v7, s7, v7 +; GFX7-NEXT: v_mul_i32_i24_e32 v8, s6, v8 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1 -; GFX7-NEXT: v_and_b32_e32 v2, s0, v2 +; GFX7-NEXT: v_and_b32_e32 v2, s8, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GFX7-NEXT: v_and_b32_e32 v9, s0, v9 +; GFX7-NEXT: v_and_b32_e32 v9, s8, v9 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5 -; GFX7-NEXT: v_and_b32_e32 v6, s0, v6 +; GFX7-NEXT: v_and_b32_e32 v6, s8, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 8, v7 -; GFX7-NEXT: v_and_b32_e32 v8, s0, v8 +; GFX7-NEXT: v_and_b32_e32 v8, s8, v8 ; GFX7-NEXT: v_or_b32_e32 v1, v2, v1 ; GFX7-NEXT: v_or_b32_e32 v2, v9, v3 ; GFX7-NEXT: v_or_b32_e32 v3, v6, v5 ; GFX7-NEXT: v_or_b32_e32 v5, v8, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX7-NEXT: v_and_b32_e32 v2, s1, v2 +; GFX7-NEXT: v_and_b32_e32 v2, s9, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX7-NEXT: v_and_b32_e32 v5, s1, v5 +; GFX7-NEXT: v_and_b32_e32 v5, s9, v5 ; GFX7-NEXT: v_or_b32_e32 v1, v2, v1 ; GFX7-NEXT: v_or_b32_e32 v2, v5, v3 ; GFX7-NEXT: v_alignbit_b32 v3, v1, v2, 8 @@ -2007,83 +2010,83 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v5, v0 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v6, v0 -; GFX7-NEXT: v_mad_i32_i24 v0, s13, v4, v0 +; GFX7-NEXT: v_mad_i32_i24 v0, s12, v4, v0 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v7 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v8 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1 -; GFX7-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: idot8_acc8_vecMul: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX8-NEXT: s_mov_b32 s2, 0xffff +; GFX8-NEXT: s_mov_b32 s33, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] ; GFX8-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s5, s[6:7], 0x0 +; GFX8-NEXT: s_load_dword s3, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_lshl_b32 s13, s1, 24 -; GFX8-NEXT: s_lshl_b32 s17, s1, 16 -; GFX8-NEXT: s_ashr_i64 s[22:23], s[4:5], 60 -; GFX8-NEXT: s_lshl_b32 s25, s5, 24 -; GFX8-NEXT: s_lshl_b32 s27, s5, 28 -; GFX8-NEXT: s_lshl_b32 s29, s5, 16 -; GFX8-NEXT: s_ashr_i64 s[10:11], s[0:1], 60 -; GFX8-NEXT: s_lshl_b32 s15, s1, 28 -; GFX8-NEXT: s_lshl_b32 s19, s5, 8 -; GFX8-NEXT: s_lshl_b32 s21, s5, 12 -; GFX8-NEXT: s_lshl_b32 s23, s5, 4 -; GFX8-NEXT: s_lshl_b32 s5, s5, 20 -; GFX8-NEXT: s_ashr_i64 s[12:13], s[12:13], 60 -; GFX8-NEXT: s_ashr_i64 s[16:17], s[16:17], 60 +; GFX8-NEXT: s_lshl_b32 s11, s1, 24 +; GFX8-NEXT: s_lshl_b32 s15, s1, 16 +; GFX8-NEXT: s_ashr_i64 s[20:21], s[2:3], 60 +; GFX8-NEXT: s_lshl_b32 s23, s3, 24 +; GFX8-NEXT: s_lshl_b32 s25, s3, 28 +; GFX8-NEXT: s_lshl_b32 s27, s3, 16 +; GFX8-NEXT: s_ashr_i64 s[8:9], s[0:1], 60 +; GFX8-NEXT: s_lshl_b32 s13, s1, 28 +; GFX8-NEXT: s_lshl_b32 s17, s3, 8 +; GFX8-NEXT: s_lshl_b32 s19, s3, 12 +; GFX8-NEXT: s_lshl_b32 s21, s3, 4 +; GFX8-NEXT: s_lshl_b32 s3, s3, 20 +; GFX8-NEXT: s_ashr_i64 s[10:11], s[10:11], 60 +; GFX8-NEXT: s_ashr_i64 s[14:15], s[14:15], 60 +; GFX8-NEXT: s_ashr_i64 s[22:23], s[22:23], 60 ; GFX8-NEXT: s_ashr_i64 s[24:25], s[24:25], 60 ; GFX8-NEXT: s_ashr_i64 s[26:27], s[26:27], 60 -; GFX8-NEXT: s_ashr_i64 s[28:29], s[28:29], 60 -; GFX8-NEXT: s_lshl_b32 s7, s1, 8 -; GFX8-NEXT: s_lshl_b32 s9, s1, 12 -; GFX8-NEXT: s_lshl_b32 s11, s1, 4 +; GFX8-NEXT: s_lshl_b32 s5, s1, 8 +; GFX8-NEXT: s_lshl_b32 s7, s1, 12 +; GFX8-NEXT: s_lshl_b32 s9, s1, 4 ; GFX8-NEXT: s_lshl_b32 s1, s1, 20 -; GFX8-NEXT: s_ashr_i64 s[4:5], s[4:5], 60 -; GFX8-NEXT: s_ashr_i64 s[14:15], s[14:15], 60 -; GFX8-NEXT: v_mov_b32_e32 v6, s28 -; GFX8-NEXT: v_mov_b32_e32 v7, s16 -; GFX8-NEXT: v_mov_b32_e32 v8, s26 -; GFX8-NEXT: v_mov_b32_e32 v9, s24 -; GFX8-NEXT: v_mov_b32_e32 v10, s12 +; GFX8-NEXT: s_ashr_i64 s[2:3], s[2:3], 60 +; GFX8-NEXT: s_ashr_i64 s[12:13], s[12:13], 60 +; GFX8-NEXT: v_mov_b32_e32 v6, s26 +; GFX8-NEXT: v_mov_b32_e32 v7, s14 +; GFX8-NEXT: v_mov_b32_e32 v8, s24 +; GFX8-NEXT: v_mov_b32_e32 v9, s22 +; GFX8-NEXT: v_mov_b32_e32 v10, s10 ; GFX8-NEXT: v_mul_i32_i24_sdwa v6, v7, v6 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_mul_i32_i24_e32 v7, s14, v8 +; GFX8-NEXT: v_mul_i32_i24_e32 v7, s12, v8 ; GFX8-NEXT: v_mul_i32_i24_sdwa v8, v10, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: s_ashr_i64 s[0:1], s[0:1], 60 -; GFX8-NEXT: v_mov_b32_e32 v5, s4 +; GFX8-NEXT: v_mov_b32_e32 v5, s2 ; GFX8-NEXT: v_mul_i32_i24_e32 v5, s0, v5 ; GFX8-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: s_ashr_i64 s[6:7], s[6:7], 60 -; GFX8-NEXT: s_ashr_i64 s[18:19], s[18:19], 60 +; GFX8-NEXT: s_ashr_i64 s[4:5], s[4:5], 60 +; GFX8-NEXT: s_ashr_i64 s[16:17], s[16:17], 60 ; GFX8-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_e32 v6, s2, v7 -; GFX8-NEXT: s_ashr_i64 s[20:21], s[20:21], 60 -; GFX8-NEXT: v_mov_b32_e32 v3, s22 -; GFX8-NEXT: v_mov_b32_e32 v4, s10 -; GFX8-NEXT: s_ashr_i64 s[32:33], s[22:23], 60 +; GFX8-NEXT: v_and_b32_e32 v6, s33, v7 +; GFX8-NEXT: s_ashr_i64 s[18:19], s[18:19], 60 +; GFX8-NEXT: v_mov_b32_e32 v3, s20 +; GFX8-NEXT: v_mov_b32_e32 v4, s8 +; GFX8-NEXT: s_ashr_i64 s[30:31], s[20:21], 60 ; GFX8-NEXT: v_mul_i32_i24_sdwa v3, v4, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_or_b32_e32 v5, v6, v5 -; GFX8-NEXT: s_ashr_i64 s[8:9], s[8:9], 60 -; GFX8-NEXT: v_mov_b32_e32 v4, s20 -; GFX8-NEXT: v_mov_b32_e32 v12, s18 -; GFX8-NEXT: v_mov_b32_e32 v13, s6 -; GFX8-NEXT: s_ashr_i64 s[30:31], s[10:11], 60 -; GFX8-NEXT: v_mov_b32_e32 v11, s32 -; GFX8-NEXT: v_mul_i32_i24_e32 v4, s8, v4 +; GFX8-NEXT: s_ashr_i64 s[6:7], s[6:7], 60 +; GFX8-NEXT: v_mov_b32_e32 v4, s18 +; GFX8-NEXT: v_mov_b32_e32 v12, s16 +; GFX8-NEXT: v_mov_b32_e32 v13, s4 +; GFX8-NEXT: s_ashr_i64 s[28:29], s[8:9], 60 +; GFX8-NEXT: v_mov_b32_e32 v11, s30 +; GFX8-NEXT: v_mul_i32_i24_e32 v4, s6, v4 ; GFX8-NEXT: v_mul_i32_i24_sdwa v10, v13, v12 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v5 ; GFX8-NEXT: v_or_b32_sdwa v4, v4, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_mul_i32_i24_e32 v9, s30, v11 +; GFX8-NEXT: v_mul_i32_i24_e32 v9, s28, v11 ; GFX8-NEXT: v_or_b32_sdwa v3, v9, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_and_b32_e32 v4, s2, v4 +; GFX8-NEXT: v_and_b32_e32 v4, s33, v4 ; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v3 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -2110,20 +2113,20 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshr_b32 s8, s0, 4 -; GFX9-NEXT: s_lshr_b32 s15, s1, 4 +; GFX9-NEXT: s_lshr_b32 s7, s0, 4 +; GFX9-NEXT: s_lshr_b32 s14, s1, 4 ; GFX9-NEXT: v_lshlrev_b16_e64 v3, 12, s0 ; GFX9-NEXT: v_lshlrev_b16_e64 v4, 12, s1 -; GFX9-NEXT: v_lshlrev_b16_e64 v7, 12, s8 -; GFX9-NEXT: v_lshlrev_b16_e64 v14, 12, s15 -; GFX9-NEXT: s_lshr_b32 s9, s0, 12 -; GFX9-NEXT: s_lshr_b32 s10, s0, 8 -; GFX9-NEXT: s_lshr_b32 s16, s1, 12 -; GFX9-NEXT: s_lshr_b32 s17, s1, 8 -; GFX9-NEXT: v_lshlrev_b16_e64 v5, 12, s10 -; GFX9-NEXT: v_lshlrev_b16_e64 v6, 12, s9 -; GFX9-NEXT: v_lshlrev_b16_e64 v12, 12, s17 -; GFX9-NEXT: v_lshlrev_b16_e64 v13, 12, s16 +; GFX9-NEXT: v_lshlrev_b16_e64 v7, 12, s7 +; GFX9-NEXT: v_lshlrev_b16_e64 v14, 12, s14 +; GFX9-NEXT: s_lshr_b32 s8, s0, 12 +; GFX9-NEXT: s_lshr_b32 s9, s0, 8 +; GFX9-NEXT: s_lshr_b32 s15, s1, 12 +; GFX9-NEXT: s_lshr_b32 s16, s1, 8 +; GFX9-NEXT: v_lshlrev_b16_e64 v5, 12, s9 +; GFX9-NEXT: v_lshlrev_b16_e64 v6, 12, s8 +; GFX9-NEXT: v_lshlrev_b16_e64 v12, 12, s16 +; GFX9-NEXT: v_lshlrev_b16_e64 v13, 12, s15 ; GFX9-NEXT: v_ashrrev_i16_e32 v3, 12, v3 ; GFX9-NEXT: v_ashrrev_i16_e32 v4, 12, v4 ; GFX9-NEXT: v_ashrrev_i16_e32 v7, 12, v7 @@ -2135,26 +2138,26 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX9-NEXT: v_mul_lo_u16_e32 v3, v3, v4 ; GFX9-NEXT: v_mul_lo_u16_sdwa v7, v7, v14 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: s_lshr_b32 s4, s0, 20 -; GFX9-NEXT: s_lshr_b32 s5, s0, 16 -; GFX9-NEXT: s_lshr_b32 s11, s1, 20 -; GFX9-NEXT: s_lshr_b32 s12, s1, 16 +; GFX9-NEXT: s_lshr_b32 s3, s0, 20 +; GFX9-NEXT: s_lshr_b32 s4, s0, 16 +; GFX9-NEXT: s_lshr_b32 s10, s1, 20 +; GFX9-NEXT: s_lshr_b32 s11, s1, 16 ; GFX9-NEXT: v_mul_lo_u16_sdwa v6, v6, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_mul_lo_u16_e32 v5, v5, v12 -; GFX9-NEXT: v_lshlrev_b16_e64 v10, 12, s5 -; GFX9-NEXT: v_lshlrev_b16_e64 v11, 12, s4 -; GFX9-NEXT: v_lshlrev_b16_e64 v17, 12, s12 -; GFX9-NEXT: v_lshlrev_b16_e64 v18, 12, s11 -; GFX9-NEXT: s_lshr_b32 s6, s0, 28 -; GFX9-NEXT: s_lshr_b32 s7, s0, 24 -; GFX9-NEXT: s_lshr_b32 s13, s1, 28 -; GFX9-NEXT: s_lshr_b32 s14, s1, 24 +; GFX9-NEXT: v_lshlrev_b16_e64 v10, 12, s4 +; GFX9-NEXT: v_lshlrev_b16_e64 v11, 12, s3 +; GFX9-NEXT: v_lshlrev_b16_e64 v17, 12, s11 +; GFX9-NEXT: v_lshlrev_b16_e64 v18, 12, s10 +; GFX9-NEXT: s_lshr_b32 s5, s0, 28 +; GFX9-NEXT: s_lshr_b32 s6, s0, 24 +; GFX9-NEXT: s_lshr_b32 s12, s1, 28 +; GFX9-NEXT: s_lshr_b32 s13, s1, 24 ; GFX9-NEXT: v_and_b32_e32 v3, s2, v3 ; GFX9-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b16_e64 v8, 12, s7 -; GFX9-NEXT: v_lshlrev_b16_e64 v9, 12, s6 -; GFX9-NEXT: v_lshlrev_b16_e64 v15, 12, s14 -; GFX9-NEXT: v_lshlrev_b16_e64 v16, 12, s13 +; GFX9-NEXT: v_lshlrev_b16_e64 v8, 12, s6 +; GFX9-NEXT: v_lshlrev_b16_e64 v9, 12, s5 +; GFX9-NEXT: v_lshlrev_b16_e64 v15, 12, s13 +; GFX9-NEXT: v_lshlrev_b16_e64 v16, 12, s12 ; GFX9-NEXT: v_or_b32_e32 v5, v3, v5 ; GFX9-NEXT: v_ashrrev_i16_e32 v10, 12, v10 ; GFX9-NEXT: v_ashrrev_i16_e32 v17, 12, v17 @@ -2198,20 +2201,20 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_lshr_b32 s8, s0, 4 -; GFX9-DL-NEXT: s_lshr_b32 s15, s1, 4 +; GFX9-DL-NEXT: s_lshr_b32 s7, s0, 4 +; GFX9-DL-NEXT: s_lshr_b32 s14, s1, 4 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s0 ; GFX9-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s1 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v7, 12, s8 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v14, 12, s15 -; GFX9-DL-NEXT: s_lshr_b32 s9, s0, 12 -; GFX9-DL-NEXT: s_lshr_b32 s10, s0, 8 -; GFX9-DL-NEXT: s_lshr_b32 s16, s1, 12 -; GFX9-DL-NEXT: s_lshr_b32 s17, s1, 8 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s10 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v6, 12, s9 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v12, 12, s17 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v13, 12, s16 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v7, 12, s7 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v14, 12, s14 +; GFX9-DL-NEXT: s_lshr_b32 s8, s0, 12 +; GFX9-DL-NEXT: s_lshr_b32 s9, s0, 8 +; GFX9-DL-NEXT: s_lshr_b32 s15, s1, 12 +; GFX9-DL-NEXT: s_lshr_b32 s16, s1, 8 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s9 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v6, 12, s8 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v12, 12, s16 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v13, 12, s15 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v3, 12, v3 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v4, 12, v4 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v7, 12, v7 @@ -2223,26 +2226,26 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v3, v3, v4 ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v7, v7, v14 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-DL-NEXT: v_or_b32_sdwa v3, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-DL-NEXT: s_lshr_b32 s4, s0, 20 -; GFX9-DL-NEXT: s_lshr_b32 s5, s0, 16 -; GFX9-DL-NEXT: s_lshr_b32 s11, s1, 20 -; GFX9-DL-NEXT: s_lshr_b32 s12, s1, 16 +; GFX9-DL-NEXT: s_lshr_b32 s3, s0, 20 +; GFX9-DL-NEXT: s_lshr_b32 s4, s0, 16 +; GFX9-DL-NEXT: s_lshr_b32 s10, s1, 20 +; GFX9-DL-NEXT: s_lshr_b32 s11, s1, 16 ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v6, v6, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v5, v5, v12 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v10, 12, s5 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v11, 12, s4 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v17, 12, s12 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v18, 12, s11 -; GFX9-DL-NEXT: s_lshr_b32 s6, s0, 28 -; GFX9-DL-NEXT: s_lshr_b32 s7, s0, 24 -; GFX9-DL-NEXT: s_lshr_b32 s13, s1, 28 -; GFX9-DL-NEXT: s_lshr_b32 s14, s1, 24 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v10, 12, s4 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v11, 12, s3 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v17, 12, s11 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v18, 12, s10 +; GFX9-DL-NEXT: s_lshr_b32 s5, s0, 28 +; GFX9-DL-NEXT: s_lshr_b32 s6, s0, 24 +; GFX9-DL-NEXT: s_lshr_b32 s12, s1, 28 +; GFX9-DL-NEXT: s_lshr_b32 s13, s1, 24 ; GFX9-DL-NEXT: v_and_b32_e32 v3, s2, v3 ; GFX9-DL-NEXT: v_or_b32_sdwa v5, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v8, 12, s7 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v9, 12, s6 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v15, 12, s14 -; GFX9-DL-NEXT: v_lshlrev_b16_e64 v16, 12, s13 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v8, 12, s6 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v9, 12, s5 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v15, 12, s13 +; GFX9-DL-NEXT: v_lshlrev_b16_e64 v16, 12, s12 ; GFX9-DL-NEXT: v_or_b32_e32 v5, v3, v5 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v10, 12, v10 ; GFX9-DL-NEXT: v_ashrrev_i16_e32 v17, 12, v17 @@ -2276,70 +2279,70 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1, ; ; GFX10-DL-LABEL: idot8_acc8_vecMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 -; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_mov_b32 s2, 0xffff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_lshr_b32 s8, s0, 4 -; GFX10-DL-NEXT: s_lshr_b32 s15, s1, 4 -; GFX10-DL-NEXT: s_lshr_b32 s9, s0, 12 -; GFX10-DL-NEXT: s_lshr_b32 s16, s1, 12 +; GFX10-DL-NEXT: s_lshr_b32 s7, s0, 4 +; GFX10-DL-NEXT: s_lshr_b32 s14, s1, 4 +; GFX10-DL-NEXT: s_lshr_b32 s8, s0, 12 +; GFX10-DL-NEXT: s_lshr_b32 s15, s1, 12 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 12, s0 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v7, 12, s8 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v12, 12, s15 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v7, 12, s7 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v12, 12, s14 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 12, s1 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v14, 12, s16 -; GFX10-DL-NEXT: s_lshr_b32 s10, s0, 8 -; GFX10-DL-NEXT: s_lshr_b32 s17, s1, 8 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v6, 12, s9 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v14, 12, s15 +; GFX10-DL-NEXT: s_lshr_b32 s9, s0, 8 +; GFX10-DL-NEXT: s_lshr_b32 s16, s1, 8 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v6, 12, s8 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v7, 12, v7 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v12, 12, v12 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s10 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v5, 12, s9 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v3, 12, v3 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v4, 12, v4 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v13, 12, s17 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v13, 12, s16 ; GFX10-DL-NEXT: v_mul_lo_u16_e64 v7, v7, v12 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v19, 12, v6 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v14, 12, v14 -; GFX10-DL-NEXT: s_lshr_b32 s4, s0, 20 -; GFX10-DL-NEXT: s_lshr_b32 s5, s0, 16 -; GFX10-DL-NEXT: s_lshr_b32 s6, s0, 28 -; GFX10-DL-NEXT: s_lshr_b32 s7, s0, 24 +; GFX10-DL-NEXT: s_lshr_b32 s3, s0, 20 +; GFX10-DL-NEXT: s_lshr_b32 s4, s0, 16 +; GFX10-DL-NEXT: s_lshr_b32 s5, s0, 28 +; GFX10-DL-NEXT: s_lshr_b32 s6, s0, 24 ; GFX10-DL-NEXT: v_mul_lo_u16_e64 v3, v3, v4 ; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, v19, v14 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v6, 8, v7 -; GFX10-DL-NEXT: s_lshr_b32 s11, s1, 20 +; GFX10-DL-NEXT: s_lshr_b32 s10, s1, 20 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v12, 12, v13 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v5, 12, v5 -; GFX10-DL-NEXT: s_lshr_b32 s12, s1, 16 +; GFX10-DL-NEXT: s_lshr_b32 s11, s1, 16 ; GFX10-DL-NEXT: v_or_b32_sdwa v3, v3, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-DL-NEXT: s_lshr_b32 s13, s1, 28 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v8, 12, s7 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v9, 12, s6 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v10, 12, s5 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v11, 12, s4 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v13, 12, s11 +; GFX10-DL-NEXT: s_lshr_b32 s12, s1, 28 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v8, 12, s6 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v9, 12, s5 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v10, 12, s4 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v11, 12, s3 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v13, 12, s10 ; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, v5, v12 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v4, 8, v4 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v7, 12, s12 -; GFX10-DL-NEXT: s_lshr_b32 s14, s1, 24 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v7, 12, s11 +; GFX10-DL-NEXT: s_lshr_b32 s13, s1, 24 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v6, 12, v8 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v8, 12, v9 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v9, 12, v10 ; GFX10-DL-NEXT: v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-DL-NEXT: v_and_b32_e32 v3, s2, v3 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v16, 12, s13 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v16, 12, s12 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v5, 12, v11 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v10, 12, v13 -; GFX10-DL-NEXT: v_lshlrev_b16_e64 v15, 12, s14 +; GFX10-DL-NEXT: v_lshlrev_b16_e64 v15, 12, s13 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v7, 12, v7 ; GFX10-DL-NEXT: v_ashrrev_i16_e64 v11, 12, v16 ; GFX10-DL-NEXT: v_or_b32_e32 v4, v3, v4 diff --git a/llvm/test/CodeGen/AMDGPU/idot8u.ll b/llvm/test/CodeGen/AMDGPU/idot8u.ll index 543b55e8e261a..21eede8df373a 100644 --- a/llvm/test/CodeGen/AMDGPU/idot8u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8u.ll @@ -9,49 +9,49 @@ define amdgpu_kernel void @udot8_acc32(<8 x i4> addrspace(1)* %src1, ; GFX7-LABEL: udot8_acc32: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s10, s[10:11], 0x0 -; GFX7-NEXT: s_load_dword s0, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s21, s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s6, s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s20, s[0:1], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_lshr_b32 s11, s10, 28 -; GFX7-NEXT: s_bfe_u32 s15, s10, 0x40018 -; GFX7-NEXT: s_bfe_u32 s16, s10, 0x40014 -; GFX7-NEXT: s_bfe_u32 s17, s10, 0x40010 -; GFX7-NEXT: s_bfe_u32 s18, s10, 0x4000c -; GFX7-NEXT: s_bfe_u32 s19, s10, 0x40008 -; GFX7-NEXT: s_bfe_u32 s20, s10, 0x40004 -; GFX7-NEXT: s_and_b32 s10, s10, 15 -; GFX7-NEXT: s_lshr_b32 s1, s0, 28 -; GFX7-NEXT: s_bfe_u32 s2, s0, 0x40018 -; GFX7-NEXT: s_bfe_u32 s8, s0, 0x40014 -; GFX7-NEXT: s_bfe_u32 s9, s0, 0x40010 -; GFX7-NEXT: s_bfe_u32 s12, s0, 0x4000c -; GFX7-NEXT: s_bfe_u32 s13, s0, 0x40008 -; GFX7-NEXT: s_bfe_u32 s14, s0, 0x40004 -; GFX7-NEXT: s_and_b32 s0, s0, 15 -; GFX7-NEXT: v_mov_b32_e32 v0, s10 -; GFX7-NEXT: v_mov_b32_e32 v1, s21 -; GFX7-NEXT: v_mad_u32_u24 v0, s0, v0, v1 +; GFX7-NEXT: s_lshr_b32 s7, s6, 28 +; GFX7-NEXT: s_bfe_u32 s14, s6, 0x40018 +; GFX7-NEXT: s_bfe_u32 s15, s6, 0x40014 +; GFX7-NEXT: s_bfe_u32 s16, s6, 0x40010 +; GFX7-NEXT: s_bfe_u32 s17, s6, 0x4000c +; GFX7-NEXT: s_bfe_u32 s18, s6, 0x40008 +; GFX7-NEXT: s_bfe_u32 s19, s6, 0x40004 +; GFX7-NEXT: s_and_b32 s6, s6, 15 +; GFX7-NEXT: s_lshr_b32 s5, s4, 28 +; GFX7-NEXT: s_bfe_u32 s8, s4, 0x40018 +; GFX7-NEXT: s_bfe_u32 s9, s4, 0x40014 +; GFX7-NEXT: s_bfe_u32 s10, s4, 0x40010 +; GFX7-NEXT: s_bfe_u32 s11, s4, 0x4000c +; GFX7-NEXT: s_bfe_u32 s12, s4, 0x40008 +; GFX7-NEXT: s_bfe_u32 s13, s4, 0x40004 +; GFX7-NEXT: s_and_b32 s4, s4, 15 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s20 -; GFX7-NEXT: v_mad_u32_u24 v0, s14, v1, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s4, v0, v1 ; GFX7-NEXT: v_mov_b32_e32 v1, s19 ; GFX7-NEXT: v_mad_u32_u24 v0, s13, v1, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, s18 ; GFX7-NEXT: v_mad_u32_u24 v0, s12, v1, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, s17 -; GFX7-NEXT: v_mad_u32_u24 v0, s9, v1, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s11, v1, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, s16 -; GFX7-NEXT: v_mad_u32_u24 v0, s8, v1, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s10, v1, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, s15 -; GFX7-NEXT: v_mad_u32_u24 v0, s2, v1, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s11 -; GFX7-NEXT: v_mad_u32_u24 v0, s1, v1, v0 -; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX7-NEXT: v_mad_u32_u24 v0, s9, v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s14 +; GFX7-NEXT: v_mad_u32_u24 v0, s8, v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mad_u32_u24 v0, s5, v1, v0 +; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: udot8_acc32: @@ -61,29 +61,27 @@ define amdgpu_kernel void @udot8_acc32(<8 x i4> addrspace(1)* %src1, ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s6, s[6:7], 0x0 ; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s19, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s18, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_lshr_b32 s7, s6, 28 -; GFX8-NEXT: s_bfe_u32 s13, s6, 0x40018 -; GFX8-NEXT: s_bfe_u32 s14, s6, 0x40014 -; GFX8-NEXT: s_bfe_u32 s15, s6, 0x40010 -; GFX8-NEXT: s_bfe_u32 s16, s6, 0x4000c -; GFX8-NEXT: s_bfe_u32 s17, s6, 0x40008 -; GFX8-NEXT: s_bfe_u32 s18, s6, 0x40004 +; GFX8-NEXT: s_bfe_u32 s12, s6, 0x40018 +; GFX8-NEXT: s_bfe_u32 s13, s6, 0x40014 +; GFX8-NEXT: s_bfe_u32 s14, s6, 0x40010 +; GFX8-NEXT: s_bfe_u32 s15, s6, 0x4000c +; GFX8-NEXT: s_bfe_u32 s16, s6, 0x40008 +; GFX8-NEXT: s_bfe_u32 s17, s6, 0x40004 ; GFX8-NEXT: s_and_b32 s6, s6, 15 -; GFX8-NEXT: s_lshr_b32 s4, s2, 28 -; GFX8-NEXT: s_bfe_u32 s5, s2, 0x40018 -; GFX8-NEXT: s_bfe_u32 s8, s2, 0x40014 -; GFX8-NEXT: s_bfe_u32 s9, s2, 0x40010 -; GFX8-NEXT: s_bfe_u32 s10, s2, 0x4000c -; GFX8-NEXT: s_bfe_u32 s11, s2, 0x40008 -; GFX8-NEXT: s_bfe_u32 s12, s2, 0x40004 +; GFX8-NEXT: s_lshr_b32 s3, s2, 28 +; GFX8-NEXT: s_bfe_u32 s4, s2, 0x40018 +; GFX8-NEXT: s_bfe_u32 s5, s2, 0x40014 +; GFX8-NEXT: s_bfe_u32 s8, s2, 0x40010 +; GFX8-NEXT: s_bfe_u32 s9, s2, 0x4000c +; GFX8-NEXT: s_bfe_u32 s10, s2, 0x40008 +; GFX8-NEXT: s_bfe_u32 s11, s2, 0x40004 ; GFX8-NEXT: s_and_b32 s2, s2, 15 ; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s19 -; GFX8-NEXT: v_mad_u32_u24 v0, s2, v0, v1 ; GFX8-NEXT: v_mov_b32_e32 v1, s18 -; GFX8-NEXT: v_mad_u32_u24 v0, s12, v1, v0 +; GFX8-NEXT: v_mad_u32_u24 v0, s2, v0, v1 ; GFX8-NEXT: v_mov_b32_e32 v1, s17 ; GFX8-NEXT: v_mad_u32_u24 v0, s11, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, s16 @@ -94,8 +92,10 @@ define amdgpu_kernel void @udot8_acc32(<8 x i4> addrspace(1)* %src1, ; GFX8-NEXT: v_mad_u32_u24 v0, s8, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, s13 ; GFX8-NEXT: v_mad_u32_u24 v0, s5, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s12 +; GFX8-NEXT: v_mad_u32_u24 v0, s4, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_mad_u32_u24 v2, s4, v1, v0 +; GFX8-NEXT: v_mad_u32_u24 v2, s3, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -108,29 +108,27 @@ define amdgpu_kernel void @udot8_acc32(<8 x i4> addrspace(1)* %src1, ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s19, s[0:1], 0x0 +; GFX9-NEXT: s_load_dword s18, s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s7, s6, 28 -; GFX9-NEXT: s_bfe_u32 s13, s6, 0x40018 -; GFX9-NEXT: s_bfe_u32 s14, s6, 0x40014 -; GFX9-NEXT: s_bfe_u32 s15, s6, 0x40010 -; GFX9-NEXT: s_bfe_u32 s16, s6, 0x4000c -; GFX9-NEXT: s_bfe_u32 s17, s6, 0x40008 -; GFX9-NEXT: s_bfe_u32 s18, s6, 0x40004 +; GFX9-NEXT: s_bfe_u32 s12, s6, 0x40018 +; GFX9-NEXT: s_bfe_u32 s13, s6, 0x40014 +; GFX9-NEXT: s_bfe_u32 s14, s6, 0x40010 +; GFX9-NEXT: s_bfe_u32 s15, s6, 0x4000c +; GFX9-NEXT: s_bfe_u32 s16, s6, 0x40008 +; GFX9-NEXT: s_bfe_u32 s17, s6, 0x40004 ; GFX9-NEXT: s_and_b32 s6, s6, 15 -; GFX9-NEXT: s_lshr_b32 s4, s2, 28 -; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40018 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40014 -; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40010 -; GFX9-NEXT: s_bfe_u32 s10, s2, 0x4000c -; GFX9-NEXT: s_bfe_u32 s11, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s12, s2, 0x40004 +; GFX9-NEXT: s_lshr_b32 s3, s2, 28 +; GFX9-NEXT: s_bfe_u32 s4, s2, 0x40018 +; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40014 +; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40010 +; GFX9-NEXT: s_bfe_u32 s9, s2, 0x4000c +; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40008 +; GFX9-NEXT: s_bfe_u32 s11, s2, 0x40004 ; GFX9-NEXT: s_and_b32 s2, s2, 15 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s19 -; GFX9-NEXT: v_mad_u32_u24 v0, s2, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, s18 -; GFX9-NEXT: v_mad_u32_u24 v0, s12, v1, v0 +; GFX9-NEXT: v_mad_u32_u24 v0, s2, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: v_mad_u32_u24 v0, s11, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, s16 @@ -141,8 +139,10 @@ define amdgpu_kernel void @udot8_acc32(<8 x i4> addrspace(1)* %src1, ; GFX9-NEXT: v_mad_u32_u24 v0, s8, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, s13 ; GFX9-NEXT: v_mad_u32_u24 v0, s5, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s12 +; GFX9-NEXT: v_mad_u32_u24 v0, s4, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mad_u32_u24 v2, s4, v1, v0 +; GFX9-NEXT: v_mad_u32_u24 v2, s3, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_store_dword v[0:1], v2, off @@ -154,11 +154,11 @@ define amdgpu_kernel void @udot8_acc32(<8 x i4> addrspace(1)* %src1, ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: v_dot8_u32_u4 v2, s4, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 @@ -167,18 +167,18 @@ define amdgpu_kernel void @udot8_acc32(<8 x i4> addrspace(1)* %src1, ; ; GFX10-DL-LABEL: udot8_acc32: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[8:9], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s2, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_dot8_u32_u4 v2, s1, s2, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s9 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NEXT: v_dot8_u32_u4 v2, s0, s1, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, @@ -254,49 +254,49 @@ entry: define amdgpu_kernel void @udot8_acc16(<8 x i4> addrspace(1)* %src1, ; GFX7-LABEL: udot8_acc16: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: buffer_load_ushort v0, off, s[4:7], 0 -; GFX7-NEXT: s_load_dword s0, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s1, s[10:11], 0x0 +; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_lshr_b32 s2, s0, 28 -; GFX7-NEXT: s_bfe_u32 s15, s1, 0x40018 -; GFX7-NEXT: s_bfe_u32 s16, s1, 0x40014 -; GFX7-NEXT: s_bfe_u32 s17, s1, 0x40010 -; GFX7-NEXT: s_bfe_u32 s18, s1, 0x4000c -; GFX7-NEXT: s_bfe_u32 s19, s1, 0x40008 -; GFX7-NEXT: s_bfe_u32 s20, s1, 0x40004 -; GFX7-NEXT: s_lshr_b32 s14, s1, 28 -; GFX7-NEXT: s_and_b32 s1, s1, 15 -; GFX7-NEXT: s_bfe_u32 s8, s0, 0x40018 -; GFX7-NEXT: s_bfe_u32 s9, s0, 0x40014 -; GFX7-NEXT: s_bfe_u32 s10, s0, 0x40010 -; GFX7-NEXT: s_bfe_u32 s11, s0, 0x4000c -; GFX7-NEXT: s_bfe_u32 s12, s0, 0x40008 -; GFX7-NEXT: s_bfe_u32 s13, s0, 0x40004 -; GFX7-NEXT: s_and_b32 s0, s0, 15 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-NEXT: v_mov_b32_e32 v3, s19 -; GFX7-NEXT: v_mov_b32_e32 v4, s18 -; GFX7-NEXT: v_mov_b32_e32 v5, s17 -; GFX7-NEXT: v_mov_b32_e32 v6, s16 -; GFX7-NEXT: v_mov_b32_e32 v7, s15 +; GFX7-NEXT: s_lshr_b32 s6, s4, 28 +; GFX7-NEXT: s_bfe_u32 s14, s5, 0x40018 +; GFX7-NEXT: s_bfe_u32 s15, s5, 0x40014 +; GFX7-NEXT: s_bfe_u32 s16, s5, 0x40010 +; GFX7-NEXT: s_bfe_u32 s17, s5, 0x4000c +; GFX7-NEXT: s_bfe_u32 s18, s5, 0x40008 +; GFX7-NEXT: s_bfe_u32 s19, s5, 0x40004 +; GFX7-NEXT: s_lshr_b32 s13, s5, 28 +; GFX7-NEXT: s_and_b32 s5, s5, 15 +; GFX7-NEXT: s_bfe_u32 s7, s4, 0x40018 +; GFX7-NEXT: s_bfe_u32 s8, s4, 0x40014 +; GFX7-NEXT: s_bfe_u32 s9, s4, 0x40010 +; GFX7-NEXT: s_bfe_u32 s10, s4, 0x4000c +; GFX7-NEXT: s_bfe_u32 s11, s4, 0x40008 +; GFX7-NEXT: s_bfe_u32 s12, s4, 0x40004 +; GFX7-NEXT: s_and_b32 s4, s4, 15 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s19 +; GFX7-NEXT: v_mov_b32_e32 v3, s18 +; GFX7-NEXT: v_mov_b32_e32 v4, s17 +; GFX7-NEXT: v_mov_b32_e32 v5, s16 +; GFX7-NEXT: v_mov_b32_e32 v6, s15 +; GFX7-NEXT: v_mov_b32_e32 v7, s14 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v0, s0, v1, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s13, v2, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s12, v3, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s11, v4, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s10, v5, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s9, v6, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s8, v7, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s14 -; GFX7-NEXT: v_mad_u32_u24 v0, s2, v1, v0 -; GFX7-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s12, v2, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s11, v3, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s10, v4, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s9, v5, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s8, v6, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s7, v7, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s13 +; GFX7-NEXT: v_mad_u32_u24 v0, s6, v1, v0 +; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: udot8_acc16: @@ -311,38 +311,38 @@ define amdgpu_kernel void @udot8_acc16(<8 x i4> addrspace(1)* %src1, ; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_lshr_b32 s2, s0, 28 -; GFX8-NEXT: s_bfe_u32 s11, s1, 0x40018 -; GFX8-NEXT: s_bfe_u32 s12, s1, 0x40014 -; GFX8-NEXT: s_bfe_u32 s13, s1, 0x40010 -; GFX8-NEXT: s_bfe_u32 s14, s1, 0x4000c -; GFX8-NEXT: s_bfe_u32 s15, s1, 0x40008 -; GFX8-NEXT: s_bfe_u32 s16, s1, 0x40004 -; GFX8-NEXT: s_lshr_b32 s10, s1, 28 +; GFX8-NEXT: s_bfe_u32 s10, s1, 0x40018 +; GFX8-NEXT: s_bfe_u32 s11, s1, 0x40014 +; GFX8-NEXT: s_bfe_u32 s12, s1, 0x40010 +; GFX8-NEXT: s_bfe_u32 s13, s1, 0x4000c +; GFX8-NEXT: s_bfe_u32 s14, s1, 0x40008 +; GFX8-NEXT: s_bfe_u32 s15, s1, 0x40004 +; GFX8-NEXT: s_lshr_b32 s9, s1, 28 ; GFX8-NEXT: s_and_b32 s1, s1, 15 -; GFX8-NEXT: s_bfe_u32 s4, s0, 0x40018 -; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40014 -; GFX8-NEXT: s_bfe_u32 s6, s0, 0x40010 -; GFX8-NEXT: s_bfe_u32 s7, s0, 0x4000c -; GFX8-NEXT: s_bfe_u32 s8, s0, 0x40008 -; GFX8-NEXT: s_bfe_u32 s9, s0, 0x40004 +; GFX8-NEXT: s_bfe_u32 s3, s0, 0x40018 +; GFX8-NEXT: s_bfe_u32 s4, s0, 0x40014 +; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40010 +; GFX8-NEXT: s_bfe_u32 s6, s0, 0x4000c +; GFX8-NEXT: s_bfe_u32 s7, s0, 0x40008 +; GFX8-NEXT: s_bfe_u32 s8, s0, 0x40004 ; GFX8-NEXT: s_and_b32 s0, s0, 15 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s16 -; GFX8-NEXT: v_mov_b32_e32 v5, s15 -; GFX8-NEXT: v_mov_b32_e32 v6, s14 -; GFX8-NEXT: v_mov_b32_e32 v7, s13 -; GFX8-NEXT: v_mov_b32_e32 v8, s12 -; GFX8-NEXT: v_mov_b32_e32 v9, s11 +; GFX8-NEXT: v_mov_b32_e32 v4, s15 +; GFX8-NEXT: v_mov_b32_e32 v5, s14 +; GFX8-NEXT: v_mov_b32_e32 v6, s13 +; GFX8-NEXT: v_mov_b32_e32 v7, s12 +; GFX8-NEXT: v_mov_b32_e32 v8, s11 +; GFX8-NEXT: v_mov_b32_e32 v9, s10 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s9, v4, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s8, v4, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s8, v5, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s7, v6, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s6, v7, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s5, v8, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s4, v9, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s10 +; GFX8-NEXT: v_mad_u32_u24 v2, s7, v5, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s6, v6, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s5, v7, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s4, v8, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s3, v9, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s9 ; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm @@ -359,38 +359,38 @@ define amdgpu_kernel void @udot8_acc16(<8 x i4> addrspace(1)* %src1, ; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s2, s0, 28 -; GFX9-NEXT: s_bfe_u32 s11, s1, 0x40018 -; GFX9-NEXT: s_bfe_u32 s12, s1, 0x40014 -; GFX9-NEXT: s_bfe_u32 s13, s1, 0x40010 -; GFX9-NEXT: s_bfe_u32 s14, s1, 0x4000c -; GFX9-NEXT: s_bfe_u32 s15, s1, 0x40008 -; GFX9-NEXT: s_bfe_u32 s16, s1, 0x40004 -; GFX9-NEXT: s_lshr_b32 s10, s1, 28 +; GFX9-NEXT: s_bfe_u32 s10, s1, 0x40018 +; GFX9-NEXT: s_bfe_u32 s11, s1, 0x40014 +; GFX9-NEXT: s_bfe_u32 s12, s1, 0x40010 +; GFX9-NEXT: s_bfe_u32 s13, s1, 0x4000c +; GFX9-NEXT: s_bfe_u32 s14, s1, 0x40008 +; GFX9-NEXT: s_bfe_u32 s15, s1, 0x40004 +; GFX9-NEXT: s_lshr_b32 s9, s1, 28 ; GFX9-NEXT: s_and_b32 s1, s1, 15 -; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40018 -; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40014 -; GFX9-NEXT: s_bfe_u32 s6, s0, 0x40010 -; GFX9-NEXT: s_bfe_u32 s7, s0, 0x4000c -; GFX9-NEXT: s_bfe_u32 s8, s0, 0x40008 -; GFX9-NEXT: s_bfe_u32 s9, s0, 0x40004 +; GFX9-NEXT: s_bfe_u32 s3, s0, 0x40018 +; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40014 +; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40010 +; GFX9-NEXT: s_bfe_u32 s6, s0, 0x4000c +; GFX9-NEXT: s_bfe_u32 s7, s0, 0x40008 +; GFX9-NEXT: s_bfe_u32 s8, s0, 0x40004 ; GFX9-NEXT: s_and_b32 s0, s0, 15 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_mov_b32_e32 v4, s16 -; GFX9-NEXT: v_mov_b32_e32 v5, s15 -; GFX9-NEXT: v_mov_b32_e32 v6, s14 -; GFX9-NEXT: v_mov_b32_e32 v7, s13 -; GFX9-NEXT: v_mov_b32_e32 v8, s12 -; GFX9-NEXT: v_mov_b32_e32 v9, s11 +; GFX9-NEXT: v_mov_b32_e32 v4, s15 +; GFX9-NEXT: v_mov_b32_e32 v5, s14 +; GFX9-NEXT: v_mov_b32_e32 v6, s13 +; GFX9-NEXT: v_mov_b32_e32 v7, s12 +; GFX9-NEXT: v_mov_b32_e32 v8, s11 +; GFX9-NEXT: v_mov_b32_e32 v9, s10 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s9, v4, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s8, v4, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s8, v5, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s7, v6, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s6, v7, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s5, v8, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s4, v9, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s10 +; GFX9-NEXT: v_mad_u32_u24 v2, s7, v5, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s6, v6, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s5, v7, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s4, v8, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s3, v9, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 ; GFX9-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-NEXT: global_store_short v[0:1], v2, off ; GFX9-NEXT: s_endpgm @@ -407,80 +407,80 @@ define amdgpu_kernel void @udot8_acc16(<8 x i4> addrspace(1)* %src1, ; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_lshr_b32 s2, s0, 28 -; GFX9-DL-NEXT: s_bfe_u32 s11, s1, 0x40018 -; GFX9-DL-NEXT: s_bfe_u32 s12, s1, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s13, s1, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s14, s1, 0x4000c -; GFX9-DL-NEXT: s_bfe_u32 s15, s1, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s16, s1, 0x40004 -; GFX9-DL-NEXT: s_lshr_b32 s10, s1, 28 +; GFX9-DL-NEXT: s_bfe_u32 s10, s1, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s11, s1, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s12, s1, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s13, s1, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s14, s1, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s15, s1, 0x40004 +; GFX9-DL-NEXT: s_lshr_b32 s9, s1, 28 ; GFX9-DL-NEXT: s_and_b32 s1, s1, 15 -; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40018 -; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x4000c -; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s9, s0, 0x40004 +; GFX9-DL-NEXT: s_bfe_u32 s3, s0, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x40004 ; GFX9-DL-NEXT: s_and_b32 s0, s0, 15 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s16 -; GFX9-DL-NEXT: v_mov_b32_e32 v5, s15 -; GFX9-DL-NEXT: v_mov_b32_e32 v6, s14 -; GFX9-DL-NEXT: v_mov_b32_e32 v7, s13 -; GFX9-DL-NEXT: v_mov_b32_e32 v8, s12 -; GFX9-DL-NEXT: v_mov_b32_e32 v9, s11 +; GFX9-DL-NEXT: v_mov_b32_e32 v4, s15 +; GFX9-DL-NEXT: v_mov_b32_e32 v5, s14 +; GFX9-DL-NEXT: v_mov_b32_e32 v6, s13 +; GFX9-DL-NEXT: v_mov_b32_e32 v7, s12 +; GFX9-DL-NEXT: v_mov_b32_e32 v8, s11 +; GFX9-DL-NEXT: v_mov_b32_e32 v9, s10 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s9, v4, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s8, v4, v2 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s8, v5, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v6, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v7, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v8, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v9, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s10 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v5, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v6, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v7, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v8, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s3, v9, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s9 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-DL-NEXT: global_store_short v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc16: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 -; GFX10-DL-NEXT: s_and_b32 s4, s1, 15 -; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40004 +; GFX10-DL-NEXT: s_and_b32 s3, s1, 15 +; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x40004 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 ; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40008 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v2 -; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x4000c -; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x4000c +; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40008 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s4, s5, v2 +; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x4000c +; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x4000c ; GFX10-DL-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 ; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40010 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v2 -; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x40014 -; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40014 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 +; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40010 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s4, s5, v2 +; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x40014 +; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x40014 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 ; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40018 -; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40018 +; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40018 ; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28 ; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 28 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s4, s5, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 ; GFX10-DL-NEXT: global_store_short v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm @@ -557,49 +557,49 @@ entry: define amdgpu_kernel void @udot8_acc8(<8 x i4> addrspace(1)* %src1, ; GFX7-LABEL: udot8_acc8: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 -; GFX7-NEXT: s_load_dword s0, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s1, s[10:11], 0x0 +; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_lshr_b32 s2, s0, 28 -; GFX7-NEXT: s_bfe_u32 s15, s1, 0x40018 -; GFX7-NEXT: s_bfe_u32 s16, s1, 0x40014 -; GFX7-NEXT: s_bfe_u32 s17, s1, 0x40010 -; GFX7-NEXT: s_bfe_u32 s18, s1, 0x4000c -; GFX7-NEXT: s_bfe_u32 s19, s1, 0x40008 -; GFX7-NEXT: s_bfe_u32 s20, s1, 0x40004 -; GFX7-NEXT: s_lshr_b32 s14, s1, 28 -; GFX7-NEXT: s_and_b32 s1, s1, 15 -; GFX7-NEXT: s_bfe_u32 s8, s0, 0x40018 -; GFX7-NEXT: s_bfe_u32 s9, s0, 0x40014 -; GFX7-NEXT: s_bfe_u32 s10, s0, 0x40010 -; GFX7-NEXT: s_bfe_u32 s11, s0, 0x4000c -; GFX7-NEXT: s_bfe_u32 s12, s0, 0x40008 -; GFX7-NEXT: s_bfe_u32 s13, s0, 0x40004 -; GFX7-NEXT: s_and_b32 s0, s0, 15 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-NEXT: v_mov_b32_e32 v3, s19 -; GFX7-NEXT: v_mov_b32_e32 v4, s18 -; GFX7-NEXT: v_mov_b32_e32 v5, s17 -; GFX7-NEXT: v_mov_b32_e32 v6, s16 -; GFX7-NEXT: v_mov_b32_e32 v7, s15 +; GFX7-NEXT: s_lshr_b32 s6, s4, 28 +; GFX7-NEXT: s_bfe_u32 s14, s5, 0x40018 +; GFX7-NEXT: s_bfe_u32 s15, s5, 0x40014 +; GFX7-NEXT: s_bfe_u32 s16, s5, 0x40010 +; GFX7-NEXT: s_bfe_u32 s17, s5, 0x4000c +; GFX7-NEXT: s_bfe_u32 s18, s5, 0x40008 +; GFX7-NEXT: s_bfe_u32 s19, s5, 0x40004 +; GFX7-NEXT: s_lshr_b32 s13, s5, 28 +; GFX7-NEXT: s_and_b32 s5, s5, 15 +; GFX7-NEXT: s_bfe_u32 s7, s4, 0x40018 +; GFX7-NEXT: s_bfe_u32 s8, s4, 0x40014 +; GFX7-NEXT: s_bfe_u32 s9, s4, 0x40010 +; GFX7-NEXT: s_bfe_u32 s10, s4, 0x4000c +; GFX7-NEXT: s_bfe_u32 s11, s4, 0x40008 +; GFX7-NEXT: s_bfe_u32 s12, s4, 0x40004 +; GFX7-NEXT: s_and_b32 s4, s4, 15 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s19 +; GFX7-NEXT: v_mov_b32_e32 v3, s18 +; GFX7-NEXT: v_mov_b32_e32 v4, s17 +; GFX7-NEXT: v_mov_b32_e32 v5, s16 +; GFX7-NEXT: v_mov_b32_e32 v6, s15 +; GFX7-NEXT: v_mov_b32_e32 v7, s14 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v0, s0, v1, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s13, v2, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s12, v3, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s11, v4, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s10, v5, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s9, v6, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s8, v7, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s14 -; GFX7-NEXT: v_mad_u32_u24 v0, s2, v1, v0 -; GFX7-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s12, v2, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s11, v3, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s10, v4, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s9, v5, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s8, v6, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s7, v7, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s13 +; GFX7-NEXT: v_mad_u32_u24 v0, s6, v1, v0 +; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: udot8_acc8: @@ -614,38 +614,38 @@ define amdgpu_kernel void @udot8_acc8(<8 x i4> addrspace(1)* %src1, ; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_lshr_b32 s2, s0, 28 -; GFX8-NEXT: s_bfe_u32 s11, s1, 0x40018 -; GFX8-NEXT: s_bfe_u32 s12, s1, 0x40014 -; GFX8-NEXT: s_bfe_u32 s13, s1, 0x40010 -; GFX8-NEXT: s_bfe_u32 s14, s1, 0x4000c -; GFX8-NEXT: s_bfe_u32 s15, s1, 0x40008 -; GFX8-NEXT: s_bfe_u32 s16, s1, 0x40004 -; GFX8-NEXT: s_lshr_b32 s10, s1, 28 +; GFX8-NEXT: s_bfe_u32 s10, s1, 0x40018 +; GFX8-NEXT: s_bfe_u32 s11, s1, 0x40014 +; GFX8-NEXT: s_bfe_u32 s12, s1, 0x40010 +; GFX8-NEXT: s_bfe_u32 s13, s1, 0x4000c +; GFX8-NEXT: s_bfe_u32 s14, s1, 0x40008 +; GFX8-NEXT: s_bfe_u32 s15, s1, 0x40004 +; GFX8-NEXT: s_lshr_b32 s9, s1, 28 ; GFX8-NEXT: s_and_b32 s1, s1, 15 -; GFX8-NEXT: s_bfe_u32 s4, s0, 0x40018 -; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40014 -; GFX8-NEXT: s_bfe_u32 s6, s0, 0x40010 -; GFX8-NEXT: s_bfe_u32 s7, s0, 0x4000c -; GFX8-NEXT: s_bfe_u32 s8, s0, 0x40008 -; GFX8-NEXT: s_bfe_u32 s9, s0, 0x40004 +; GFX8-NEXT: s_bfe_u32 s3, s0, 0x40018 +; GFX8-NEXT: s_bfe_u32 s4, s0, 0x40014 +; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40010 +; GFX8-NEXT: s_bfe_u32 s6, s0, 0x4000c +; GFX8-NEXT: s_bfe_u32 s7, s0, 0x40008 +; GFX8-NEXT: s_bfe_u32 s8, s0, 0x40004 ; GFX8-NEXT: s_and_b32 s0, s0, 15 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s16 -; GFX8-NEXT: v_mov_b32_e32 v5, s15 -; GFX8-NEXT: v_mov_b32_e32 v6, s14 -; GFX8-NEXT: v_mov_b32_e32 v7, s13 -; GFX8-NEXT: v_mov_b32_e32 v8, s12 -; GFX8-NEXT: v_mov_b32_e32 v9, s11 +; GFX8-NEXT: v_mov_b32_e32 v4, s15 +; GFX8-NEXT: v_mov_b32_e32 v5, s14 +; GFX8-NEXT: v_mov_b32_e32 v6, s13 +; GFX8-NEXT: v_mov_b32_e32 v7, s12 +; GFX8-NEXT: v_mov_b32_e32 v8, s11 +; GFX8-NEXT: v_mov_b32_e32 v9, s10 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s9, v4, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s8, v4, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s8, v5, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s7, v6, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s6, v7, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s5, v8, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s4, v9, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s10 +; GFX8-NEXT: v_mad_u32_u24 v2, s7, v5, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s6, v6, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s5, v7, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s4, v8, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s3, v9, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s9 ; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_endpgm @@ -662,38 +662,38 @@ define amdgpu_kernel void @udot8_acc8(<8 x i4> addrspace(1)* %src1, ; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s2, s0, 28 -; GFX9-NEXT: s_bfe_u32 s11, s1, 0x40018 -; GFX9-NEXT: s_bfe_u32 s12, s1, 0x40014 -; GFX9-NEXT: s_bfe_u32 s13, s1, 0x40010 -; GFX9-NEXT: s_bfe_u32 s14, s1, 0x4000c -; GFX9-NEXT: s_bfe_u32 s15, s1, 0x40008 -; GFX9-NEXT: s_bfe_u32 s16, s1, 0x40004 -; GFX9-NEXT: s_lshr_b32 s10, s1, 28 +; GFX9-NEXT: s_bfe_u32 s10, s1, 0x40018 +; GFX9-NEXT: s_bfe_u32 s11, s1, 0x40014 +; GFX9-NEXT: s_bfe_u32 s12, s1, 0x40010 +; GFX9-NEXT: s_bfe_u32 s13, s1, 0x4000c +; GFX9-NEXT: s_bfe_u32 s14, s1, 0x40008 +; GFX9-NEXT: s_bfe_u32 s15, s1, 0x40004 +; GFX9-NEXT: s_lshr_b32 s9, s1, 28 ; GFX9-NEXT: s_and_b32 s1, s1, 15 -; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40018 -; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40014 -; GFX9-NEXT: s_bfe_u32 s6, s0, 0x40010 -; GFX9-NEXT: s_bfe_u32 s7, s0, 0x4000c -; GFX9-NEXT: s_bfe_u32 s8, s0, 0x40008 -; GFX9-NEXT: s_bfe_u32 s9, s0, 0x40004 +; GFX9-NEXT: s_bfe_u32 s3, s0, 0x40018 +; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40014 +; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40010 +; GFX9-NEXT: s_bfe_u32 s6, s0, 0x4000c +; GFX9-NEXT: s_bfe_u32 s7, s0, 0x40008 +; GFX9-NEXT: s_bfe_u32 s8, s0, 0x40004 ; GFX9-NEXT: s_and_b32 s0, s0, 15 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_mov_b32_e32 v4, s16 -; GFX9-NEXT: v_mov_b32_e32 v5, s15 -; GFX9-NEXT: v_mov_b32_e32 v6, s14 -; GFX9-NEXT: v_mov_b32_e32 v7, s13 -; GFX9-NEXT: v_mov_b32_e32 v8, s12 -; GFX9-NEXT: v_mov_b32_e32 v9, s11 +; GFX9-NEXT: v_mov_b32_e32 v4, s15 +; GFX9-NEXT: v_mov_b32_e32 v5, s14 +; GFX9-NEXT: v_mov_b32_e32 v6, s13 +; GFX9-NEXT: v_mov_b32_e32 v7, s12 +; GFX9-NEXT: v_mov_b32_e32 v8, s11 +; GFX9-NEXT: v_mov_b32_e32 v9, s10 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s9, v4, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s8, v4, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s8, v5, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s7, v6, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s6, v7, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s5, v8, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s4, v9, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s10 +; GFX9-NEXT: v_mad_u32_u24 v2, s7, v5, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s6, v6, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s5, v7, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s4, v8, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s3, v9, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 ; GFX9-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-NEXT: global_store_byte v[0:1], v2, off ; GFX9-NEXT: s_endpgm @@ -710,80 +710,80 @@ define amdgpu_kernel void @udot8_acc8(<8 x i4> addrspace(1)* %src1, ; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_lshr_b32 s2, s0, 28 -; GFX9-DL-NEXT: s_bfe_u32 s11, s1, 0x40018 -; GFX9-DL-NEXT: s_bfe_u32 s12, s1, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s13, s1, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s14, s1, 0x4000c -; GFX9-DL-NEXT: s_bfe_u32 s15, s1, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s16, s1, 0x40004 -; GFX9-DL-NEXT: s_lshr_b32 s10, s1, 28 +; GFX9-DL-NEXT: s_bfe_u32 s10, s1, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s11, s1, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s12, s1, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s13, s1, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s14, s1, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s15, s1, 0x40004 +; GFX9-DL-NEXT: s_lshr_b32 s9, s1, 28 ; GFX9-DL-NEXT: s_and_b32 s1, s1, 15 -; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40018 -; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x4000c -; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s9, s0, 0x40004 +; GFX9-DL-NEXT: s_bfe_u32 s3, s0, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x40004 ; GFX9-DL-NEXT: s_and_b32 s0, s0, 15 ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s16 -; GFX9-DL-NEXT: v_mov_b32_e32 v5, s15 -; GFX9-DL-NEXT: v_mov_b32_e32 v6, s14 -; GFX9-DL-NEXT: v_mov_b32_e32 v7, s13 -; GFX9-DL-NEXT: v_mov_b32_e32 v8, s12 -; GFX9-DL-NEXT: v_mov_b32_e32 v9, s11 +; GFX9-DL-NEXT: v_mov_b32_e32 v4, s15 +; GFX9-DL-NEXT: v_mov_b32_e32 v5, s14 +; GFX9-DL-NEXT: v_mov_b32_e32 v6, s13 +; GFX9-DL-NEXT: v_mov_b32_e32 v7, s12 +; GFX9-DL-NEXT: v_mov_b32_e32 v8, s11 +; GFX9-DL-NEXT: v_mov_b32_e32 v9, s10 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s9, v4, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s8, v4, v2 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s8, v5, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v6, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v7, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v8, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v9, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s10 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v5, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v6, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v7, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v8, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s3, v9, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s9 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc8: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 -; GFX10-DL-NEXT: s_and_b32 s4, s1, 15 -; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40004 +; GFX10-DL-NEXT: s_and_b32 s3, s1, 15 +; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x40004 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 ; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40008 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v2 -; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x4000c -; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x4000c +; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40008 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s4, s5, v2 +; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x4000c +; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x4000c ; GFX10-DL-NEXT: v_and_b32_e32 v2, 0xff, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 ; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40010 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v2 -; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x40014 -; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40014 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 +; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40010 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s4, s5, v2 +; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x40014 +; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x40014 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 ; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40018 -; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40018 +; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40018 ; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28 ; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 28 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s4, s5, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm @@ -860,50 +860,50 @@ entry: define amdgpu_kernel void @udot8_acc4(<8 x i4> addrspace(1)* %src1, ; GFX7-LABEL: udot8_acc4: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 -; GFX7-NEXT: s_load_dword s0, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s1, s[10:11], 0x0 +; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_lshr_b32 s2, s0, 28 -; GFX7-NEXT: s_bfe_u32 s15, s1, 0x40018 -; GFX7-NEXT: s_bfe_u32 s16, s1, 0x40014 -; GFX7-NEXT: s_bfe_u32 s17, s1, 0x40010 -; GFX7-NEXT: s_bfe_u32 s18, s1, 0x4000c -; GFX7-NEXT: s_bfe_u32 s19, s1, 0x40008 -; GFX7-NEXT: s_bfe_u32 s20, s1, 0x40004 -; GFX7-NEXT: s_lshr_b32 s14, s1, 28 -; GFX7-NEXT: s_and_b32 s1, s1, 15 -; GFX7-NEXT: s_bfe_u32 s8, s0, 0x40018 -; GFX7-NEXT: s_bfe_u32 s9, s0, 0x40014 -; GFX7-NEXT: s_bfe_u32 s10, s0, 0x40010 -; GFX7-NEXT: s_bfe_u32 s11, s0, 0x4000c -; GFX7-NEXT: s_bfe_u32 s12, s0, 0x40008 -; GFX7-NEXT: s_bfe_u32 s13, s0, 0x40004 -; GFX7-NEXT: s_and_b32 s0, s0, 15 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-NEXT: v_mov_b32_e32 v3, s19 -; GFX7-NEXT: v_mov_b32_e32 v4, s18 -; GFX7-NEXT: v_mov_b32_e32 v5, s17 -; GFX7-NEXT: v_mov_b32_e32 v6, s16 -; GFX7-NEXT: v_mov_b32_e32 v7, s15 +; GFX7-NEXT: s_lshr_b32 s6, s4, 28 +; GFX7-NEXT: s_bfe_u32 s14, s5, 0x40018 +; GFX7-NEXT: s_bfe_u32 s15, s5, 0x40014 +; GFX7-NEXT: s_bfe_u32 s16, s5, 0x40010 +; GFX7-NEXT: s_bfe_u32 s17, s5, 0x4000c +; GFX7-NEXT: s_bfe_u32 s18, s5, 0x40008 +; GFX7-NEXT: s_bfe_u32 s19, s5, 0x40004 +; GFX7-NEXT: s_lshr_b32 s13, s5, 28 +; GFX7-NEXT: s_and_b32 s5, s5, 15 +; GFX7-NEXT: s_bfe_u32 s7, s4, 0x40018 +; GFX7-NEXT: s_bfe_u32 s8, s4, 0x40014 +; GFX7-NEXT: s_bfe_u32 s9, s4, 0x40010 +; GFX7-NEXT: s_bfe_u32 s10, s4, 0x4000c +; GFX7-NEXT: s_bfe_u32 s11, s4, 0x40008 +; GFX7-NEXT: s_bfe_u32 s12, s4, 0x40004 +; GFX7-NEXT: s_and_b32 s4, s4, 15 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s19 +; GFX7-NEXT: v_mov_b32_e32 v3, s18 +; GFX7-NEXT: v_mov_b32_e32 v4, s17 +; GFX7-NEXT: v_mov_b32_e32 v5, s16 +; GFX7-NEXT: v_mov_b32_e32 v6, s15 +; GFX7-NEXT: v_mov_b32_e32 v7, s14 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v0, s0, v1, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s13, v2, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s12, v3, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s11, v4, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s10, v5, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s9, v6, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s8, v7, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s14 -; GFX7-NEXT: v_mad_u32_u24 v0, s2, v1, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s12, v2, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s11, v3, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s10, v4, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s9, v5, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s8, v6, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s7, v7, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s13 +; GFX7-NEXT: v_mad_u32_u24 v0, s6, v1, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX7-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: udot8_acc4: @@ -917,41 +917,41 @@ define amdgpu_kernel void @udot8_acc4(<8 x i4> addrspace(1)* %src1, ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s9, s0, 15 -; GFX8-NEXT: s_and_b32 s16, s1, 15 -; GFX8-NEXT: s_bfe_u32 s15, s1, 0x40004 -; GFX8-NEXT: v_mov_b32_e32 v4, s16 -; GFX8-NEXT: s_bfe_u32 s11, s1, 0x40018 -; GFX8-NEXT: s_bfe_u32 s12, s1, 0x40014 -; GFX8-NEXT: s_bfe_u32 s13, s1, 0x40010 -; GFX8-NEXT: s_bfe_u32 s14, s1, 0x40008 -; GFX8-NEXT: s_lshr_b32 s10, s1, 28 +; GFX8-NEXT: s_and_b32 s8, s0, 15 +; GFX8-NEXT: s_and_b32 s15, s1, 15 +; GFX8-NEXT: s_bfe_u32 s14, s1, 0x40004 +; GFX8-NEXT: v_mov_b32_e32 v4, s15 +; GFX8-NEXT: s_bfe_u32 s10, s1, 0x40018 +; GFX8-NEXT: s_bfe_u32 s11, s1, 0x40014 +; GFX8-NEXT: s_bfe_u32 s12, s1, 0x40010 +; GFX8-NEXT: s_bfe_u32 s13, s1, 0x40008 +; GFX8-NEXT: s_lshr_b32 s9, s1, 28 ; GFX8-NEXT: s_bfe_u32 s1, s1, 0x4000c -; GFX8-NEXT: s_bfe_u32 s8, s0, 0x40004 -; GFX8-NEXT: v_mov_b32_e32 v5, s15 +; GFX8-NEXT: s_bfe_u32 s7, s0, 0x40004 +; GFX8-NEXT: v_mov_b32_e32 v5, s14 ; GFX8-NEXT: s_lshr_b32 s2, s0, 28 -; GFX8-NEXT: s_bfe_u32 s4, s0, 0x40018 -; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40014 -; GFX8-NEXT: s_bfe_u32 s6, s0, 0x40010 -; GFX8-NEXT: s_bfe_u32 s7, s0, 0x40008 +; GFX8-NEXT: s_bfe_u32 s3, s0, 0x40018 +; GFX8-NEXT: s_bfe_u32 s4, s0, 0x40014 +; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40010 +; GFX8-NEXT: s_bfe_u32 s6, s0, 0x40008 ; GFX8-NEXT: s_bfe_u32 s0, s0, 0x4000c ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v6, s14 +; GFX8-NEXT: v_mov_b32_e32 v6, s13 ; GFX8-NEXT: v_mul_u32_u24_e32 v3, s0, v3 ; GFX8-NEXT: v_and_b32_e32 v3, 15, v3 -; GFX8-NEXT: v_mov_b32_e32 v7, s13 -; GFX8-NEXT: v_mov_b32_e32 v8, s12 -; GFX8-NEXT: v_mov_b32_e32 v9, s11 +; GFX8-NEXT: v_mov_b32_e32 v7, s12 +; GFX8-NEXT: v_mov_b32_e32 v8, s11 +; GFX8-NEXT: v_mov_b32_e32 v9, s10 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v2, s9, v4, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s8, v5, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s7, v6, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s8, v4, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s7, v5, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s6, v6, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s6, v7, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s5, v8, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s4, v9, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s10 +; GFX8-NEXT: v_mad_u32_u24 v2, s5, v7, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s4, v8, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s3, v9, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s9 ; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX8-NEXT: flat_store_byte v[0:1], v2 @@ -968,41 +968,41 @@ define amdgpu_kernel void @udot8_acc4(<8 x i4> addrspace(1)* %src1, ; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s9, s0, 15 -; GFX9-NEXT: s_and_b32 s16, s1, 15 -; GFX9-NEXT: s_bfe_u32 s15, s1, 0x40004 -; GFX9-NEXT: v_mov_b32_e32 v4, s16 -; GFX9-NEXT: s_bfe_u32 s11, s1, 0x40018 -; GFX9-NEXT: s_bfe_u32 s12, s1, 0x40014 -; GFX9-NEXT: s_bfe_u32 s13, s1, 0x40010 -; GFX9-NEXT: s_bfe_u32 s14, s1, 0x40008 -; GFX9-NEXT: s_lshr_b32 s10, s1, 28 +; GFX9-NEXT: s_and_b32 s8, s0, 15 +; GFX9-NEXT: s_and_b32 s15, s1, 15 +; GFX9-NEXT: s_bfe_u32 s14, s1, 0x40004 +; GFX9-NEXT: v_mov_b32_e32 v4, s15 +; GFX9-NEXT: s_bfe_u32 s10, s1, 0x40018 +; GFX9-NEXT: s_bfe_u32 s11, s1, 0x40014 +; GFX9-NEXT: s_bfe_u32 s12, s1, 0x40010 +; GFX9-NEXT: s_bfe_u32 s13, s1, 0x40008 +; GFX9-NEXT: s_lshr_b32 s9, s1, 28 ; GFX9-NEXT: s_bfe_u32 s1, s1, 0x4000c -; GFX9-NEXT: s_bfe_u32 s8, s0, 0x40004 -; GFX9-NEXT: v_mov_b32_e32 v5, s15 +; GFX9-NEXT: s_bfe_u32 s7, s0, 0x40004 +; GFX9-NEXT: v_mov_b32_e32 v5, s14 ; GFX9-NEXT: s_lshr_b32 s2, s0, 28 -; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40018 -; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40014 -; GFX9-NEXT: s_bfe_u32 s6, s0, 0x40010 -; GFX9-NEXT: s_bfe_u32 s7, s0, 0x40008 +; GFX9-NEXT: s_bfe_u32 s3, s0, 0x40018 +; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40014 +; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40010 +; GFX9-NEXT: s_bfe_u32 s6, s0, 0x40008 ; GFX9-NEXT: s_bfe_u32 s0, s0, 0x4000c ; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_mov_b32_e32 v6, s14 +; GFX9-NEXT: v_mov_b32_e32 v6, s13 ; GFX9-NEXT: v_mul_u32_u24_e32 v3, s0, v3 ; GFX9-NEXT: v_and_b32_e32 v3, 15, v3 -; GFX9-NEXT: v_mov_b32_e32 v7, s13 -; GFX9-NEXT: v_mov_b32_e32 v8, s12 -; GFX9-NEXT: v_mov_b32_e32 v9, s11 +; GFX9-NEXT: v_mov_b32_e32 v7, s12 +; GFX9-NEXT: v_mov_b32_e32 v8, s11 +; GFX9-NEXT: v_mov_b32_e32 v9, s10 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mad_u32_u24 v2, s9, v4, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s8, v5, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s7, v6, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s8, v4, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s7, v5, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s6, v6, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 -; GFX9-NEXT: v_mad_u32_u24 v2, s6, v7, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s5, v8, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s4, v9, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s10 +; GFX9-NEXT: v_mad_u32_u24 v2, s5, v7, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s4, v8, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s3, v9, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 ; GFX9-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-NEXT: global_store_byte v[0:1], v2, off @@ -1019,41 +1019,41 @@ define amdgpu_kernel void @udot8_acc4(<8 x i4> addrspace(1)* %src1, ; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s9, s0, 15 -; GFX9-DL-NEXT: s_and_b32 s16, s1, 15 -; GFX9-DL-NEXT: s_bfe_u32 s15, s1, 0x40004 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s16 -; GFX9-DL-NEXT: s_bfe_u32 s11, s1, 0x40018 -; GFX9-DL-NEXT: s_bfe_u32 s12, s1, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s13, s1, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s14, s1, 0x40008 -; GFX9-DL-NEXT: s_lshr_b32 s10, s1, 28 +; GFX9-DL-NEXT: s_and_b32 s8, s0, 15 +; GFX9-DL-NEXT: s_and_b32 s15, s1, 15 +; GFX9-DL-NEXT: s_bfe_u32 s14, s1, 0x40004 +; GFX9-DL-NEXT: v_mov_b32_e32 v4, s15 +; GFX9-DL-NEXT: s_bfe_u32 s10, s1, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s11, s1, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s12, s1, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s13, s1, 0x40008 +; GFX9-DL-NEXT: s_lshr_b32 s9, s1, 28 ; GFX9-DL-NEXT: s_bfe_u32 s1, s1, 0x4000c -; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x40004 -; GFX9-DL-NEXT: v_mov_b32_e32 v5, s15 +; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x40004 +; GFX9-DL-NEXT: v_mov_b32_e32 v5, s14 ; GFX9-DL-NEXT: s_lshr_b32 s2, s0, 28 -; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40018 -; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s3, s0, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x40008 ; GFX9-DL-NEXT: s_bfe_u32 s0, s0, 0x4000c ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-DL-NEXT: v_mov_b32_e32 v6, s14 +; GFX9-DL-NEXT: v_mov_b32_e32 v6, s13 ; GFX9-DL-NEXT: v_mul_u32_u24_e32 v3, s0, v3 ; GFX9-DL-NEXT: v_and_b32_e32 v3, 15, v3 -; GFX9-DL-NEXT: v_mov_b32_e32 v7, s13 -; GFX9-DL-NEXT: v_mov_b32_e32 v8, s12 -; GFX9-DL-NEXT: v_mov_b32_e32 v9, s11 +; GFX9-DL-NEXT: v_mov_b32_e32 v7, s12 +; GFX9-DL-NEXT: v_mov_b32_e32 v8, s11 +; GFX9-DL-NEXT: v_mov_b32_e32 v9, s10 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s9, v4, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s8, v5, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v6, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s8, v4, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v5, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v6, v2 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v3 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v7, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v8, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v9, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s10 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v7, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v8, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s3, v9, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s9 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off @@ -1061,44 +1061,44 @@ define amdgpu_kernel void @udot8_acc4(<8 x i4> addrspace(1)* %src1, ; ; GFX10-DL-LABEL: udot8_acc4: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 -; GFX10-DL-NEXT: s_and_b32 s4, s1, 15 -; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x40008 +; GFX10-DL-NEXT: s_and_b32 s3, s1, 15 +; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40008 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 ; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x4000c -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v2 -; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x4000c -; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40014 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s7, v2 -; GFX10-DL-NEXT: v_mul_u32_u24_e64 v3, s4, s5 +; GFX10-DL-NEXT: s_bfe_u32 s3, s0, 0x4000c +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s4, s5, v2 +; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x4000c +; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x40014 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s6, v2 +; GFX10-DL-NEXT: v_mul_u32_u24_e64 v3, s3, s4 ; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x40014 ; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX10-DL-NEXT: v_and_b32_e32 v3, 15, v3 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v3 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 ; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40018 -; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40018 +; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40018 ; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28 ; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 28 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s4, s5, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 ; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off @@ -1160,50 +1160,50 @@ entry: define amdgpu_kernel void @udot8_CommutationInsideMAD(<8 x i4> addrspace(1)* %src1, ; GFX7-LABEL: udot8_CommutationInsideMAD: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 -; GFX7-NEXT: s_load_dword s0, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s1, s[10:11], 0x0 +; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_lshr_b32 s2, s0, 28 -; GFX7-NEXT: s_bfe_u32 s15, s1, 0x40018 -; GFX7-NEXT: s_bfe_u32 s16, s1, 0x40014 -; GFX7-NEXT: s_bfe_u32 s17, s1, 0x40010 -; GFX7-NEXT: s_bfe_u32 s18, s1, 0x4000c -; GFX7-NEXT: s_bfe_u32 s19, s1, 0x40008 -; GFX7-NEXT: s_bfe_u32 s20, s1, 0x40004 -; GFX7-NEXT: s_lshr_b32 s14, s1, 28 -; GFX7-NEXT: s_and_b32 s1, s1, 15 -; GFX7-NEXT: s_bfe_u32 s8, s0, 0x40018 -; GFX7-NEXT: s_bfe_u32 s9, s0, 0x40014 -; GFX7-NEXT: s_bfe_u32 s10, s0, 0x40010 -; GFX7-NEXT: s_bfe_u32 s11, s0, 0x4000c -; GFX7-NEXT: s_bfe_u32 s12, s0, 0x40008 -; GFX7-NEXT: s_bfe_u32 s13, s0, 0x40004 -; GFX7-NEXT: s_and_b32 s0, s0, 15 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-NEXT: v_mov_b32_e32 v3, s19 -; GFX7-NEXT: v_mov_b32_e32 v4, s18 -; GFX7-NEXT: v_mov_b32_e32 v5, s17 -; GFX7-NEXT: v_mov_b32_e32 v6, s16 -; GFX7-NEXT: v_mov_b32_e32 v7, s15 +; GFX7-NEXT: s_lshr_b32 s6, s4, 28 +; GFX7-NEXT: s_bfe_u32 s14, s5, 0x40018 +; GFX7-NEXT: s_bfe_u32 s15, s5, 0x40014 +; GFX7-NEXT: s_bfe_u32 s16, s5, 0x40010 +; GFX7-NEXT: s_bfe_u32 s17, s5, 0x4000c +; GFX7-NEXT: s_bfe_u32 s18, s5, 0x40008 +; GFX7-NEXT: s_bfe_u32 s19, s5, 0x40004 +; GFX7-NEXT: s_lshr_b32 s13, s5, 28 +; GFX7-NEXT: s_and_b32 s5, s5, 15 +; GFX7-NEXT: s_bfe_u32 s7, s4, 0x40018 +; GFX7-NEXT: s_bfe_u32 s8, s4, 0x40014 +; GFX7-NEXT: s_bfe_u32 s9, s4, 0x40010 +; GFX7-NEXT: s_bfe_u32 s10, s4, 0x4000c +; GFX7-NEXT: s_bfe_u32 s11, s4, 0x40008 +; GFX7-NEXT: s_bfe_u32 s12, s4, 0x40004 +; GFX7-NEXT: s_and_b32 s4, s4, 15 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s19 +; GFX7-NEXT: v_mov_b32_e32 v3, s18 +; GFX7-NEXT: v_mov_b32_e32 v4, s17 +; GFX7-NEXT: v_mov_b32_e32 v5, s16 +; GFX7-NEXT: v_mov_b32_e32 v6, s15 +; GFX7-NEXT: v_mov_b32_e32 v7, s14 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v0, s0, v1, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s13, v2, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s12, v3, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s11, v4, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s10, v5, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s9, v6, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s8, v7, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s14 -; GFX7-NEXT: v_mad_u32_u24 v0, s2, v1, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s12, v2, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s11, v3, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s10, v4, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s9, v5, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s8, v6, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s7, v7, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s13 +; GFX7-NEXT: v_mad_u32_u24 v0, s6, v1, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX7-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: udot8_CommutationInsideMAD: @@ -1217,41 +1217,41 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(<8 x i4> addrspace(1)* %sr ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s9, s0, 15 -; GFX8-NEXT: s_and_b32 s16, s1, 15 -; GFX8-NEXT: s_bfe_u32 s15, s1, 0x40004 -; GFX8-NEXT: v_mov_b32_e32 v4, s16 -; GFX8-NEXT: s_bfe_u32 s11, s1, 0x40018 -; GFX8-NEXT: s_bfe_u32 s12, s1, 0x40014 -; GFX8-NEXT: s_bfe_u32 s13, s1, 0x40010 -; GFX8-NEXT: s_bfe_u32 s14, s1, 0x40008 -; GFX8-NEXT: s_lshr_b32 s10, s1, 28 +; GFX8-NEXT: s_and_b32 s8, s0, 15 +; GFX8-NEXT: s_and_b32 s15, s1, 15 +; GFX8-NEXT: s_bfe_u32 s14, s1, 0x40004 +; GFX8-NEXT: v_mov_b32_e32 v4, s15 +; GFX8-NEXT: s_bfe_u32 s10, s1, 0x40018 +; GFX8-NEXT: s_bfe_u32 s11, s1, 0x40014 +; GFX8-NEXT: s_bfe_u32 s12, s1, 0x40010 +; GFX8-NEXT: s_bfe_u32 s13, s1, 0x40008 +; GFX8-NEXT: s_lshr_b32 s9, s1, 28 ; GFX8-NEXT: s_bfe_u32 s1, s1, 0x4000c -; GFX8-NEXT: s_bfe_u32 s8, s0, 0x40004 -; GFX8-NEXT: v_mov_b32_e32 v5, s15 +; GFX8-NEXT: s_bfe_u32 s7, s0, 0x40004 +; GFX8-NEXT: v_mov_b32_e32 v5, s14 ; GFX8-NEXT: s_lshr_b32 s2, s0, 28 -; GFX8-NEXT: s_bfe_u32 s4, s0, 0x40018 -; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40014 -; GFX8-NEXT: s_bfe_u32 s6, s0, 0x40010 -; GFX8-NEXT: s_bfe_u32 s7, s0, 0x40008 +; GFX8-NEXT: s_bfe_u32 s3, s0, 0x40018 +; GFX8-NEXT: s_bfe_u32 s4, s0, 0x40014 +; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40010 +; GFX8-NEXT: s_bfe_u32 s6, s0, 0x40008 ; GFX8-NEXT: s_bfe_u32 s0, s0, 0x4000c ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v6, s14 +; GFX8-NEXT: v_mov_b32_e32 v6, s13 ; GFX8-NEXT: v_mul_u32_u24_e32 v3, s0, v3 ; GFX8-NEXT: v_and_b32_e32 v3, 15, v3 -; GFX8-NEXT: v_mov_b32_e32 v7, s13 -; GFX8-NEXT: v_mov_b32_e32 v8, s12 -; GFX8-NEXT: v_mov_b32_e32 v9, s11 +; GFX8-NEXT: v_mov_b32_e32 v7, s12 +; GFX8-NEXT: v_mov_b32_e32 v8, s11 +; GFX8-NEXT: v_mov_b32_e32 v9, s10 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v2, s9, v4, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s8, v5, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s7, v6, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s8, v4, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s7, v5, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s6, v6, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v3 -; GFX8-NEXT: v_mad_u32_u24 v2, s6, v7, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s5, v8, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s4, v9, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s10 +; GFX8-NEXT: v_mad_u32_u24 v2, s5, v7, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s4, v8, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s3, v9, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s9 ; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX8-NEXT: flat_store_byte v[0:1], v2 @@ -1268,41 +1268,41 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(<8 x i4> addrspace(1)* %sr ; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s9, s0, 15 -; GFX9-NEXT: s_and_b32 s16, s1, 15 -; GFX9-NEXT: s_bfe_u32 s15, s1, 0x40004 -; GFX9-NEXT: v_mov_b32_e32 v4, s16 -; GFX9-NEXT: s_bfe_u32 s11, s1, 0x40018 -; GFX9-NEXT: s_bfe_u32 s12, s1, 0x40014 -; GFX9-NEXT: s_bfe_u32 s13, s1, 0x40010 -; GFX9-NEXT: s_bfe_u32 s14, s1, 0x40008 -; GFX9-NEXT: s_lshr_b32 s10, s1, 28 +; GFX9-NEXT: s_and_b32 s8, s0, 15 +; GFX9-NEXT: s_and_b32 s15, s1, 15 +; GFX9-NEXT: s_bfe_u32 s14, s1, 0x40004 +; GFX9-NEXT: v_mov_b32_e32 v4, s15 +; GFX9-NEXT: s_bfe_u32 s10, s1, 0x40018 +; GFX9-NEXT: s_bfe_u32 s11, s1, 0x40014 +; GFX9-NEXT: s_bfe_u32 s12, s1, 0x40010 +; GFX9-NEXT: s_bfe_u32 s13, s1, 0x40008 +; GFX9-NEXT: s_lshr_b32 s9, s1, 28 ; GFX9-NEXT: s_bfe_u32 s1, s1, 0x4000c -; GFX9-NEXT: s_bfe_u32 s8, s0, 0x40004 -; GFX9-NEXT: v_mov_b32_e32 v5, s15 +; GFX9-NEXT: s_bfe_u32 s7, s0, 0x40004 +; GFX9-NEXT: v_mov_b32_e32 v5, s14 ; GFX9-NEXT: s_lshr_b32 s2, s0, 28 -; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40018 -; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40014 -; GFX9-NEXT: s_bfe_u32 s6, s0, 0x40010 -; GFX9-NEXT: s_bfe_u32 s7, s0, 0x40008 +; GFX9-NEXT: s_bfe_u32 s3, s0, 0x40018 +; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40014 +; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40010 +; GFX9-NEXT: s_bfe_u32 s6, s0, 0x40008 ; GFX9-NEXT: s_bfe_u32 s0, s0, 0x4000c ; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_mov_b32_e32 v6, s14 +; GFX9-NEXT: v_mov_b32_e32 v6, s13 ; GFX9-NEXT: v_mul_u32_u24_e32 v3, s0, v3 ; GFX9-NEXT: v_and_b32_e32 v3, 15, v3 -; GFX9-NEXT: v_mov_b32_e32 v7, s13 -; GFX9-NEXT: v_mov_b32_e32 v8, s12 -; GFX9-NEXT: v_mov_b32_e32 v9, s11 +; GFX9-NEXT: v_mov_b32_e32 v7, s12 +; GFX9-NEXT: v_mov_b32_e32 v8, s11 +; GFX9-NEXT: v_mov_b32_e32 v9, s10 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mad_u32_u24 v2, s9, v4, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s8, v5, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s7, v6, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s8, v4, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s7, v5, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s6, v6, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-NEXT: v_add_u32_e32 v2, v3, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s6, v7, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s5, v8, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s4, v9, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s10 +; GFX9-NEXT: v_mad_u32_u24 v2, s5, v7, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s4, v8, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s3, v9, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 ; GFX9-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-NEXT: global_store_byte v[0:1], v2, off @@ -1319,41 +1319,41 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(<8 x i4> addrspace(1)* %sr ; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s9, s0, 15 -; GFX9-DL-NEXT: s_and_b32 s16, s1, 15 -; GFX9-DL-NEXT: s_bfe_u32 s15, s1, 0x40004 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s16 -; GFX9-DL-NEXT: s_bfe_u32 s11, s1, 0x40018 -; GFX9-DL-NEXT: s_bfe_u32 s12, s1, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s13, s1, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s14, s1, 0x40008 -; GFX9-DL-NEXT: s_lshr_b32 s10, s1, 28 +; GFX9-DL-NEXT: s_and_b32 s8, s0, 15 +; GFX9-DL-NEXT: s_and_b32 s15, s1, 15 +; GFX9-DL-NEXT: s_bfe_u32 s14, s1, 0x40004 +; GFX9-DL-NEXT: v_mov_b32_e32 v4, s15 +; GFX9-DL-NEXT: s_bfe_u32 s10, s1, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s11, s1, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s12, s1, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s13, s1, 0x40008 +; GFX9-DL-NEXT: s_lshr_b32 s9, s1, 28 ; GFX9-DL-NEXT: s_bfe_u32 s1, s1, 0x4000c -; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x40004 -; GFX9-DL-NEXT: v_mov_b32_e32 v5, s15 +; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x40004 +; GFX9-DL-NEXT: v_mov_b32_e32 v5, s14 ; GFX9-DL-NEXT: s_lshr_b32 s2, s0, 28 -; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40018 -; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s3, s0, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x40008 ; GFX9-DL-NEXT: s_bfe_u32 s0, s0, 0x4000c ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-DL-NEXT: v_mov_b32_e32 v6, s14 +; GFX9-DL-NEXT: v_mov_b32_e32 v6, s13 ; GFX9-DL-NEXT: v_mul_u32_u24_e32 v3, s0, v3 ; GFX9-DL-NEXT: v_and_b32_e32 v3, 15, v3 -; GFX9-DL-NEXT: v_mov_b32_e32 v7, s13 -; GFX9-DL-NEXT: v_mov_b32_e32 v8, s12 -; GFX9-DL-NEXT: v_mov_b32_e32 v9, s11 +; GFX9-DL-NEXT: v_mov_b32_e32 v7, s12 +; GFX9-DL-NEXT: v_mov_b32_e32 v8, s11 +; GFX9-DL-NEXT: v_mov_b32_e32 v9, s10 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s9, v4, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s8, v5, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v6, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s8, v4, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v5, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v6, v2 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-DL-NEXT: v_add_u32_e32 v2, v3, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v7, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v8, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v9, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s10 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v7, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v8, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s3, v9, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s9 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off @@ -1361,44 +1361,44 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(<8 x i4> addrspace(1)* %sr ; ; GFX10-DL-LABEL: udot8_CommutationInsideMAD: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 -; GFX10-DL-NEXT: s_and_b32 s4, s1, 15 -; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s8, s1, 0x4000c +; GFX10-DL-NEXT: s_and_b32 s3, s1, 15 +; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40008 +; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x4000c ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 -; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x4000c +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 +; GFX10-DL-NEXT: s_bfe_u32 s3, s0, 0x4000c ; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40008 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v2 -; GFX10-DL-NEXT: v_mul_u32_u24_e64 v3, s4, s8 -; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x40014 -; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40014 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s7, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s4, s5, v2 +; GFX10-DL-NEXT: v_mul_u32_u24_e64 v3, s3, s7 +; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x40014 +; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x40014 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s6, v2 ; GFX10-DL-NEXT: v_and_b32_e32 v3, 15, v3 ; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40010 ; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v3, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 ; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40018 -; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40018 +; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40018 ; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28 ; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 28 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s4, s5, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 ; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off @@ -1458,51 +1458,51 @@ entry: define amdgpu_kernel void @udot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1, ; GFX7-LABEL: udot8_multiuses_mul1: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s10, s[10:11], 0x0 -; GFX7-NEXT: s_load_dword s0, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s21, s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s6, s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s20, s[0:1], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bfe_u32 s20, s10, 0x40004 -; GFX7-NEXT: s_lshr_b32 s11, s10, 28 -; GFX7-NEXT: s_bfe_u32 s15, s10, 0x40018 -; GFX7-NEXT: s_bfe_u32 s16, s10, 0x40014 -; GFX7-NEXT: s_bfe_u32 s17, s10, 0x40010 -; GFX7-NEXT: s_bfe_u32 s18, s10, 0x4000c -; GFX7-NEXT: s_bfe_u32 s19, s10, 0x40008 -; GFX7-NEXT: s_and_b32 s10, s10, 15 -; GFX7-NEXT: s_lshr_b32 s1, s0, 28 -; GFX7-NEXT: s_bfe_u32 s2, s0, 0x40018 -; GFX7-NEXT: s_bfe_u32 s8, s0, 0x40014 -; GFX7-NEXT: s_bfe_u32 s9, s0, 0x40010 -; GFX7-NEXT: s_bfe_u32 s12, s0, 0x4000c -; GFX7-NEXT: s_bfe_u32 s13, s0, 0x40008 -; GFX7-NEXT: s_bfe_u32 s14, s0, 0x40004 -; GFX7-NEXT: s_and_b32 s0, s0, 15 -; GFX7-NEXT: v_mov_b32_e32 v0, s10 -; GFX7-NEXT: v_mov_b32_e32 v1, s21 -; GFX7-NEXT: v_mad_u32_u24 v1, s0, v0, v1 -; GFX7-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-NEXT: v_mad_u32_u24 v0, s0, v0, v1 -; GFX7-NEXT: v_mad_u32_u24 v1, s14, v2, v1 +; GFX7-NEXT: s_bfe_u32 s19, s6, 0x40004 +; GFX7-NEXT: s_lshr_b32 s7, s6, 28 +; GFX7-NEXT: s_bfe_u32 s14, s6, 0x40018 +; GFX7-NEXT: s_bfe_u32 s15, s6, 0x40014 +; GFX7-NEXT: s_bfe_u32 s16, s6, 0x40010 +; GFX7-NEXT: s_bfe_u32 s17, s6, 0x4000c +; GFX7-NEXT: s_bfe_u32 s18, s6, 0x40008 +; GFX7-NEXT: s_and_b32 s6, s6, 15 +; GFX7-NEXT: s_lshr_b32 s5, s4, 28 +; GFX7-NEXT: s_bfe_u32 s8, s4, 0x40018 +; GFX7-NEXT: s_bfe_u32 s9, s4, 0x40014 +; GFX7-NEXT: s_bfe_u32 s10, s4, 0x40010 +; GFX7-NEXT: s_bfe_u32 s11, s4, 0x4000c +; GFX7-NEXT: s_bfe_u32 s12, s4, 0x40008 +; GFX7-NEXT: s_bfe_u32 s13, s4, 0x40004 +; GFX7-NEXT: s_and_b32 s4, s4, 15 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 +; GFX7-NEXT: v_mov_b32_e32 v1, s20 +; GFX7-NEXT: v_mad_u32_u24 v1, s4, v0, v1 ; GFX7-NEXT: v_mov_b32_e32 v2, s19 +; GFX7-NEXT: v_mad_u32_u24 v0, s4, v0, v1 ; GFX7-NEXT: v_mad_u32_u24 v1, s13, v2, v1 ; GFX7-NEXT: v_mov_b32_e32 v2, s18 ; GFX7-NEXT: v_mad_u32_u24 v1, s12, v2, v1 ; GFX7-NEXT: v_mov_b32_e32 v2, s17 -; GFX7-NEXT: v_mad_u32_u24 v1, s9, v2, v1 +; GFX7-NEXT: v_mad_u32_u24 v1, s11, v2, v1 ; GFX7-NEXT: v_mov_b32_e32 v2, s16 -; GFX7-NEXT: v_mad_u32_u24 v1, s8, v2, v1 +; GFX7-NEXT: v_mad_u32_u24 v1, s10, v2, v1 ; GFX7-NEXT: v_mov_b32_e32 v2, s15 -; GFX7-NEXT: v_mad_u32_u24 v1, s2, v2, v1 -; GFX7-NEXT: v_mov_b32_e32 v2, s11 -; GFX7-NEXT: v_mad_u32_u24 v1, s1, v2, v1 +; GFX7-NEXT: v_mad_u32_u24 v1, s9, v2, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, s14 +; GFX7-NEXT: v_mad_u32_u24 v1, s8, v2, v1 +; GFX7-NEXT: v_mov_b32_e32 v2, s7 +; GFX7-NEXT: v_mad_u32_u24 v1, s5, v2, v1 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v1, v0 -; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: udot8_multiuses_mul1: @@ -1512,31 +1512,29 @@ define amdgpu_kernel void @udot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1, ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s6, s[6:7], 0x0 ; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s19, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s18, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_u32 s18, s6, 0x40004 +; GFX8-NEXT: s_bfe_u32 s17, s6, 0x40004 ; GFX8-NEXT: s_lshr_b32 s7, s6, 28 -; GFX8-NEXT: s_bfe_u32 s13, s6, 0x40018 -; GFX8-NEXT: s_bfe_u32 s14, s6, 0x40014 -; GFX8-NEXT: s_bfe_u32 s15, s6, 0x40010 -; GFX8-NEXT: s_bfe_u32 s16, s6, 0x4000c -; GFX8-NEXT: s_bfe_u32 s17, s6, 0x40008 +; GFX8-NEXT: s_bfe_u32 s12, s6, 0x40018 +; GFX8-NEXT: s_bfe_u32 s13, s6, 0x40014 +; GFX8-NEXT: s_bfe_u32 s14, s6, 0x40010 +; GFX8-NEXT: s_bfe_u32 s15, s6, 0x4000c +; GFX8-NEXT: s_bfe_u32 s16, s6, 0x40008 ; GFX8-NEXT: s_and_b32 s6, s6, 15 -; GFX8-NEXT: s_lshr_b32 s4, s2, 28 -; GFX8-NEXT: s_bfe_u32 s5, s2, 0x40018 -; GFX8-NEXT: s_bfe_u32 s8, s2, 0x40014 -; GFX8-NEXT: s_bfe_u32 s9, s2, 0x40010 -; GFX8-NEXT: s_bfe_u32 s10, s2, 0x4000c -; GFX8-NEXT: s_bfe_u32 s11, s2, 0x40008 -; GFX8-NEXT: s_bfe_u32 s12, s2, 0x40004 +; GFX8-NEXT: s_lshr_b32 s3, s2, 28 +; GFX8-NEXT: s_bfe_u32 s4, s2, 0x40018 +; GFX8-NEXT: s_bfe_u32 s5, s2, 0x40014 +; GFX8-NEXT: s_bfe_u32 s8, s2, 0x40010 +; GFX8-NEXT: s_bfe_u32 s9, s2, 0x4000c +; GFX8-NEXT: s_bfe_u32 s10, s2, 0x40008 +; GFX8-NEXT: s_bfe_u32 s11, s2, 0x40004 ; GFX8-NEXT: s_and_b32 s2, s2, 15 ; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s19 +; GFX8-NEXT: v_mov_b32_e32 v1, s18 ; GFX8-NEXT: v_mad_u32_u24 v1, s2, v0, v1 -; GFX8-NEXT: v_mov_b32_e32 v2, s18 -; GFX8-NEXT: v_mad_u32_u24 v0, s2, v0, v1 -; GFX8-NEXT: v_mad_u32_u24 v1, s12, v2, v1 ; GFX8-NEXT: v_mov_b32_e32 v2, s17 +; GFX8-NEXT: v_mad_u32_u24 v0, s2, v0, v1 ; GFX8-NEXT: v_mad_u32_u24 v1, s11, v2, v1 ; GFX8-NEXT: v_mov_b32_e32 v2, s16 ; GFX8-NEXT: v_mad_u32_u24 v1, s10, v2, v1 @@ -1546,8 +1544,10 @@ define amdgpu_kernel void @udot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1, ; GFX8-NEXT: v_mad_u32_u24 v1, s8, v2, v1 ; GFX8-NEXT: v_mov_b32_e32 v2, s13 ; GFX8-NEXT: v_mad_u32_u24 v1, s5, v2, v1 -; GFX8-NEXT: v_mov_b32_e32 v2, s7 +; GFX8-NEXT: v_mov_b32_e32 v2, s12 ; GFX8-NEXT: v_mad_u32_u24 v1, s4, v2, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, s7 +; GFX8-NEXT: v_mad_u32_u24 v1, s3, v2, v1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -1561,31 +1561,29 @@ define amdgpu_kernel void @udot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1, ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s19, s[0:1], 0x0 +; GFX9-NEXT: s_load_dword s18, s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_u32 s18, s6, 0x40004 +; GFX9-NEXT: s_bfe_u32 s17, s6, 0x40004 ; GFX9-NEXT: s_lshr_b32 s7, s6, 28 -; GFX9-NEXT: s_bfe_u32 s13, s6, 0x40018 -; GFX9-NEXT: s_bfe_u32 s14, s6, 0x40014 -; GFX9-NEXT: s_bfe_u32 s15, s6, 0x40010 -; GFX9-NEXT: s_bfe_u32 s16, s6, 0x4000c -; GFX9-NEXT: s_bfe_u32 s17, s6, 0x40008 +; GFX9-NEXT: s_bfe_u32 s12, s6, 0x40018 +; GFX9-NEXT: s_bfe_u32 s13, s6, 0x40014 +; GFX9-NEXT: s_bfe_u32 s14, s6, 0x40010 +; GFX9-NEXT: s_bfe_u32 s15, s6, 0x4000c +; GFX9-NEXT: s_bfe_u32 s16, s6, 0x40008 ; GFX9-NEXT: s_and_b32 s6, s6, 15 -; GFX9-NEXT: s_lshr_b32 s4, s2, 28 -; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40018 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40014 -; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40010 -; GFX9-NEXT: s_bfe_u32 s10, s2, 0x4000c -; GFX9-NEXT: s_bfe_u32 s11, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s12, s2, 0x40004 +; GFX9-NEXT: s_lshr_b32 s3, s2, 28 +; GFX9-NEXT: s_bfe_u32 s4, s2, 0x40018 +; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40014 +; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40010 +; GFX9-NEXT: s_bfe_u32 s9, s2, 0x4000c +; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40008 +; GFX9-NEXT: s_bfe_u32 s11, s2, 0x40004 ; GFX9-NEXT: s_and_b32 s2, s2, 15 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s19 +; GFX9-NEXT: v_mov_b32_e32 v1, s18 ; GFX9-NEXT: v_mad_u32_u24 v1, s2, v0, v1 -; GFX9-NEXT: v_mov_b32_e32 v2, s18 -; GFX9-NEXT: v_mad_u32_u24 v0, s2, v0, v1 -; GFX9-NEXT: v_mad_u32_u24 v1, s12, v2, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, s17 +; GFX9-NEXT: v_mad_u32_u24 v0, s2, v0, v1 ; GFX9-NEXT: v_mad_u32_u24 v1, s11, v2, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, s16 ; GFX9-NEXT: v_mad_u32_u24 v1, s10, v2, v1 @@ -1595,8 +1593,10 @@ define amdgpu_kernel void @udot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1, ; GFX9-NEXT: v_mad_u32_u24 v1, s8, v2, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, s13 ; GFX9-NEXT: v_mad_u32_u24 v1, s5, v2, v1 -; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: v_mov_b32_e32 v2, s12 ; GFX9-NEXT: v_mad_u32_u24 v1, s4, v2, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-NEXT: v_mad_u32_u24 v1, s3, v2, v1 ; GFX9-NEXT: v_add_u32_e32 v2, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 @@ -1610,31 +1610,29 @@ define amdgpu_kernel void @udot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1, ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s6, s[6:7], 0x0 ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s19, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s18, s[0:1], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_bfe_u32 s18, s6, 0x40004 +; GFX9-DL-NEXT: s_bfe_u32 s17, s6, 0x40004 ; GFX9-DL-NEXT: s_lshr_b32 s7, s6, 28 -; GFX9-DL-NEXT: s_bfe_u32 s13, s6, 0x40018 -; GFX9-DL-NEXT: s_bfe_u32 s14, s6, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s15, s6, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s16, s6, 0x4000c -; GFX9-DL-NEXT: s_bfe_u32 s17, s6, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s12, s6, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s13, s6, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s14, s6, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s15, s6, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s16, s6, 0x40008 ; GFX9-DL-NEXT: s_and_b32 s6, s6, 15 -; GFX9-DL-NEXT: s_lshr_b32 s4, s2, 28 -; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40018 -; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x4000c -; GFX9-DL-NEXT: s_bfe_u32 s11, s2, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s12, s2, 0x40004 +; GFX9-DL-NEXT: s_lshr_b32 s3, s2, 28 +; GFX9-DL-NEXT: s_bfe_u32 s4, s2, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x4000c +; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s11, s2, 0x40004 ; GFX9-DL-NEXT: s_and_b32 s2, s2, 15 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s19 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s18 ; GFX9-DL-NEXT: v_mad_u32_u24 v1, s2, v0, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s18 -; GFX9-DL-NEXT: v_mad_u32_u24 v0, s2, v0, v1 -; GFX9-DL-NEXT: v_mad_u32_u24 v1, s12, v2, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s17 +; GFX9-DL-NEXT: v_mad_u32_u24 v0, s2, v0, v1 ; GFX9-DL-NEXT: v_mad_u32_u24 v1, s11, v2, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s16 ; GFX9-DL-NEXT: v_mad_u32_u24 v1, s10, v2, v1 @@ -1644,8 +1642,10 @@ define amdgpu_kernel void @udot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1, ; GFX9-DL-NEXT: v_mad_u32_u24 v1, s8, v2, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v2, s13 ; GFX9-DL-NEXT: v_mad_u32_u24 v1, s5, v2, v1 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s12 ; GFX9-DL-NEXT: v_mad_u32_u24 v1, s4, v2, v1 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s7 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s3, v2, v1 ; GFX9-DL-NEXT: v_add_u32_e32 v2, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 @@ -1659,35 +1659,35 @@ define amdgpu_kernel void @udot8_multiuses_mul1(<8 x i4> addrspace(1)* %src1, ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s4, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s3, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s4, s[0:1], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s6, s2, 15 -; GFX10-DL-NEXT: s_and_b32 s7, s4, 15 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s5 -; GFX10-DL-NEXT: s_bfe_u32 s5, s2, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s8, s4, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s9, s2, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s10, s4, 0x40008 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s6, s7, v0 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s5, s8, v0 -; GFX10-DL-NEXT: s_bfe_u32 s5, s2, 0x4000c -; GFX10-DL-NEXT: s_bfe_u32 s8, s4, 0x4000c -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s6, s7, v0 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s9, s10, v1 -; GFX10-DL-NEXT: s_bfe_u32 s9, s2, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s10, s4, 0x40010 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s5, s8, v1 -; GFX10-DL-NEXT: s_bfe_u32 s5, s2, 0x40014 -; GFX10-DL-NEXT: s_bfe_u32 s8, s4, 0x40014 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s9, s10, v1 -; GFX10-DL-NEXT: s_bfe_u32 s9, s2, 0x40018 -; GFX10-DL-NEXT: s_bfe_u32 s10, s4, 0x40018 +; GFX10-DL-NEXT: s_and_b32 s5, s2, 15 +; GFX10-DL-NEXT: s_and_b32 s6, s3, 15 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: s_bfe_u32 s4, s2, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s7, s3, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s8, s2, 0x40008 +; GFX10-DL-NEXT: s_bfe_u32 s9, s3, 0x40008 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s5, s6, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s4, s7, v0 +; GFX10-DL-NEXT: s_bfe_u32 s4, s2, 0x4000c +; GFX10-DL-NEXT: s_bfe_u32 s7, s3, 0x4000c +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s5, s6, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s8, s9, v1 +; GFX10-DL-NEXT: s_bfe_u32 s8, s2, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s9, s3, 0x40010 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s4, s7, v1 +; GFX10-DL-NEXT: s_bfe_u32 s4, s2, 0x40014 +; GFX10-DL-NEXT: s_bfe_u32 s7, s3, 0x40014 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s8, s9, v1 +; GFX10-DL-NEXT: s_bfe_u32 s8, s2, 0x40018 +; GFX10-DL-NEXT: s_bfe_u32 s9, s3, 0x40018 ; GFX10-DL-NEXT: s_lshr_b32 s2, s2, 28 -; GFX10-DL-NEXT: s_lshr_b32 s4, s4, 28 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s5, s8, v1 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s9, s10, v1 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s4, v1 +; GFX10-DL-NEXT: s_lshr_b32 s3, s3, 28 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s4, s7, v1 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s8, s9, v1 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s2, s3, v1 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v0, v1 ; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, s1 @@ -1766,49 +1766,49 @@ entry: define amdgpu_kernel void @udot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX7-LABEL: udot8_acc32_vecMul: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_load_dword s10, s[10:11], 0x0 -; GFX7-NEXT: s_load_dword s0, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s21, s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s6, s[6:7], 0x0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s20, s[0:1], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_lshr_b32 s11, s10, 28 -; GFX7-NEXT: s_bfe_u32 s15, s10, 0x40018 -; GFX7-NEXT: s_bfe_u32 s16, s10, 0x40014 -; GFX7-NEXT: s_bfe_u32 s17, s10, 0x40010 -; GFX7-NEXT: s_bfe_u32 s18, s10, 0x4000c -; GFX7-NEXT: s_bfe_u32 s19, s10, 0x40008 -; GFX7-NEXT: s_bfe_u32 s20, s10, 0x40004 -; GFX7-NEXT: s_and_b32 s10, s10, 15 -; GFX7-NEXT: s_lshr_b32 s1, s0, 28 -; GFX7-NEXT: s_bfe_u32 s2, s0, 0x40018 -; GFX7-NEXT: s_bfe_u32 s8, s0, 0x40014 -; GFX7-NEXT: s_bfe_u32 s9, s0, 0x40010 -; GFX7-NEXT: s_bfe_u32 s12, s0, 0x4000c -; GFX7-NEXT: s_bfe_u32 s13, s0, 0x40008 -; GFX7-NEXT: s_bfe_u32 s14, s0, 0x40004 -; GFX7-NEXT: s_and_b32 s0, s0, 15 -; GFX7-NEXT: v_mov_b32_e32 v0, s10 -; GFX7-NEXT: v_mov_b32_e32 v1, s21 -; GFX7-NEXT: v_mad_u32_u24 v0, s0, v0, v1 +; GFX7-NEXT: s_lshr_b32 s7, s6, 28 +; GFX7-NEXT: s_bfe_u32 s14, s6, 0x40018 +; GFX7-NEXT: s_bfe_u32 s15, s6, 0x40014 +; GFX7-NEXT: s_bfe_u32 s16, s6, 0x40010 +; GFX7-NEXT: s_bfe_u32 s17, s6, 0x4000c +; GFX7-NEXT: s_bfe_u32 s18, s6, 0x40008 +; GFX7-NEXT: s_bfe_u32 s19, s6, 0x40004 +; GFX7-NEXT: s_and_b32 s6, s6, 15 +; GFX7-NEXT: s_lshr_b32 s5, s4, 28 +; GFX7-NEXT: s_bfe_u32 s8, s4, 0x40018 +; GFX7-NEXT: s_bfe_u32 s9, s4, 0x40014 +; GFX7-NEXT: s_bfe_u32 s10, s4, 0x40010 +; GFX7-NEXT: s_bfe_u32 s11, s4, 0x4000c +; GFX7-NEXT: s_bfe_u32 s12, s4, 0x40008 +; GFX7-NEXT: s_bfe_u32 s13, s4, 0x40004 +; GFX7-NEXT: s_and_b32 s4, s4, 15 +; GFX7-NEXT: v_mov_b32_e32 v0, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s20 -; GFX7-NEXT: v_mad_u32_u24 v0, s14, v1, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s4, v0, v1 ; GFX7-NEXT: v_mov_b32_e32 v1, s19 ; GFX7-NEXT: v_mad_u32_u24 v0, s13, v1, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, s18 ; GFX7-NEXT: v_mad_u32_u24 v0, s12, v1, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, s17 -; GFX7-NEXT: v_mad_u32_u24 v0, s9, v1, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s11, v1, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, s16 -; GFX7-NEXT: v_mad_u32_u24 v0, s8, v1, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s10, v1, v0 ; GFX7-NEXT: v_mov_b32_e32 v1, s15 -; GFX7-NEXT: v_mad_u32_u24 v0, s2, v1, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s11 -; GFX7-NEXT: v_mad_u32_u24 v0, s1, v1, v0 -; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GFX7-NEXT: v_mad_u32_u24 v0, s9, v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s14 +; GFX7-NEXT: v_mad_u32_u24 v0, s8, v1, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s7 +; GFX7-NEXT: v_mad_u32_u24 v0, s5, v1, v0 +; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: udot8_acc32_vecMul: @@ -1818,29 +1818,27 @@ define amdgpu_kernel void @udot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_load_dword s6, s[6:7], 0x0 ; GFX8-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX8-NEXT: s_load_dword s19, s[0:1], 0x0 +; GFX8-NEXT: s_load_dword s18, s[0:1], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_lshr_b32 s7, s6, 28 -; GFX8-NEXT: s_bfe_u32 s13, s6, 0x40018 -; GFX8-NEXT: s_bfe_u32 s14, s6, 0x40014 -; GFX8-NEXT: s_bfe_u32 s15, s6, 0x40010 -; GFX8-NEXT: s_bfe_u32 s16, s6, 0x4000c -; GFX8-NEXT: s_bfe_u32 s17, s6, 0x40008 -; GFX8-NEXT: s_bfe_u32 s18, s6, 0x40004 +; GFX8-NEXT: s_bfe_u32 s12, s6, 0x40018 +; GFX8-NEXT: s_bfe_u32 s13, s6, 0x40014 +; GFX8-NEXT: s_bfe_u32 s14, s6, 0x40010 +; GFX8-NEXT: s_bfe_u32 s15, s6, 0x4000c +; GFX8-NEXT: s_bfe_u32 s16, s6, 0x40008 +; GFX8-NEXT: s_bfe_u32 s17, s6, 0x40004 ; GFX8-NEXT: s_and_b32 s6, s6, 15 -; GFX8-NEXT: s_lshr_b32 s4, s2, 28 -; GFX8-NEXT: s_bfe_u32 s5, s2, 0x40018 -; GFX8-NEXT: s_bfe_u32 s8, s2, 0x40014 -; GFX8-NEXT: s_bfe_u32 s9, s2, 0x40010 -; GFX8-NEXT: s_bfe_u32 s10, s2, 0x4000c -; GFX8-NEXT: s_bfe_u32 s11, s2, 0x40008 -; GFX8-NEXT: s_bfe_u32 s12, s2, 0x40004 +; GFX8-NEXT: s_lshr_b32 s3, s2, 28 +; GFX8-NEXT: s_bfe_u32 s4, s2, 0x40018 +; GFX8-NEXT: s_bfe_u32 s5, s2, 0x40014 +; GFX8-NEXT: s_bfe_u32 s8, s2, 0x40010 +; GFX8-NEXT: s_bfe_u32 s9, s2, 0x4000c +; GFX8-NEXT: s_bfe_u32 s10, s2, 0x40008 +; GFX8-NEXT: s_bfe_u32 s11, s2, 0x40004 ; GFX8-NEXT: s_and_b32 s2, s2, 15 ; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s19 -; GFX8-NEXT: v_mad_u32_u24 v0, s2, v0, v1 ; GFX8-NEXT: v_mov_b32_e32 v1, s18 -; GFX8-NEXT: v_mad_u32_u24 v0, s12, v1, v0 +; GFX8-NEXT: v_mad_u32_u24 v0, s2, v0, v1 ; GFX8-NEXT: v_mov_b32_e32 v1, s17 ; GFX8-NEXT: v_mad_u32_u24 v0, s11, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, s16 @@ -1851,8 +1849,10 @@ define amdgpu_kernel void @udot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX8-NEXT: v_mad_u32_u24 v0, s8, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, s13 ; GFX8-NEXT: v_mad_u32_u24 v0, s5, v1, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, s12 +; GFX8-NEXT: v_mad_u32_u24 v0, s4, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_mad_u32_u24 v2, s4, v1, v0 +; GFX8-NEXT: v_mad_u32_u24 v2, s3, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -1865,29 +1865,27 @@ define amdgpu_kernel void @udot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_load_dword s6, s[6:7], 0x0 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 -; GFX9-NEXT: s_load_dword s19, s[0:1], 0x0 +; GFX9-NEXT: s_load_dword s18, s[0:1], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_lshr_b32 s7, s6, 28 -; GFX9-NEXT: s_bfe_u32 s13, s6, 0x40018 -; GFX9-NEXT: s_bfe_u32 s14, s6, 0x40014 -; GFX9-NEXT: s_bfe_u32 s15, s6, 0x40010 -; GFX9-NEXT: s_bfe_u32 s16, s6, 0x4000c -; GFX9-NEXT: s_bfe_u32 s17, s6, 0x40008 -; GFX9-NEXT: s_bfe_u32 s18, s6, 0x40004 +; GFX9-NEXT: s_bfe_u32 s12, s6, 0x40018 +; GFX9-NEXT: s_bfe_u32 s13, s6, 0x40014 +; GFX9-NEXT: s_bfe_u32 s14, s6, 0x40010 +; GFX9-NEXT: s_bfe_u32 s15, s6, 0x4000c +; GFX9-NEXT: s_bfe_u32 s16, s6, 0x40008 +; GFX9-NEXT: s_bfe_u32 s17, s6, 0x40004 ; GFX9-NEXT: s_and_b32 s6, s6, 15 -; GFX9-NEXT: s_lshr_b32 s4, s2, 28 -; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40018 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40014 -; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40010 -; GFX9-NEXT: s_bfe_u32 s10, s2, 0x4000c -; GFX9-NEXT: s_bfe_u32 s11, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s12, s2, 0x40004 +; GFX9-NEXT: s_lshr_b32 s3, s2, 28 +; GFX9-NEXT: s_bfe_u32 s4, s2, 0x40018 +; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40014 +; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40010 +; GFX9-NEXT: s_bfe_u32 s9, s2, 0x4000c +; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40008 +; GFX9-NEXT: s_bfe_u32 s11, s2, 0x40004 ; GFX9-NEXT: s_and_b32 s2, s2, 15 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s19 -; GFX9-NEXT: v_mad_u32_u24 v0, s2, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, s18 -; GFX9-NEXT: v_mad_u32_u24 v0, s12, v1, v0 +; GFX9-NEXT: v_mad_u32_u24 v0, s2, v0, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-NEXT: v_mad_u32_u24 v0, s11, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, s16 @@ -1898,8 +1896,10 @@ define amdgpu_kernel void @udot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX9-NEXT: v_mad_u32_u24 v0, s8, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, s13 ; GFX9-NEXT: v_mad_u32_u24 v0, s5, v1, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, s12 +; GFX9-NEXT: v_mad_u32_u24 v0, s4, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_mad_u32_u24 v2, s4, v1, v0 +; GFX9-NEXT: v_mad_u32_u24 v2, s3, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: global_store_dword v[0:1], v2, off @@ -1911,11 +1911,11 @@ define amdgpu_kernel void @udot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s2, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX9-DL-NEXT: s_load_dword s3, s[0:1], 0x0 ; GFX9-DL-NEXT: s_load_dword s4, s[4:5], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s6 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-DL-NEXT: v_dot8_u32_u4 v2, s4, v0, v1 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 @@ -1924,18 +1924,18 @@ define amdgpu_kernel void @udot8_acc32_vecMul(<8 x i4> addrspace(1)* %src1, ; ; GFX10-DL-LABEL: udot8_acc32_vecMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[8:9], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s2, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s6, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-DL-NEXT: v_dot8_u32_u4 v2, s1, s2, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s9 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s6 +; GFX10-DL-NEXT: v_dot8_u32_u4 v2, s0, s1, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-DL-NEXT: global_store_dword v[0:1], v2, off ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, @@ -1976,59 +1976,59 @@ entry: define amdgpu_kernel void @udot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX7-LABEL: udot8_acc16_vecMul: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: buffer_load_ushort v0, off, s[4:7], 0 -; GFX7-NEXT: s_load_dword s0, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s1, s[10:11], 0x0 +; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bfe_u32 s11, s0, 0x40004 -; GFX7-NEXT: s_bfe_u32 s18, s1, 0x40004 -; GFX7-NEXT: s_bfe_u32 s20, s1, 0x4000c -; GFX7-NEXT: v_mov_b32_e32 v4, s18 -; GFX7-NEXT: s_bfe_u32 s15, s1, 0x40018 -; GFX7-NEXT: s_bfe_u32 s16, s1, 0x40014 -; GFX7-NEXT: s_bfe_u32 s17, s1, 0x40010 -; GFX7-NEXT: s_and_b32 s19, s1, 15 -; GFX7-NEXT: s_lshr_b32 s14, s1, 28 -; GFX7-NEXT: s_bfe_u32 s1, s1, 0x40008 -; GFX7-NEXT: s_bfe_u32 s13, s0, 0x4000c -; GFX7-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-NEXT: v_mul_u32_u24_e32 v2, s13, v2 -; GFX7-NEXT: v_mul_u32_u24_e32 v4, s11, v4 -; GFX7-NEXT: s_lshr_b32 s2, s0, 28 -; GFX7-NEXT: s_bfe_u32 s8, s0, 0x40018 -; GFX7-NEXT: s_bfe_u32 s9, s0, 0x40014 -; GFX7-NEXT: s_bfe_u32 s10, s0, 0x40010 -; GFX7-NEXT: s_and_b32 s12, s0, 15 -; GFX7-NEXT: v_mov_b32_e32 v3, s19 -; GFX7-NEXT: s_bfe_u32 s0, s0, 0x40008 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mul_u32_u24_e32 v1, s0, v1 +; GFX7-NEXT: s_bfe_u32 s10, s4, 0x40004 +; GFX7-NEXT: s_bfe_u32 s17, s5, 0x40004 +; GFX7-NEXT: s_bfe_u32 s19, s5, 0x4000c +; GFX7-NEXT: v_mov_b32_e32 v4, s17 +; GFX7-NEXT: s_bfe_u32 s14, s5, 0x40018 +; GFX7-NEXT: s_bfe_u32 s15, s5, 0x40014 +; GFX7-NEXT: s_bfe_u32 s16, s5, 0x40010 +; GFX7-NEXT: s_and_b32 s18, s5, 15 +; GFX7-NEXT: s_lshr_b32 s13, s5, 28 +; GFX7-NEXT: s_bfe_u32 s5, s5, 0x40008 +; GFX7-NEXT: s_bfe_u32 s12, s4, 0x4000c +; GFX7-NEXT: v_mov_b32_e32 v2, s19 +; GFX7-NEXT: v_mul_u32_u24_e32 v2, s12, v2 +; GFX7-NEXT: v_mul_u32_u24_e32 v4, s10, v4 +; GFX7-NEXT: s_lshr_b32 s6, s4, 28 +; GFX7-NEXT: s_bfe_u32 s7, s4, 0x40018 +; GFX7-NEXT: s_bfe_u32 s8, s4, 0x40014 +; GFX7-NEXT: s_bfe_u32 s9, s4, 0x40010 +; GFX7-NEXT: s_and_b32 s11, s4, 15 +; GFX7-NEXT: v_mov_b32_e32 v3, s18 +; GFX7-NEXT: s_bfe_u32 s4, s4, 0x40008 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mul_u32_u24_e32 v1, s4, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX7-NEXT: v_mul_u32_u24_e32 v3, s12, v3 +; GFX7-NEXT: v_mul_u32_u24_e32 v3, s11, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4 ; GFX7-NEXT: v_or_b32_e32 v1, v1, v2 ; GFX7-NEXT: v_or_b32_e32 v2, v3, v4 ; GFX7-NEXT: v_alignbit_b32 v3, v1, v2, 16 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; GFX7-NEXT: v_mov_b32_e32 v5, s17 -; GFX7-NEXT: v_mov_b32_e32 v6, s16 -; GFX7-NEXT: v_mov_b32_e32 v7, s15 +; GFX7-NEXT: v_mov_b32_e32 v5, s16 +; GFX7-NEXT: v_mov_b32_e32 v6, s15 +; GFX7-NEXT: v_mov_b32_e32 v7, s14 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v3, v0 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v4, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s10, v5, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s9, v6, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s8, v7, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s14 -; GFX7-NEXT: v_mad_u32_u24 v0, s2, v1, v0 -; GFX7-NEXT: buffer_store_short v0, off, s[4:7], 0 +; GFX7-NEXT: v_mad_u32_u24 v0, s9, v5, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s8, v6, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s7, v7, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s13 +; GFX7-NEXT: v_mad_u32_u24 v0, s6, v1, v0 +; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: udot8_acc16_vecMul: @@ -2043,38 +2043,38 @@ define amdgpu_kernel void @udot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_lshr_b32 s2, s0, 28 -; GFX8-NEXT: s_bfe_u32 s11, s1, 0x40018 -; GFX8-NEXT: s_bfe_u32 s12, s1, 0x40014 -; GFX8-NEXT: s_bfe_u32 s13, s1, 0x40010 -; GFX8-NEXT: s_bfe_u32 s14, s1, 0x4000c -; GFX8-NEXT: s_bfe_u32 s15, s1, 0x40008 -; GFX8-NEXT: s_bfe_u32 s16, s1, 0x40004 -; GFX8-NEXT: s_lshr_b32 s10, s1, 28 +; GFX8-NEXT: s_bfe_u32 s10, s1, 0x40018 +; GFX8-NEXT: s_bfe_u32 s11, s1, 0x40014 +; GFX8-NEXT: s_bfe_u32 s12, s1, 0x40010 +; GFX8-NEXT: s_bfe_u32 s13, s1, 0x4000c +; GFX8-NEXT: s_bfe_u32 s14, s1, 0x40008 +; GFX8-NEXT: s_bfe_u32 s15, s1, 0x40004 +; GFX8-NEXT: s_lshr_b32 s9, s1, 28 ; GFX8-NEXT: s_and_b32 s1, s1, 15 -; GFX8-NEXT: s_bfe_u32 s4, s0, 0x40018 -; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40014 -; GFX8-NEXT: s_bfe_u32 s6, s0, 0x40010 -; GFX8-NEXT: s_bfe_u32 s7, s0, 0x4000c -; GFX8-NEXT: s_bfe_u32 s8, s0, 0x40008 -; GFX8-NEXT: s_bfe_u32 s9, s0, 0x40004 +; GFX8-NEXT: s_bfe_u32 s3, s0, 0x40018 +; GFX8-NEXT: s_bfe_u32 s4, s0, 0x40014 +; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40010 +; GFX8-NEXT: s_bfe_u32 s6, s0, 0x4000c +; GFX8-NEXT: s_bfe_u32 s7, s0, 0x40008 +; GFX8-NEXT: s_bfe_u32 s8, s0, 0x40004 ; GFX8-NEXT: s_and_b32 s0, s0, 15 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v4, s16 -; GFX8-NEXT: v_mov_b32_e32 v5, s15 -; GFX8-NEXT: v_mov_b32_e32 v6, s14 -; GFX8-NEXT: v_mov_b32_e32 v7, s13 -; GFX8-NEXT: v_mov_b32_e32 v8, s12 -; GFX8-NEXT: v_mov_b32_e32 v9, s11 +; GFX8-NEXT: v_mov_b32_e32 v4, s15 +; GFX8-NEXT: v_mov_b32_e32 v5, s14 +; GFX8-NEXT: v_mov_b32_e32 v6, s13 +; GFX8-NEXT: v_mov_b32_e32 v7, s12 +; GFX8-NEXT: v_mov_b32_e32 v8, s11 +; GFX8-NEXT: v_mov_b32_e32 v9, s10 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mad_u32_u24 v2, s0, v3, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s9, v4, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s8, v4, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s8, v5, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s7, v6, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s6, v7, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s5, v8, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s4, v9, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s10 +; GFX8-NEXT: v_mad_u32_u24 v2, s7, v5, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s6, v6, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s5, v7, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s4, v8, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s3, v9, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s9 ; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm @@ -2088,36 +2088,36 @@ define amdgpu_kernel void @udot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_bfe_u32 s7, s6, 0x40018 -; GFX9-NEXT: s_lshr_b32 s13, s6, 28 -; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s13 -; GFX9-NEXT: s_bfe_u32 s4, s2, 0x40018 -; GFX9-NEXT: s_lshr_b32 s5, s2, 28 -; GFX9-NEXT: s_bfe_u32 s14, s6, 0x40010 -; GFX9-NEXT: s_bfe_u32 s15, s6, 0x40014 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX9-NEXT: s_lshr_b32 s12, s6, 28 +; GFX9-NEXT: s_pack_ll_b32_b16 s7, s7, s12 +; GFX9-NEXT: s_bfe_u32 s3, s2, 0x40018 +; GFX9-NEXT: s_lshr_b32 s4, s2, 28 +; GFX9-NEXT: s_bfe_u32 s13, s6, 0x40010 +; GFX9-NEXT: s_bfe_u32 s14, s6, 0x40014 +; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4 ; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: v_pk_mul_lo_u16 v2, s4, v0 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s14, s15 -; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40010 -; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40014 -; GFX9-NEXT: s_bfe_u32 s16, s6, 0x40008 -; GFX9-NEXT: s_bfe_u32 s17, s6, 0x4000c -; GFX9-NEXT: s_and_b32 s18, s6, 15 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s8, s9 -; GFX9-NEXT: s_bfe_u32 s10, s2, 0x40008 -; GFX9-NEXT: s_bfe_u32 s11, s2, 0x4000c +; GFX9-NEXT: v_pk_mul_lo_u16 v2, s3, v0 +; GFX9-NEXT: s_pack_ll_b32_b16 s3, s13, s14 +; GFX9-NEXT: s_bfe_u32 s5, s2, 0x40010 +; GFX9-NEXT: s_bfe_u32 s8, s2, 0x40014 +; GFX9-NEXT: s_bfe_u32 s15, s6, 0x40008 +; GFX9-NEXT: s_bfe_u32 s16, s6, 0x4000c +; GFX9-NEXT: s_and_b32 s17, s6, 15 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s5, s8 +; GFX9-NEXT: s_bfe_u32 s9, s2, 0x40008 +; GFX9-NEXT: s_bfe_u32 s10, s2, 0x4000c ; GFX9-NEXT: s_bfe_u32 s6, s6, 0x40004 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s16, s17 -; GFX9-NEXT: v_pk_mul_lo_u16 v3, s5, v0 -; GFX9-NEXT: s_and_b32 s12, s2, 15 +; GFX9-NEXT: s_pack_ll_b32_b16 s3, s15, s16 +; GFX9-NEXT: v_pk_mul_lo_u16 v3, s4, v0 +; GFX9-NEXT: s_and_b32 s11, s2, 15 ; GFX9-NEXT: s_bfe_u32 s2, s2, 0x40004 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: s_pack_ll_b32_b16 s5, s10, s11 -; GFX9-NEXT: s_pack_ll_b32_b16 s4, s18, s6 -; GFX9-NEXT: v_pk_mul_lo_u16 v4, s5, v0 -; GFX9-NEXT: s_pack_ll_b32_b16 s2, s12, s2 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: s_pack_ll_b32_b16 s4, s9, s10 +; GFX9-NEXT: s_pack_ll_b32_b16 s3, s17, s6 +; GFX9-NEXT: v_pk_mul_lo_u16 v4, s4, v0 +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s11, s2 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 ; GFX9-NEXT: v_pk_mul_lo_u16 v5, s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 @@ -2143,36 +2143,36 @@ define amdgpu_kernel void @udot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX9-DL-NEXT: s_load_dword s2, s[4:5], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_bfe_u32 s7, s6, 0x40018 -; GFX9-DL-NEXT: s_lshr_b32 s13, s6, 28 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s7, s7, s13 -; GFX9-DL-NEXT: s_bfe_u32 s4, s2, 0x40018 -; GFX9-DL-NEXT: s_lshr_b32 s5, s2, 28 -; GFX9-DL-NEXT: s_bfe_u32 s14, s6, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s15, s6, 0x40014 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s4, s4, s5 +; GFX9-DL-NEXT: s_lshr_b32 s12, s6, 28 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s7, s7, s12 +; GFX9-DL-NEXT: s_bfe_u32 s3, s2, 0x40018 +; GFX9-DL-NEXT: s_lshr_b32 s4, s2, 28 +; GFX9-DL-NEXT: s_bfe_u32 s13, s6, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s14, s6, 0x40014 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s3, s3, s4 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, s4, v0 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s4, s14, s15 -; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s16, s6, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s17, s6, 0x4000c -; GFX9-DL-NEXT: s_and_b32 s18, s6, 15 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s5, s8, s9 -; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x40008 -; GFX9-DL-NEXT: s_bfe_u32 s11, s2, 0x4000c +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, s3, v0 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s3, s13, s14 +; GFX9-DL-NEXT: s_bfe_u32 s5, s2, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s8, s2, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s15, s6, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s16, s6, 0x4000c +; GFX9-DL-NEXT: s_and_b32 s17, s6, 15 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s4, s5, s8 +; GFX9-DL-NEXT: s_bfe_u32 s9, s2, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s10, s2, 0x4000c ; GFX9-DL-NEXT: s_bfe_u32 s6, s6, 0x40004 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s4, s16, s17 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v3, s5, v0 -; GFX9-DL-NEXT: s_and_b32 s12, s2, 15 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s3, s15, s16 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v3, s4, v0 +; GFX9-DL-NEXT: s_and_b32 s11, s2, 15 ; GFX9-DL-NEXT: s_bfe_u32 s2, s2, 0x40004 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s5, s10, s11 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s4, s18, s6 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, s5, v0 -; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s12, s2 -; GFX9-DL-NEXT: v_mov_b32_e32 v0, s4 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s4, s9, s10 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s3, s17, s6 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, s4, v0 +; GFX9-DL-NEXT: s_pack_ll_b32_b16 s2, s11, s2 +; GFX9-DL-NEXT: v_mov_b32_e32 v0, s3 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, s2, v0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 @@ -2191,47 +2191,47 @@ define amdgpu_kernel void @udot8_acc16_vecMul(<8 x i4> addrspace(1)* %src1, ; ; GFX10-DL-LABEL: udot8_acc16_vecMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ushort v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 -; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x40004 -; GFX10-DL-NEXT: s_and_b32 s4, s1, 15 -; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x4000c -; GFX10-DL-NEXT: s_bfe_u32 s8, s0, 0x4000c -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s2, s2, s6 -; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40008 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s4, s4, s5 -; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x40008 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, s2, s4 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s4, s6, s7 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s5, s5, s8 +; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x40004 +; GFX10-DL-NEXT: s_and_b32 s3, s1, 15 +; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x4000c +; GFX10-DL-NEXT: s_bfe_u32 s7, s0, 0x4000c +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s2, s2, s5 +; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x40008 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s3, s3, s4 +; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x40008 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, s2, s3 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s3, s5, s6 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s4, s4, s7 ; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x40014 -; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s8, s1, 0x40014 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, s5, s4 -; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x40018 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s2, s2, s6 +; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x40014 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, s4, s3 +; GFX10-DL-NEXT: s_bfe_u32 s3, s0, 0x40018 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s2, s2, s5 ; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s5, s7, s8 -; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40018 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s4, s6, s7 +; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x40018 ; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 28 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s4, s0 -; GFX10-DL-NEXT: s_pack_ll_b32_b16 s1, s6, s1 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s0, s3, s0 +; GFX10-DL-NEXT: s_pack_ll_b32_b16 s1, s5, s1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v3, v2 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, s2, s5 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v3, s2, s4 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:BYTE_0 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, s0, s1 @@ -2278,53 +2278,53 @@ entry: define amdgpu_kernel void @udot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX7-LABEL: udot8_acc8_vecMul: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 -; GFX7-NEXT: s_load_dword s0, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s1, s[10:11], 0x0 +; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_bfe_u32 s2, s0, 0x4000c -; GFX7-NEXT: s_bfe_u32 s14, s1, 0x4000c -; GFX7-NEXT: s_bfe_u32 s16, s1, 0x40004 -; GFX7-NEXT: s_lshr_b32 s18, s1, 28 -; GFX7-NEXT: v_mov_b32_e32 v8, s14 -; GFX7-NEXT: s_bfe_u32 s15, s1, 0x40008 -; GFX7-NEXT: s_and_b32 s17, s1, 15 -; GFX7-NEXT: s_bfe_u32 s19, s1, 0x40018 -; GFX7-NEXT: s_bfe_u32 s20, s1, 0x40014 -; GFX7-NEXT: s_bfe_u32 s9, s0, 0x40004 -; GFX7-NEXT: v_mov_b32_e32 v6, s16 -; GFX7-NEXT: s_lshr_b32 s11, s0, 28 -; GFX7-NEXT: v_mov_b32_e32 v4, s18 -; GFX7-NEXT: v_mul_u32_u24_e32 v4, s11, v4 -; GFX7-NEXT: v_mul_u32_u24_e32 v6, s9, v6 -; GFX7-NEXT: v_mul_u32_u24_e32 v8, s2, v8 -; GFX7-NEXT: s_bfe_u32 s1, s1, 0x40010 -; GFX7-NEXT: s_bfe_u32 s8, s0, 0x40008 -; GFX7-NEXT: v_mov_b32_e32 v7, s15 -; GFX7-NEXT: s_and_b32 s10, s0, 15 -; GFX7-NEXT: v_mov_b32_e32 v5, s17 -; GFX7-NEXT: s_bfe_u32 s12, s0, 0x40018 -; GFX7-NEXT: v_mov_b32_e32 v3, s19 -; GFX7-NEXT: s_bfe_u32 s13, s0, 0x40014 -; GFX7-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-NEXT: v_mul_u32_u24_e32 v2, s13, v2 -; GFX7-NEXT: s_bfe_u32 s0, s0, 0x40010 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mul_u32_u24_e32 v3, s12, v3 +; GFX7-NEXT: s_bfe_u32 s6, s4, 0x4000c +; GFX7-NEXT: s_bfe_u32 s13, s5, 0x4000c +; GFX7-NEXT: s_bfe_u32 s15, s5, 0x40004 +; GFX7-NEXT: s_lshr_b32 s17, s5, 28 +; GFX7-NEXT: v_mov_b32_e32 v8, s13 +; GFX7-NEXT: s_bfe_u32 s14, s5, 0x40008 +; GFX7-NEXT: s_and_b32 s16, s5, 15 +; GFX7-NEXT: s_bfe_u32 s18, s5, 0x40018 +; GFX7-NEXT: s_bfe_u32 s19, s5, 0x40014 +; GFX7-NEXT: s_bfe_u32 s8, s4, 0x40004 +; GFX7-NEXT: v_mov_b32_e32 v6, s15 +; GFX7-NEXT: s_lshr_b32 s10, s4, 28 +; GFX7-NEXT: v_mov_b32_e32 v4, s17 +; GFX7-NEXT: v_mul_u32_u24_e32 v4, s10, v4 +; GFX7-NEXT: v_mul_u32_u24_e32 v6, s8, v6 +; GFX7-NEXT: v_mul_u32_u24_e32 v8, s6, v8 +; GFX7-NEXT: s_bfe_u32 s5, s5, 0x40010 +; GFX7-NEXT: s_bfe_u32 s7, s4, 0x40008 +; GFX7-NEXT: v_mov_b32_e32 v7, s14 +; GFX7-NEXT: s_and_b32 s9, s4, 15 +; GFX7-NEXT: v_mov_b32_e32 v5, s16 +; GFX7-NEXT: s_bfe_u32 s11, s4, 0x40018 +; GFX7-NEXT: v_mov_b32_e32 v3, s18 +; GFX7-NEXT: s_bfe_u32 s12, s4, 0x40014 +; GFX7-NEXT: v_mov_b32_e32 v2, s19 +; GFX7-NEXT: v_mul_u32_u24_e32 v2, s12, v2 +; GFX7-NEXT: s_bfe_u32 s4, s4, 0x40010 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mul_u32_u24_e32 v3, s11, v3 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 8, v4 -; GFX7-NEXT: v_mul_u32_u24_e32 v5, s10, v5 -; GFX7-NEXT: v_mul_u32_u24_e32 v7, s8, v7 +; GFX7-NEXT: v_mul_u32_u24_e32 v5, s9, v5 +; GFX7-NEXT: v_mul_u32_u24_e32 v7, s7, v7 ; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v6 ; GFX7-NEXT: v_lshlrev_b32_e32 v8, 8, v8 ; GFX7-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX7-NEXT: v_or_b32_e32 v4, v5, v6 ; GFX7-NEXT: v_or_b32_e32 v5, v7, v8 -; GFX7-NEXT: v_mul_u32_u24_e32 v9, s0, v1 +; GFX7-NEXT: v_mul_u32_u24_e32 v9, s4, v1 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; GFX7-NEXT: v_or_b32_e32 v2, v9, v2 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3 @@ -2342,11 +2342,11 @@ define amdgpu_kernel void @udot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v4, v0 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v5, v0 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v6, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s0, v1, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v7 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v8 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX7-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: udot8_acc8_vecMul: @@ -2361,42 +2361,42 @@ define amdgpu_kernel void @udot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX8-NEXT: s_load_dword s2, s[6:7], 0x0 ; GFX8-NEXT: s_mov_b32 s0, 0xffff ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_bfe_u32 s8, s1, 0x40004 -; GFX8-NEXT: s_bfe_u32 s10, s1, 0x4000c -; GFX8-NEXT: s_bfe_u32 s15, s2, 0x40004 -; GFX8-NEXT: s_and_b32 s16, s2, 15 -; GFX8-NEXT: s_bfe_u32 s17, s2, 0x4000c -; GFX8-NEXT: s_bfe_u32 s4, s1, 0x40014 -; GFX8-NEXT: s_lshr_b32 s6, s1, 28 -; GFX8-NEXT: s_bfe_u32 s11, s2, 0x40014 -; GFX8-NEXT: s_bfe_u32 s12, s2, 0x40010 -; GFX8-NEXT: s_lshr_b32 s13, s2, 28 -; GFX8-NEXT: s_bfe_u32 s14, s2, 0x40018 +; GFX8-NEXT: s_bfe_u32 s7, s1, 0x40004 +; GFX8-NEXT: s_bfe_u32 s9, s1, 0x4000c +; GFX8-NEXT: s_bfe_u32 s14, s2, 0x40004 +; GFX8-NEXT: s_and_b32 s15, s2, 15 +; GFX8-NEXT: s_bfe_u32 s16, s2, 0x4000c +; GFX8-NEXT: s_bfe_u32 s3, s1, 0x40014 +; GFX8-NEXT: s_lshr_b32 s5, s1, 28 +; GFX8-NEXT: s_bfe_u32 s10, s2, 0x40014 +; GFX8-NEXT: s_bfe_u32 s11, s2, 0x40010 +; GFX8-NEXT: s_lshr_b32 s12, s2, 28 +; GFX8-NEXT: s_bfe_u32 s13, s2, 0x40018 ; GFX8-NEXT: s_bfe_u32 s2, s2, 0x40008 -; GFX8-NEXT: s_and_b32 s9, s1, 15 -; GFX8-NEXT: v_mov_b32_e32 v4, s17 -; GFX8-NEXT: v_mov_b32_e32 v5, s10 -; GFX8-NEXT: v_mov_b32_e32 v6, s16 -; GFX8-NEXT: v_mov_b32_e32 v7, s15 -; GFX8-NEXT: v_mov_b32_e32 v8, s8 +; GFX8-NEXT: s_and_b32 s8, s1, 15 +; GFX8-NEXT: v_mov_b32_e32 v4, s16 +; GFX8-NEXT: v_mov_b32_e32 v5, s9 +; GFX8-NEXT: v_mov_b32_e32 v6, s15 +; GFX8-NEXT: v_mov_b32_e32 v7, s14 +; GFX8-NEXT: v_mov_b32_e32 v8, s7 ; GFX8-NEXT: v_mul_u32_u24_sdwa v4, v5, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_mul_u32_u24_e32 v5, s9, v6 +; GFX8-NEXT: v_mul_u32_u24_e32 v5, s8, v6 ; GFX8-NEXT: v_mul_u32_u24_sdwa v6, v8, v7 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: s_bfe_u32 s5, s1, 0x40010 -; GFX8-NEXT: s_bfe_u32 s7, s1, 0x40018 -; GFX8-NEXT: v_mov_b32_e32 v9, s14 +; GFX8-NEXT: s_bfe_u32 s4, s1, 0x40010 +; GFX8-NEXT: s_bfe_u32 s6, s1, 0x40018 +; GFX8-NEXT: v_mov_b32_e32 v9, s13 ; GFX8-NEXT: s_bfe_u32 s1, s1, 0x40008 ; GFX8-NEXT: v_mov_b32_e32 v3, s2 -; GFX8-NEXT: v_mov_b32_e32 v10, s13 -; GFX8-NEXT: v_mov_b32_e32 v11, s6 -; GFX8-NEXT: v_mov_b32_e32 v12, s12 -; GFX8-NEXT: v_mov_b32_e32 v13, s11 -; GFX8-NEXT: v_mov_b32_e32 v14, s4 +; GFX8-NEXT: v_mov_b32_e32 v10, s12 +; GFX8-NEXT: v_mov_b32_e32 v11, s5 +; GFX8-NEXT: v_mov_b32_e32 v12, s11 +; GFX8-NEXT: v_mov_b32_e32 v13, s10 +; GFX8-NEXT: v_mov_b32_e32 v14, s3 ; GFX8-NEXT: v_mul_u32_u24_e32 v3, s1, v3 ; GFX8-NEXT: v_or_b32_e32 v5, v5, v6 -; GFX8-NEXT: v_mul_u32_u24_e32 v7, s7, v9 +; GFX8-NEXT: v_mul_u32_u24_e32 v7, s6, v9 ; GFX8-NEXT: v_mul_u32_u24_sdwa v8, v11, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_mul_u32_u24_e32 v9, s5, v12 +; GFX8-NEXT: v_mul_u32_u24_e32 v9, s4, v12 ; GFX8-NEXT: v_mul_u32_u24_sdwa v10, v14, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX8-NEXT: v_and_b32_e32 v5, s0, v5 ; GFX8-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -2431,40 +2431,40 @@ define amdgpu_kernel void @udot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40010 -; GFX9-NEXT: s_bfe_u32 s11, s1, 0x40010 -; GFX9-NEXT: s_bfe_u32 s12, s1, 0x40014 -; GFX9-NEXT: s_bfe_u32 s13, s1, 0x40018 -; GFX9-NEXT: s_lshr_b32 s14, s1, 28 -; GFX9-NEXT: s_and_b32 s15, s1, 15 -; GFX9-NEXT: s_bfe_u32 s16, s1, 0x40004 -; GFX9-NEXT: s_bfe_u32 s17, s1, 0x40008 -; GFX9-NEXT: v_mov_b32_e32 v3, s11 +; GFX9-NEXT: s_bfe_u32 s3, s0, 0x40010 +; GFX9-NEXT: s_bfe_u32 s10, s1, 0x40010 +; GFX9-NEXT: s_bfe_u32 s11, s1, 0x40014 +; GFX9-NEXT: s_bfe_u32 s12, s1, 0x40018 +; GFX9-NEXT: s_lshr_b32 s13, s1, 28 +; GFX9-NEXT: s_and_b32 s14, s1, 15 +; GFX9-NEXT: s_bfe_u32 s15, s1, 0x40004 +; GFX9-NEXT: s_bfe_u32 s16, s1, 0x40008 +; GFX9-NEXT: v_mov_b32_e32 v3, s10 ; GFX9-NEXT: s_bfe_u32 s1, s1, 0x4000c -; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40014 -; GFX9-NEXT: v_mov_b32_e32 v4, s12 -; GFX9-NEXT: s_bfe_u32 s6, s0, 0x40018 -; GFX9-NEXT: v_mov_b32_e32 v5, s13 -; GFX9-NEXT: s_lshr_b32 s7, s0, 28 -; GFX9-NEXT: v_mov_b32_e32 v6, s14 -; GFX9-NEXT: s_and_b32 s8, s0, 15 -; GFX9-NEXT: v_mov_b32_e32 v7, s15 -; GFX9-NEXT: s_bfe_u32 s9, s0, 0x40004 -; GFX9-NEXT: v_mov_b32_e32 v8, s16 -; GFX9-NEXT: s_bfe_u32 s10, s0, 0x40008 -; GFX9-NEXT: v_mov_b32_e32 v9, s17 +; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40014 +; GFX9-NEXT: v_mov_b32_e32 v4, s11 +; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40018 +; GFX9-NEXT: v_mov_b32_e32 v5, s12 +; GFX9-NEXT: s_lshr_b32 s6, s0, 28 +; GFX9-NEXT: v_mov_b32_e32 v6, s13 +; GFX9-NEXT: s_and_b32 s7, s0, 15 +; GFX9-NEXT: v_mov_b32_e32 v7, s14 +; GFX9-NEXT: s_bfe_u32 s8, s0, 0x40004 +; GFX9-NEXT: v_mov_b32_e32 v8, s15 +; GFX9-NEXT: s_bfe_u32 s9, s0, 0x40008 +; GFX9-NEXT: v_mov_b32_e32 v9, s16 ; GFX9-NEXT: s_bfe_u32 s0, s0, 0x4000c ; GFX9-NEXT: v_mov_b32_e32 v10, s1 -; GFX9-NEXT: v_mul_lo_u16_e32 v3, s4, v3 -; GFX9-NEXT: v_mul_lo_u16_sdwa v4, s5, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_mul_lo_u16_e32 v5, s6, v5 -; GFX9-NEXT: v_mul_lo_u16_sdwa v6, s7, v6 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_mul_lo_u16_e32 v7, s8, v7 -; GFX9-NEXT: v_mul_lo_u16_sdwa v8, s9, v8 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_mul_lo_u16_e32 v3, s3, v3 +; GFX9-NEXT: v_mul_lo_u16_sdwa v4, s4, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_mul_lo_u16_e32 v5, s5, v5 +; GFX9-NEXT: v_mul_lo_u16_sdwa v6, s6, v6 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_mul_lo_u16_e32 v7, s7, v7 +; GFX9-NEXT: v_mul_lo_u16_sdwa v8, s8, v8 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX9-NEXT: v_or_b32_sdwa v4, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_e32 v5, v7, v8 -; GFX9-NEXT: v_mul_lo_u16_e32 v9, s10, v9 +; GFX9-NEXT: v_mul_lo_u16_e32 v9, s9, v9 ; GFX9-NEXT: v_mul_lo_u16_sdwa v10, s0, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_and_b32_e32 v5, s2, v5 ; GFX9-NEXT: v_or_b32_sdwa v6, v9, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -2497,40 +2497,40 @@ define amdgpu_kernel void @udot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s11, s1, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s12, s1, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s13, s1, 0x40018 -; GFX9-DL-NEXT: s_lshr_b32 s14, s1, 28 -; GFX9-DL-NEXT: s_and_b32 s15, s1, 15 -; GFX9-DL-NEXT: s_bfe_u32 s16, s1, 0x40004 -; GFX9-DL-NEXT: s_bfe_u32 s17, s1, 0x40008 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s11 +; GFX9-DL-NEXT: s_bfe_u32 s3, s0, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s10, s1, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s11, s1, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s12, s1, 0x40018 +; GFX9-DL-NEXT: s_lshr_b32 s13, s1, 28 +; GFX9-DL-NEXT: s_and_b32 s14, s1, 15 +; GFX9-DL-NEXT: s_bfe_u32 s15, s1, 0x40004 +; GFX9-DL-NEXT: s_bfe_u32 s16, s1, 0x40008 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s10 ; GFX9-DL-NEXT: s_bfe_u32 s1, s1, 0x4000c -; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40014 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s12 -; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x40018 -; GFX9-DL-NEXT: v_mov_b32_e32 v5, s13 -; GFX9-DL-NEXT: s_lshr_b32 s7, s0, 28 -; GFX9-DL-NEXT: v_mov_b32_e32 v6, s14 -; GFX9-DL-NEXT: s_and_b32 s8, s0, 15 -; GFX9-DL-NEXT: v_mov_b32_e32 v7, s15 -; GFX9-DL-NEXT: s_bfe_u32 s9, s0, 0x40004 -; GFX9-DL-NEXT: v_mov_b32_e32 v8, s16 -; GFX9-DL-NEXT: s_bfe_u32 s10, s0, 0x40008 -; GFX9-DL-NEXT: v_mov_b32_e32 v9, s17 +; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40014 +; GFX9-DL-NEXT: v_mov_b32_e32 v4, s11 +; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40018 +; GFX9-DL-NEXT: v_mov_b32_e32 v5, s12 +; GFX9-DL-NEXT: s_lshr_b32 s6, s0, 28 +; GFX9-DL-NEXT: v_mov_b32_e32 v6, s13 +; GFX9-DL-NEXT: s_and_b32 s7, s0, 15 +; GFX9-DL-NEXT: v_mov_b32_e32 v7, s14 +; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x40004 +; GFX9-DL-NEXT: v_mov_b32_e32 v8, s15 +; GFX9-DL-NEXT: s_bfe_u32 s9, s0, 0x40008 +; GFX9-DL-NEXT: v_mov_b32_e32 v9, s16 ; GFX9-DL-NEXT: s_bfe_u32 s0, s0, 0x4000c ; GFX9-DL-NEXT: v_mov_b32_e32 v10, s1 -; GFX9-DL-NEXT: v_mul_lo_u16_e32 v3, s4, v3 -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v4, s5, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-DL-NEXT: v_mul_lo_u16_e32 v5, s6, v5 -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v6, s7, v6 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-DL-NEXT: v_mul_lo_u16_e32 v7, s8, v7 -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v8, s9, v8 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-DL-NEXT: v_mul_lo_u16_e32 v3, s3, v3 +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v4, s4, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-DL-NEXT: v_mul_lo_u16_e32 v5, s5, v5 +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v6, s6, v6 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-DL-NEXT: v_mul_lo_u16_e32 v7, s7, v7 +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v8, s8, v8 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-DL-NEXT: v_or_b32_e32 v3, v3, v4 ; GFX9-DL-NEXT: v_or_b32_sdwa v4, v5, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-DL-NEXT: v_or_b32_e32 v5, v7, v8 -; GFX9-DL-NEXT: v_mul_lo_u16_e32 v9, s10, v9 +; GFX9-DL-NEXT: v_mul_lo_u16_e32 v9, s9, v9 ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v10, s0, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-DL-NEXT: v_and_b32_e32 v5, s2, v5 ; GFX9-DL-NEXT: v_or_b32_sdwa v6, v9, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD @@ -2553,58 +2553,58 @@ define amdgpu_kernel void @udot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1, ; ; GFX10-DL-LABEL: udot8_acc8_vecMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40004 -; GFX10-DL-NEXT: s_and_b32 s5, s0, 15 -; GFX10-DL-NEXT: s_and_b32 s7, s1, 15 -; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x4000c -; GFX10-DL-NEXT: s_bfe_u32 s8, s1, 0x4000c -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v3, s2, s4 +; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40004 +; GFX10-DL-NEXT: s_and_b32 s4, s0, 15 +; GFX10-DL-NEXT: s_and_b32 s6, s1, 15 +; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x4000c +; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x4000c +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v3, s2, s3 ; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40008 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, s5, s7 -; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40008 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s6, s8 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v4, s4, s6 +; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40008 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s5, s7 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 8, v3 -; GFX10-DL-NEXT: s_mov_b32 s5, 0xffff -; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x40014 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v6, s2, s4 +; GFX10-DL-NEXT: s_mov_b32 s4, 0xffff +; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40014 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v6, s2, s3 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v5, 8, v5 ; GFX10-DL-NEXT: v_or_b32_e32 v3, v4, v3 -; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x40014 +; GFX10-DL-NEXT: s_bfe_u32 s3, s0, 0x40014 ; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x40018 +; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x40018 ; GFX10-DL-NEXT: v_or_b32_sdwa v4, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-DL-NEXT: v_and_b32_e32 v3, s5, v3 -; GFX10-DL-NEXT: s_bfe_u32 s8, s1, 0x40010 -; GFX10-DL-NEXT: s_lshr_b32 s9, s1, 28 +; GFX10-DL-NEXT: v_and_b32_e32 v3, s4, v3 +; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x40010 +; GFX10-DL-NEXT: s_lshr_b32 s8, s1, 28 ; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s4, s7 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s3, s6 ; GFX10-DL-NEXT: v_or_b32_e32 v4, v3, v4 ; GFX10-DL-NEXT: s_bfe_u32 s1, s1, 0x40018 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v6, s2, s8 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v7, s0, s9 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v6, s2, s7 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v7, s0, s8 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v4 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v7, 8, v7 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v3, v2 ; GFX10-DL-NEXT: v_lshlrev_b16_e64 v3, 8, v5 -; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s6, s1 +; GFX10-DL-NEXT: v_mul_lo_u16_e64 v5, s5, s1 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v8 ; GFX10-DL-NEXT: v_or_b32_e32 v3, v6, v3 ; GFX10-DL-NEXT: v_or_b32_sdwa v5, v5, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_2 -; GFX10-DL-NEXT: v_and_b32_e32 v3, s5, v3 +; GFX10-DL-NEXT: v_and_b32_e32 v3, s4, v3 ; GFX10-DL-NEXT: v_add_nc_u32_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 ; GFX10-DL-NEXT: v_or_b32_e32 v4, v3, v5 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v3 @@ -2651,50 +2651,50 @@ entry: define amdgpu_kernel void @udot8_acc4_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX7-LABEL: udot8_acc4_vecMul: ; GFX7: ; %bb.0: ; %entry -; GFX7-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 -; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd -; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: buffer_load_ubyte v0, off, s[4:7], 0 -; GFX7-NEXT: s_load_dword s0, s[8:9], 0x0 -; GFX7-NEXT: s_load_dword s1, s[10:11], 0x0 +; GFX7-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 +; GFX7-NEXT: s_load_dword s4, s[4:5], 0x0 +; GFX7-NEXT: s_load_dword s5, s[6:7], 0x0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_lshr_b32 s2, s0, 28 -; GFX7-NEXT: s_bfe_u32 s15, s1, 0x40018 -; GFX7-NEXT: s_bfe_u32 s16, s1, 0x40014 -; GFX7-NEXT: s_bfe_u32 s17, s1, 0x40010 -; GFX7-NEXT: s_bfe_u32 s18, s1, 0x4000c -; GFX7-NEXT: s_bfe_u32 s19, s1, 0x40008 -; GFX7-NEXT: s_bfe_u32 s20, s1, 0x40004 -; GFX7-NEXT: s_lshr_b32 s14, s1, 28 -; GFX7-NEXT: s_and_b32 s1, s1, 15 -; GFX7-NEXT: s_bfe_u32 s8, s0, 0x40018 -; GFX7-NEXT: s_bfe_u32 s9, s0, 0x40014 -; GFX7-NEXT: s_bfe_u32 s10, s0, 0x40010 -; GFX7-NEXT: s_bfe_u32 s11, s0, 0x4000c -; GFX7-NEXT: s_bfe_u32 s12, s0, 0x40008 -; GFX7-NEXT: s_bfe_u32 s13, s0, 0x40004 -; GFX7-NEXT: s_and_b32 s0, s0, 15 -; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: v_mov_b32_e32 v2, s20 -; GFX7-NEXT: v_mov_b32_e32 v3, s19 -; GFX7-NEXT: v_mov_b32_e32 v4, s18 -; GFX7-NEXT: v_mov_b32_e32 v5, s17 -; GFX7-NEXT: v_mov_b32_e32 v6, s16 -; GFX7-NEXT: v_mov_b32_e32 v7, s15 +; GFX7-NEXT: s_lshr_b32 s6, s4, 28 +; GFX7-NEXT: s_bfe_u32 s14, s5, 0x40018 +; GFX7-NEXT: s_bfe_u32 s15, s5, 0x40014 +; GFX7-NEXT: s_bfe_u32 s16, s5, 0x40010 +; GFX7-NEXT: s_bfe_u32 s17, s5, 0x4000c +; GFX7-NEXT: s_bfe_u32 s18, s5, 0x40008 +; GFX7-NEXT: s_bfe_u32 s19, s5, 0x40004 +; GFX7-NEXT: s_lshr_b32 s13, s5, 28 +; GFX7-NEXT: s_and_b32 s5, s5, 15 +; GFX7-NEXT: s_bfe_u32 s7, s4, 0x40018 +; GFX7-NEXT: s_bfe_u32 s8, s4, 0x40014 +; GFX7-NEXT: s_bfe_u32 s9, s4, 0x40010 +; GFX7-NEXT: s_bfe_u32 s10, s4, 0x4000c +; GFX7-NEXT: s_bfe_u32 s11, s4, 0x40008 +; GFX7-NEXT: s_bfe_u32 s12, s4, 0x40004 +; GFX7-NEXT: s_and_b32 s4, s4, 15 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s19 +; GFX7-NEXT: v_mov_b32_e32 v3, s18 +; GFX7-NEXT: v_mov_b32_e32 v4, s17 +; GFX7-NEXT: v_mov_b32_e32 v5, s16 +; GFX7-NEXT: v_mov_b32_e32 v6, s15 +; GFX7-NEXT: v_mov_b32_e32 v7, s14 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v0, s0, v1, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s13, v2, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s12, v3, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s11, v4, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s10, v5, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s9, v6, v0 -; GFX7-NEXT: v_mad_u32_u24 v0, s8, v7, v0 -; GFX7-NEXT: v_mov_b32_e32 v1, s14 -; GFX7-NEXT: v_mad_u32_u24 v0, s2, v1, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s4, v1, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s12, v2, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s11, v3, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s10, v4, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s9, v5, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s8, v6, v0 +; GFX7-NEXT: v_mad_u32_u24 v0, s7, v7, v0 +; GFX7-NEXT: v_mov_b32_e32 v1, s13 +; GFX7-NEXT: v_mad_u32_u24 v0, s6, v1, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX7-NEXT: buffer_store_byte v0, off, s[4:7], 0 +; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX7-NEXT: s_endpgm ; ; GFX8-LABEL: udot8_acc4_vecMul: @@ -2708,41 +2708,41 @@ define amdgpu_kernel void @udot8_acc4_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX8-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX8-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s9, s0, 15 -; GFX8-NEXT: s_and_b32 s16, s1, 15 -; GFX8-NEXT: s_bfe_u32 s15, s1, 0x40004 -; GFX8-NEXT: v_mov_b32_e32 v4, s16 -; GFX8-NEXT: s_bfe_u32 s11, s1, 0x40018 -; GFX8-NEXT: s_bfe_u32 s12, s1, 0x40014 -; GFX8-NEXT: s_bfe_u32 s13, s1, 0x40010 -; GFX8-NEXT: s_bfe_u32 s14, s1, 0x40008 -; GFX8-NEXT: s_lshr_b32 s10, s1, 28 +; GFX8-NEXT: s_and_b32 s8, s0, 15 +; GFX8-NEXT: s_and_b32 s15, s1, 15 +; GFX8-NEXT: s_bfe_u32 s14, s1, 0x40004 +; GFX8-NEXT: v_mov_b32_e32 v4, s15 +; GFX8-NEXT: s_bfe_u32 s10, s1, 0x40018 +; GFX8-NEXT: s_bfe_u32 s11, s1, 0x40014 +; GFX8-NEXT: s_bfe_u32 s12, s1, 0x40010 +; GFX8-NEXT: s_bfe_u32 s13, s1, 0x40008 +; GFX8-NEXT: s_lshr_b32 s9, s1, 28 ; GFX8-NEXT: s_bfe_u32 s1, s1, 0x4000c -; GFX8-NEXT: s_bfe_u32 s8, s0, 0x40004 -; GFX8-NEXT: v_mov_b32_e32 v5, s15 +; GFX8-NEXT: s_bfe_u32 s7, s0, 0x40004 +; GFX8-NEXT: v_mov_b32_e32 v5, s14 ; GFX8-NEXT: s_lshr_b32 s2, s0, 28 -; GFX8-NEXT: s_bfe_u32 s4, s0, 0x40018 -; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40014 -; GFX8-NEXT: s_bfe_u32 s6, s0, 0x40010 -; GFX8-NEXT: s_bfe_u32 s7, s0, 0x40008 +; GFX8-NEXT: s_bfe_u32 s3, s0, 0x40018 +; GFX8-NEXT: s_bfe_u32 s4, s0, 0x40014 +; GFX8-NEXT: s_bfe_u32 s5, s0, 0x40010 +; GFX8-NEXT: s_bfe_u32 s6, s0, 0x40008 ; GFX8-NEXT: s_bfe_u32 s0, s0, 0x4000c ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v6, s14 +; GFX8-NEXT: v_mov_b32_e32 v6, s13 ; GFX8-NEXT: v_mul_u32_u24_e32 v3, s0, v3 ; GFX8-NEXT: v_and_b32_e32 v3, 15, v3 -; GFX8-NEXT: v_mov_b32_e32 v7, s13 -; GFX8-NEXT: v_mov_b32_e32 v8, s12 -; GFX8-NEXT: v_mov_b32_e32 v9, s11 +; GFX8-NEXT: v_mov_b32_e32 v7, s12 +; GFX8-NEXT: v_mov_b32_e32 v8, s11 +; GFX8-NEXT: v_mov_b32_e32 v9, s10 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v2, s9, v4, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s8, v5, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s7, v6, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s8, v4, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s7, v5, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s6, v6, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s6, v7, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s5, v8, v2 -; GFX8-NEXT: v_mad_u32_u24 v2, s4, v9, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, s10 +; GFX8-NEXT: v_mad_u32_u24 v2, s5, v7, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s4, v8, v2 +; GFX8-NEXT: v_mad_u32_u24 v2, s3, v9, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s9 ; GFX8-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX8-NEXT: flat_store_byte v[0:1], v2 @@ -2759,41 +2759,41 @@ define amdgpu_kernel void @udot8_acc4_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_and_b32 s9, s0, 15 -; GFX9-NEXT: s_and_b32 s16, s1, 15 -; GFX9-NEXT: s_bfe_u32 s15, s1, 0x40004 -; GFX9-NEXT: v_mov_b32_e32 v4, s16 -; GFX9-NEXT: s_bfe_u32 s11, s1, 0x40018 -; GFX9-NEXT: s_bfe_u32 s12, s1, 0x40014 -; GFX9-NEXT: s_bfe_u32 s13, s1, 0x40010 -; GFX9-NEXT: s_bfe_u32 s14, s1, 0x40008 -; GFX9-NEXT: s_lshr_b32 s10, s1, 28 +; GFX9-NEXT: s_and_b32 s8, s0, 15 +; GFX9-NEXT: s_and_b32 s15, s1, 15 +; GFX9-NEXT: s_bfe_u32 s14, s1, 0x40004 +; GFX9-NEXT: v_mov_b32_e32 v4, s15 +; GFX9-NEXT: s_bfe_u32 s10, s1, 0x40018 +; GFX9-NEXT: s_bfe_u32 s11, s1, 0x40014 +; GFX9-NEXT: s_bfe_u32 s12, s1, 0x40010 +; GFX9-NEXT: s_bfe_u32 s13, s1, 0x40008 +; GFX9-NEXT: s_lshr_b32 s9, s1, 28 ; GFX9-NEXT: s_bfe_u32 s1, s1, 0x4000c -; GFX9-NEXT: s_bfe_u32 s8, s0, 0x40004 -; GFX9-NEXT: v_mov_b32_e32 v5, s15 +; GFX9-NEXT: s_bfe_u32 s7, s0, 0x40004 +; GFX9-NEXT: v_mov_b32_e32 v5, s14 ; GFX9-NEXT: s_lshr_b32 s2, s0, 28 -; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40018 -; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40014 -; GFX9-NEXT: s_bfe_u32 s6, s0, 0x40010 -; GFX9-NEXT: s_bfe_u32 s7, s0, 0x40008 +; GFX9-NEXT: s_bfe_u32 s3, s0, 0x40018 +; GFX9-NEXT: s_bfe_u32 s4, s0, 0x40014 +; GFX9-NEXT: s_bfe_u32 s5, s0, 0x40010 +; GFX9-NEXT: s_bfe_u32 s6, s0, 0x40008 ; GFX9-NEXT: s_bfe_u32 s0, s0, 0x4000c ; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_mov_b32_e32 v6, s14 +; GFX9-NEXT: v_mov_b32_e32 v6, s13 ; GFX9-NEXT: v_mul_u32_u24_e32 v3, s0, v3 ; GFX9-NEXT: v_and_b32_e32 v3, 15, v3 -; GFX9-NEXT: v_mov_b32_e32 v7, s13 -; GFX9-NEXT: v_mov_b32_e32 v8, s12 -; GFX9-NEXT: v_mov_b32_e32 v9, s11 +; GFX9-NEXT: v_mov_b32_e32 v7, s12 +; GFX9-NEXT: v_mov_b32_e32 v8, s11 +; GFX9-NEXT: v_mov_b32_e32 v9, s10 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mad_u32_u24 v2, s9, v4, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s8, v5, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s7, v6, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s8, v4, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s7, v5, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s6, v6, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-NEXT: v_add_u32_e32 v2, v2, v3 -; GFX9-NEXT: v_mad_u32_u24 v2, s6, v7, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s5, v8, v2 -; GFX9-NEXT: v_mad_u32_u24 v2, s4, v9, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s10 +; GFX9-NEXT: v_mad_u32_u24 v2, s5, v7, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s4, v8, v2 +; GFX9-NEXT: v_mad_u32_u24 v2, s3, v9, v2 +; GFX9-NEXT: v_mov_b32_e32 v3, s9 ; GFX9-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-NEXT: global_store_byte v[0:1], v2, off @@ -2810,41 +2810,41 @@ define amdgpu_kernel void @udot8_acc4_vecMul(<8 x i4> addrspace(1)* %src1, ; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 ; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_and_b32 s9, s0, 15 -; GFX9-DL-NEXT: s_and_b32 s16, s1, 15 -; GFX9-DL-NEXT: s_bfe_u32 s15, s1, 0x40004 -; GFX9-DL-NEXT: v_mov_b32_e32 v4, s16 -; GFX9-DL-NEXT: s_bfe_u32 s11, s1, 0x40018 -; GFX9-DL-NEXT: s_bfe_u32 s12, s1, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s13, s1, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s14, s1, 0x40008 -; GFX9-DL-NEXT: s_lshr_b32 s10, s1, 28 +; GFX9-DL-NEXT: s_and_b32 s8, s0, 15 +; GFX9-DL-NEXT: s_and_b32 s15, s1, 15 +; GFX9-DL-NEXT: s_bfe_u32 s14, s1, 0x40004 +; GFX9-DL-NEXT: v_mov_b32_e32 v4, s15 +; GFX9-DL-NEXT: s_bfe_u32 s10, s1, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s11, s1, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s12, s1, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s13, s1, 0x40008 +; GFX9-DL-NEXT: s_lshr_b32 s9, s1, 28 ; GFX9-DL-NEXT: s_bfe_u32 s1, s1, 0x4000c -; GFX9-DL-NEXT: s_bfe_u32 s8, s0, 0x40004 -; GFX9-DL-NEXT: v_mov_b32_e32 v5, s15 +; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x40004 +; GFX9-DL-NEXT: v_mov_b32_e32 v5, s14 ; GFX9-DL-NEXT: s_lshr_b32 s2, s0, 28 -; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40018 -; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40014 -; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x40010 -; GFX9-DL-NEXT: s_bfe_u32 s7, s0, 0x40008 +; GFX9-DL-NEXT: s_bfe_u32 s3, s0, 0x40018 +; GFX9-DL-NEXT: s_bfe_u32 s4, s0, 0x40014 +; GFX9-DL-NEXT: s_bfe_u32 s5, s0, 0x40010 +; GFX9-DL-NEXT: s_bfe_u32 s6, s0, 0x40008 ; GFX9-DL-NEXT: s_bfe_u32 s0, s0, 0x4000c ; GFX9-DL-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-DL-NEXT: v_mov_b32_e32 v6, s14 +; GFX9-DL-NEXT: v_mov_b32_e32 v6, s13 ; GFX9-DL-NEXT: v_mul_u32_u24_e32 v3, s0, v3 ; GFX9-DL-NEXT: v_and_b32_e32 v3, 15, v3 -; GFX9-DL-NEXT: v_mov_b32_e32 v7, s13 -; GFX9-DL-NEXT: v_mov_b32_e32 v8, s12 -; GFX9-DL-NEXT: v_mov_b32_e32 v9, s11 +; GFX9-DL-NEXT: v_mov_b32_e32 v7, s12 +; GFX9-DL-NEXT: v_mov_b32_e32 v8, s11 +; GFX9-DL-NEXT: v_mov_b32_e32 v9, s10 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s9, v4, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s8, v5, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v6, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s8, v4, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s7, v5, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v6, v2 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-DL-NEXT: v_add_u32_e32 v2, v2, v3 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s6, v7, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v8, v2 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v9, v2 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, s10 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s5, v7, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s4, v8, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v2, s3, v9, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, s9 ; GFX9-DL-NEXT: v_mad_u32_u24 v2, s2, v3, v2 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX9-DL-NEXT: global_store_byte v[0:1], v2, off @@ -2852,44 +2852,44 @@ define amdgpu_kernel void @udot8_acc4_vecMul(<8 x i4> addrspace(1)* %src1, ; ; GFX10-DL-LABEL: udot8_acc4_vecMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: ; implicit-def: $vcc_hi ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-DL-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: global_load_ubyte v2, v[0:1], off ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_and_b32 s2, s0, 15 -; GFX10-DL-NEXT: s_and_b32 s4, s1, 15 -; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x40008 +; GFX10-DL-NEXT: s_and_b32 s3, s1, 15 +; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40008 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 ; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x4000c -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v2 -; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x4000c -; GFX10-DL-NEXT: s_bfe_u32 s6, s1, 0x40014 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s7, v2 -; GFX10-DL-NEXT: v_mul_u32_u24_e64 v3, s4, s5 +; GFX10-DL-NEXT: s_bfe_u32 s3, s0, 0x4000c +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s4, s5, v2 +; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x4000c +; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x40014 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s6, v2 +; GFX10-DL-NEXT: v_mul_u32_u24_e64 v3, s3, s4 ; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s5, s0, 0x40014 +; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x40014 ; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX10-DL-NEXT: v_and_b32_e32 v3, 15, v3 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v2, v2, v3 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 ; GFX10-DL-NEXT: s_bfe_u32 s2, s0, 0x40018 -; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x40018 +; GFX10-DL-NEXT: s_bfe_u32 s3, s1, 0x40018 ; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28 ; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 28 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s5, s6, v2 -; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s4, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s4, s5, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v2, s2, s3, v2 ; GFX10-DL-NEXT: v_mad_u32_u24 v2, s0, s1, v2 ; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX10-DL-NEXT: global_store_byte v[0:1], v2, off diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll index 2d85df7c2b94c..f004bbe500936 100644 --- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-term.ll @@ -16,40 +16,40 @@ define amdgpu_kernel void @extract_w_offset_vgpr(i32 addrspace(1)* %out) { ; GCN: renamable $sgpr2 = COPY renamable $sgpr1 ; GCN: renamable $sgpr0 = COPY renamable $sgpr0, implicit killed $sgpr0_sgpr1 ; GCN: renamable $sgpr1 = S_MOV_B32 61440 - ; GCN: renamable $sgpr4 = S_MOV_B32 -1 - ; GCN: undef renamable $sgpr8 = COPY killed renamable $sgpr0, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 - ; GCN: renamable $sgpr9 = COPY killed renamable $sgpr2 - ; GCN: renamable $sgpr10 = COPY killed renamable $sgpr4 - ; GCN: renamable $sgpr11 = COPY killed renamable $sgpr1 + ; GCN: renamable $sgpr3 = S_MOV_B32 -1 + ; GCN: undef renamable $sgpr4 = COPY killed renamable $sgpr0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7 + ; GCN: renamable $sgpr5 = COPY killed renamable $sgpr2 + ; GCN: renamable $sgpr6 = COPY killed renamable $sgpr3 + ; GCN: renamable $sgpr7 = COPY killed renamable $sgpr1 ; GCN: renamable $sgpr0 = S_MOV_B32 16 ; GCN: renamable $sgpr1 = S_MOV_B32 15 ; GCN: renamable $sgpr2 = S_MOV_B32 14 - ; GCN: renamable $sgpr4 = S_MOV_B32 13 - ; GCN: renamable $sgpr5 = S_MOV_B32 12 - ; GCN: renamable $sgpr6 = S_MOV_B32 11 - ; GCN: renamable $sgpr7 = S_MOV_B32 10 - ; GCN: renamable $sgpr12 = S_MOV_B32 9 - ; GCN: renamable $sgpr13 = S_MOV_B32 8 - ; GCN: renamable $sgpr14 = S_MOV_B32 7 - ; GCN: renamable $sgpr15 = S_MOV_B32 6 - ; GCN: renamable $sgpr16 = S_MOV_B32 5 - ; GCN: renamable $sgpr17 = S_MOV_B32 3 - ; GCN: renamable $sgpr18 = S_MOV_B32 2 - ; GCN: renamable $sgpr19 = S_MOV_B32 1 - ; GCN: renamable $sgpr20 = S_MOV_B32 0 - ; GCN: renamable $vgpr1 = COPY killed renamable $sgpr20 - ; GCN: renamable $vgpr2 = COPY killed renamable $sgpr19 - ; GCN: renamable $vgpr3 = COPY killed renamable $sgpr18 - ; GCN: renamable $vgpr4 = COPY killed renamable $sgpr17 - ; GCN: renamable $vgpr5 = COPY killed renamable $sgpr16 - ; GCN: renamable $vgpr6 = COPY killed renamable $sgpr15 - ; GCN: renamable $vgpr7 = COPY killed renamable $sgpr14 - ; GCN: renamable $vgpr8 = COPY killed renamable $sgpr13 - ; GCN: renamable $vgpr9 = COPY killed renamable $sgpr12 - ; GCN: renamable $vgpr10 = COPY killed renamable $sgpr7 - ; GCN: renamable $vgpr11 = COPY killed renamable $sgpr6 - ; GCN: renamable $vgpr12 = COPY killed renamable $sgpr5 - ; GCN: renamable $vgpr13 = COPY killed renamable $sgpr4 + ; GCN: renamable $sgpr3 = S_MOV_B32 13 + ; GCN: renamable $sgpr8 = S_MOV_B32 12 + ; GCN: renamable $sgpr9 = S_MOV_B32 11 + ; GCN: renamable $sgpr10 = S_MOV_B32 10 + ; GCN: renamable $sgpr11 = S_MOV_B32 9 + ; GCN: renamable $sgpr12 = S_MOV_B32 8 + ; GCN: renamable $sgpr13 = S_MOV_B32 7 + ; GCN: renamable $sgpr14 = S_MOV_B32 6 + ; GCN: renamable $sgpr15 = S_MOV_B32 5 + ; GCN: renamable $sgpr16 = S_MOV_B32 3 + ; GCN: renamable $sgpr17 = S_MOV_B32 2 + ; GCN: renamable $sgpr18 = S_MOV_B32 1 + ; GCN: renamable $sgpr19 = S_MOV_B32 0 + ; GCN: renamable $vgpr1 = COPY killed renamable $sgpr19 + ; GCN: renamable $vgpr2 = COPY killed renamable $sgpr18 + ; GCN: renamable $vgpr3 = COPY killed renamable $sgpr17 + ; GCN: renamable $vgpr4 = COPY killed renamable $sgpr16 + ; GCN: renamable $vgpr5 = COPY killed renamable $sgpr15 + ; GCN: renamable $vgpr6 = COPY killed renamable $sgpr14 + ; GCN: renamable $vgpr7 = COPY killed renamable $sgpr13 + ; GCN: renamable $vgpr8 = COPY killed renamable $sgpr12 + ; GCN: renamable $vgpr9 = COPY killed renamable $sgpr11 + ; GCN: renamable $vgpr10 = COPY killed renamable $sgpr10 + ; GCN: renamable $vgpr11 = COPY killed renamable $sgpr9 + ; GCN: renamable $vgpr12 = COPY killed renamable $sgpr8 + ; GCN: renamable $vgpr13 = COPY killed renamable $sgpr3 ; GCN: renamable $vgpr14 = COPY killed renamable $sgpr2 ; GCN: renamable $vgpr15 = COPY killed renamable $sgpr1 ; GCN: renamable $vgpr16 = COPY killed renamable $sgpr0 @@ -69,44 +69,44 @@ define amdgpu_kernel void @extract_w_offset_vgpr(i32 addrspace(1)* %out) { ; GCN: renamable $vgpr30 = COPY killed renamable $vgpr14 ; GCN: renamable $vgpr31 = COPY killed renamable $vgpr15 ; GCN: renamable $vgpr32 = COPY killed renamable $vgpr16 - ; GCN: renamable $sgpr22_sgpr23 = S_MOV_B64 $exec + ; GCN: renamable $sgpr20_sgpr21 = S_MOV_B64 $exec ; GCN: renamable $vgpr1 = IMPLICIT_DEF - ; GCN: renamable $sgpr24_sgpr25 = IMPLICIT_DEF - ; GCN: SI_SPILL_V32_SAVE killed $vgpr0, %stack.0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr3, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5) - ; GCN: SI_SPILL_S128_SAVE killed $sgpr8_sgpr9_sgpr10_sgpr11, %stack.1, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr3 :: (store 16 into %stack.1, align 4, addrspace 5) - ; GCN: SI_SPILL_V512_SAVE killed $vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32, %stack.2, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr3, 0, implicit $exec :: (store 64 into %stack.2, align 4, addrspace 5) - ; GCN: SI_SPILL_S64_SAVE killed $sgpr22_sgpr23, %stack.3, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr3 :: (store 8 into %stack.3, align 4, addrspace 5) - ; GCN: SI_SPILL_V32_SAVE killed $vgpr1, %stack.4, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr3, 0, implicit $exec :: (store 4 into %stack.4, addrspace 5) - ; GCN: SI_SPILL_S64_SAVE killed $sgpr24_sgpr25, %stack.5, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr3 :: (store 8 into %stack.5, align 4, addrspace 5) + ; GCN: renamable $sgpr22_sgpr23 = IMPLICIT_DEF + ; GCN: SI_SPILL_V32_SAVE killed $vgpr0, %stack.0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5) + ; GCN: SI_SPILL_S128_SAVE killed $sgpr4_sgpr5_sgpr6_sgpr7, %stack.1, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 :: (store 16 into %stack.1, align 4, addrspace 5) + ; GCN: SI_SPILL_V512_SAVE killed $vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32, %stack.2, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 64 into %stack.2, align 4, addrspace 5) + ; GCN: SI_SPILL_S64_SAVE killed $sgpr20_sgpr21, %stack.3, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 :: (store 8 into %stack.3, align 4, addrspace 5) + ; GCN: SI_SPILL_V32_SAVE killed $vgpr1, %stack.4, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 4 into %stack.4, addrspace 5) + ; GCN: SI_SPILL_S64_SAVE killed $sgpr22_sgpr23, %stack.5, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 :: (store 8 into %stack.5, align 4, addrspace 5) ; GCN: bb.1: ; GCN: successors: %bb.1(0x40000000), %bb.3(0x40000000) - ; GCN: $sgpr0_sgpr1 = SI_SPILL_S64_RESTORE %stack.5, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr3 :: (load 8 from %stack.5, align 4, addrspace 5) - ; GCN: $vgpr0 = SI_SPILL_V32_RESTORE %stack.4, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr3, 0, implicit $exec :: (load 4 from %stack.4, addrspace 5) - ; GCN: $vgpr1 = SI_SPILL_V32_RESTORE %stack.0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr3, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) + ; GCN: $sgpr0_sgpr1 = SI_SPILL_S64_RESTORE %stack.5, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 :: (load 8 from %stack.5, align 4, addrspace 5) + ; GCN: $vgpr0 = SI_SPILL_V32_RESTORE %stack.4, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (load 4 from %stack.4, addrspace 5) + ; GCN: $vgpr1 = SI_SPILL_V32_RESTORE %stack.0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) ; GCN: renamable $sgpr2 = V_READFIRSTLANE_B32 $vgpr1, implicit $exec ; GCN: renamable $sgpr4_sgpr5 = V_CMP_EQ_U32_e64 $sgpr2, killed $vgpr1, implicit $exec ; GCN: renamable $sgpr4_sgpr5 = S_AND_SAVEEXEC_B64 killed renamable $sgpr4_sgpr5, implicit-def $exec, implicit-def $scc, implicit $exec ; GCN: S_SET_GPR_IDX_ON killed renamable $sgpr2, 1, implicit-def $m0, implicit undef $m0 - ; GCN: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17 = SI_SPILL_V512_RESTORE %stack.2, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr3, 0, implicit $exec :: (load 64 from %stack.2, align 4, addrspace 5) + ; GCN: $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17 = SI_SPILL_V512_RESTORE %stack.2, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (load 64 from %stack.2, align 4, addrspace 5) ; GCN: renamable $vgpr18 = V_MOV_B32_e32 undef $vgpr3, implicit $exec, implicit killed $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17, implicit $m0 ; GCN: S_SET_GPR_IDX_OFF ; GCN: renamable $vgpr19 = COPY renamable $vgpr18 ; GCN: renamable $sgpr6_sgpr7 = COPY renamable $sgpr4_sgpr5 - ; GCN: SI_SPILL_S64_SAVE killed $sgpr6_sgpr7, %stack.5, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr3 :: (store 8 into %stack.5, align 4, addrspace 5) - ; GCN: SI_SPILL_S64_SAVE killed $sgpr0_sgpr1, %stack.6, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr3 :: (store 8 into %stack.6, align 4, addrspace 5) - ; GCN: SI_SPILL_V32_SAVE killed $vgpr19, %stack.4, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr3, 0, implicit $exec :: (store 4 into %stack.4, addrspace 5) - ; GCN: SI_SPILL_V32_SAVE killed $vgpr0, %stack.7, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr3, 0, implicit $exec :: (store 4 into %stack.7, addrspace 5) - ; GCN: SI_SPILL_V32_SAVE killed $vgpr18, %stack.8, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr3, 0, implicit $exec :: (store 4 into %stack.8, addrspace 5) + ; GCN: SI_SPILL_S64_SAVE killed $sgpr6_sgpr7, %stack.5, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 :: (store 8 into %stack.5, align 4, addrspace 5) + ; GCN: SI_SPILL_S64_SAVE killed $sgpr0_sgpr1, %stack.6, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 :: (store 8 into %stack.6, align 4, addrspace 5) + ; GCN: SI_SPILL_V32_SAVE killed $vgpr19, %stack.4, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 4 into %stack.4, addrspace 5) + ; GCN: SI_SPILL_V32_SAVE killed $vgpr0, %stack.7, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 4 into %stack.7, addrspace 5) + ; GCN: SI_SPILL_V32_SAVE killed $vgpr18, %stack.8, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (store 4 into %stack.8, addrspace 5) ; GCN: $exec = S_XOR_B64_term $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc ; GCN: S_CBRANCH_EXECNZ %bb.1, implicit $exec ; GCN: bb.3: ; GCN: successors: %bb.2(0x80000000) - ; GCN: $sgpr0_sgpr1 = SI_SPILL_S64_RESTORE %stack.3, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr3 :: (load 8 from %stack.3, align 4, addrspace 5) + ; GCN: $sgpr0_sgpr1 = SI_SPILL_S64_RESTORE %stack.3, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 :: (load 8 from %stack.3, align 4, addrspace 5) ; GCN: $exec = S_MOV_B64 killed renamable $sgpr0_sgpr1 ; GCN: bb.2: - ; GCN: $vgpr0 = SI_SPILL_V32_RESTORE %stack.8, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr3, 0, implicit $exec :: (load 4 from %stack.8, addrspace 5) - ; GCN: $sgpr4_sgpr5_sgpr6_sgpr7 = SI_SPILL_S128_RESTORE %stack.1, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr3 :: (load 16 from %stack.1, align 4, addrspace 5) - ; GCN: BUFFER_STORE_DWORD_OFFSET renamable $vgpr0, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.load, addrspace 1) + ; GCN: $vgpr0 = SI_SPILL_V32_RESTORE %stack.8, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr32, 0, implicit $exec :: (load 4 from %stack.8, addrspace 5) + ; GCN: $sgpr0_sgpr1_sgpr2_sgpr3 = SI_SPILL_S128_RESTORE %stack.1, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 :: (load 16 from %stack.1, align 4, addrspace 5) + ; GCN: BUFFER_STORE_DWORD_OFFSET renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.load, addrspace 1) ; GCN: S_ENDPGM 0 entry: %id = call i32 @llvm.amdgcn.workitem.id.x() #1 diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll index 5481246279972..ca033e5289f3f 100644 --- a/llvm/test/CodeGen/AMDGPU/indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll @@ -75,11 +75,12 @@ define amdgpu_kernel void @test_indirect_call_sgpr_ptr() { ; GCN-NEXT: runtime_loader_kernel_symbol = 0 ; GCN-NEXT: .end_amd_kernel_code_t ; GCN-NEXT: ; %bb.0: -; GCN-NEXT: s_mov_b32 s33, s17 -; GCN-NEXT: s_mov_b32 s32, s33 +; GCN-NEXT: s_mov_b32 s32, 0 ; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 -; GCN-NEXT: s_add_u32 s12, s12, s33 +; GCN-NEXT: s_add_u32 s12, s12, s17 ; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-NEXT: s_add_u32 s0, s0, s17 +; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, gv.fptr0@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, gv.fptr0@rel32@hi+4 @@ -167,11 +168,12 @@ define amdgpu_kernel void @test_indirect_call_sgpr_ptr_arg() { ; GCN-NEXT: runtime_loader_kernel_symbol = 0 ; GCN-NEXT: .end_amd_kernel_code_t ; GCN-NEXT: ; %bb.0: -; GCN-NEXT: s_mov_b32 s33, s17 -; GCN-NEXT: s_mov_b32 s32, s33 +; GCN-NEXT: s_mov_b32 s32, 0 ; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 -; GCN-NEXT: s_add_u32 s12, s12, s33 +; GCN-NEXT: s_add_u32 s12, s12, s17 ; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-NEXT: s_add_u32 s0, s0, s17 +; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, gv.fptr1@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, gv.fptr1@rel32@hi+4 diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll index d29bca5aee73b..070a36dd4a21d 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -1620,9 +1620,9 @@ define amdgpu_kernel void @dynamic_insertelement_v8f64(<8 x double> addrspace(1) ; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; SI-NEXT: s_load_dwordx16 s[12:27], s[4:5], 0x10 ; SI-NEXT: s_load_dword s4, s[4:5], 0x20 +; SI-NEXT: s_add_u32 s0, s0, s7 +; SI-NEXT: s_addc_u32 s1, s1, 0 ; SI-NEXT: v_mov_b32_e32 v16, 64 -; SI-NEXT: s_mov_b32 s11, 0x100f000 -; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s12 ; SI-NEXT: s_and_b32 s4, s4, 7 @@ -1642,18 +1642,20 @@ define amdgpu_kernel void @dynamic_insertelement_v8f64(<8 x double> addrspace(1) ; SI-NEXT: v_mov_b32_e32 v9, s21 ; SI-NEXT: v_mov_b32_e32 v10, s22 ; SI-NEXT: v_mov_b32_e32 v11, s23 -; SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], s7 offset:112 -; SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], s7 offset:96 -; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], s7 offset:80 -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], s7 offset:64 +; SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:112 +; SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:96 +; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:80 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 ; SI-NEXT: v_or_b32_e32 v16, s4, v16 ; SI-NEXT: v_mov_b32_e32 v0, 0 ; SI-NEXT: v_mov_b32_e32 v1, 0x40200000 -; SI-NEXT: buffer_store_dwordx2 v[0:1], v16, s[0:3], s7 offen -; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], s7 offset:64 -; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], s7 offset:80 -; SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], s7 offset:96 -; SI-NEXT: buffer_load_dwordx4 v[12:15], off, s[0:3], s7 offset:112 +; SI-NEXT: buffer_store_dwordx2 v[0:1], v16, s[0:3], 0 offen +; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:64 +; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:80 +; SI-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 offset:96 +; SI-NEXT: buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:112 +; SI-NEXT: s_mov_b32 s11, 0x100f000 +; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[8:11], 0 offset:48 ; SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[8:11], 0 offset:32 @@ -1666,9 +1668,9 @@ define amdgpu_kernel void @dynamic_insertelement_v8f64(<8 x double> addrspace(1) ; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0 ; VI-NEXT: s_load_dwordx16 s[12:27], s[4:5], 0x40 ; VI-NEXT: s_load_dword s4, s[4:5], 0x80 +; VI-NEXT: s_add_u32 s0, s0, s7 +; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v16, 64 -; VI-NEXT: s_mov_b32 s11, 0x1100f000 -; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s12 ; VI-NEXT: s_and_b32 s4, s4, 7 @@ -1688,18 +1690,20 @@ define amdgpu_kernel void @dynamic_insertelement_v8f64(<8 x double> addrspace(1) ; VI-NEXT: v_mov_b32_e32 v9, s21 ; VI-NEXT: v_mov_b32_e32 v10, s22 ; VI-NEXT: v_mov_b32_e32 v11, s23 -; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], s7 offset:112 -; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], s7 offset:96 -; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], s7 offset:80 -; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], s7 offset:64 +; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:112 +; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:96 +; VI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:80 +; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:64 ; VI-NEXT: v_or_b32_e32 v16, s4, v16 ; VI-NEXT: v_mov_b32_e32 v0, 0 ; VI-NEXT: v_mov_b32_e32 v1, 0x40200000 -; VI-NEXT: buffer_store_dwordx2 v[0:1], v16, s[0:3], s7 offen -; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], s7 offset:64 -; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], s7 offset:80 -; VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], s7 offset:96 -; VI-NEXT: buffer_load_dwordx4 v[12:15], off, s[0:3], s7 offset:112 +; VI-NEXT: buffer_store_dwordx2 v[0:1], v16, s[0:3], 0 offen +; VI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:64 +; VI-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:80 +; VI-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 offset:96 +; VI-NEXT: buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:112 +; VI-NEXT: s_mov_b32 s11, 0x1100f000 +; VI-NEXT: s_mov_b32 s10, -1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: buffer_store_dwordx4 v[12:15], off, s[8:11], 0 offset:48 ; VI-NEXT: buffer_store_dwordx4 v[8:11], off, s[8:11], 0 offset:32 diff --git a/llvm/test/CodeGen/AMDGPU/ipra.ll b/llvm/test/CodeGen/AMDGPU/ipra.ll index 1b7f4df214c90..0a93986cd232b 100644 --- a/llvm/test/CodeGen/AMDGPU/ipra.ll +++ b/llvm/test/CodeGen/AMDGPU/ipra.ll @@ -30,7 +30,7 @@ define hidden void @func() #1 { ; GCN-NOT: writelane ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v8 -; GCN: ; NumSgprs: 38 +; GCN: ; NumSgprs: 37 ; GCN: ; NumVgprs: 9 define amdgpu_kernel void @kernel_call() #0 { %vgpr = load volatile i32, i32 addrspace(1)* undef diff --git a/llvm/test/CodeGen/AMDGPU/large-alloca-compute.ll b/llvm/test/CodeGen/AMDGPU/large-alloca-compute.ll index 8cb822938fd77..40d622c25e184 100644 --- a/llvm/test/CodeGen/AMDGPU/large-alloca-compute.ll +++ b/llvm/test/CodeGen/AMDGPU/large-alloca-compute.ll @@ -48,8 +48,8 @@ ; GFX10HSA-DAG: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), [[FLAT_SCR_LO]] ; GFX10HSA-DAG: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), [[FLAT_SCR_HI]] -; GCNHSA: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[0:3], s9 offen -; GCNHSA: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[0:3], s9 offen +; GCNHSA: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[0:3], 0 offen +; GCNHSA: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[0:3], 0 offen ; Scratch size = alloca size + emergency stack slot, align {{.*}}, addrspace(5) ; ALL: ; ScratchSize: 32772 diff --git a/llvm/test/CodeGen/AMDGPU/large-alloca-graphics.ll b/llvm/test/CodeGen/AMDGPU/large-alloca-graphics.ll index e69ddbe83e1af..4436b60be2a9d 100644 --- a/llvm/test/CodeGen/AMDGPU/large-alloca-graphics.ll +++ b/llvm/test/CodeGen/AMDGPU/large-alloca-graphics.ll @@ -3,15 +3,19 @@ ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=ALL %s ; ALL-LABEL: {{^}}large_alloca_pixel_shader: -; GCN-DAG: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GCN-DAG: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GCN-DAG: s_mov_b32 s10, -1 -; CI-DAG: s_mov_b32 s11, 0xe8f000 -; VI-DAG: s_mov_b32 s11, 0xe80000 -; GFX9-DAG: s_mov_b32 s11, 0xe00000 +; GCN-DAG: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 +; GCN-DAG: s_mov_b32 s5, SCRATCH_RSRC_DWORD1 +; GCN-DAG: s_mov_b32 s6, -1 -; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s0 offen -; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s0 offen +; CI-DAG: s_mov_b32 s7, 0xe8f000 +; VI-DAG: s_mov_b32 s7, 0xe80000 +; GFX9-DAG: s_mov_b32 s7, 0xe00000 + +; GCN: s_add_u32 s4, s4, s0 +; GCN: s_addc_u32 s5, s5, 0 + +; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[4:7], 0 offen +; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[4:7], 0 offen ; ALL: ; ScratchSize: 32772 define amdgpu_ps void @large_alloca_pixel_shader(i32 %x, i32 %y) #0 { @@ -25,15 +29,19 @@ define amdgpu_ps void @large_alloca_pixel_shader(i32 %x, i32 %y) #0 { } ; ALL-LABEL: {{^}}large_alloca_pixel_shader_inreg: -; GCN-DAG: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; GCN-DAG: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 -; GCN-DAG: s_mov_b32 s10, -1 -; CI-DAG: s_mov_b32 s11, 0xe8f000 -; VI-DAG: s_mov_b32 s11, 0xe80000 -; GFX9-DAG: s_mov_b32 s11, 0xe00000 - -; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s2 offen -; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s2 offen +; GCN-DAG: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 +; GCN-DAG: s_mov_b32 s5, SCRATCH_RSRC_DWORD1 +; GCN-DAG: s_mov_b32 s6, -1 + +; CI-DAG: s_mov_b32 s7, 0xe8f000 +; VI-DAG: s_mov_b32 s7, 0xe80000 +; GFX9-DAG: s_mov_b32 s7, 0xe00000 + +; GCN: s_add_u32 s4, s4, s2 +; GCN: s_addc_u32 s5, s5, 0 + +; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[4:7], 0 offen +; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[4:7], 0 offen ; ALL: ; ScratchSize: 32772 define amdgpu_ps void @large_alloca_pixel_shader_inreg(i32 inreg %x, i32 inreg %y) #0 { diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.buffer.ptr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.buffer.ptr.ll index 3a69ef673b857..6b22d92e367d9 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.buffer.ptr.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.implicit.buffer.ptr.ll @@ -3,7 +3,7 @@ ; FIXME: Requires stack object to not assert ; GCN-LABEL: {{^}}test_ps: ; GCN: s_load_dwordx2 s[4:5], s[0:1], 0x0 -; GCN: buffer_store_dword v0, off, s[4:7], s2 offset:4 +; GCN: buffer_store_dword v0, off, s[4:7], 0 offset:4 ; GCN: s_load_dword s{{[0-9]+}}, s[0:1], 0x0 ; GCN-NEXT: s_waitcnt ; GCN-NEXT: ; return @@ -18,7 +18,7 @@ define amdgpu_ps i32 @test_ps() #1 { ; GCN-LABEL: {{^}}test_cs: ; GCN: s_mov_b64 s[4:5], s[0:1] -; GCN: buffer_store_dword v{{[0-9]+}}, off, s[4:7], s2 offset:4 +; GCN: buffer_store_dword v{{[0-9]+}}, off, s[4:7], 0 offset:4 ; GCN: s_load_dword s0, s[0:1], 0x0 define amdgpu_cs i32 @test_cs() #1 { %alloca = alloca i32, addrspace(5) diff --git a/llvm/test/CodeGen/AMDGPU/load-hi16.ll b/llvm/test/CodeGen/AMDGPU/load-hi16.ll index 49f691b8ae309..37ca76e9489e6 100644 --- a/llvm/test/CodeGen/AMDGPU/load-hi16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-hi16.ll @@ -531,13 +531,13 @@ entry: ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_nooff: ; GCN: s_waitcnt -; GFX900: buffer_load_short_d16_hi v0, off, s[0:3], s33 offset:4094{{$}} +; GFX900: buffer_load_short_d16_hi v0, off, s[0:3], 0 offset:4094{{$}} ; GFX900: s_waitcnt ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0 ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: s_setpc_b64 -; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s33 offset:4094{{$}} +; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], 0 offset:4094{{$}} define void @load_private_hi_v2i16_reglo_vreg_nooff(i16 addrspace(5)* byval %in, i16 %reg) #0 { entry: %load = load volatile i16, i16 addrspace(5)* inttoptr (i32 4094 to i16 addrspace(5)*) @@ -549,13 +549,13 @@ entry: ; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg_nooff: ; GCN: s_waitcnt -; GFX900-NEXT: buffer_load_short_d16_hi v1, off, s[0:3], s33 offset:4094{{$}} +; GFX900-NEXT: buffer_load_short_d16_hi v1, off, s[0:3], 0 offset:4094{{$}} ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: s_setpc_b64 -; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s33 offset:4094{{$}} +; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], 0 offset:4094{{$}} define void @load_private_hi_v2f16_reglo_vreg_nooff(half addrspace(5)* %in, half %reg) #0 { entry: %load = load volatile half, half addrspace(5)* inttoptr (i32 4094 to half addrspace(5)*) @@ -649,13 +649,13 @@ entry: ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_nooff_zexti8: ; GCN: s_waitcnt -; GFX900-NEXT: buffer_load_ubyte_d16_hi v1, off, s[0:3], s33 offset:4094{{$}} +; GFX900-NEXT: buffer_load_ubyte_d16_hi v1, off, s[0:3], 0 offset:4094{{$}} ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: s_setpc_b64 -; NO-D16-HI: buffer_load_ubyte v0, off, s[0:3], s33 offset:4094{{$}} +; NO-D16-HI: buffer_load_ubyte v0, off, s[0:3], 0 offset:4094{{$}} define void @load_private_hi_v2i16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in, i16 %reg) #0 { entry: %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*) @@ -668,13 +668,13 @@ entry: ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_nooff_sexti8: ; GCN: s_waitcnt -; GFX900-NEXT: buffer_load_sbyte_d16_hi v1, off, s[0:3], s33 offset:4094{{$}} +; GFX900-NEXT: buffer_load_sbyte_d16_hi v1, off, s[0:3], 0 offset:4094{{$}} ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: s_setpc_b64 -; NO-D16-HI: buffer_load_sbyte v0, off, s[0:3], s33 offset:4094{{$}} +; NO-D16-HI: buffer_load_sbyte v0, off, s[0:3], 0 offset:4094{{$}} define void @load_private_hi_v2i16_reglo_vreg_nooff_sexti8(i8 addrspace(5)* %in, i16 %reg) #0 { entry: %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*) @@ -687,13 +687,13 @@ entry: ; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg_nooff_zexti8: ; GCN: s_waitcnt -; GFX900-NEXT: buffer_load_ubyte_d16_hi v1, off, s[0:3], s33 offset:4094{{$}} +; GFX900-NEXT: buffer_load_ubyte_d16_hi v1, off, s[0:3], 0 offset:4094{{$}} ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1 ; GFX900-NEXT: s_waitcnt ; GFX900-NEXT: s_setpc_b64 -; NO-D16-HI: buffer_load_ubyte v0, off, s[0:3], s33 offset:4094{{$}} +; NO-D16-HI: buffer_load_ubyte v0, off, s[0:3], 0 offset:4094{{$}} define void @load_private_hi_v2f16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in, half %reg) #0 { entry: %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*) diff --git a/llvm/test/CodeGen/AMDGPU/load-lo16.ll b/llvm/test/CodeGen/AMDGPU/load-lo16.ll index b7df7a58e82ca..d83fda5d7861d 100644 --- a/llvm/test/CodeGen/AMDGPU/load-lo16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-lo16.ll @@ -1303,7 +1303,7 @@ define void @load_private_lo_v2i16_reglo_vreg_nooff(i16 addrspace(5)* %in, i32 % ; GFX900-LABEL: load_private_lo_v2i16_reglo_vreg_nooff: ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: buffer_load_short_d16 v1, off, s[0:3], s33 offset:4094 +; GFX900-NEXT: buffer_load_short_d16 v1, off, s[0:3], 0 offset:4094 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: global_store_dword v[0:1], v1, off ; GFX900-NEXT: s_waitcnt vmcnt(0) @@ -1312,7 +1312,7 @@ define void @load_private_lo_v2i16_reglo_vreg_nooff(i16 addrspace(5)* %in, i32 % ; GFX906-LABEL: load_private_lo_v2i16_reglo_vreg_nooff: ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: buffer_load_ushort v0, off, s[0:3], s33 offset:4094 +; GFX906-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:4094 ; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_bfi_b32 v0, v2, v0, v1 @@ -1323,7 +1323,7 @@ define void @load_private_lo_v2i16_reglo_vreg_nooff(i16 addrspace(5)* %in, i32 % ; GFX803-LABEL: load_private_lo_v2i16_reglo_vreg_nooff: ; GFX803: ; %bb.0: ; %entry ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX803-NEXT: buffer_load_ushort v0, off, s[0:3], s33 offset:4094 +; GFX803-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:4094 ; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 @@ -1342,7 +1342,7 @@ define void @load_private_lo_v2i16_reghi_vreg_nooff(i16 addrspace(5)* %in, i32 % ; GFX900-LABEL: load_private_lo_v2i16_reghi_vreg_nooff: ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: buffer_load_short_d16 v1, off, s[0:3], s33 offset:4094 +; GFX900-NEXT: buffer_load_short_d16 v1, off, s[0:3], 0 offset:4094 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: global_store_dword v[0:1], v1, off ; GFX900-NEXT: s_waitcnt vmcnt(0) @@ -1351,7 +1351,7 @@ define void @load_private_lo_v2i16_reghi_vreg_nooff(i16 addrspace(5)* %in, i32 % ; GFX906-LABEL: load_private_lo_v2i16_reghi_vreg_nooff: ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: buffer_load_ushort v0, off, s[0:3], s33 offset:4094 +; GFX906-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:4094 ; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_bfi_b32 v0, v2, v0, v1 @@ -1362,7 +1362,7 @@ define void @load_private_lo_v2i16_reghi_vreg_nooff(i16 addrspace(5)* %in, i32 % ; GFX803-LABEL: load_private_lo_v2i16_reghi_vreg_nooff: ; GFX803: ; %bb.0: ; %entry ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX803-NEXT: buffer_load_ushort v0, off, s[0:3], s33 offset:4094 +; GFX803-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:4094 ; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 @@ -1381,7 +1381,7 @@ define void @load_private_lo_v2f16_reglo_vreg_nooff(half addrspace(5)* %in, i32 ; GFX900-LABEL: load_private_lo_v2f16_reglo_vreg_nooff: ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: buffer_load_short_d16 v1, off, s[0:3], s33 offset:4094 +; GFX900-NEXT: buffer_load_short_d16 v1, off, s[0:3], 0 offset:4094 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: global_store_dword v[0:1], v1, off ; GFX900-NEXT: s_waitcnt vmcnt(0) @@ -1390,7 +1390,7 @@ define void @load_private_lo_v2f16_reglo_vreg_nooff(half addrspace(5)* %in, i32 ; GFX906-LABEL: load_private_lo_v2f16_reglo_vreg_nooff: ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: buffer_load_ushort v0, off, s[0:3], s33 offset:4094 +; GFX906-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:4094 ; GFX906-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -1402,7 +1402,7 @@ define void @load_private_lo_v2f16_reglo_vreg_nooff(half addrspace(5)* %in, i32 ; GFX803-LABEL: load_private_lo_v2f16_reglo_vreg_nooff: ; GFX803: ; %bb.0: ; %entry ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX803-NEXT: buffer_load_ushort v0, off, s[0:3], s33 offset:4094 +; GFX803-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:4094 ; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: v_or_b32_e32 v0, v0, v1 @@ -1504,7 +1504,7 @@ define void @load_private_lo_v2i16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in, ; GFX900-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_zexti8: ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: buffer_load_ubyte_d16 v1, off, s[0:3], s33 offset:4094 +; GFX900-NEXT: buffer_load_ubyte_d16 v1, off, s[0:3], 0 offset:4094 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: global_store_dword v[0:1], v1, off ; GFX900-NEXT: s_waitcnt vmcnt(0) @@ -1513,7 +1513,7 @@ define void @load_private_lo_v2i16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in, ; GFX906-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_zexti8: ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: buffer_load_ubyte v0, off, s[0:3], s33 offset:4094 +; GFX906-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:4094 ; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_bfi_b32 v0, v2, v0, v1 @@ -1525,7 +1525,7 @@ define void @load_private_lo_v2i16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in, ; GFX803: ; %bb.0: ; %entry ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX803-NEXT: v_lshrrev_b32_e32 v0, 16, v1 -; GFX803-NEXT: buffer_load_ubyte v1, off, s[0:3], s33 offset:4094 +; GFX803-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 offset:4094 ; GFX803-NEXT: s_mov_b32 s4, 0x5040c00 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4 @@ -1545,7 +1545,7 @@ define void @load_private_lo_v2i16_reglo_vreg_nooff_sexti8(i8 addrspace(5)* %in, ; GFX900-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_sexti8: ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: buffer_load_sbyte_d16 v1, off, s[0:3], s33 offset:4094 +; GFX900-NEXT: buffer_load_sbyte_d16 v1, off, s[0:3], 0 offset:4094 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: global_store_dword v[0:1], v1, off ; GFX900-NEXT: s_waitcnt vmcnt(0) @@ -1554,7 +1554,7 @@ define void @load_private_lo_v2i16_reglo_vreg_nooff_sexti8(i8 addrspace(5)* %in, ; GFX906-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_sexti8: ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: buffer_load_sbyte v0, off, s[0:3], s33 offset:4094 +; GFX906-NEXT: buffer_load_sbyte v0, off, s[0:3], 0 offset:4094 ; GFX906-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_bfi_b32 v0, v2, v0, v1 @@ -1565,7 +1565,7 @@ define void @load_private_lo_v2i16_reglo_vreg_nooff_sexti8(i8 addrspace(5)* %in, ; GFX803-LABEL: load_private_lo_v2i16_reglo_vreg_nooff_sexti8: ; GFX803: ; %bb.0: ; %entry ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX803-NEXT: buffer_load_sbyte v0, off, s[0:3], s33 offset:4094 +; GFX803-NEXT: buffer_load_sbyte v0, off, s[0:3], 0 offset:4094 ; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -1585,7 +1585,7 @@ define void @load_private_lo_v2f16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in, ; GFX900-LABEL: load_private_lo_v2f16_reglo_vreg_nooff_zexti8: ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX900-NEXT: buffer_load_ubyte_d16 v1, off, s[0:3], s33 offset:4094 +; GFX900-NEXT: buffer_load_ubyte_d16 v1, off, s[0:3], 0 offset:4094 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: global_store_dword v[0:1], v1, off ; GFX900-NEXT: s_waitcnt vmcnt(0) @@ -1594,7 +1594,7 @@ define void @load_private_lo_v2f16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in, ; GFX906-LABEL: load_private_lo_v2f16_reglo_vreg_nooff_zexti8: ; GFX906: ; %bb.0: ; %entry ; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: buffer_load_ubyte v0, off, s[0:3], s33 offset:4094 +; GFX906-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:4094 ; GFX906-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX906-NEXT: s_waitcnt vmcnt(0) ; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0 @@ -1607,7 +1607,7 @@ define void @load_private_lo_v2f16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in, ; GFX803: ; %bb.0: ; %entry ; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX803-NEXT: v_lshrrev_b32_e32 v0, 16, v1 -; GFX803-NEXT: buffer_load_ubyte v1, off, s[0:3], s33 offset:4094 +; GFX803-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 offset:4094 ; GFX803-NEXT: s_mov_b32 s4, 0x5040c00 ; GFX803-NEXT: s_waitcnt vmcnt(0) ; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4 diff --git a/llvm/test/CodeGen/AMDGPU/mad-combine.ll b/llvm/test/CodeGen/AMDGPU/mad-combine.ll index 0b360f6ecefb5..09bc371876fc2 100644 --- a/llvm/test/CodeGen/AMDGPU/mad-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/mad-combine.ll @@ -4,6 +4,8 @@ ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-STD -check-prefix=SI-STD-SAFE -check-prefix=FUNC %s ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-STD -check-prefix=SI-STD-UNSAFE -check-prefix=FUNC %s +; FIXME: Remove enable-unsafe-fp-math in RUN line and add flags to IR instrs + ; Make sure we don't form mad with denormals ; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=+fp32-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-DENORM -check-prefix=SI-DENORM-FASTFMAF -check-prefix=FUNC %s ; RUN: llc -march=amdgcn -mcpu=verde -mattr=+fp32-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-DENORM -check-prefix=SI-DENORM-SLOWFMAF -check-prefix=FUNC %s @@ -566,9 +568,10 @@ define amdgpu_kernel void @aggressive_combine_to_mad_fsub_3_f32(float addrspace( %u = load volatile float, float addrspace(1)* %gep.3 %v = load volatile float, float addrspace(1)* %gep.4 - %tmp0 = fmul float %u, %v - %tmp1 = call float @llvm.fmuladd.f32(float %y, float %z, float %tmp0) #0 - %tmp2 = fsub float %x, %tmp1 + ; nsz flag is needed since this combine may change sign of zero + %tmp0 = fmul nsz float %u, %v + %tmp1 = call nsz float @llvm.fmuladd.f32(float %y, float %z, float %tmp0) #0 + %tmp2 = fsub nsz float %x, %tmp1 store float %tmp2, float addrspace(1)* %gep.out ret void diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-load.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-load.ll index 77b9fcb3915ed..6aeb69335646e 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-load.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-load.ll @@ -447,8 +447,8 @@ entry: } ; GCN-LABEL: {{^}}nontemporal_private_0: -; GFX89: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen glc slc{{$}} -; GFX10: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen slc{{$}} +; GFX89: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offen glc slc{{$}} +; GFX10: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offen slc{{$}} ; GFX10: .amdhsa_kernel nontemporal_private_0 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 ; GFX10CU: .amdhsa_workgroup_processor_mode 0 @@ -462,8 +462,8 @@ entry: } ; GCN-LABEL: {{^}}nontemporal_private_1: -; GFX89: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen glc slc{{$}} -; GFX10: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen slc{{$}} +; GFX89: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offen glc slc{{$}} +; GFX10: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offen slc{{$}} ; GFX10: .amdhsa_kernel nontemporal_private_1 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 ; GFX10CU: .amdhsa_workgroup_processor_mode 0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-store.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-store.ll index 2294b7d200d1b..dd81f00a40cb6 100644 --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-store.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-store.ll @@ -314,8 +314,8 @@ entry: } ; GCN-LABEL: {{^}}nontemporal_private_0: -; GFX89: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen glc slc{{$}} -; GFX10: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen slc{{$}} +; GFX89: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offen glc slc{{$}} +; GFX10: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offen slc{{$}} ; GFX10: .amdhsa_kernel nontemporal_private_0 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 ; GFX10CU: .amdhsa_workgroup_processor_mode 0 @@ -329,8 +329,8 @@ entry: } ; GCN-LABEL: {{^}}nontemporal_private_1: -; GFX89: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen glc slc{{$}} -; GFX10: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen slc{{$}} +; GFX89: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offen glc slc{{$}} +; GFX10: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offen slc{{$}} ; GFX10: .amdhsa_kernel nontemporal_private_1 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 ; GFX10CU: .amdhsa_workgroup_processor_mode 0 diff --git a/llvm/test/CodeGen/AMDGPU/memory_clause.ll b/llvm/test/CodeGen/AMDGPU/memory_clause.ll index 51716847743cb..15a8e9283e388 100644 --- a/llvm/test/CodeGen/AMDGPU/memory_clause.ll +++ b/llvm/test/CodeGen/AMDGPU/memory_clause.ll @@ -115,61 +115,50 @@ define void @mubuf_clause(<4 x i32> addrspace(5)* noalias nocapture readonly %ar ; GCN-NEXT: v_and_b32_e32 v2, 0x3ff, v2 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 4, v2 ; GCN-NEXT: v_add_u32_e32 v0, v0, v2 -; GCN-NEXT: s_nop 0 -; GCN-NEXT: s_nop 0 -; GCN-NEXT: buffer_load_dword v3, v0, s[0:3], s33 offen -; GCN-NEXT: buffer_load_dword v4, v0, s[0:3], s33 offen offset:4 -; GCN-NEXT: buffer_load_dword v5, v0, s[0:3], s33 offen offset:8 -; GCN-NEXT: buffer_load_dword v6, v0, s[0:3], s33 offen offset:12 -; GCN-NEXT: buffer_load_dword v7, v0, s[0:3], s33 offen offset:16 -; GCN-NEXT: buffer_load_dword v8, v0, s[0:3], s33 offen offset:20 -; GCN-NEXT: buffer_load_dword v9, v0, s[0:3], s33 offen offset:24 -; GCN-NEXT: buffer_load_dword v10, v0, s[0:3], s33 offen offset:28 -; GCN-NEXT: buffer_load_dword v11, v0, s[0:3], s33 offen offset:32 -; GCN-NEXT: buffer_load_dword v12, v0, s[0:3], s33 offen offset:36 -; GCN-NEXT: buffer_load_dword v13, v0, s[0:3], s33 offen offset:40 -; GCN-NEXT: buffer_load_dword v14, v0, s[0:3], s33 offen offset:44 -; GCN-NEXT: buffer_load_dword v15, v0, s[0:3], s33 offen offset:48 -; GCN-NEXT: buffer_load_dword v16, v0, s[0:3], s33 offen offset:52 -; GCN-NEXT: buffer_load_dword v17, v0, s[0:3], s33 offen offset:56 ; GCN-NEXT: v_add_u32_e32 v1, v1, v2 ; GCN-NEXT: s_nop 0 ; GCN-NEXT: s_nop 0 -; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], s33 offen offset:60 -; GCN-NEXT: s_nop 0 -; GCN-NEXT: s_waitcnt vmcnt(15) -; GCN-NEXT: s_nop 0 -; GCN-NEXT: buffer_store_dword v3, v1, s[0:3], s33 offen -; GCN-NEXT: s_waitcnt vmcnt(15) -; GCN-NEXT: buffer_store_dword v4, v1, s[0:3], s33 offen offset:4 -; GCN-NEXT: s_waitcnt vmcnt(15) -; GCN-NEXT: buffer_store_dword v5, v1, s[0:3], s33 offen offset:8 -; GCN-NEXT: s_waitcnt vmcnt(15) -; GCN-NEXT: buffer_store_dword v6, v1, s[0:3], s33 offen offset:12 -; GCN-NEXT: s_waitcnt vmcnt(15) -; GCN-NEXT: buffer_store_dword v7, v1, s[0:3], s33 offen offset:16 -; GCN-NEXT: s_waitcnt vmcnt(15) -; GCN-NEXT: buffer_store_dword v8, v1, s[0:3], s33 offen offset:20 -; GCN-NEXT: s_waitcnt vmcnt(15) -; GCN-NEXT: buffer_store_dword v9, v1, s[0:3], s33 offen offset:24 -; GCN-NEXT: s_waitcnt vmcnt(15) -; GCN-NEXT: buffer_store_dword v10, v1, s[0:3], s33 offen offset:28 -; GCN-NEXT: s_waitcnt vmcnt(15) -; GCN-NEXT: buffer_store_dword v11, v1, s[0:3], s33 offen offset:32 -; GCN-NEXT: s_waitcnt vmcnt(15) -; GCN-NEXT: buffer_store_dword v12, v1, s[0:3], s33 offen offset:36 -; GCN-NEXT: s_waitcnt vmcnt(15) -; GCN-NEXT: buffer_store_dword v13, v1, s[0:3], s33 offen offset:40 -; GCN-NEXT: s_waitcnt vmcnt(15) -; GCN-NEXT: buffer_store_dword v14, v1, s[0:3], s33 offen offset:44 -; GCN-NEXT: s_waitcnt vmcnt(15) -; GCN-NEXT: buffer_store_dword v15, v1, s[0:3], s33 offen offset:48 -; GCN-NEXT: s_waitcnt vmcnt(15) -; GCN-NEXT: buffer_store_dword v16, v1, s[0:3], s33 offen offset:52 -; GCN-NEXT: s_waitcnt vmcnt(15) -; GCN-NEXT: buffer_store_dword v17, v1, s[0:3], s33 offen offset:56 -; GCN-NEXT: s_waitcnt vmcnt(15) -; GCN-NEXT: buffer_store_dword v0, v1, s[0:3], s33 offen offset:60 +; GCN-NEXT: buffer_load_dword v6, v0, s[0:3], 0 offen offset:20 +; GCN-NEXT: buffer_load_dword v7, v0, s[0:3], 0 offen offset:24 +; GCN-NEXT: buffer_load_dword v8, v0, s[0:3], 0 offen offset:28 +; GCN-NEXT: buffer_load_dword v9, v0, s[0:3], 0 offen offset:32 +; GCN-NEXT: buffer_load_dword v10, v0, s[0:3], 0 offen offset:36 +; GCN-NEXT: buffer_load_dword v11, v0, s[0:3], 0 offen offset:40 +; GCN-NEXT: buffer_load_dword v12, v0, s[0:3], 0 offen offset:44 +; GCN-NEXT: buffer_load_dword v13, v0, s[0:3], 0 offen offset:48 +; GCN-NEXT: buffer_load_dword v14, v0, s[0:3], 0 offen offset:52 +; GCN-NEXT: buffer_load_dword v15, v0, s[0:3], 0 offen offset:56 +; GCN-NEXT: buffer_load_dword v16, v0, s[0:3], 0 offen offset:60 +; GCN-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen +; GCN-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen offset:4 +; GCN-NEXT: buffer_load_dword v4, v0, s[0:3], 0 offen offset:8 +; GCN-NEXT: buffer_load_dword v5, v0, s[0:3], 0 offen offset:12 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:16 +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_waitcnt vmcnt(4) +; GCN-NEXT: s_nop 0 +; GCN-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt vmcnt(4) +; GCN-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen offset:4 +; GCN-NEXT: s_waitcnt vmcnt(4) +; GCN-NEXT: buffer_store_dword v4, v1, s[0:3], 0 offen offset:8 +; GCN-NEXT: s_waitcnt vmcnt(4) +; GCN-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen offset:12 +; GCN-NEXT: s_waitcnt vmcnt(4) +; GCN-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen offset:16 +; GCN-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen offset:20 +; GCN-NEXT: buffer_store_dword v7, v1, s[0:3], 0 offen offset:24 +; GCN-NEXT: buffer_store_dword v8, v1, s[0:3], 0 offen offset:28 +; GCN-NEXT: buffer_store_dword v9, v1, s[0:3], 0 offen offset:32 +; GCN-NEXT: buffer_store_dword v10, v1, s[0:3], 0 offen offset:36 +; GCN-NEXT: buffer_store_dword v11, v1, s[0:3], 0 offen offset:40 +; GCN-NEXT: buffer_store_dword v12, v1, s[0:3], 0 offen offset:44 +; GCN-NEXT: buffer_store_dword v13, v1, s[0:3], 0 offen offset:48 +; GCN-NEXT: buffer_store_dword v14, v1, s[0:3], 0 offen offset:52 +; GCN-NEXT: buffer_store_dword v15, v1, s[0:3], 0 offen offset:56 +; GCN-NEXT: buffer_store_dword v16, v1, s[0:3], 0 offen offset:60 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] bb: diff --git a/llvm/test/CodeGen/AMDGPU/mesa3d.ll b/llvm/test/CodeGen/AMDGPU/mesa3d.ll index 4f09b3f748045..41011afd5c84f 100644 --- a/llvm/test/CodeGen/AMDGPU/mesa3d.ll +++ b/llvm/test/CodeGen/AMDGPU/mesa3d.ll @@ -5,7 +5,7 @@ ; GCN-DAG: s_mov_b32 s6, -1{{$}} ; GCN-DAG: s_mov_b32 s7, 0xe8f000 ; GCN-DAG: v_mov_b32_e32 [[V:v[0-9]+]], 2 -; GCN: buffer_store_dword [[V]], off, s[4:7], s2 offset:4 +; GCN: buffer_store_dword [[V]], off, s[4:7], 0 offset:4 define amdgpu_ps void @scratch_ps(i32 addrspace(1)* %out, i32 %in) { entry: %alloca = alloca i32, addrspace(5) diff --git a/llvm/test/CodeGen/AMDGPU/mir-print-dead-csr-fi.mir b/llvm/test/CodeGen/AMDGPU/mir-print-dead-csr-fi.mir index cccf2c113ebe5..71c7e32ff23b3 100644 --- a/llvm/test/CodeGen/AMDGPU/mir-print-dead-csr-fi.mir +++ b/llvm/test/CodeGen/AMDGPU/mir-print-dead-csr-fi.mir @@ -15,7 +15,6 @@ frameInfo: maxAlignment: 4 machineFunctionInfo: scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' - scratchWaveOffsetReg: '$sgpr4' frameOffsetReg: '$sgpr5' stackPtrOffsetReg: '$sgpr32' body: | diff --git a/llvm/test/CodeGen/AMDGPU/misched-killflags.mir b/llvm/test/CodeGen/AMDGPU/misched-killflags.mir index 75d297e111a5a..9d0c32214100f 100644 --- a/llvm/test/CodeGen/AMDGPU/misched-killflags.mir +++ b/llvm/test/CodeGen/AMDGPU/misched-killflags.mir @@ -6,7 +6,6 @@ tracksRegLiveness: true machineFunctionInfo: isEntryFunction: true scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' - scratchWaveOffsetReg: '$sgpr7' frameOffsetReg: '$sgpr7' body: | bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-offset-private.ll b/llvm/test/CodeGen/AMDGPU/mubuf-offset-private.ll index 5a0d87f5186d6..f984bb49c7b7c 100644 --- a/llvm/test/CodeGen/AMDGPU/mubuf-offset-private.ll +++ b/llvm/test/CodeGen/AMDGPU/mubuf-offset-private.ll @@ -5,49 +5,49 @@ ; Test addressing modes when the scratch base is not a frame index. ; GCN-LABEL: {{^}}store_private_offset_i8: -; GCN: buffer_store_byte v{{[0-9]+}}, off, s[4:7], s2 offset:8 +; GCN: buffer_store_byte v{{[0-9]+}}, off, s[4:7], 0 offset:8 define amdgpu_kernel void @store_private_offset_i8() #0 { store volatile i8 5, i8 addrspace(5)* inttoptr (i32 8 to i8 addrspace(5)*) ret void } ; GCN-LABEL: {{^}}store_private_offset_i16: -; GCN: buffer_store_short v{{[0-9]+}}, off, s[4:7], s2 offset:8 +; GCN: buffer_store_short v{{[0-9]+}}, off, s[4:7], 0 offset:8 define amdgpu_kernel void @store_private_offset_i16() #0 { store volatile i16 5, i16 addrspace(5)* inttoptr (i32 8 to i16 addrspace(5)*) ret void } ; GCN-LABEL: {{^}}store_private_offset_i32: -; GCN: buffer_store_dword v{{[0-9]+}}, off, s[4:7], s2 offset:8 +; GCN: buffer_store_dword v{{[0-9]+}}, off, s[4:7], 0 offset:8 define amdgpu_kernel void @store_private_offset_i32() #0 { store volatile i32 5, i32 addrspace(5)* inttoptr (i32 8 to i32 addrspace(5)*) ret void } ; GCN-LABEL: {{^}}store_private_offset_v2i32: -; GCN: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], s2 offset:8 +; GCN: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], 0 offset:8 define amdgpu_kernel void @store_private_offset_v2i32() #0 { store volatile <2 x i32> , <2 x i32> addrspace(5)* inttoptr (i32 8 to <2 x i32> addrspace(5)*) ret void } ; GCN-LABEL: {{^}}store_private_offset_v4i32: -; GCN: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], s2 offset:8 +; GCN: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], 0 offset:8 define amdgpu_kernel void @store_private_offset_v4i32() #0 { store volatile <4 x i32> , <4 x i32> addrspace(5)* inttoptr (i32 8 to <4 x i32> addrspace(5)*) ret void } ; GCN-LABEL: {{^}}load_private_offset_i8: -; GCN: buffer_load_ubyte v{{[0-9]+}}, off, s[4:7], s2 offset:8 +; GCN: buffer_load_ubyte v{{[0-9]+}}, off, s[4:7], 0 offset:8 define amdgpu_kernel void @load_private_offset_i8() #0 { %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 8 to i8 addrspace(5)*) ret void } ; GCN-LABEL: {{^}}sextload_private_offset_i8: -; GCN: buffer_load_sbyte v{{[0-9]+}}, off, s[4:7], s8 offset:8 +; GCN: buffer_load_sbyte v{{[0-9]+}}, off, s[4:7], 0 offset:8 define amdgpu_kernel void @sextload_private_offset_i8(i32 addrspace(1)* %out) #0 { %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 8 to i8 addrspace(5)*) %sextload = sext i8 %load to i32 @@ -56,7 +56,7 @@ define amdgpu_kernel void @sextload_private_offset_i8(i32 addrspace(1)* %out) #0 } ; GCN-LABEL: {{^}}zextload_private_offset_i8: -; GCN: buffer_load_ubyte v{{[0-9]+}}, off, s[4:7], s8 offset:8 +; GCN: buffer_load_ubyte v{{[0-9]+}}, off, s[4:7], 0 offset:8 define amdgpu_kernel void @zextload_private_offset_i8(i32 addrspace(1)* %out) #0 { %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 8 to i8 addrspace(5)*) %zextload = zext i8 %load to i32 @@ -65,14 +65,14 @@ define amdgpu_kernel void @zextload_private_offset_i8(i32 addrspace(1)* %out) #0 } ; GCN-LABEL: {{^}}load_private_offset_i16: -; GCN: buffer_load_ushort v{{[0-9]+}}, off, s[4:7], s2 offset:8 +; GCN: buffer_load_ushort v{{[0-9]+}}, off, s[4:7], 0 offset:8 define amdgpu_kernel void @load_private_offset_i16() #0 { %load = load volatile i16, i16 addrspace(5)* inttoptr (i32 8 to i16 addrspace(5)*) ret void } ; GCN-LABEL: {{^}}sextload_private_offset_i16: -; GCN: buffer_load_sshort v{{[0-9]+}}, off, s[4:7], s8 offset:8 +; GCN: buffer_load_sshort v{{[0-9]+}}, off, s[4:7], 0 offset:8 define amdgpu_kernel void @sextload_private_offset_i16(i32 addrspace(1)* %out) #0 { %load = load volatile i16, i16 addrspace(5)* inttoptr (i32 8 to i16 addrspace(5)*) %sextload = sext i16 %load to i32 @@ -81,7 +81,7 @@ define amdgpu_kernel void @sextload_private_offset_i16(i32 addrspace(1)* %out) # } ; GCN-LABEL: {{^}}zextload_private_offset_i16: -; GCN: buffer_load_ushort v{{[0-9]+}}, off, s[4:7], s8 offset:8 +; GCN: buffer_load_ushort v{{[0-9]+}}, off, s[4:7], 0 offset:8 define amdgpu_kernel void @zextload_private_offset_i16(i32 addrspace(1)* %out) #0 { %load = load volatile i16, i16 addrspace(5)* inttoptr (i32 8 to i16 addrspace(5)*) %zextload = zext i16 %load to i32 @@ -90,28 +90,28 @@ define amdgpu_kernel void @zextload_private_offset_i16(i32 addrspace(1)* %out) # } ; GCN-LABEL: {{^}}load_private_offset_i32: -; GCN: buffer_load_dword v{{[0-9]+}}, off, s[4:7], s2 offset:8 +; GCN: buffer_load_dword v{{[0-9]+}}, off, s[4:7], 0 offset:8 define amdgpu_kernel void @load_private_offset_i32() #0 { %load = load volatile i32, i32 addrspace(5)* inttoptr (i32 8 to i32 addrspace(5)*) ret void } ; GCN-LABEL: {{^}}load_private_offset_v2i32: -; GCN: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], s2 offset:8 +; GCN: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], 0 offset:8 define amdgpu_kernel void @load_private_offset_v2i32() #0 { %load = load volatile <2 x i32>, <2 x i32> addrspace(5)* inttoptr (i32 8 to <2 x i32> addrspace(5)*) ret void } ; GCN-LABEL: {{^}}load_private_offset_v4i32: -; GCN: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], s2 offset:8 +; GCN: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[4:7], 0 offset:8 define amdgpu_kernel void @load_private_offset_v4i32() #0 { %load = load volatile <4 x i32>, <4 x i32> addrspace(5)* inttoptr (i32 8 to <4 x i32> addrspace(5)*) ret void } ; GCN-LABEL: {{^}}store_private_offset_i8_max_offset: -; GCN: buffer_store_byte v{{[0-9]+}}, off, s[4:7], s2 offset:4095 +; GCN: buffer_store_byte v{{[0-9]+}}, off, s[4:7], 0 offset:4095 define amdgpu_kernel void @store_private_offset_i8_max_offset() #0 { store volatile i8 5, i8 addrspace(5)* inttoptr (i32 4095 to i8 addrspace(5)*) ret void @@ -119,7 +119,7 @@ define amdgpu_kernel void @store_private_offset_i8_max_offset() #0 { ; GCN-LABEL: {{^}}store_private_offset_i8_max_offset_plus1: ; GCN: v_mov_b32_e32 [[OFFSET:v[0-9]+]], 0x1000 -; GCN: buffer_store_byte v{{[0-9]+}}, [[OFFSET]], s[4:7], s2 offen{{$}} +; GCN: buffer_store_byte v{{[0-9]+}}, [[OFFSET]], s[4:7], 0 offen{{$}} define amdgpu_kernel void @store_private_offset_i8_max_offset_plus1() #0 { store volatile i8 5, i8 addrspace(5)* inttoptr (i32 4096 to i8 addrspace(5)*) ret void @@ -127,7 +127,7 @@ define amdgpu_kernel void @store_private_offset_i8_max_offset_plus1() #0 { ; GCN-LABEL: {{^}}store_private_offset_i8_max_offset_plus2: ; GCN: v_mov_b32_e32 [[OFFSET:v[0-9]+]], 0x1000 -; GCN: buffer_store_byte v{{[0-9]+}}, [[OFFSET]], s[4:7], s2 offen offset:1{{$}} +; GCN: buffer_store_byte v{{[0-9]+}}, [[OFFSET]], s[4:7], 0 offen offset:1{{$}} define amdgpu_kernel void @store_private_offset_i8_max_offset_plus2() #0 { store volatile i8 5, i8 addrspace(5)* inttoptr (i32 4097 to i8 addrspace(5)*) ret void @@ -139,10 +139,10 @@ define amdgpu_kernel void @store_private_offset_i8_max_offset_plus2() #0 { ; GCN-LABEL: {{^}}store_private_unknown_bits_vaddr: ; SICIVI: v_add_{{i|u}}32_e32 [[ADDR0:v[0-9]+]], vcc, 4 ; SICIVI: v_add_{{i|u}}32_e32 [[ADDR1:v[0-9]+]], vcc, 32, [[ADDR0]] -; SICIVI: buffer_store_dword v{{[0-9]+}}, [[ADDR1]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}} +; SICIVI: buffer_store_dword v{{[0-9]+}}, [[ADDR1]], s{{\[[0-9]+:[0-9]+\]}}, 0 offen{{$}} ; GFX9: v_add_u32_e32 [[ADDR:v[0-9]+]], 4, -; GFX9: buffer_store_dword v{{[0-9]+}}, [[ADDR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen offset:32 +; GFX9: buffer_store_dword v{{[0-9]+}}, [[ADDR]], s{{\[[0-9]+:[0-9]+\]}}, 0 offen offset:32 define amdgpu_kernel void @store_private_unknown_bits_vaddr() #0 { %alloca = alloca [16 x i32], align 4, addrspace(5) %vaddr = load volatile i32, i32 addrspace(1)* undef diff --git a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll index 5f0b5aaecac9b..3eb478896f55f 100644 --- a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll +++ b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll @@ -189,18 +189,18 @@ define void @slsr1_1(i32 %b.arg, i32 %s.arg) #0 { ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-NEXT: v_writelane_b32 v35, s34, 4 -; GFX9-NEXT: s_mov_b32 s34, s32 +; GFX9-NEXT: v_writelane_b32 v35, s33, 4 +; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_add_u32 s32, s32, 0x800 -; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s34 offset:8 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s34 offset:4 ; 4-byte Folded Spill -; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s34 ; 4-byte Folded Spill -; GFX9-NEXT: v_writelane_b32 v35, s36, 0 +; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: v_writelane_b32 v35, s34, 0 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, foo@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, foo@gotpcrel32@hi+4 -; GFX9-NEXT: v_writelane_b32 v35, s37, 1 -; GFX9-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x0 +; GFX9-NEXT: v_writelane_b32 v35, s35, 1 +; GFX9-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v32, v1 ; GFX9-NEXT: v_mov_b32_e32 v33, v0 ; GFX9-NEXT: v_writelane_b32 v35, s30, 2 @@ -208,21 +208,21 @@ define void @slsr1_1(i32 %b.arg, i32 %s.arg) #0 { ; GFX9-NEXT: v_writelane_b32 v35, s31, 3 ; GFX9-NEXT: v_and_b32_e32 v34, 0xffffff, v32 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_swappc_b64 s[30:31], s[36:37] +; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_mad_u32_u24 v32, v33, v32, v34 ; GFX9-NEXT: v_mov_b32_e32 v0, v32 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[36:37] +; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_add_u32_e32 v0, v32, v34 -; GFX9-NEXT: s_swappc_b64 s[30:31], s[36:37] +; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s4, v35, 2 ; GFX9-NEXT: v_readlane_b32 s5, v35, 3 -; GFX9-NEXT: v_readlane_b32 s37, v35, 1 -; GFX9-NEXT: v_readlane_b32 s36, v35, 0 -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s34 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s34 offset:4 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s34 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: v_readlane_b32 s35, v35, 1 +; GFX9-NEXT: v_readlane_b32 s34, v35, 0 +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX9-NEXT: s_sub_u32 s32, s32, 0x800 -; GFX9-NEXT: v_readlane_b32 s34, v35, 4 +; GFX9-NEXT: v_readlane_b32 s33, v35, 4 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] diff --git a/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll b/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll index c09d18e104f79..d473146d1cdda 100644 --- a/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll +++ b/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll @@ -729,15 +729,18 @@ bb5: ; preds = %bb3 ; IR: [[FLOW]]: ; IR-NEXT: phi -; IR-NEXT: br i1 [[CND2:%.*]], label %[[PREHEADER:.*]], label %[[FLOW2:.*]] +; IR-NEXT: br i1 [[CND2:%.*]], label %[[LOOP:.*]], label %UnifiedReturnBlock -; IR: [[FLOW2]]: -; IR-NEXT: br label %UnifiedReturnBlock +; IR: [[LOOP]]: +; IR-NEXT: br i1 false, label %[[FLOW1:.*]], label %[[LOOP]] ; IR: [[EXP]]: ; IR-NEXT: call void @llvm.amdgcn.exp.compr.v2f16(i32 immarg 0, i32 immarg 15, <2 x half> , <2 x half> , i1 immarg false, i1 immarg true) ; IR-NEXT: br label %[[FLOW]] +; IR: [[FLOW1]]: +; IR-NEXT: br label %UnifiedReturnBlock + ; IR: UnifiedReturnBlock: ; IR-NEXT: call void @llvm.amdgcn.exp.f32(i32 9, i32 0, float undef, float undef, float undef, float undef, i1 true, i1 true) ; IR-NEXT: ret void @@ -745,13 +748,10 @@ bb5: ; preds = %bb3 define amdgpu_ps void @uniformly_reached_export(float inreg %tmp25) { .entry: %tmp26 = fcmp olt float %tmp25, 0.000000e+00 - br i1 %tmp26, label %.preheader.1, label %bb27 - -.preheader.1: ; preds = %.entry - br label %bb + br i1 %tmp26, label %loop, label %bb27 -bb: ; preds = %bb, %.preheader.1 - br label %bb +loop: ; preds = %loop, %.entry + br label %loop bb27: ; preds = %.entry call void @llvm.amdgcn.exp.compr.v2f16(i32 immarg 0, i32 immarg 15, <2 x half> , <2 x half> , i1 immarg true, i1 immarg true) diff --git a/llvm/test/CodeGen/AMDGPU/nested-calls.ll b/llvm/test/CodeGen/AMDGPU/nested-calls.ll index fdbe3a25e64e8..3e94a8e2f9523 100644 --- a/llvm/test/CodeGen/AMDGPU/nested-calls.ll +++ b/llvm/test/CodeGen/AMDGPU/nested-calls.ll @@ -14,8 +14,8 @@ declare void @external_void_func_i32(i32) #0 ; GCN: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] -; GCN-DAG: v_writelane_b32 v32, s34, 2 -; GCN-DAG: s_mov_b32 s34, s32 +; GCN-DAG: v_writelane_b32 v32, s33, 2 +; GCN-DAG: s_mov_b32 s33, s32 ; GCN-DAG: s_add_u32 s32, s32, 0x400 ; GCN-DAG: v_writelane_b32 v32, s30, 0 ; GCN-DAG: v_writelane_b32 v32, s31, 1 @@ -26,7 +26,7 @@ declare void @external_void_func_i32(i32) #0 ; GCN: v_readlane_b32 s5, v32, 1 ; GCN-NEXT: s_sub_u32 s32, s32, 0x400 -; GCN-NEXT: v_readlane_b32 s34, v32, 2 +; GCN-NEXT: v_readlane_b32 s33, v32, 2 ; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] @@ -39,9 +39,9 @@ define void @test_func_call_external_void_func_i32_imm() #0 { ; GCN-LABEL: {{^}}test_func_call_external_void_func_i32_imm_stack_use: ; GCN: s_waitcnt -; GCN: s_mov_b32 s34, s32 +; GCN: s_mov_b32 s33, s32 ; GCN-DAG: s_add_u32 s32, s32, 0x1400{{$}} -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s34 offset: +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33 offset: ; GCN: s_swappc_b64 ; GCN: s_sub_u32 s32, s32, 0x1400{{$}} ; GCN: s_setpc_b64 diff --git a/llvm/test/CodeGen/AMDGPU/optimize-exec-masking-pre-ra.mir b/llvm/test/CodeGen/AMDGPU/optimize-exec-masking-pre-ra.mir index 0ea085afc4051..fee6b52d1a117 100644 --- a/llvm/test/CodeGen/AMDGPU/optimize-exec-masking-pre-ra.mir +++ b/llvm/test/CodeGen/AMDGPU/optimize-exec-masking-pre-ra.mir @@ -9,7 +9,6 @@ tracksRegLiveness: true machineFunctionInfo: isEntryFunction: true scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99' - scratchWaveOffsetReg: '$sgpr101' frameOffsetReg: '$sgpr101' body: | ; GCN-LABEL: name: exec_src1_is_not_copy diff --git a/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll index a38bacd97a67e..ef656f66fcc89 100644 --- a/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll +++ b/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll @@ -13,7 +13,6 @@ ; GCN: def s[4:11] ; GCN: def s[12:19] ; GCN: def s[20:27] -; GCN: def s[28:35] ; GCN: def s[36:43] ; GCN: def s[44:51] ; GCN: def s[52:59] @@ -37,8 +36,8 @@ ; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 10 ; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 11 ; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 12 -; GCN-NEXT: v_writelane_b32 v0, s9, 13 -; GCN-NEXT: v_writelane_b32 v0, s10, 14 +; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 13 +; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 14 ; GCN-NEXT: v_writelane_b32 v0, s[[TMP_HI]], 15 ; GCN: def s{{\[}}[[TMP_LO]]:[[TMP_HI]]{{\]}} @@ -47,8 +46,8 @@ ; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 18 ; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 19 ; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 20 -; GCN-NEXT: v_writelane_b32 v0, s9, 21 -; GCN-NEXT: v_writelane_b32 v0, s10, 22 +; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 21 +; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 22 ; GCN-NEXT: v_writelane_b32 v0, s[[TMP_HI]], 23 ; GCN: def s{{\[}}[[TMP_LO]]:[[TMP_HI]]{{\]}} @@ -57,8 +56,8 @@ ; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 26 ; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 27 ; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 28 -; GCN-NEXT: v_writelane_b32 v0, s9, 29 -; GCN-NEXT: v_writelane_b32 v0, s10, 30 +; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 29 +; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 30 ; GCN-NEXT: v_writelane_b32 v0, s[[TMP_HI]], 31 ; GCN: def s{{\[}}[[TMP_LO]]:[[TMP_HI]]{{\]}} @@ -67,8 +66,8 @@ ; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 34 ; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 35 ; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 36 -; GCN-NEXT: v_writelane_b32 v0, s9, 37 -; GCN-NEXT: v_writelane_b32 v0, s10, 38 +; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 37 +; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 38 ; GCN-NEXT: v_writelane_b32 v0, s[[TMP_HI]], 39 ; GCN: def s{{\[}}[[TMP_LO]]:[[TMP_HI]]{{\]}} @@ -77,36 +76,38 @@ ; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 42 ; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 43 ; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 44 -; GCN-NEXT: v_writelane_b32 v0, s9, 45 -; GCN-NEXT: v_writelane_b32 v0, s10, 46 +; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 45 +; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 46 ; GCN-NEXT: v_writelane_b32 v0, s[[TMP_HI]], 47 ; GCN: def s{{\[}}[[TMP_LO]]:[[TMP_HI]]{{\]}} -; GCN: v_writelane_b32 v0, s12, 48 -; GCN-NEXT: v_writelane_b32 v0, s13, 49 -; GCN-NEXT: v_writelane_b32 v0, s14, 50 -; GCN-NEXT: v_writelane_b32 v0, s15, 51 -; GCN-NEXT: v_writelane_b32 v0, s16, 52 -; GCN-NEXT: v_writelane_b32 v0, s17, 53 -; GCN-NEXT: v_writelane_b32 v0, s18, 54 -; GCN-NEXT: v_writelane_b32 v0, s19, 55 - -; GCN-NEXT: v_writelane_b32 v0, s20, 56 -; GCN-NEXT: v_writelane_b32 v0, s21, 57 -; GCN-NEXT: v_writelane_b32 v0, s22, 58 -; GCN-NEXT: v_writelane_b32 v0, s23, 59 -; GCN-NEXT: v_writelane_b32 v0, s24, 60 -; GCN-NEXT: v_writelane_b32 v0, s25, 61 -; GCN-NEXT: v_writelane_b32 v0, s26, 62 -; GCN-NEXT: v_writelane_b32 v0, s27, 63 -; GCN-NEXT: v_writelane_b32 v1, s28, 0 -; GCN-NEXT: v_writelane_b32 v1, s29, 1 -; GCN-NEXT: v_writelane_b32 v1, s30, 2 -; GCN-NEXT: v_writelane_b32 v1, s31, 3 -; GCN-NEXT: v_writelane_b32 v1, s32, 4 -; GCN-NEXT: v_writelane_b32 v1, s33, 5 -; GCN-NEXT: v_writelane_b32 v1, s34, 6 -; GCN-NEXT: v_writelane_b32 v1, s35, 7 +; GCN: v_writelane_b32 v0, s[[TMP_LO]], 48 +; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 49 +; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 50 +; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 51 +; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 52 +; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 53 +; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 54 +; GCN-NEXT: v_writelane_b32 v0, s[[TMP_HI]], 55 + +; GCN: def s{{\[}}[[TMP_LO]]:[[TMP_HI]]{{\]}} +; GCN: v_writelane_b32 v0, s12, 56 +; GCN-NEXT: v_writelane_b32 v0, s13, 57 +; GCN-NEXT: v_writelane_b32 v0, s14, 58 +; GCN-NEXT: v_writelane_b32 v0, s15, 59 +; GCN-NEXT: v_writelane_b32 v0, s16, 60 +; GCN-NEXT: v_writelane_b32 v0, s17, 61 +; GCN-NEXT: v_writelane_b32 v0, s18, 62 +; GCN-NEXT: v_writelane_b32 v0, s19, 63 + +; GCN-NEXT: v_writelane_b32 v1, s20, 0 +; GCN-NEXT: v_writelane_b32 v1, s21, 1 +; GCN-NEXT: v_writelane_b32 v1, s22, 2 +; GCN-NEXT: v_writelane_b32 v1, s23, 3 +; GCN-NEXT: v_writelane_b32 v1, s24, 4 +; GCN-NEXT: v_writelane_b32 v1, s25, 5 +; GCN-NEXT: v_writelane_b32 v1, s26, 6 +; GCN-NEXT: v_writelane_b32 v1, s27, 7 ; GCN-NEXT: v_writelane_b32 v1, s36, 8 ; GCN-NEXT: v_writelane_b32 v1, s37, 9 ; GCN-NEXT: v_writelane_b32 v1, s38, 10 @@ -184,16 +185,6 @@ ; GCN-NEXT: v_readlane_b32 s[[USE_TMP_HI:[0-9]+]], v0, 7 ; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}} -; GCN: v_readlane_b32 s[[USE_TMP_LO:[0-9]+]], v0, 48 -; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 49 -; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 50 -; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 51 -; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 52 -; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 53 -; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 54 -; GCN-NEXT: v_readlane_b32 s[[USE_TMP_HI:[0-9]+]], v0, 55 -; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}} - ; GCN: v_readlane_b32 s[[USE_TMP_LO:[0-9]+]], v0, 56 ; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 57 ; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 58 @@ -393,7 +384,7 @@ ret: ; GCN-LABEL: {{^}}split_sgpr_spill_2_vgprs: ; GCN: def s[4:19] -; GCN: def s[20:35] +; GCN: def s[36:51] ; GCN: v_writelane_b32 v0, s4, 48 ; GCN-NEXT: v_writelane_b32 v0, s5, 49 @@ -412,22 +403,22 @@ ret: ; GCN-NEXT: v_writelane_b32 v0, s18, 62 ; GCN-NEXT: v_writelane_b32 v0, s19, 63 -; GCN: v_readlane_b32 s4, v0, 48 -; GCN-NEXT: v_readlane_b32 s5, v0, 49 -; GCN-NEXT: v_readlane_b32 s6, v0, 50 -; GCN-NEXT: v_readlane_b32 s7, v0, 51 -; GCN-NEXT: v_readlane_b32 s8, v0, 52 -; GCN-NEXT: v_readlane_b32 s9, v0, 53 -; GCN-NEXT: v_readlane_b32 s10, v0, 54 -; GCN-NEXT: v_readlane_b32 s11, v0, 55 -; GCN-NEXT: v_readlane_b32 s12, v0, 56 -; GCN-NEXT: v_readlane_b32 s13, v0, 57 -; GCN-NEXT: v_readlane_b32 s14, v0, 58 -; GCN-NEXT: v_readlane_b32 s15, v0, 59 -; GCN-NEXT: v_readlane_b32 s16, v0, 60 -; GCN-NEXT: v_readlane_b32 s17, v0, 61 -; GCN-NEXT: v_readlane_b32 s18, v0, 62 -; GCN-NEXT: v_readlane_b32 s19, v0, 63 +; GCN: v_readlane_b32 s0, v0, 48 +; GCN-NEXT: v_readlane_b32 s1, v0, 49 +; GCN-NEXT: v_readlane_b32 s2, v0, 50 +; GCN-NEXT: v_readlane_b32 s3, v0, 51 +; GCN-NEXT: v_readlane_b32 s4, v0, 52 +; GCN-NEXT: v_readlane_b32 s5, v0, 53 +; GCN-NEXT: v_readlane_b32 s6, v0, 54 +; GCN-NEXT: v_readlane_b32 s7, v0, 55 +; GCN-NEXT: v_readlane_b32 s8, v0, 56 +; GCN-NEXT: v_readlane_b32 s9, v0, 57 +; GCN-NEXT: v_readlane_b32 s10, v0, 58 +; GCN-NEXT: v_readlane_b32 s11, v0, 59 +; GCN-NEXT: v_readlane_b32 s12, v0, 60 +; GCN-NEXT: v_readlane_b32 s13, v0, 61 +; GCN-NEXT: v_readlane_b32 s14, v0, 62 +; GCN-NEXT: v_readlane_b32 s15, v0, 63 define amdgpu_kernel void @split_sgpr_spill_2_vgprs(i32 addrspace(1)* %out, i32 %in) #1 { %wide.sgpr0 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0 %wide.sgpr1 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0 @@ -457,133 +448,133 @@ ret: ; GCN-LABEL: {{^}}no_vgprs_last_sgpr_spill: -; GCN: v_writelane_b32 v23, s{{[0-9]+}}, 0 -; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 1 -; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 2 -; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 3 -; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 4 -; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 5 -; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 6 -; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 7 -; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 8 -; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 9 -; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 10 -; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 11 -; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 12 -; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 13 -; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 14 -; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 15 - -; GCN: v_writelane_b32 v23, s{{[0-9]+}}, 16 -; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 17 -; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 18 -; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 19 -; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 20 -; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 21 -; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 22 -; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 23 -; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 24 -; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 25 -; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 26 -; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 27 -; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 28 -; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 29 -; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 30 -; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 31 +; GCN: v_writelane_b32 v31, s{{[0-9]+}}, 0 +; GCN-NEXT: v_writelane_b32 v31, s{{[0-9]+}}, 1 +; GCN-NEXT: v_writelane_b32 v31, s{{[0-9]+}}, 2 +; GCN-NEXT: v_writelane_b32 v31, s{{[0-9]+}}, 3 +; GCN-NEXT: v_writelane_b32 v31, s{{[0-9]+}}, 4 +; GCN-NEXT: v_writelane_b32 v31, s{{[0-9]+}}, 5 +; GCN-NEXT: v_writelane_b32 v31, s{{[0-9]+}}, 6 +; GCN-NEXT: v_writelane_b32 v31, s{{[0-9]+}}, 7 +; GCN-NEXT: v_writelane_b32 v31, s{{[0-9]+}}, 8 +; GCN-NEXT: v_writelane_b32 v31, s{{[0-9]+}}, 9 +; GCN-NEXT: v_writelane_b32 v31, s{{[0-9]+}}, 10 +; GCN-NEXT: v_writelane_b32 v31, s{{[0-9]+}}, 11 +; GCN-NEXT: v_writelane_b32 v31, s{{[0-9]+}}, 12 +; GCN-NEXT: v_writelane_b32 v31, s{{[0-9]+}}, 13 +; GCN-NEXT: v_writelane_b32 v31, s{{[0-9]+}}, 14 +; GCN-NEXT: v_writelane_b32 v31, s{{[0-9]+}}, 15 + +; GCN: v_writelane_b32 v31, s{{[0-9]+}}, 16 +; GCN-NEXT: v_writelane_b32 v31, s{{[0-9]+}}, 17 +; GCN-NEXT: v_writelane_b32 v31, s{{[0-9]+}}, 18 +; GCN-NEXT: v_writelane_b32 v31, s{{[0-9]+}}, 19 +; GCN-NEXT: v_writelane_b32 v31, s{{[0-9]+}}, 20 +; GCN-NEXT: v_writelane_b32 v31, s{{[0-9]+}}, 21 +; GCN-NEXT: v_writelane_b32 v31, s{{[0-9]+}}, 22 +; GCN-NEXT: v_writelane_b32 v31, s{{[[0-9]+}}, 23 +; GCN-NEXT: v_writelane_b32 v31, s{{[[0-9]+}}, 24 +; GCN-NEXT: v_writelane_b32 v31, s{{[[0-9]+}}, 25 +; GCN-NEXT: v_writelane_b32 v31, s{{[[0-9]+}}, 26 +; GCN-NEXT: v_writelane_b32 v31, s{{[[0-9]+}}, 27 +; GCN-NEXT: v_writelane_b32 v31, s{{[[0-9]+}}, 28 +; GCN-NEXT: v_writelane_b32 v31, s{{[[0-9]+}}, 29 +; GCN-NEXT: v_writelane_b32 v31, s{{[[0-9]+}}, 30 +; GCN-NEXT: v_writelane_b32 v31, s{{[[0-9]+}}, 31 ; GCN: def s[0:1] -; GCN: v_writelane_b32 v23, s20, 32 -; GCN-NEXT: v_writelane_b32 v23, s21, 33 - -; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 34 -; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 35 -; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 36 -; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 37 -; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 38 -; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 39 -; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 40 -; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 41 -; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 42 -; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 43 -; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 44 -; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 45 -; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 46 -; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 47 -; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 48 -; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 49 - -; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} -; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +; GCN: v_writelane_b32 v31, s{{[[0-9]+}}, 32 +; GCN-NEXT: v_writelane_b32 v31, s{{[[0-9]+}}, 33 +; GCN-NEXT: v_writelane_b32 v31, s{{[[0-9]+}}, 34 +; GCN-NEXT: v_writelane_b32 v31, s{{[[0-9]+}}, 35 +; GCN-NEXT: v_writelane_b32 v31, s{{[[0-9]+}}, 36 +; GCN-NEXT: v_writelane_b32 v31, s{{[[0-9]+}}, 37 +; GCN-NEXT: v_writelane_b32 v31, s{{[[0-9]+}}, 38 +; GCN-NEXT: v_writelane_b32 v31, s{{[[0-9]+}}, 39 +; GCN-NEXT: v_writelane_b32 v31, s{{[[0-9]+}}, 40 +; GCN-NEXT: v_writelane_b32 v31, s{{[[0-9]+}}, 41 +; GCN-NEXT: v_writelane_b32 v31, s{{[[0-9]+}}, 42 +; GCN-NEXT: v_writelane_b32 v31, s{{[[0-9]+}}, 43 +; GCN-NEXT: v_writelane_b32 v31, s{{[[0-9]+}}, 44 +; GCN-NEXT: v_writelane_b32 v31, s{{[[0-9]+}}, 45 +; GCN-NEXT: v_writelane_b32 v31, s{{[[0-9]+}}, 46 +; GCN-NEXT: v_writelane_b32 v31, s{{[[0-9]+}}, 47 +; GCN-NEXT: v_writelane_b32 v31, s{{[[0-9]+}}, 48 +; GCN-NEXT: v_writelane_b32 v31, s{{[[0-9]+}}, 49 + +; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 +; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 ; GCN: s_cbranch_scc1 -; GCN: v_readlane_b32 s[[USE_TMP_LO:[0-9]+]], v23, 0 -; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 1 -; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 2 -; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 3 -; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 4 -; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 5 -; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 6 -; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 7 -; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 8 -; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 9 -; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 10 -; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 11 -; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 12 -; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 13 -; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 14 -; GCN-NEXT: v_readlane_b32 s[[USE_TMP_HI:[0-9]+]], v23, 15 +; GCN: v_readlane_b32 s[[USE_TMP_LO:[0-9]+]], v31, 0 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v31, 1 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v31, 2 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v31, 3 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v31, 4 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v31, 5 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v31, 6 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v31, 7 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v31, 8 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v31, 9 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v31, 10 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v31, 11 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v31, 12 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v31, 13 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v31, 14 +; GCN-NEXT: v_readlane_b32 s[[USE_TMP_HI:[0-9]+]], v31, 15 ; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}} -; GCN: v_readlane_b32 s[[USE_TMP_LO:[0-9]+]], v23, 32 -; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 33 -; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 34 -; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 35 -; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 36 -; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 37 -; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 38 -; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 39 -; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 40 -; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 41 -; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 42 -; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 43 -; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 44 -; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 45 -; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 46 -; GCN-NEXT: v_readlane_b32 s[[USE_TMP_HI:[0-9]+]], v23, 47 +; GCN: v_readlane_b32 s[[USE_TMP_LO:[0-9]+]], v31, 32 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v31, 33 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v31, 34 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v31, 35 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v31, 36 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v31, 37 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v31, 38 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v31, 39 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v31, 40 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v31, 41 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v31, 42 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v31, 43 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v31, 44 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v31, 45 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v31, 46 +; GCN-NEXT: v_readlane_b32 s[[USE_TMP_HI:[0-9]+]], v31, 47 ; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}} -; GCN: v_readlane_b32 s[[USE_TMP_LO:[0-9]+]], v23, 16 -; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 17 -; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 18 -; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 19 -; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 20 -; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 21 -; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 22 -; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 23 -; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 24 -; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 25 -; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 26 -; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 27 -; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 28 -; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 29 -; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 30 -; GCN-NEXT: v_readlane_b32 s[[USE_TMP_HI:[0-9]+]], v23, 31 +; GCN: v_readlane_b32 s[[USE_TMP_LO:[0-9]+]], v31, 16 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v31, 17 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v31, 18 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v31, 19 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v31, 20 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v31, 21 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v31, 22 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v31, 23 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v31, 24 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v31, 25 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v31, 26 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v31, 27 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v31, 28 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v31, 29 +; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v31, 30 +; GCN-NEXT: v_readlane_b32 s[[USE_TMP_HI:[0-9]+]], v31, 31 ; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}} -; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} -; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} - -; GCN: v_readfirstlane_b32 s1, v0 +; GCN: buffer_load_dword v[[RESTORE_TMP:[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 +; GCN: v_readfirstlane_b32 s[[USE_TMP_LO:[0-9]+]], v[[RESTORE_TMP]] +; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 +; GCN: v_readfirstlane_b32 s[[USE_TMP_HI:[0-9]+]], v[[RESTORE_TMP]] ; GCN: ;;#ASMSTART -; GCN: ; use s[0:1] +; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}} define amdgpu_kernel void @no_vgprs_last_sgpr_spill(i32 addrspace(1)* %out, i32 %in) #1 { call void asm sideeffect "", "~{v[0:7]}" () #0 call void asm sideeffect "", "~{v[8:15]}" () #0 - call void asm sideeffect "", "~{v[16:19]}"() #0 - call void asm sideeffect "", "~{v[20:21]}"() #0 - call void asm sideeffect "", "~{v22}"() #0 + call void asm sideeffect "", "~{v[16:23]}" () #0 + call void asm sideeffect "", "~{v[24:27]}"() #0 + call void asm sideeffect "", "~{v[28:29]}"() #0 + call void asm sideeffect "", "~{v30}"() #0 %wide.sgpr0 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0 %wide.sgpr1 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0 @@ -606,4 +597,4 @@ ret: } attributes #0 = { nounwind } -attributes #1 = { nounwind "amdgpu-waves-per-eu"="10,10" } +attributes #1 = { nounwind "amdgpu-waves-per-eu"="8,8" } diff --git a/llvm/test/CodeGen/AMDGPU/pei-reg-scavenger-position.mir b/llvm/test/CodeGen/AMDGPU/pei-reg-scavenger-position.mir index 39915f2755ce0..ba62ca822733f 100644 --- a/llvm/test/CodeGen/AMDGPU/pei-reg-scavenger-position.mir +++ b/llvm/test/CodeGen/AMDGPU/pei-reg-scavenger-position.mir @@ -17,21 +17,23 @@ stack: machineFunctionInfo: isEntryFunction: true scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr33 - frameOffsetReg: $sgpr5 stackPtrOffsetReg: $sgpr32 + argumentInfo: + privateSegmentWaveByteOffset: { reg: '$sgpr4' } body: | ; CHECK-LABEL: name: scavenge_register_position ; CHECK: bb.0: ; CHECK: successors: %bb.1(0x80000000) - ; CHECK: liveins: $sgpr33, $sgpr0_sgpr1_sgpr2_sgpr3 - ; CHECK: $sgpr4 = S_ADD_U32 $sgpr32, 524288, implicit-def $scc - ; CHECK: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, align 8192, addrspace 5) + ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4 + ; CHECK: $sgpr0 = S_ADD_U32 $sgpr0, $sgpr4, implicit-def $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; CHECK: $sgpr1 = S_ADDC_U32 $sgpr1, 0, implicit-def $scc, implicit $scc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 + ; CHECK: $sgpr5 = S_MOV_B32 524288 + ; CHECK: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, align 8192, addrspace 5) ; CHECK: S_BRANCH %bb.1 ; CHECK: bb.1: ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 - ; CHECK: $sgpr4 = S_ADD_U32 $sgpr32, 524288, implicit-def $scc + ; CHECK: $sgpr4 = S_MOV_B32 524288 ; CHECK: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, align 8192, addrspace 5) ; CHECK: S_ENDPGM 0, implicit $vgpr0 bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-carry-out.mir b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-carry-out.mir index 1c7adc39fe290..f22acffb59097 100644 --- a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-carry-out.mir +++ b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-carry-out.mir @@ -19,7 +19,6 @@ stack: machineFunctionInfo: isEntryFunction: false scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr34 frameOffsetReg: $sgpr33 stackPtrOffsetReg: $sgpr32 @@ -34,13 +33,11 @@ body: | ; CHECK: $sgpr33 = frame-setup S_AND_B32 killed $sgpr4, 4294443008, implicit-def $scc ; CHECK: $sgpr32 = frame-setup S_ADD_U32 $sgpr32, 1572864, implicit-def $scc ; CHECK: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr17, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc - ; CHECK: $sgpr33 = S_SUB_U32 $sgpr33, $sgpr34, implicit-def $scc - ; CHECK: $sgpr33 = S_LSHR_B32 killed $sgpr33, 6, implicit-def $scc + ; CHECK: $sgpr33 = S_LSHR_B32 $sgpr33, 6, implicit-def $scc ; CHECK: $sgpr33 = S_ADD_U32 killed $sgpr33, 8192, implicit-def $scc ; CHECK: $vgpr2 = COPY killed $sgpr33 ; CHECK: $sgpr33 = S_SUB_U32 killed $sgpr33, 8192, implicit-def $scc - ; CHECK: $sgpr33 = S_LSHL_B32 killed $sgpr33, 6, implicit-def $scc - ; CHECK: $sgpr33 = S_ADD_U32 $sgpr33, $sgpr34, implicit-def $scc + ; CHECK: $sgpr33 = S_LSHL_B32 $sgpr33, 6, implicit-def $scc ; CHECK: $vgpr0 = V_OR_B32_e32 killed $vgpr2, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr17, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31 ; CHECK: $sgpr32 = frame-destroy S_SUB_U32 $sgpr32, 1572864, implicit-def $scc ; CHECK: $sgpr33 = frame-setup COPY $sgpr27 @@ -64,7 +61,6 @@ stack: machineFunctionInfo: isEntryFunction: false scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr34 frameOffsetReg: $sgpr33 stackPtrOffsetReg: $sgpr32 @@ -79,8 +75,7 @@ body: | ; CHECK: $sgpr33 = frame-setup S_AND_B32 killed $sgpr4, 4294443008, implicit-def $scc ; CHECK: $sgpr32 = frame-setup S_ADD_U32 $sgpr32, 1572864, implicit-def $scc ; CHECK: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr17, implicit-def $sgpr28, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc - ; CHECK: $sgpr29 = S_SUB_U32 $sgpr33, $sgpr34, implicit-def $scc - ; CHECK: $sgpr29 = S_LSHR_B32 killed $sgpr29, 6, implicit-def $scc + ; CHECK: $sgpr29 = S_LSHR_B32 $sgpr33, 6, implicit-def $scc ; CHECK: $sgpr29 = S_ADD_U32 killed $sgpr29, 8192, implicit-def $scc ; CHECK: $vgpr2 = COPY killed $sgpr29 ; CHECK: $vgpr0 = V_OR_B32_e32 killed $vgpr2, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr17, implicit $sgpr28, implicit $sgpr31 @@ -106,7 +101,6 @@ stack: machineFunctionInfo: isEntryFunction: false scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr34 frameOffsetReg: $sgpr33 stackPtrOffsetReg: $sgpr32 @@ -121,10 +115,9 @@ body: | ; CHECK: $sgpr33 = frame-setup S_AND_B32 killed $sgpr4, 4294443008, implicit-def $scc ; CHECK: $sgpr32 = frame-setup S_ADD_U32 $sgpr32, 1572864, implicit-def $scc ; CHECK: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr17, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc - ; CHECK: $sgpr28 = S_SUB_U32 $sgpr33, $sgpr34, implicit-def $scc - ; CHECK: $vgpr3 = V_LSHRREV_B32_e64 6, killed $sgpr28, implicit $exec + ; CHECK: $vgpr2 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec ; CHECK: $sgpr28 = S_MOV_B32 8192 - ; CHECK: $vgpr2, dead $sgpr28_sgpr29 = V_ADD_I32_e64 killed $sgpr28, killed $vgpr3, 0, implicit $exec + ; CHECK: $vgpr2, dead $sgpr28_sgpr29 = V_ADD_I32_e64 killed $sgpr28, killed $vgpr2, 0, implicit $exec ; CHECK: $vgpr0 = V_OR_B32_e32 killed $vgpr2, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr17, implicit $sgpr31 ; CHECK: $sgpr32 = frame-destroy S_SUB_U32 $sgpr32, 1572864, implicit-def $scc ; CHECK: $sgpr33 = frame-setup COPY $sgpr27 @@ -147,7 +140,6 @@ stack: machineFunctionInfo: isEntryFunction: false scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr34 frameOffsetReg: $sgpr33 stackPtrOffsetReg: $sgpr32 @@ -162,10 +154,9 @@ body: | ; CHECK: $sgpr33 = frame-setup S_AND_B32 killed $sgpr4, 4294443008, implicit-def $scc ; CHECK: $sgpr32 = frame-setup S_ADD_U32 $sgpr32, 1572864, implicit-def $scc ; CHECK: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr17, implicit-def $sgpr30, implicit-def $sgpr31 - ; CHECK: $vcc_hi = S_SUB_U32 $sgpr33, $sgpr34, implicit-def $scc - ; CHECK: $vgpr3 = V_LSHRREV_B32_e64 6, killed $vcc_hi, implicit $exec + ; CHECK: $vgpr2 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec ; CHECK: $vcc_lo = S_MOV_B32 8192 - ; CHECK: $vgpr2, dead $vcc = V_ADD_I32_e64 killed $vcc_lo, killed $vgpr3, 0, implicit $exec + ; CHECK: $vgpr2, dead $vcc = V_ADD_I32_e64 killed $vcc_lo, killed $vgpr2, 0, implicit $exec ; CHECK: $vgpr0 = V_OR_B32_e32 killed $vgpr2, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr17, implicit $sgpr31 ; CHECK: $sgpr32 = frame-destroy S_SUB_U32 $sgpr32, 1572864, implicit-def $scc ; CHECK: $sgpr33 = frame-setup COPY $sgpr27 diff --git a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir index 90afb185ccb7d..ec3c0b7042fe5 100644 --- a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir +++ b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir @@ -14,7 +14,6 @@ stack: machineFunctionInfo: isEntryFunction: false scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr34 frameOffsetReg: $sgpr33 stackPtrOffsetReg: $sgpr32 @@ -29,10 +28,8 @@ body: | ; CHECK: $sgpr33 = frame-setup S_AND_B32 killed $sgpr4, 4294443008, implicit-def $scc ; CHECK: $sgpr32 = frame-setup S_ADD_U32 $sgpr32, 1572864, implicit-def $scc ; CHECK: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr17, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc - ; CHECK: $sgpr33 = S_SUB_U32 $sgpr33, $sgpr34, implicit-def $scc - ; CHECK: $vgpr3 = V_LSHRREV_B32_e64 6, killed $sgpr33, implicit $exec - ; CHECK: $vgpr2 = V_ADD_U32_e32 8192, killed $vgpr3, implicit $exec - ; CHECK: $sgpr33 = S_ADD_U32 $sgpr33, $sgpr34, implicit-def $scc + ; CHECK: $vgpr2 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec + ; CHECK: $vgpr2 = V_ADD_U32_e32 8192, killed $vgpr2, implicit $exec ; CHECK: $vgpr0 = V_OR_B32_e32 killed $vgpr2, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr17, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31 ; CHECK: $sgpr32 = frame-destroy S_SUB_U32 $sgpr32, 1572864, implicit-def $scc ; CHECK: $sgpr33 = frame-setup COPY $sgpr27 diff --git a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr.mir b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr.mir index 853e2346031e7..2fbc51f036f03 100644 --- a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr.mir +++ b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr.mir @@ -14,7 +14,6 @@ stack: machineFunctionInfo: isEntryFunction: false scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr34 frameOffsetReg: $sgpr33 stackPtrOffsetReg: $sgpr32 @@ -29,9 +28,7 @@ body: | ; CHECK: $sgpr33 = frame-setup S_AND_B32 killed $sgpr4, 4294705152, implicit-def $scc ; CHECK: $sgpr32 = frame-setup S_ADD_U32 $sgpr32, 524288, implicit-def $scc ; CHECK: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr17, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc - ; CHECK: $sgpr33 = S_SUB_U32 $sgpr33, $sgpr34, implicit-def $scc ; CHECK: $vgpr2 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec - ; CHECK: $sgpr33 = S_ADD_U32 $sgpr33, $sgpr34, implicit-def $scc ; CHECK: $vgpr0 = V_OR_B32_e32 killed $vgpr2, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr17, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31 ; CHECK: $sgpr32 = frame-destroy S_SUB_U32 $sgpr32, 524288, implicit-def $scc ; CHECK: $sgpr33 = frame-setup COPY $sgpr27 diff --git a/llvm/test/CodeGen/AMDGPU/pei-scavenge-vgpr-spill.mir b/llvm/test/CodeGen/AMDGPU/pei-scavenge-vgpr-spill.mir new file mode 100644 index 0000000000000..579ba6dfc3f93 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/pei-scavenge-vgpr-spill.mir @@ -0,0 +1,60 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefix=GFX8 %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefix=GFX9 %s + +# Test case where spilling a VGPR to an emergency slot is needed during frame index elimination. + +--- +name: pei_scavenge_vgpr_spill +tracksRegLiveness: true + +stack: + - { id: 0, type: default, offset: 0, size: 4, alignment: 8192 } + - { id: 1, type: default, offset: 0, size: 4, alignment: 8192 } + +machineFunctionInfo: + isEntryFunction: false + scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 + frameOffsetReg: $sgpr33 + stackPtrOffsetReg: $sgpr32 + +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255 + + ; GFX8-LABEL: name: pei_scavenge_vgpr_spill + ; GFX8: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr2 + ; GFX8: $vgpr2 = V_WRITELANE_B32_vi $sgpr33, 0, undef $vgpr2 + ; GFX8: $sgpr4 = frame-setup S_ADD_U32 $sgpr32, 524224, implicit-def $scc + ; GFX8: $sgpr33 = frame-setup S_AND_B32 killed $sgpr4, 4294443008, implicit-def $scc + ; GFX8: $sgpr32 = frame-setup S_ADD_U32 $sgpr32, 1572864, implicit-def $scc + ; GFX8: $sgpr4 = S_ADD_U32 $sgpr33, 524544, implicit-def $scc + ; GFX8: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.3, addrspace 5) + ; GFX8: $vgpr3 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec + ; GFX8: $vcc_lo = S_MOV_B32 8192 + ; GFX8: $vgpr3, dead $vcc = V_ADD_I32_e64 killed $vcc_lo, killed $vgpr3, 0, implicit $exec + ; GFX8: $vgpr0 = V_OR_B32_e32 killed $vgpr3, $vgpr1, implicit $exec + ; GFX8: $sgpr32 = frame-destroy S_SUB_U32 $sgpr32, 1572864, implicit-def $scc + ; GFX8: $sgpr33 = V_READLANE_B32_vi $vgpr2, 0 + ; GFX8: $sgpr4 = S_ADD_U32 $sgpr33, 524544, implicit-def $scc + ; GFX8: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.3, addrspace 5) + ; GFX8: S_ENDPGM 0, csr_amdgpu_allvgprs + ; GFX9-LABEL: name: pei_scavenge_vgpr_spill + ; GFX9: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr2 + ; GFX9: $vgpr2 = V_WRITELANE_B32_vi $sgpr33, 0, undef $vgpr2 + ; GFX9: $sgpr4 = frame-setup S_ADD_U32 $sgpr32, 524224, implicit-def $scc + ; GFX9: $sgpr33 = frame-setup S_AND_B32 killed $sgpr4, 4294443008, implicit-def $scc + ; GFX9: $sgpr32 = frame-setup S_ADD_U32 $sgpr32, 1572864, implicit-def $scc + ; GFX9: $sgpr4 = S_ADD_U32 $sgpr33, 524544, implicit-def $scc + ; GFX9: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %stack.3, addrspace 5) + ; GFX9: $vgpr3 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec + ; GFX9: $vgpr3 = V_ADD_U32_e32 8192, killed $vgpr3, implicit $exec + ; GFX9: $vgpr0 = V_OR_B32_e32 killed $vgpr3, $vgpr1, implicit $exec + ; GFX9: $sgpr32 = frame-destroy S_SUB_U32 $sgpr32, 1572864, implicit-def $scc + ; GFX9: $sgpr33 = V_READLANE_B32_vi $vgpr2, 0 + ; GFX9: $sgpr4 = S_ADD_U32 $sgpr33, 524544, implicit-def $scc + ; GFX9: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.3, addrspace 5) + ; GFX9: S_ENDPGM 0, csr_amdgpu_allvgprs + $vgpr0 = V_OR_B32_e32 %stack.1, $vgpr1, implicit $exec + S_ENDPGM 0, csr_amdgpu_allvgprs +... diff --git a/llvm/test/CodeGen/AMDGPU/private-access-no-objects.ll b/llvm/test/CodeGen/AMDGPU/private-access-no-objects.ll index 8e96fbb178ca1..1c48689f07a4f 100644 --- a/llvm/test/CodeGen/AMDGPU/private-access-no-objects.ll +++ b/llvm/test/CodeGen/AMDGPU/private-access-no-objects.ll @@ -10,14 +10,13 @@ ; GCN-LABEL: {{^}}store_to_undef: ; OPT-DAG: s_mov_b64 s{{\[}}[[RSRC_LO:[0-9]+]]:{{[0-9]+\]}}, s[0:1] ; OPT-DAG: s_mov_b64 s{{\[[0-9]+}}:[[RSRC_HI:[0-9]+]]{{\]}}, s[2:3] -; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s5{{$}} -; OPT: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, [[SOFFSET]] offen{{$}} +; OPT: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, 0 offen{{$}} ; -O0 should assume spilling, so the input scratch resource descriptor ; -should be used directly without any copies. ; OPTNONE-NOT: s_mov_b32 -; OPTNONE: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s5 offen{{$}} +; OPTNONE: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen{{$}} define amdgpu_kernel void @store_to_undef() #0 { store volatile i32 0, i32 addrspace(5)* undef ret void @@ -26,8 +25,7 @@ define amdgpu_kernel void @store_to_undef() #0 { ; GCN-LABEL: {{^}}store_to_inttoptr: ; OPT-DAG: s_mov_b64 s{{\[}}[[RSRC_LO:[0-9]+]]:{{[0-9]+\]}}, s[0:1] ; OPT-DAG: s_mov_b64 s{{\[[0-9]+}}:[[RSRC_HI:[0-9]+]]{{\]}}, s[2:3] -; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s5{{$}} -; OPT: buffer_store_dword v{{[0-9]+}}, off, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, [[SOFFSET]] offset:124{{$}} +; OPT: buffer_store_dword v{{[0-9]+}}, off, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, 0 offset:124{{$}} define amdgpu_kernel void @store_to_inttoptr() #0 { store volatile i32 0, i32 addrspace(5)* inttoptr (i32 124 to i32 addrspace(5)*) ret void @@ -36,8 +34,7 @@ define amdgpu_kernel void @store_to_inttoptr() #0 { ; GCN-LABEL: {{^}}load_from_undef: ; OPT-DAG: s_mov_b64 s{{\[}}[[RSRC_LO:[0-9]+]]:{{[0-9]+\]}}, s[0:1] ; OPT-DAG: s_mov_b64 s{{\[[0-9]+}}:[[RSRC_HI:[0-9]+]]{{\]}}, s[2:3] -; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s5{{$}} -; OPT: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, [[SOFFSET]] offen{{$}} +; OPT: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, 0 offen{{$}} define amdgpu_kernel void @load_from_undef() #0 { %ld = load volatile i32, i32 addrspace(5)* undef ret void @@ -46,8 +43,7 @@ define amdgpu_kernel void @load_from_undef() #0 { ; GCN-LABEL: {{^}}load_from_inttoptr: ; OPT-DAG: s_mov_b64 s{{\[}}[[RSRC_LO:[0-9]+]]:{{[0-9]+\]}}, s[0:1] ; OPT-DAG: s_mov_b64 s{{\[[0-9]+}}:[[RSRC_HI:[0-9]+]]{{\]}}, s[2:3] -; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s5{{$}} -; OPT: buffer_load_dword v{{[0-9]+}}, off, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, [[SOFFSET]] offset:124{{$}} +; OPT: buffer_load_dword v{{[0-9]+}}, off, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, 0 offset:124{{$}} define amdgpu_kernel void @load_from_inttoptr() #0 { %ld = load volatile i32, i32 addrspace(5)* inttoptr (i32 124 to i32 addrspace(5)*) ret void diff --git a/llvm/test/CodeGen/AMDGPU/private-element-size.ll b/llvm/test/CodeGen/AMDGPU/private-element-size.ll index 920eaa20a97ba..d5e5ba5202fc3 100644 --- a/llvm/test/CodeGen/AMDGPU/private-element-size.ll +++ b/llvm/test/CodeGen/AMDGPU/private-element-size.ll @@ -10,32 +10,32 @@ ; HSA-ELT4: private_element_size = 1 -; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:16 -; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:32 -; HSA-ELT16-DAG: buffer_load_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}} - -; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:24{{$}} -; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:16 -; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:32 -; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:40 - -; HSA-ELT8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen -; HSA-ELT8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen - - -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:16{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:20{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:24{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:28{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:32{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:36{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:40{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:44{{$}} - -; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}} -; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}} -; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:8{{$}} -; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:12{{$}} +; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 offset:16 +; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 offset:32 +; HSA-ELT16-DAG: buffer_load_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], 0 offen{{$}} + +; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 offset:24{{$}} +; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 offset:16 +; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 offset:32 +; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 offset:40 + +; HSA-ELT8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], 0 offen +; HSA-ELT8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], 0 offen + + +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:16{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:20{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:24{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:28{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:32{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:36{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:40{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:44{{$}} + +; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen{{$}} +; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen offset:4{{$}} +; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen offset:8{{$}} +; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen offset:12{{$}} define amdgpu_kernel void @private_elt_size_v4i32(<4 x i32> addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -59,53 +59,53 @@ entry: ; HSA-ELT8: private_element_size = 2 ; HSA-ELT4: private_element_size = 1 -; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:32 -; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:48 -; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:64 -; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:80 - -; HSA-ELT16-DAG: buffer_load_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}} -; HSA-ELT16-DAG: buffer_load_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}} - - -; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:32 -; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:40 -; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:48 -; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:56 -; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:88 -; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:80 -; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:72 -; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:64 - -; HSA-ELT8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen -; HSA-ELT8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen - - -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:32{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:36{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:40{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:44{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:48{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:52{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:56{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:60{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:64{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:68{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:72{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:76{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:80{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:84{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:88{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:92{{$}} - -; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}} -; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}} -; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:8{{$}} -; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:12{{$}} -; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:16{{$}} -; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:20{{$}} -; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:24{{$}} -; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:28{{$}} +; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 offset:32 +; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 offset:48 +; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 offset:64 +; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 offset:80 + +; HSA-ELT16-DAG: buffer_load_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], 0 offen{{$}} +; HSA-ELT16-DAG: buffer_load_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], 0 offen{{$}} + + +; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 offset:32 +; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 offset:40 +; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 offset:48 +; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 offset:56 +; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 offset:88 +; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 offset:80 +; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 offset:72 +; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 offset:64 + +; HSA-ELT8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], 0 offen +; HSA-ELT8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], 0 offen + + +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:32{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:36{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:40{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:44{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:48{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:52{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:56{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:60{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:64{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:68{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:72{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:76{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:80{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:84{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:88{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:92{{$}} + +; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen{{$}} +; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen offset:4{{$}} +; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen offset:8{{$}} +; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen offset:12{{$}} +; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen offset:16{{$}} +; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen offset:20{{$}} +; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen offset:24{{$}} +; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen offset:28{{$}} define amdgpu_kernel void @private_elt_size_v8i32(<8 x i32> addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -130,19 +130,19 @@ entry: ; HSA-ELT8: private_element_size = 2 ; HSA-ELT4: private_element_size = 1 -; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, {{off|v[0-9]}}, s[0:3], s9 offset:1 -; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, {{off|v[0-9]}}, s[0:3], s9 offset:2 +; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, {{off|v[0-9]}}, s[0:3], 0 offset:1 +; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, {{off|v[0-9]}}, s[0:3], 0 offset:2 -; HSA-ELTGE8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen +; HSA-ELTGE8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], 0 offen -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:16{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:20{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:24{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:28{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:16{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:20{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:24{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:28{{$}} -; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}} -; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}} +; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen{{$}} +; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen offset:4{{$}} define amdgpu_kernel void @private_elt_size_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -166,19 +166,19 @@ entry: ; HSA-ELT8: private_element_size = 2 ; HSA-ELT4: private_element_size = 1 -; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:16 -; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:24 +; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 offset:16 +; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 offset:24 -; HSA-ELTGE8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen +; HSA-ELTGE8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], 0 offen -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:16{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:20{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:24{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:28{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:16{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:20{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:24{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:28{{$}} -; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}} -; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}} +; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen{{$}} +; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen offset:4{{$}} define amdgpu_kernel void @private_elt_size_f64(double addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -202,32 +202,32 @@ entry: ; HSA-ELT8: private_element_size = 2 ; HSA-ELT4: private_element_size = 1 -; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:16 -; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:32 -; HSA-ELT16-DAG: buffer_load_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}} +; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 offset:16 +; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 offset:32 +; HSA-ELT16-DAG: buffer_load_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], 0 offen{{$}} -; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:16{{$}} -; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:24 -; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:40 -; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:32 +; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 offset:16{{$}} +; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 offset:24 +; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 offset:40 +; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 offset:32 -; HSA-ELT8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen -; HSA-ELT8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen +; HSA-ELT8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], 0 offen +; HSA-ELT8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], 0 offen -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:16{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:20{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:24{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:28{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:32{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:36{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:40{{$}} -; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:44{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:16{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:20{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:24{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:28{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:32{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:36{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:40{{$}} +; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], 0 offset:44{{$}} -; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}} -; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}} -; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:8{{$}} -; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:12{{$}} +; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen{{$}} +; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen offset:4{{$}} +; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen offset:8{{$}} +; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen offset:12{{$}} define amdgpu_kernel void @private_elt_size_v2i64(<2 x i64> addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/rename-independent-subregs-mac-operands.mir b/llvm/test/CodeGen/AMDGPU/rename-independent-subregs-mac-operands.mir index 331cccd853c24..d7892a0c97592 100644 --- a/llvm/test/CodeGen/AMDGPU/rename-independent-subregs-mac-operands.mir +++ b/llvm/test/CodeGen/AMDGPU/rename-independent-subregs-mac-operands.mir @@ -13,7 +13,6 @@ selected: false tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' - scratchWaveOffsetReg: '$sgpr4' frameOffsetReg: '$sgpr4' registers: @@ -99,7 +98,6 @@ selected: false tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' - scratchWaveOffsetReg: '$sgpr4' frameOffsetReg: '$sgpr4' registers: - { id: 0, class: vgpr_32, preferred-register: '' } diff --git a/llvm/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir b/llvm/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir index 0d2f90793fc39..e12cff942bc6e 100644 --- a/llvm/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir +++ b/llvm/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir @@ -10,7 +10,6 @@ tracksRegLiveness: true machineFunctionInfo: isEntryFunction: true scratchRSrcReg: '$sgpr24_sgpr25_sgpr26_sgpr27' - scratchWaveOffsetReg: '$sgpr32' frameOffsetReg: '$sgpr32' stackPtrOffsetReg: '$sgpr32' argumentInfo: diff --git a/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir b/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir index e9f0678f77823..bd0423c5457c5 100644 --- a/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir +++ b/llvm/test/CodeGen/AMDGPU/sched-handleMoveUp-subreg-def-across-subreg-def.mir @@ -11,7 +11,6 @@ frameInfo: machineFunctionInfo: isEntryFunction: true scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99' - scratchWaveOffsetReg: '$sgpr101' frameOffsetReg: '$sgpr101' stackPtrOffsetReg: '$sgpr101' argumentInfo: diff --git a/llvm/test/CodeGen/AMDGPU/scratch-buffer.ll b/llvm/test/CodeGen/AMDGPU/scratch-buffer.ll index 3631d673fa259..d1903f457b306 100644 --- a/llvm/test/CodeGen/AMDGPU/scratch-buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/scratch-buffer.ll @@ -9,9 +9,9 @@ ; should be able to reuse the same regiser for each scratch buffer access. ; GCN-LABEL: {{^}}legal_offset_fi: -; GCN: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offset:4{{$}} +; GCN: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:4{{$}} ; GCN: v_mov_b32_e32 [[OFFSET:v[0-9]+]], 0x8004 -; GCN: buffer_store_dword v{{[0-9]+}}, [[OFFSET]], s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen{{$}} +; GCN: buffer_store_dword v{{[0-9]+}}, [[OFFSET]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offen{{$}} define amdgpu_kernel void @legal_offset_fi(i32 addrspace(1)* %out, i32 %cond, i32 %if_offset, i32 %else_offset) { entry: @@ -47,11 +47,11 @@ done: } ; GCN-LABEL: {{^}}legal_offset_fi_offset: -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 offen{{$}} ; This constant isn't folded, because it has multiple uses. ; GCN-DAG: v_mov_b32_e32 [[K8000:v[0-9]+]], 0x8004 ; GCN-DAG: v_add_{{[iu]}}32_e32 [[OFFSET:v[0-9]+]], vcc, [[K8000]] -; GCN: buffer_store_dword v{{[0-9]+}}, [[OFFSET]], s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen{{$}} +; GCN: buffer_store_dword v{{[0-9]+}}, [[OFFSET]], s[{{[0-9]+}}:{{[0-9]+}}], 0 offen{{$}} define amdgpu_kernel void @legal_offset_fi_offset(i32 addrspace(1)* %out, i32 %cond, i32 addrspace(1)* %offsets, i32 %if_offset, i32 %else_offset) { entry: @@ -88,7 +88,7 @@ done: ; GCN-LABEL: {{^}}neg_vaddr_offset_inbounds: ; GCN: v_add_{{[iu]}}32_e32 [[ADD:v[0-9]+]], vcc, 16, v{{[0-9]+}} -; GCN: buffer_store_dword v{{[0-9]+}}, [[ADD]], s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen{{$}} +; GCN: buffer_store_dword v{{[0-9]+}}, [[ADD]], s[{{[0-9]+:[0-9]+}}], 0 offen{{$}} define amdgpu_kernel void @neg_vaddr_offset_inbounds(i32 %offset) { entry: %array = alloca [8192 x i32], addrspace(5) @@ -100,7 +100,7 @@ entry: ; GCN-LABEL: {{^}}neg_vaddr_offset: ; GCN: v_add_{{[iu]}}32_e32 [[ADD:v[0-9]+]], vcc, 16, v{{[0-9]+}} -; GCN: buffer_store_dword v{{[0-9]+}}, [[ADD]], s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen{{$}} +; GCN: buffer_store_dword v{{[0-9]+}}, [[ADD]], s[{{[0-9]+:[0-9]+}}], 0 offen{{$}} define amdgpu_kernel void @neg_vaddr_offset(i32 %offset) { entry: %array = alloca [8192 x i32], addrspace(5) @@ -111,7 +111,7 @@ entry: } ; GCN-LABEL: {{^}}pos_vaddr_offset: -; GCN: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:20 +; GCN: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:20 define amdgpu_kernel void @pos_vaddr_offset(i32 addrspace(1)* %out, i32 %offset) { entry: %array = alloca [8192 x i32], addrspace(5) diff --git a/llvm/test/CodeGen/AMDGPU/scratch-simple.ll b/llvm/test/CodeGen/AMDGPU/scratch-simple.ll index a7718496ee9ab..af933dc94d6e2 100644 --- a/llvm/test/CodeGen/AMDGPU/scratch-simple.ll +++ b/llvm/test/CodeGen/AMDGPU/scratch-simple.ll @@ -29,8 +29,8 @@ ; GCN-DAG: v_or_b32_e32 [[LO_OFF:v[0-9]+]], 0x200, [[CLAMP_IDX]] ; GCN-DAG: v_or_b32_e32 [[HI_OFF:v[0-9]+]], 0x400, [[CLAMP_IDX]] -; GCN: buffer_load_dword {{v[0-9]+}}, [[LO_OFF]], {{s\[[0-9]+:[0-9]+\]}}, s0 offen -; GCN: buffer_load_dword {{v[0-9]+}}, [[HI_OFF]], {{s\[[0-9]+:[0-9]+\]}}, s0 offen +; GCN: buffer_load_dword {{v[0-9]+}}, [[LO_OFF]], {{s\[[0-9]+:[0-9]+\]}}, 0 offen +; GCN: buffer_load_dword {{v[0-9]+}}, [[HI_OFF]], {{s\[[0-9]+:[0-9]+\]}}, 0 offen define amdgpu_ps float @ps_main(i32 %idx) { %v1 = extractelement <81 x float> , i32 %idx %v2 = extractelement <81 x float> , i32 %idx @@ -41,8 +41,8 @@ define amdgpu_ps float @ps_main(i32 %idx) { ; GCN-LABEL: {{^}}vs_main: ; GCN-DAG: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 ; GCN-NOT: s_mov_b32 s0 -; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s0 offen -; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s0 offen +; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen +; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen define amdgpu_vs float @vs_main(i32 %idx) { %v1 = extractelement <81 x float> , i32 %idx %v2 = extractelement <81 x float> , i32 %idx @@ -52,8 +52,8 @@ define amdgpu_vs float @vs_main(i32 %idx) { ; GCN-LABEL: {{^}}cs_main: ; GCN-DAG: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 -; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s0 offen -; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s0 offen +; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen +; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen define amdgpu_cs float @cs_main(i32 %idx) { %v1 = extractelement <81 x float> , i32 %idx %v2 = extractelement <81 x float> , i32 %idx @@ -64,13 +64,13 @@ define amdgpu_cs float @cs_main(i32 %idx) { ; GCN-LABEL: {{^}}hs_main: ; SIVI: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 ; SIVI-NOT: s_mov_b32 s0 -; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s0 offen -; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s0 offen +; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen +; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen ; GFX9_10: s_mov_b32 s0, SCRATCH_RSRC_DWORD0 ; GFX9_10-NOT: s_mov_b32 s5 -; GFX9_10: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s5 offen -; GFX9_10: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s5 offen +; GFX9_10: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen +; GFX9_10: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen define amdgpu_hs float @hs_main(i32 %idx) { %v1 = extractelement <81 x float> , i32 %idx %v2 = extractelement <81 x float> , i32 %idx @@ -80,12 +80,12 @@ define amdgpu_hs float @hs_main(i32 %idx) { ; GCN-LABEL: {{^}}gs_main: ; SIVI: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 -; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s0 offen -; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s0 offen +; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen +; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen ; GFX9_10: s_mov_b32 s0, SCRATCH_RSRC_DWORD0 -; GFX9_10: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s5 offen -; GFX9_10: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s5 offen +; GFX9_10: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen +; GFX9_10: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen define amdgpu_gs float @gs_main(i32 %idx) { %v1 = extractelement <81 x float> , i32 %idx %v2 = extractelement <81 x float> , i32 %idx @@ -93,16 +93,21 @@ define amdgpu_gs float @gs_main(i32 %idx) { ret float %r } +; Mesa GS and HS shaders have the preloaded scratch wave offset SGPR fixed at +; SGPR5, and the inreg implementation is used to reference it in the IR. The +; following tests confirm the shader and anything inserted after the return +; (i.e. SI_RETURN_TO_EPILOG) can access the scratch wave offset. + ; GCN-LABEL: {{^}}hs_ir_uses_scratch_offset: ; GCN: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 ; SIVI-NOT: s_mov_b32 s6 -; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s6 offen -; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s6 offen +; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen +; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen ; GFX9_10-NOT: s_mov_b32 s5 -; GFX9_10-DAG: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s5 offen -; GFX9_10-DAG: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s5 offen +; GFX9_10-DAG: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen +; GFX9_10-DAG: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen ; GCN-DAG: s_mov_b32 s2, s5 define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg %swo, i32 %idx) { @@ -117,11 +122,11 @@ define amdgpu_hs <{i32, i32, i32, float}> @hs_ir_uses_scratch_offset(i32 inreg, ; GCN-LABEL: {{^}}gs_ir_uses_scratch_offset: ; GCN: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 -; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s6 offen -; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s6 offen +; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen +; SIVI: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen -; GFX9_10-DAG: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s5 offen -; GFX9_10-DAG: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, s5 offen +; GFX9_10-DAG: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen +; GFX9_10-DAG: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen ; GCN-DAG: s_mov_b32 s2, s5 define amdgpu_gs <{i32, i32, i32, float}> @gs_ir_uses_scratch_offset(i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg, i32 inreg %swo, i32 %idx) { diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll index f37dfbd92e25e..b066cebe54863 100644 --- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll @@ -293,7 +293,7 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) { ; GCN-NEXT: v_mul_lo_u32 v13, v6, v10 ; GCN-NEXT: v_mul_hi_u32 v10, v6, v10 ; GCN-NEXT: v_addc_u32_e32 v11, vcc, v15, v11, vcc -; GCN-NEXT: v_add_i32_e32 v12, vcc, v13, v12 +; GCN-NEXT: v_add_i32_e32 v12, vcc, v12, v13 ; GCN-NEXT: v_addc_u32_e32 v10, vcc, v11, v10, vcc ; GCN-NEXT: v_addc_u32_e32 v11, vcc, v16, v14, vcc ; GCN-NEXT: v_add_i32_e32 v9, vcc, v10, v9 @@ -315,7 +315,7 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) { ; GCN-NEXT: v_mul_hi_u32 v11, v9, v8 ; GCN-NEXT: v_addc_u32_e32 v16, vcc, v15, v17, vcc ; GCN-NEXT: v_mul_lo_u32 v8, v9, v8 -; GCN-NEXT: v_add_i32_e32 v7, vcc, v7, v13 +; GCN-NEXT: v_add_i32_e32 v7, vcc, v13, v7 ; GCN-NEXT: v_addc_u32_e32 v7, vcc, v16, v12, vcc ; GCN-NEXT: v_addc_u32_e32 v9, vcc, v11, v14, vcc ; GCN-NEXT: v_add_i32_e32 v7, vcc, v7, v8 @@ -338,7 +338,7 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) { ; GCN-NEXT: v_mul_hi_u32 v5, v1, v5 ; GCN-NEXT: v_mul_hi_u32 v11, v1, v6 ; GCN-NEXT: v_mul_lo_u32 v6, v1, v6 -; GCN-NEXT: v_add_i32_e32 v8, vcc, v10, v8 +; GCN-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, v9, v5, vcc ; GCN-NEXT: v_addc_u32_e32 v8, vcc, v11, v14, vcc ; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v6 @@ -1388,7 +1388,7 @@ define i64 @v_test_sdiv_k_num_i64(i64 %x) { ; GCN-NEXT: v_mul_lo_u32 v11, v4, v8 ; GCN-NEXT: v_mul_hi_u32 v8, v4, v8 ; GCN-NEXT: v_addc_u32_e32 v9, vcc, v13, v9, vcc -; GCN-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GCN-NEXT: v_add_i32_e32 v10, vcc, v10, v11 ; GCN-NEXT: v_addc_u32_e32 v8, vcc, v9, v8, vcc ; GCN-NEXT: v_addc_u32_e32 v9, vcc, v14, v12, vcc ; GCN-NEXT: v_add_i32_e32 v7, vcc, v8, v7 @@ -1410,7 +1410,7 @@ define i64 @v_test_sdiv_k_num_i64(i64 %x) { ; GCN-NEXT: v_mul_hi_u32 v9, v7, v6 ; GCN-NEXT: v_addc_u32_e32 v14, vcc, v13, v15, vcc ; GCN-NEXT: v_mul_lo_u32 v6, v7, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v11 +; GCN-NEXT: v_add_i32_e32 v5, vcc, v11, v5 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, v14, v10, vcc ; GCN-NEXT: v_addc_u32_e32 v7, vcc, v9, v12, vcc ; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v6 @@ -1600,7 +1600,7 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) { ; GCN-NEXT: v_mul_lo_u32 v11, v4, v8 ; GCN-NEXT: v_mul_hi_u32 v8, v4, v8 ; GCN-NEXT: v_addc_u32_e32 v9, vcc, v13, v9, vcc -; GCN-NEXT: v_add_i32_e32 v10, vcc, v11, v10 +; GCN-NEXT: v_add_i32_e32 v10, vcc, v10, v11 ; GCN-NEXT: v_addc_u32_e32 v8, vcc, v9, v8, vcc ; GCN-NEXT: v_addc_u32_e32 v9, vcc, v14, v12, vcc ; GCN-NEXT: v_add_i32_e32 v7, vcc, v8, v7 @@ -1622,7 +1622,7 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) { ; GCN-NEXT: v_mul_hi_u32 v9, v7, v6 ; GCN-NEXT: v_addc_u32_e32 v14, vcc, v13, v15, vcc ; GCN-NEXT: v_mul_lo_u32 v6, v7, v6 -; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v11 +; GCN-NEXT: v_add_i32_e32 v5, vcc, v11, v5 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, v14, v10, vcc ; GCN-NEXT: v_addc_u32_e32 v7, vcc, v9, v12, vcc ; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v6 diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-wrong-stack-id.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill-wrong-stack-id.mir index afb0a8aa7e402..5021923f46755 100644 --- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-wrong-stack-id.mir +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-wrong-stack-id.mir @@ -35,13 +35,13 @@ # SHARE: SI_SPILL_S32_SAVE $sgpr32, %stack.2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (store 4 into %stack.2, addrspace 5) # SHARE: SI_SPILL_V32_SAVE killed $vgpr0, %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5) -# SHARE: SI_SPILL_S64_SAVE killed renamable $sgpr6_sgpr7, %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (store 8 into %stack.1, align 4, addrspace 5) -# SHARE: renamable $sgpr6_sgpr7 = SI_SPILL_S64_RESTORE %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (load 8 from %stack.1, align 4, addrspace 5) -# SHARE: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr6_sgpr7, @func, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4, implicit undef $vgpr0 +# SHARE: SI_SPILL_S64_SAVE killed renamable $sgpr4_sgpr5, %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (store 8 into %stack.1, align 4, addrspace 5) +# SHARE: renamable $sgpr4_sgpr5 = SI_SPILL_S64_RESTORE %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (load 8 from %stack.1, align 4, addrspace 5) +# SHARE: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit undef $vgpr0 # SHARE: $sgpr32 = SI_SPILL_S32_RESTORE %stack.2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (load 4 from %stack.2, addrspace 5) # SHARE: $vgpr0 = SI_SPILL_V32_RESTORE %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) -# SHARE: renamable $sgpr6_sgpr7 = SI_SPILL_S64_RESTORE %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (load 8 from %stack.1, align 4, addrspace 5) -# SHARE: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr6_sgpr7, @func, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4, implicit $vgpr0 +# SHARE: renamable $sgpr4_sgpr5 = SI_SPILL_S64_RESTORE %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (load 8 from %stack.1, align 4, addrspace 5) +# SHARE: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0 # SHARE: $sgpr32 = SI_SPILL_S32_RESTORE %stack.2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (load 4 from %stack.2, addrspace 5) # NOSHARE: stack: @@ -60,14 +60,14 @@ # NOSHARE: SI_SPILL_S32_SAVE $sgpr32, %stack.2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (store 4 into %stack.2, addrspace 5) # NOSHARE: SI_SPILL_V32_SAVE killed $vgpr0, %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5) -# NOSHARE: SI_SPILL_S64_SAVE killed renamable $sgpr6_sgpr7, %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (store 8 into %stack.1, align 4, addrspace 5) -# NOSHARE: renamable $sgpr6_sgpr7 = SI_SPILL_S64_RESTORE %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (load 8 from %stack.1, align 4, addrspace 5) -# NOSHARE: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr6_sgpr7, @func, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4, implicit undef $vgpr0 +# NOSHARE: SI_SPILL_S64_SAVE killed renamable $sgpr4_sgpr5, %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (store 8 into %stack.1, align 4, addrspace 5) +# NOSHARE: renamable $sgpr4_sgpr5 = SI_SPILL_S64_RESTORE %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (load 8 from %stack.1, align 4, addrspace 5) +# NOSHARE: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit undef $vgpr0 # NOSHARE: $sgpr32 = SI_SPILL_S32_RESTORE %stack.2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (load 4 from %stack.2, addrspace 5) # NOSHARE: SI_SPILL_S32_SAVE $sgpr32, %stack.3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (store 4 into %stack.3, addrspace 5) # NOSHARE: $vgpr0 = SI_SPILL_V32_RESTORE %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) -# NOSHARE: renamable $sgpr6_sgpr7 = SI_SPILL_S64_RESTORE %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (load 8 from %stack.1, align 4, addrspace 5) -# NOSHARE: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr6_sgpr7, @func, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4, implicit $vgpr0 +# NOSHARE: renamable $sgpr4_sgpr5 = SI_SPILL_S64_RESTORE %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (load 8 from %stack.1, align 4, addrspace 5) +# NOSHARE: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr4_sgpr5, @func, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0 # NOSHARE: $sgpr32 = SI_SPILL_S32_RESTORE %stack.3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (load 4 from %stack.3, addrspace 5) ... @@ -78,7 +78,6 @@ frameInfo: hasCalls: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 frameOffsetReg: $sgpr32 stackPtrOffsetReg: $sgpr32 body: | @@ -88,13 +87,13 @@ body: | %2:vgpr_32 = FLAT_LOAD_DWORD %1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr %3:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @func + 4, target-flags(amdgpu-rel32-hi) @func + 4, implicit-def dead $scc ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32 - dead $sgpr30_sgpr31 = SI_CALL %3, @func, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4, implicit undef $vgpr0 + dead $sgpr30_sgpr31 = SI_CALL %3, @func, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit undef $vgpr0 $sgpr32 = COPY %0 %4:sreg_32_xm0 = COPY $sgpr32 ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32 ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32 $vgpr0 = COPY %2 - dead $sgpr30_sgpr31 = SI_CALL %3, @func, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4, implicit killed $vgpr0 + dead $sgpr30_sgpr31 = SI_CALL %3, @func, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit killed $vgpr0 $sgpr32 = COPY %4 ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr32 diff --git a/llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll b/llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll index ed57ec6cca350..8b9942c587cc0 100644 --- a/llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll +++ b/llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll @@ -345,10 +345,10 @@ define void @shl_add_ptr_combine_2use_both_max_lds_offset(i32 %idx) #0 { ; GCN-LABEL: {{^}}shl_add_ptr_combine_2use_private: ; GCN: v_lshlrev_b32_e32 [[SCALE0:v[0-9]+]], 2, v0 -; GCN: buffer_store_dword v{{[0-9]+}}, [[SCALE0]], s[0:3], s33 offen offset:16 +; GCN: buffer_store_dword v{{[0-9]+}}, [[SCALE0]], s[0:3], 0 offen offset:16 ; GCN: v_lshlrev_b32_e32 [[SCALE1:v[0-9]+]], 3, v0 -; GCN: buffer_store_dword v{{[0-9]+}}, [[SCALE1]], s[0:3], s33 offen offset:32 +; GCN: buffer_store_dword v{{[0-9]+}}, [[SCALE1]], s[0:3], 0 offen offset:32 define void @shl_add_ptr_combine_2use_private(i16 zeroext %idx.arg) #0 { %idx = zext i16 %idx.arg to i32 %idx.add = add nuw i32 %idx, 4 @@ -364,9 +364,9 @@ define void @shl_add_ptr_combine_2use_private(i16 zeroext %idx.arg) #0 { ; GCN-LABEL: {{^}}shl_add_ptr_combine_2use_max_private_offset: ; GCN-DAG: v_lshlrev_b32_e32 [[SCALE0:v[0-9]+]], 3, v0 ; GCN-DAG: v_lshlrev_b32_e32 [[SCALE1:v[0-9]+]], 4, v0 -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[SCALE0]], s[0:3], s33 offen offset:4088 +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[SCALE0]], s[0:3], 0 offen offset:4088 ; GCN-DAG: v_add_{{[iu]}}32_e32 [[ADD:v[0-9]+]], vcc, 0x1ff0, [[SCALE1]] -; GCN: buffer_store_dword v{{[0-9]+}}, [[ADD]], s[0:3], s33 offen{{$}} +; GCN: buffer_store_dword v{{[0-9]+}}, [[ADD]], s[0:3], 0 offen{{$}} define void @shl_add_ptr_combine_2use_max_private_offset(i16 zeroext %idx.arg) #0 { %idx = zext i16 %idx.arg to i32 %idx.add = add nuw i32 %idx, 511 @@ -382,8 +382,8 @@ define void @shl_add_ptr_combine_2use_max_private_offset(i16 zeroext %idx.arg) # ; GCN: v_add_{{[iu]}}32_e32 [[ADD:v[0-9]+]], vcc, 0x100, v0 ; GCN-DAG: v_lshlrev_b32_e32 [[SCALE0:v[0-9]+]], 4, [[ADD]] ; GCN-DAG: v_lshlrev_b32_e32 [[SCALE1:v[0-9]+]], 5, [[ADD]] -; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[SCALE0]], s[0:3], s33 offen{{$}} -; GCN: buffer_store_dword v{{[0-9]+}}, [[SCALE1]], s[0:3], s33 offen{{$}} +; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[SCALE0]], s[0:3], 0 offen{{$}} +; GCN: buffer_store_dword v{{[0-9]+}}, [[SCALE1]], s[0:3], 0 offen{{$}} define void @shl_add_ptr_combine_2use_both_max_private_offset(i16 zeroext %idx.arg) #0 { %idx = zext i16 %idx.arg to i32 %idx.add = add nuw i32 %idx, 256 diff --git a/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll b/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll index b93658665be05..72ba9152a7e30 100644 --- a/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll +++ b/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll @@ -3,11 +3,10 @@ ; Make sure this doesn't crash. ; ALL-LABEL: {{^}}test: ; ALL: s_mov_b32 s[[LO:[0-9]+]], SCRATCH_RSRC_DWORD0 -; ALL: s_mov_b32 s[[OFF:[0-9]+]], s3 ; ALL: s_mov_b32 s[[HI:[0-9]+]], 0xe80000 ; Make sure we are handling hazards correctly. -; SGPR: buffer_load_dword [[VHI:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:16 +; SGPR: buffer_load_dword [[VHI:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:16 ; SGPR-NEXT: s_waitcnt vmcnt(0) ; SGPR-NEXT: v_readfirstlane_b32 s[[HI:[0-9]+]], [[VHI]] ; SGPR-NEXT: s_nop 4 diff --git a/llvm/test/CodeGen/AMDGPU/sibling-call.ll b/llvm/test/CodeGen/AMDGPU/sibling-call.ll index f887a959cbd28..0b9eec73e191f 100644 --- a/llvm/test/CodeGen/AMDGPU/sibling-call.ll +++ b/llvm/test/CodeGen/AMDGPU/sibling-call.ll @@ -117,7 +117,7 @@ entry: ; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_byval_i32: ; GCN-NOT: v0 ; GCN-NOT: s32 -; GCN: buffer_load_dword v1, off, s[0:3], s33 offset:16 +; GCN: buffer_load_dword v1, off, s[0:3], 0 offset:16 ; GCN: buffer_store_dword v1, off, s[0:3], s32{{$}} ; GCN-NEXT: s_setpc_b64 define fastcc i32 @sibling_call_i32_fastcc_i32_byval_i32(i32 %a, [32 x i32] %large) #1 { @@ -205,13 +205,13 @@ entry: ; GCN: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1 ; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec -; GCN: s_mov_b32 s34, s32 +; GCN: s_mov_b32 s33, s32 ; GCN-DAG: s_add_u32 s32, s32, 0x400 -; GCN-DAG: buffer_store_dword v32, off, s[0:3], s34 offset:4 ; 4-byte Folded Spill -; GCN-DAG: buffer_store_dword v33, off, s[0:3], s34 ; 4-byte Folded Spill -; GCN-DAG: v_writelane_b32 v34, s36, 0 -; GCN-DAG: v_writelane_b32 v34, s37, 1 +; GCN-DAG: buffer_store_dword v32, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-DAG: buffer_store_dword v33, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-DAG: v_writelane_b32 v34, s34, 0 +; GCN-DAG: v_writelane_b32 v34, s35, 1 ; GCN-DAG: s_getpc_b64 s[4:5] ; GCN-DAG: s_add_u32 s4, s4, i32_fastcc_i32_i32@gotpcrel32@lo+4 @@ -220,18 +220,18 @@ entry: ; GCN: s_swappc_b64 -; GCN-DAG: v_readlane_b32 s36, v34, 0 -; GCN-DAG: v_readlane_b32 s37, v34, 1 +; GCN-DAG: v_readlane_b32 s34, v34, 0 +; GCN-DAG: v_readlane_b32 s35, v34, 1 -; GCN: buffer_load_dword v33, off, s[0:3], s34 ; 4-byte Folded Reload -; GCN: buffer_load_dword v32, off, s[0:3], s34 offset:4 ; 4-byte Folded Reload +; GCN: buffer_load_dword v33, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN: buffer_load_dword v32, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GCN: s_getpc_b64 s[4:5] ; GCN-NEXT: s_add_u32 s4, s4, sibling_call_i32_fastcc_i32_i32@rel32@lo+4 ; GCN-NEXT: s_addc_u32 s5, s5, sibling_call_i32_fastcc_i32_i32@rel32@hi+4 ; GCN: s_sub_u32 s32, s32, 0x400 -; GCN-NEXT: v_readlane_b32 s34, +; GCN-NEXT: v_readlane_b32 s33, ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] diff --git a/llvm/test/CodeGen/AMDGPU/sp-too-many-input-sgprs.ll b/llvm/test/CodeGen/AMDGPU/sp-too-many-input-sgprs.ll deleted file mode 100644 index e1f6eb715a312..0000000000000 --- a/llvm/test/CodeGen/AMDGPU/sp-too-many-input-sgprs.ll +++ /dev/null @@ -1,102 +0,0 @@ -; RUN: llc -mtriple=amdgcn-mesa-mesa3d -verify-machineinstrs < %s | FileCheck -check-prefixes=MESA3D,ALL %s -; RUN: llc -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefixes=UNKNOWN,ALL %s - -; Make sure shaders pick a workable SP with > 32 input SGPRs. -; FIXME: Doesn't seem to be getting initial value from right register? - -; ALL-LABEL: {{^}}too_many_input_sgprs_32: -; MESA3D-NOT: s34 -; MESA3D: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s34 offset:4 - -; Happens to end up in s32 anyway -; UNKNOWN-NOT: s32 -; UNKNOWN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s32 offset:4 -define amdgpu_ps i32 @too_many_input_sgprs_32(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, i32 inreg %arg3, i32 inreg %arg4, i32 inreg %arg5, i32 inreg %arg6, i32 inreg %arg7, - i32 inreg %arg8, i32 inreg %arg9, i32 inreg %arg10, i32 inreg %arg11, i32 inreg %arg12, i32 inreg %arg13, i32 inreg %arg14, i32 inreg %arg15, - i32 inreg %arg16, i32 inreg %arg17, i32 inreg %arg18, i32 inreg %arg19, i32 inreg %arg20, i32 inreg %arg21, i32 inreg %arg22, i32 inreg %arg23, - i32 inreg %arg24, i32 inreg %arg25, i32 inreg %arg26, i32 inreg %arg27, i32 inreg %arg28, i32 inreg %arg29, i32 inreg %arg30, i32 inreg %arg31) { -bb: - %alloca = alloca i32, align 4, addrspace(5) - store volatile i32 0, i32 addrspace(5)* %alloca - %tmp = add i32 %arg, %arg1 - %tmp32 = add i32 %tmp, %arg2 - %tmp33 = add i32 %tmp32, %arg3 - %tmp34 = add i32 %tmp33, %arg4 - %tmp35 = add i32 %tmp34, %arg5 - %tmp36 = add i32 %tmp35, %arg6 - %tmp37 = add i32 %tmp36, %arg7 - %tmp38 = add i32 %tmp37, %arg8 - %tmp39 = add i32 %tmp38, %arg9 - %tmp40 = add i32 %tmp39, %arg10 - %tmp41 = add i32 %tmp40, %arg11 - %tmp42 = add i32 %tmp41, %arg12 - %tmp43 = add i32 %tmp42, %arg13 - %tmp44 = add i32 %tmp43, %arg14 - %tmp45 = add i32 %tmp44, %arg15 - %tmp46 = add i32 %tmp45, %arg16 - %tmp47 = add i32 %tmp46, %arg17 - %tmp48 = add i32 %tmp47, %arg18 - %tmp49 = add i32 %tmp48, %arg19 - %tmp50 = add i32 %tmp49, %arg20 - %tmp51 = add i32 %tmp50, %arg21 - %tmp52 = add i32 %tmp51, %arg22 - %tmp53 = add i32 %tmp52, %arg23 - %tmp54 = add i32 %tmp53, %arg24 - %tmp55 = add i32 %tmp54, %arg25 - %tmp56 = add i32 %tmp55, %arg26 - %tmp57 = add i32 %tmp56, %arg27 - %tmp58 = add i32 %tmp57, %arg28 - %tmp59 = add i32 %tmp58, %arg29 - %tmp60 = add i32 %tmp59, %arg30 - %tmp61 = add i32 %tmp60, %arg31 - ret i32 %tmp61 -} - -; ALL-LABEL: {{^}}too_many_input_sgprs_33: -; MESA3D-NOT: s35 -; MESA3D: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s35 offset:4 - -; UNKNOWN-NOT: s33 -; UNKNOWN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s33 offset:4 -define amdgpu_ps i32 @too_many_input_sgprs_33(i32 inreg %arg, i32 inreg %arg1, i32 inreg %arg2, i32 inreg %arg3, i32 inreg %arg4, i32 inreg %arg5, i32 inreg %arg6, i32 inreg %arg7, - i32 inreg %arg8, i32 inreg %arg9, i32 inreg %arg10, i32 inreg %arg11, i32 inreg %arg12, i32 inreg %arg13, i32 inreg %arg14, i32 inreg %arg15, - i32 inreg %arg16, i32 inreg %arg17, i32 inreg %arg18, i32 inreg %arg19, i32 inreg %arg20, i32 inreg %arg21, i32 inreg %arg22, i32 inreg %arg23, - i32 inreg %arg24, i32 inreg %arg25, i32 inreg %arg26, i32 inreg %arg27, i32 inreg %arg28, i32 inreg %arg29, i32 inreg %arg30, i32 inreg %arg31, - i32 inreg %arg32) { -bb: - %alloca = alloca i32, align 4, addrspace(5) - store volatile i32 0, i32 addrspace(5)* %alloca - %tmp = add i32 %arg, %arg1 - %tmp32 = add i32 %tmp, %arg2 - %tmp33 = add i32 %tmp32, %arg3 - %tmp34 = add i32 %tmp33, %arg4 - %tmp35 = add i32 %tmp34, %arg5 - %tmp36 = add i32 %tmp35, %arg6 - %tmp37 = add i32 %tmp36, %arg7 - %tmp38 = add i32 %tmp37, %arg8 - %tmp39 = add i32 %tmp38, %arg9 - %tmp40 = add i32 %tmp39, %arg10 - %tmp41 = add i32 %tmp40, %arg11 - %tmp42 = add i32 %tmp41, %arg12 - %tmp43 = add i32 %tmp42, %arg13 - %tmp44 = add i32 %tmp43, %arg14 - %tmp45 = add i32 %tmp44, %arg15 - %tmp46 = add i32 %tmp45, %arg16 - %tmp47 = add i32 %tmp46, %arg17 - %tmp48 = add i32 %tmp47, %arg18 - %tmp49 = add i32 %tmp48, %arg19 - %tmp50 = add i32 %tmp49, %arg20 - %tmp51 = add i32 %tmp50, %arg21 - %tmp52 = add i32 %tmp51, %arg22 - %tmp53 = add i32 %tmp52, %arg23 - %tmp54 = add i32 %tmp53, %arg24 - %tmp55 = add i32 %tmp54, %arg25 - %tmp56 = add i32 %tmp55, %arg26 - %tmp57 = add i32 %tmp56, %arg27 - %tmp58 = add i32 %tmp57, %arg28 - %tmp59 = add i32 %tmp58, %arg29 - %tmp60 = add i32 %tmp59, %arg30 - %tmp61 = add i32 %tmp60, %arg31 - %tmp62 = add i32 %tmp61, %arg32 - ret i32 %tmp62 -} diff --git a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll index 9c7279a78e75e..8fedd62ac6102 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-agpr.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-agpr.ll @@ -6,8 +6,8 @@ ; A2M-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 ; A2V-NOT: SCRATCH_RSRC ; GFX908-DAG: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a0 -; A2M: buffer_store_dword v[[VSPILL]], off, s[{{[0-9:]+}}], s{{[0-9]+}} offset:[[FI:[0-9]+]] ; 4-byte Folded Spill -; A2M: buffer_load_dword v[[VSPILL:[0-9]+]], off, s[{{[0-9:]+}}], s{{[0-9]+}} offset:[[FI]] ; 4-byte Folded Reload +; A2M: buffer_store_dword v[[VSPILL]], off, s[{{[0-9:]+}}], 0 offset:[[FI:[0-9]+]] ; 4-byte Folded Spill +; A2M: buffer_load_dword v[[VSPILL:[0-9]+]], off, s[{{[0-9:]+}}], 0 offset:[[FI]] ; 4-byte Folded Reload ; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]] ; A2V: ScratchSize: 0 define amdgpu_kernel void @max_24regs_32a_used(<16 x float> addrspace(1)* %arg, float addrspace(1)* %out) #0 { @@ -35,8 +35,8 @@ bb: ; A2M-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 ; A2V-NOT: SCRATCH_RSRC ; GFX908-DAG: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a4 -; A2M: buffer_store_dword v[[VSPILL]], off, s[{{[0-9:]+}}], s{{[0-9]+}} offset:[[FI:[0-9]+]] ; 4-byte Folded Spill -; A2M: buffer_load_dword v[[VSPILL:[0-9]+]], off, s[{{[0-9:]+}}], s{{[0-9]+}} offset:[[FI]] ; 4-byte Folded Reload +; A2M: buffer_store_dword v[[VSPILL]], off, s[{{[0-9:]+}}], 0 offset:[[FI:[0-9]+]] ; 4-byte Folded Spill +; A2M: buffer_load_dword v[[VSPILL:[0-9]+]], off, s[{{[0-9:]+}}], 0 offset:[[FI]] ; 4-byte Folded Reload ; A2V: v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]] ; A2V: ScratchSize: 0 define amdgpu_kernel void @max_12regs_13a_used(<4 x float> addrspace(1)* %arg, <4 x float> addrspace(1)* %out) #2 { @@ -64,8 +64,8 @@ st: ; A2M-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 ; A2V-NOT: SCRATCH_RSRC ; GFX908-DAG: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a0 -; A2M: buffer_store_dword v[[VSPILL]], off, s[{{[0-9:]+}}], s{{[0-9]+}} offset:[[FI:[0-9]+]] ; 4-byte Folded Spill -; A2M: buffer_load_dword v[[VSPILL:[0-9]+]], off, s[{{[0-9:]+}}], s{{[0-9]+}} offset:[[FI]] ; 4-byte Folded Reload +; A2M: buffer_store_dword v[[VSPILL]], off, s[{{[0-9:]+}}], 0 offset:[[FI:[0-9]+]] ; 4-byte Folded Spill +; A2M: buffer_load_dword v[[VSPILL:[0-9]+]], off, s[{{[0-9:]+}}], 0 offset:[[FI]] ; 4-byte Folded Reload ; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]] ; A2V: ScratchSize: 0 define amdgpu_kernel void @max_10_vgprs_used_9a(i32 addrspace(1)* %p) #1 { @@ -80,8 +80,8 @@ define amdgpu_kernel void @max_10_vgprs_used_9a(i32 addrspace(1)* %p) #1 { ; A2M-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1 ; A2V-NOT: SCRATCH_RSRC ; GFX908-DAG: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a0 -; A2M: buffer_store_dword v[[VSPILL]], off, s[{{[0-9:]+}}], s{{[0-9]+}} offset:[[FI:[0-9]+]] ; 4-byte Folded Spill -; A2M: buffer_load_dword v[[VSPILL:[0-9]+]], off, s[{{[0-9:]+}}], s{{[0-9]+}} offset:[[FI]] ; 4-byte Folded Reload +; A2M: buffer_store_dword v[[VSPILL]], off, s[{{[0-9:]+}}], 0 offset:[[FI:[0-9]+]] ; 4-byte Folded Spill +; A2M: buffer_load_dword v[[VSPILL:[0-9]+]], off, s[{{[0-9:]+}}], 0 offset:[[FI]] ; 4-byte Folded Reload ; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]] ; A2V: ScratchSize: 0 define amdgpu_kernel void @max_32regs_mfma32(float addrspace(1)* %arg) #3 { diff --git a/llvm/test/CodeGen/AMDGPU/spill-before-exec.mir b/llvm/test/CodeGen/AMDGPU/spill-before-exec.mir index c56387918719e..86f2de31e7f96 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-before-exec.mir +++ b/llvm/test/CodeGen/AMDGPU/spill-before-exec.mir @@ -10,7 +10,6 @@ name: foo tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 registers: - { id: 0, class: sreg_64 } @@ -36,6 +35,10 @@ registers: - { id: 1119, class: sgpr_128 } - { id: 1120, class: sgpr_128 } - { id: 1121, class: sgpr_128 } + - { id: 1122, class: sgpr_128 } + - { id: 1123, class: sgpr_128 } + - { id: 1124, class: sgpr_128 } + - { id: 1125, class: sgpr_128 } body: | bb.0: successors: %bb.1 @@ -63,6 +66,10 @@ body: | %1119 = COPY %1100 %1120 = COPY %1100 %1121 = COPY %1100 + %1122 = COPY %1100 + %1123 = COPY %1100 + %1124 = COPY %1100 + %1125 = COPY %1100 S_BRANCH %bb.1 bb.1: @@ -97,6 +104,8 @@ body: | S_CMP_EQ_U64 %1116.sub0_sub1, %1117.sub2_sub3, implicit-def $scc S_CMP_EQ_U64 %1118.sub0_sub1, %1119.sub2_sub3, implicit-def $scc S_CMP_EQ_U64 %1120.sub0_sub1, %1121.sub2_sub3, implicit-def $scc + S_CMP_EQ_U64 %1122.sub0_sub1, %1123.sub2_sub3, implicit-def $scc + S_CMP_EQ_U64 %1124.sub0_sub1, %1125.sub2_sub3, implicit-def $scc $vgpr0 = V_MOV_B32_e32 0, implicit $exec S_SETPC_B64_return undef $sgpr30_sgpr31, implicit %0, implicit $vgpr0 diff --git a/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll b/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll index 69a4d7eac9ea6..c98e344b50095 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll @@ -4,13 +4,13 @@ ; GCN: s_or_saveexec_b64 ; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec -; GCN: v_writelane_b32 v32, s34, 2 +; GCN: v_writelane_b32 v32, s33, 2 ; GCN: s_swappc_b64 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 9 -; GCN: buffer_store_dword [[K]], off, s[0:3], s34{{$}} +; GCN: buffer_store_dword [[K]], off, s[0:3], s33{{$}} -; GCN: v_readlane_b32 s34, v32, 2 +; GCN: v_readlane_b32 s33, v32, 2 ; GCN: s_or_saveexec_b64 ; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GCN: s_mov_b64 exec diff --git a/llvm/test/CodeGen/AMDGPU/spill-empty-live-interval.mir b/llvm/test/CodeGen/AMDGPU/spill-empty-live-interval.mir index 0cf19cea78110..fd0debda403c2 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-empty-live-interval.mir +++ b/llvm/test/CodeGen/AMDGPU/spill-empty-live-interval.mir @@ -21,7 +21,6 @@ name: expecting_non_empty_interval tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | bb.0: @@ -55,7 +54,6 @@ name: rematerialize_empty_interval_has_reference tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/spill-m0.ll b/llvm/test/CodeGen/AMDGPU/spill-m0.ll index 2d7f7f9e33f57..1d65471dced14 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-m0.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-m0.ll @@ -14,7 +14,7 @@ ; TOVMEM-DAG: s_mov_b32 [[M0_COPY:s[0-9]+]], m0 ; TOVMEM-DAG: v_mov_b32_e32 [[SPILL_VREG:v[0-9]+]], [[M0_COPY]] -; TOVMEM: buffer_store_dword [[SPILL_VREG]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:12 ; 4-byte Folded Spill +; TOVMEM: buffer_store_dword [[SPILL_VREG]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:12 ; 4-byte Folded Spill ; GCN: s_cbranch_scc1 [[ENDIF:BB[0-9]+_[0-9]+]] @@ -22,7 +22,7 @@ ; TOVGPR: v_readlane_b32 [[M0_RESTORE:s[0-9]+]], [[SPILL_VREG]], 2 ; TOVGPR: s_mov_b32 m0, [[M0_RESTORE]] -; TOVMEM: buffer_load_dword [[RELOAD_VREG:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:12 ; 4-byte Folded Reload +; TOVMEM: buffer_load_dword [[RELOAD_VREG:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:12 ; 4-byte Folded Reload ; TOVMEM: s_waitcnt vmcnt(0) ; TOVMEM: v_readfirstlane_b32 [[M0_RESTORE:s[0-9]+]], [[RELOAD_VREG]] ; TOVMEM: s_mov_b32 m0, [[M0_RESTORE]] diff --git a/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll b/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll index 5ff1a1cab18ee..969edbf12647d 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-offset-calculation.ll @@ -13,7 +13,7 @@ entry: %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)* %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1 - ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4092 ; 4-byte Folded Spill + ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4092 ; 4-byte Folded Spill %a = load volatile i32, i32 addrspace(5)* %aptr ; Force %a to spill. @@ -35,7 +35,7 @@ entry: %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1 ; 0x40000 / 64 = 4096 (for wave64) - ; CHECK: s_add_u32 s6, s7, 0x40000 + ; CHECK: s_mov_b32 s6, 0x40000 ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s6 ; 4-byte Folded Spill %a = load volatile i32, i32 addrspace(5)* %aptr @@ -48,39 +48,8 @@ entry: ret void } -; CHECK-LABEL: test_sgpr_offset_kernel_scavenge_fail -define amdgpu_kernel void @test_sgpr_offset_kernel_scavenge_fail() #1 { -entry: - ; Occupy 4096 bytes of scratch, so the offset of the spill of %a does not - ; fit in the instruction, and has to live in the SGPR offset. - %alloca = alloca i8, i32 4092, align 4, addrspace(5) - %buf = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)* - - %aptr = getelementptr i32, i32 addrspace(5)* %buf, i32 1 - - ; 0x40000 / 64 = 4096 (for wave64) - %a = load volatile i32, i32 addrspace(5)* %aptr - - %asm = call { i32, i32, i32, i32, i32, i32, i32, i32 } asm sideeffect "", "=s,=s,=s,=s,=s,=s,=s,=s"() - %asm0 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 0 - %asm1 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 1 - %asm2 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 2 - %asm3 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 3 - %asm4 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 4 - %asm5 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 5 - %asm6 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 6 - %asm7 = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm, 7 - - call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}"() #0 - - ; CHECK: s_add_u32 s7, s7, 0x40000 - ; CHECK: buffer_load_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s7 ; 4-byte Folded Reload - ; CHECK: s_sub_u32 s7, s7, 0x40000 - - ; Force %a to spill with no free SGPRs - call void asm sideeffect "", "s,s,s,s,s,s,s,s,v"(i32 %asm0, i32 %asm1, i32 %asm2, i32 %asm3, i32 %asm4, i32 %asm5, i32 %asm6, i32 %asm7, i32 %a) - ret void -} +; FIXME: If we fail to scavenge an SGPR in a kernel we don't have a stack +; pointer to temporarily update, so we just crash. ; CHECK-LABEL: test_sgpr_offset_function_scavenge_fail define void @test_sgpr_offset_function_scavenge_fail() #2 { @@ -141,8 +110,8 @@ entry: %bufv1 = bitcast i8 addrspace(5)* %alloca to i32 addrspace(5)* %bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)* - ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4088 ; 4-byte Folded Spill - ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4092 ; 4-byte Folded Spill + ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4088 ; 4-byte Folded Spill + ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4092 ; 4-byte Folded Spill %aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1 %a = load volatile <2 x i32>, <2 x i32> addrspace(5)* %aptr @@ -170,7 +139,7 @@ entry: %bufv2 = bitcast i8 addrspace(5)* %alloca to <2 x i32> addrspace(5)* ; 0x3ff00 / 64 = 4092 (for wave64) - ; CHECK: s_add_u32 s6, s7, 0x3ff00 + ; CHECK: s_mov_b32 s6, 0x3ff00 ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s6 ; 4-byte Folded Spill ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s6 offset:4 ; 4-byte Folded Spill %aptr = getelementptr <2 x i32>, <2 x i32> addrspace(5)* %bufv2, i32 1 diff --git a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll index 78e2885c523a7..9fe431ea486fb 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll @@ -246,7 +246,7 @@ define amdgpu_kernel void @max_256_vgprs_spill_9x32(<32 x float> addrspace(1)* % ; GFX908-DAG v_accvgpr_read_b32 ; GCN: NumVgprs: 256 -; GFX900: ScratchSize: 644 +; GFX900: ScratchSize: 708 ; GFX908-FIXME: ScratchSize: 0 ; GCN: VGPRBlocks: 63 ; GCN: NumVGPRsForWavesPerEU: 256 diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll index 96145858a303d..97a6c3757b0b4 100644 --- a/llvm/test/CodeGen/AMDGPU/srem64.ll +++ b/llvm/test/CodeGen/AMDGPU/srem64.ll @@ -271,7 +271,7 @@ define i64 @v_test_srem(i64 %x, i64 %y) { ; GCN-NEXT: v_mul_lo_u32 v12, v5, v9 ; GCN-NEXT: v_mul_hi_u32 v9, v5, v9 ; GCN-NEXT: v_addc_u32_e32 v10, vcc, v14, v10, vcc -; GCN-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; GCN-NEXT: v_add_i32_e32 v11, vcc, v11, v12 ; GCN-NEXT: v_addc_u32_e32 v9, vcc, v10, v9, vcc ; GCN-NEXT: v_addc_u32_e32 v10, vcc, v15, v13, vcc ; GCN-NEXT: v_add_i32_e32 v8, vcc, v9, v8 @@ -293,7 +293,7 @@ define i64 @v_test_srem(i64 %x, i64 %y) { ; GCN-NEXT: v_mul_hi_u32 v10, v8, v7 ; GCN-NEXT: v_addc_u32_e32 v15, vcc, v14, v16, vcc ; GCN-NEXT: v_mul_lo_u32 v7, v8, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v12 +; GCN-NEXT: v_add_i32_e32 v6, vcc, v12, v6 ; GCN-NEXT: v_addc_u32_e32 v6, vcc, v15, v11, vcc ; GCN-NEXT: v_addc_u32_e32 v8, vcc, v10, v13, vcc ; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v7 @@ -316,7 +316,7 @@ define i64 @v_test_srem(i64 %x, i64 %y) { ; GCN-NEXT: v_mul_hi_u32 v4, v1, v4 ; GCN-NEXT: v_mul_hi_u32 v10, v1, v5 ; GCN-NEXT: v_mul_lo_u32 v5, v1, v5 -; GCN-NEXT: v_add_i32_e32 v7, vcc, v9, v7 +; GCN-NEXT: v_add_i32_e32 v7, vcc, v7, v9 ; GCN-NEXT: v_addc_u32_e32 v4, vcc, v8, v4, vcc ; GCN-NEXT: v_addc_u32_e32 v7, vcc, v10, v13, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v5 @@ -1572,7 +1572,7 @@ define i64 @v_test_srem_k_num_i64(i64 %x) { ; GCN-NEXT: v_mul_lo_u32 v10, v3, v7 ; GCN-NEXT: v_mul_hi_u32 v7, v3, v7 ; GCN-NEXT: v_addc_u32_e32 v8, vcc, v12, v8, vcc -; GCN-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; GCN-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; GCN-NEXT: v_addc_u32_e32 v7, vcc, v8, v7, vcc ; GCN-NEXT: v_addc_u32_e32 v8, vcc, v13, v11, vcc ; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6 @@ -1594,7 +1594,7 @@ define i64 @v_test_srem_k_num_i64(i64 %x) { ; GCN-NEXT: v_mul_hi_u32 v8, v6, v5 ; GCN-NEXT: v_addc_u32_e32 v13, vcc, v12, v14, vcc ; GCN-NEXT: v_mul_lo_u32 v5, v6, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v10 +; GCN-NEXT: v_add_i32_e32 v4, vcc, v10, v4 ; GCN-NEXT: v_addc_u32_e32 v4, vcc, v13, v9, vcc ; GCN-NEXT: v_addc_u32_e32 v6, vcc, v8, v11, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v5 @@ -1782,7 +1782,7 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) { ; GCN-NEXT: v_mul_lo_u32 v10, v3, v7 ; GCN-NEXT: v_mul_hi_u32 v7, v3, v7 ; GCN-NEXT: v_addc_u32_e32 v8, vcc, v12, v8, vcc -; GCN-NEXT: v_add_i32_e32 v9, vcc, v10, v9 +; GCN-NEXT: v_add_i32_e32 v9, vcc, v9, v10 ; GCN-NEXT: v_addc_u32_e32 v7, vcc, v8, v7, vcc ; GCN-NEXT: v_addc_u32_e32 v8, vcc, v13, v11, vcc ; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6 @@ -1804,7 +1804,7 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) { ; GCN-NEXT: v_mul_hi_u32 v8, v6, v5 ; GCN-NEXT: v_addc_u32_e32 v13, vcc, v12, v14, vcc ; GCN-NEXT: v_mul_lo_u32 v5, v6, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v10 +; GCN-NEXT: v_add_i32_e32 v4, vcc, v10, v4 ; GCN-NEXT: v_addc_u32_e32 v4, vcc, v13, v9, vcc ; GCN-NEXT: v_addc_u32_e32 v6, vcc, v8, v11, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v5 diff --git a/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll b/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll index be60a34b42089..497d35ea3d710 100644 --- a/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll +++ b/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll @@ -1,6 +1,16 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck -check-prefix=GCN %s +; FIXME: The MUBUF loads in this test output are incorrect, their SOffset +; should use the frame offset register, not the ABI stack pointer register. We +; rely on the frame index argument of MUBUF stack accesses to survive until PEI +; so we can fix up the SOffset to use the correct frame register in +; eliminateFrameIndex. Some things like LocalStackSlotAllocation can lift the +; frame index up into something (e.g. `v_add_nc_u32`) that we cannot fold back +; into the MUBUF instruction, and so we end up emitting an incorrect offset. +; Fixing this may involve adding stack access pseudos so that we don't have to +; speculatively refer to the ABI stack pointer register at all. + ; An assert was hit when frame offset register was used to address FrameIndex. define amdgpu_kernel void @kernel_background_evaluate(float addrspace(5)* %kg, <4 x i32> addrspace(1)* %input, <4 x float> addrspace(1)* %output, i32 %i) { ; GCN-LABEL: kernel_background_evaluate: @@ -10,14 +20,15 @@ define amdgpu_kernel void @kernel_background_evaluate(float addrspace(5)* %kg, < ; GCN-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GCN-NEXT: s_mov_b32 s38, -1 ; GCN-NEXT: s_mov_b32 s39, 0x31c16000 -; GCN-NEXT: s_mov_b32 s33, s3 -; GCN-NEXT: s_mov_b64 s[0:1], s[36:37] +; GCN-NEXT: s_add_u32 s36, s36, s3 +; GCN-NEXT: s_addc_u32 s37, s37, 0 ; GCN-NEXT: v_mov_b32_e32 v1, 0x2000 ; GCN-NEXT: v_mov_b32_e32 v2, 0x4000 ; GCN-NEXT: v_mov_b32_e32 v3, 0 -; GCN-NEXT: s_mov_b64 s[2:3], s[38:39] ; GCN-NEXT: v_mov_b32_e32 v4, 0x400000 -; GCN-NEXT: s_add_u32 s32, s33, 0xc0000 +; GCN-NEXT: s_mov_b64 s[0:1], s[36:37] +; GCN-NEXT: s_mov_b64 s[2:3], s[38:39] +; GCN-NEXT: s_mov_b32 s32, 0xc0000 ; GCN-NEXT: v_add_nc_u32_e64 v32, 4, 0x4000 ; GCN-NEXT: ; implicit-def: $vcc_hi ; GCN-NEXT: s_getpc_b64 s[4:5] @@ -36,7 +47,7 @@ define amdgpu_kernel void @kernel_background_evaluate(float addrspace(5)* %kg, < ; GCN-NEXT: v_add_nc_u32_e32 v0, v1, v0 ; GCN-NEXT: v_mul_lo_u32 v0, 0x41c64e6d, v0 ; GCN-NEXT: v_add_nc_u32_e32 v0, 0x3039, v0 -; GCN-NEXT: buffer_store_dword v0, v0, s[36:39], s33 offen +; GCN-NEXT: buffer_store_dword v0, v0, s[36:39], 0 offen ; GCN-NEXT: BB0_2: ; %shader_eval_surface.exit ; GCN-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll b/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll index deb94f521f00e..93b6eacf34292 100644 --- a/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll +++ b/llvm/test/CodeGen/AMDGPU/stack-realign-kernel.ll @@ -7,10 +7,12 @@ define amdgpu_kernel void @max_alignment_128() #0 { ; VI-LABEL: max_alignment_128: ; VI: ; %bb.0: ; VI-NEXT: s_add_u32 s4, s4, s7 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 +; VI-NEXT: s_add_u32 s0, s0, s7 +; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, 9 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s5 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s7 offset:128 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:128 ; VI-NEXT: s_endpgm ; VI-NEXT: .section .rodata,#alloc ; VI-NEXT: .p2align 6 @@ -52,9 +54,11 @@ define amdgpu_kernel void @max_alignment_128() #0 { ; GFX9-LABEL: max_alignment_128: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s4, s7 -; GFX9-NEXT: v_mov_b32_e32 v0, 9 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s7 offset:128 +; GFX9-NEXT: s_add_u32 s0, s0, s7 +; GFX9-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, 9 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:128 ; GFX9-NEXT: s_endpgm ; GFX9-NEXT: .section .rodata,#alloc ; GFX9-NEXT: .p2align 6 @@ -102,10 +106,12 @@ define amdgpu_kernel void @stackrealign_attr() #1 { ; VI-LABEL: stackrealign_attr: ; VI: ; %bb.0: ; VI-NEXT: s_add_u32 s4, s4, s7 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 +; VI-NEXT: s_add_u32 s0, s0, s7 +; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, 9 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s5 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s7 offset:4 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 ; VI-NEXT: s_endpgm ; VI-NEXT: .section .rodata,#alloc ; VI-NEXT: .p2align 6 @@ -147,9 +153,11 @@ define amdgpu_kernel void @stackrealign_attr() #1 { ; GFX9-LABEL: stackrealign_attr: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s4, s7 -; GFX9-NEXT: v_mov_b32_e32 v0, 9 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s7 offset:4 +; GFX9-NEXT: s_add_u32 s0, s0, s7 +; GFX9-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, 9 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 ; GFX9-NEXT: s_endpgm ; GFX9-NEXT: .section .rodata,#alloc ; GFX9-NEXT: .p2align 6 @@ -197,10 +205,12 @@ define amdgpu_kernel void @alignstack_attr() #2 { ; VI-LABEL: alignstack_attr: ; VI: ; %bb.0: ; VI-NEXT: s_add_u32 s4, s4, s7 +; VI-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 +; VI-NEXT: s_add_u32 s0, s0, s7 +; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, 9 ; VI-NEXT: s_mov_b32 flat_scratch_lo, s5 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8 -; VI-NEXT: buffer_store_dword v0, off, s[0:3], s7 offset:4 +; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 ; VI-NEXT: s_endpgm ; VI-NEXT: .section .rodata,#alloc ; VI-NEXT: .p2align 6 @@ -242,9 +252,11 @@ define amdgpu_kernel void @alignstack_attr() #2 { ; GFX9-LABEL: alignstack_attr: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s4, s7 -; GFX9-NEXT: v_mov_b32_e32 v0, 9 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s5, 0 -; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s7 offset:4 +; GFX9-NEXT: s_add_u32 s0, s0, s7 +; GFX9-NEXT: s_addc_u32 s1, s1, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, 9 +; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 ; GFX9-NEXT: s_endpgm ; GFX9-NEXT: .section .rodata,#alloc ; GFX9-NEXT: .p2align 6 diff --git a/llvm/test/CodeGen/AMDGPU/stack-realign.ll b/llvm/test/CodeGen/AMDGPU/stack-realign.ll index 5674f6e15e103..2a3cfe7a09928 100644 --- a/llvm/test/CodeGen/AMDGPU/stack-realign.ll +++ b/llvm/test/CodeGen/AMDGPU/stack-realign.ll @@ -9,18 +9,17 @@ ; = 144 bytes with padding between them ; GCN-LABEL: {{^}}needs_align16_default_stack_align: -; GCN: s_sub_u32 [[SUB:s[0-9]+]], s32, s33 ; GCN-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, v0 -; GCN-DAG: v_lshrrev_b32_e64 [[FRAMEDIFF:v[0-9]+]], 6, [[SUB]] +; GCN-DAG: v_lshrrev_b32_e64 [[FRAMEDIFF:v[0-9]+]], 6, s32 ; GCN: v_add_u32_e32 [[FI:v[0-9]+]], vcc, [[FRAMEDIFF]], [[SCALED_IDX]] ; GCN-NOT: s32 -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s33 offen +; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen ; GCN: v_or_b32_e32 v{{[0-9]+}}, 12 -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s33 offen -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s33 offen -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s33 offen +; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen +; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen +; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen ; GCN-NOT: s32 @@ -34,14 +33,14 @@ define void @needs_align16_default_stack_align(i32 %idx) #0 { ; GCN-LABEL: {{^}}needs_align16_stack_align4: ; GCN: s_add_u32 [[SCRATCH_REG:s[0-9]+]], s32, 0x3c0{{$}} -; GCN: s_and_b32 s34, [[SCRATCH_REG]], 0xfffffc00 -; GCN: s_add_u32 s32, s32, 0x2800{{$}} +; GCN: s_and_b32 s33, [[SCRATCH_REG]], 0xfffffc00 -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s33 offen +; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen ; GCN: v_or_b32_e32 v{{[0-9]+}}, 12 -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s33 offen -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s33 offen -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s33 offen +; GCN: s_add_u32 s32, s32, 0x2800{{$}} +; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen +; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen +; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen ; GCN: s_sub_u32 s32, s32, 0x2800 @@ -55,14 +54,14 @@ define void @needs_align16_stack_align4(i32 %idx) #2 { ; GCN-LABEL: {{^}}needs_align32: ; GCN: s_add_u32 [[SCRATCH_REG:s[0-9]+]], s32, 0x7c0{{$}} -; GCN: s_and_b32 s34, [[SCRATCH_REG]], 0xfffff800 -; GCN: s_add_u32 s32, s32, 0x3000{{$}} +; GCN: s_and_b32 s33, [[SCRATCH_REG]], 0xfffff800 -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s33 offen +; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen ; GCN: v_or_b32_e32 v{{[0-9]+}}, 12 -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s33 offen -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s33 offen -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s33 offen +; GCN: s_add_u32 s32, s32, 0x3000{{$}} +; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen +; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen +; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen ; GCN: s_sub_u32 s32, s32, 0x3000 @@ -76,10 +75,10 @@ define void @needs_align32(i32 %idx) #0 { ; GCN-LABEL: {{^}}force_realign4: ; GCN: s_add_u32 [[SCRATCH_REG:s[0-9]+]], s32, 0xc0{{$}} -; GCN: s_and_b32 s34, [[SCRATCH_REG]], 0xffffff00 +; GCN: s_and_b32 s33, [[SCRATCH_REG]], 0xffffff00 ; GCN: s_add_u32 s32, s32, 0xd00{{$}} -; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s33 offen +; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], 0 offen ; GCN: s_sub_u32 s32, s32, 0xd00 ; GCN: ; ScratchSize: 52 @@ -91,8 +90,7 @@ define void @force_realign4(i32 %idx) #1 { } ; GCN-LABEL: {{^}}kernel_call_align16_from_8: -; GCN: s_mov_b32 s33, s7{{$}} -; GCN-NEXT: s_add_u32 s32, s33, 0x400{{$}} +; GCN: s_movk_i32 s32, 0x400{{$}} ; GCN-NOT: s32 ; GCN: s_swappc_b64 define amdgpu_kernel void @kernel_call_align16_from_8() #0 { @@ -104,8 +102,7 @@ define amdgpu_kernel void @kernel_call_align16_from_8() #0 { ; The call sequence should keep the stack on call aligned to 4 ; GCN-LABEL: {{^}}kernel_call_align16_from_5: -; GCN: s_mov_b32 s33, s7{{$}} -; GCN-NEXT: s_add_u32 s32, s33, 0x400 +; GCN: s_movk_i32 s32, 0x400 ; GCN: s_swappc_b64 define amdgpu_kernel void @kernel_call_align16_from_5() { %alloca0 = alloca i8, align 1, addrspace(5) @@ -116,8 +113,7 @@ define amdgpu_kernel void @kernel_call_align16_from_5() { } ; GCN-LABEL: {{^}}kernel_call_align4_from_5: -; GCN: s_mov_b32 s33, s7{{$}} -; GCN: s_add_u32 s32, s33, 0x400 +; GCN: s_movk_i32 s32, 0x400 ; GCN: s_swappc_b64 define amdgpu_kernel void @kernel_call_align4_from_5() { %alloca0 = alloca i8, align 1, addrspace(5) @@ -129,13 +125,13 @@ define amdgpu_kernel void @kernel_call_align4_from_5() { ; GCN-LABEL: {{^}}default_realign_align128: ; GCN: s_add_u32 [[TMP:s[0-9]+]], s32, 0x1fc0 -; GCN-NEXT: s_mov_b32 [[FP_COPY:s[0-9]+]], s34 -; GCN-NEXT: s_and_b32 s34, [[TMP]], 0xffffe000 +; GCN-NEXT: s_mov_b32 [[FP_COPY:s[0-9]+]], s33 +; GCN-NEXT: s_and_b32 s33, [[TMP]], 0xffffe000 ; GCN-NEXT: s_add_u32 s32, s32, 0x4000 -; GCN-NOT: s34 -; GCN: buffer_store_dword v0, off, s[0:3], s34{{$}} +; GCN-NOT: s33 +; GCN: buffer_store_dword v0, off, s[0:3], s33{{$}} ; GCN: s_sub_u32 s32, s32, 0x4000 -; GCN: s_mov_b32 s34, [[FP_COPY]] +; GCN: s_mov_b32 s33, [[FP_COPY]] define void @default_realign_align128(i32 %idx) #0 { %alloca.align = alloca i32, align 128, addrspace(5) store volatile i32 9, i32 addrspace(5)* %alloca.align, align 128 diff --git a/llvm/test/CodeGen/AMDGPU/stack-slot-color-sgpr-vgpr-spills.mir b/llvm/test/CodeGen/AMDGPU/stack-slot-color-sgpr-vgpr-spills.mir index 77010834cc3ee..e446320f55a2b 100644 --- a/llvm/test/CodeGen/AMDGPU/stack-slot-color-sgpr-vgpr-spills.mir +++ b/llvm/test/CodeGen/AMDGPU/stack-slot-color-sgpr-vgpr-spills.mir @@ -12,15 +12,14 @@ # CHECK: SI_SPILL_V32_SAVE killed $vgpr0, %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (store 4 into %stack.0, addrspace 5) # CHECK: $vgpr0 = SI_SPILL_V32_RESTORE %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) -# CHECK: SI_SPILL_S32_SAVE killed renamable $sgpr6, %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (store 4 into %stack.1, addrspace 5) -# CHECK: $sgpr6 = SI_SPILL_S32_RESTORE %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (load 4 from %stack.1, addrspace 5) +# CHECK: SI_SPILL_S32_SAVE killed renamable $sgpr5, %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (store 4 into %stack.1, addrspace 5) +# CHECK: $sgpr5 = SI_SPILL_S32_RESTORE %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 :: (load 4 from %stack.1, addrspace 5) name: no_merge_sgpr_vgpr_spill_slot tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 - frameOffsetReg: $sgpr5 + frameOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 body: | bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/store-hi16.ll b/llvm/test/CodeGen/AMDGPU/store-hi16.ll index e8d6b24efd3eb..51ed07aa2964b 100644 --- a/llvm/test/CodeGen/AMDGPU/store-hi16.ll +++ b/llvm/test/CodeGen/AMDGPU/store-hi16.ll @@ -389,10 +389,10 @@ entry: ; GCN-LABEL: {{^}}store_private_hi_v2i16: ; GCN: s_waitcnt -; GFX900-NEXT: buffer_store_short_d16_hi v1, v0, s[0:3], s33 offen{{$}} +; GFX900-NEXT: buffer_store_short_d16_hi v1, v0, s[0:3], 0 offen{{$}} ; NO-D16-HI: v_lshrrev_b32_e32 v1, 16, v1 -; NO-D16-HI: buffer_store_short v1, v0, s[0:3], s33 offen{{$}} +; NO-D16-HI: buffer_store_short v1, v0, s[0:3], 0 offen{{$}} ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 @@ -408,10 +408,10 @@ entry: ; GCN-LABEL: {{^}}store_private_hi_v2f16: ; GCN: s_waitcnt -; GFX900-NEXT: buffer_store_short_d16_hi v1, v0, s[0:3], s33 offen{{$}} +; GFX900-NEXT: buffer_store_short_d16_hi v1, v0, s[0:3], 0 offen{{$}} ; NO-D16-HI: v_lshrrev_b32_e32 v1, 16, v1 -; NO-D16-HI: buffer_store_short v1, v0, s[0:3], s33 offen{{$}} +; NO-D16-HI: buffer_store_short v1, v0, s[0:3], 0 offen{{$}} ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 @@ -427,10 +427,10 @@ entry: ; GCN-LABEL: {{^}}store_private_hi_i32_shift: ; GCN: s_waitcnt -; GFX900-NEXT: buffer_store_short_d16_hi v1, v0, s[0:3], s33 offen{{$}} +; GFX900-NEXT: buffer_store_short_d16_hi v1, v0, s[0:3], 0 offen{{$}} ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; NO-D16-HI-NEXT: buffer_store_short v1, v0, s[0:3], s33 offen{{$}} +; NO-D16-HI-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen{{$}} ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 @@ -445,10 +445,10 @@ entry: ; GCN-LABEL: {{^}}store_private_hi_v2i16_i8: ; GCN: s_waitcnt -; GFX900-NEXT: buffer_store_byte_d16_hi v1, v0, s[0:3], s33 offen{{$}} +; GFX900-NEXT: buffer_store_byte_d16_hi v1, v0, s[0:3], 0 offen{{$}} ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; NO-D16-HI-NEXT: buffer_store_byte v1, v0, s[0:3], s33 offen{{$}} +; NO-D16-HI-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen{{$}} ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 @@ -464,10 +464,10 @@ entry: ; GCN-LABEL: {{^}}store_private_hi_i8_shift: ; GCN: s_waitcnt -; GFX900-NEXT: buffer_store_byte_d16_hi v1, v0, s[0:3], s33 offen{{$}} +; GFX900-NEXT: buffer_store_byte_d16_hi v1, v0, s[0:3], 0 offen{{$}} ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; NO-D16-HI-NEXT: buffer_store_byte v1, v0, s[0:3], s33 offen{{$}} +; NO-D16-HI-NEXT: buffer_store_byte v1, v0, s[0:3], 0 offen{{$}} ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 @@ -502,10 +502,10 @@ entry: ; GCN-LABEL: {{^}}store_private_hi_v2i16_nooff: ; GCN: s_waitcnt -; GFX900-NEXT: buffer_store_short_d16_hi v0, off, s[0:3], s33{{$}} +; GFX900-NEXT: buffer_store_short_d16_hi v0, off, s[0:3], 0{{$}} ; NO-D16-HI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; NO-D16-HI-NEXT: buffer_store_short v0, off, s[0:3], s33{{$}} +; NO-D16-HI-NEXT: buffer_store_short v0, off, s[0:3], 0{{$}} ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 @@ -522,10 +522,10 @@ entry: ; GCN-LABEL: {{^}}store_private_hi_v2i16_i8_nooff: ; GCN: s_waitcnt -; GFX900-NEXT: buffer_store_byte_d16_hi v0, off, s[0:3], s33{{$}} +; GFX900-NEXT: buffer_store_byte_d16_hi v0, off, s[0:3], 0{{$}} ; NO-D16-HI: v_lshrrev_b32_e32 v0, 16, v0 -; NO-D16-HI: buffer_store_byte v0, off, s[0:3], s33{{$}} +; NO-D16-HI: buffer_store_byte v0, off, s[0:3], 0{{$}} ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 diff --git a/llvm/test/CodeGen/AMDGPU/subreg-split-live-in-error.mir b/llvm/test/CodeGen/AMDGPU/subreg-split-live-in-error.mir index 05ddadad86bbb..778c12b3dae06 100644 --- a/llvm/test/CodeGen/AMDGPU/subreg-split-live-in-error.mir +++ b/llvm/test/CodeGen/AMDGPU/subreg-split-live-in-error.mir @@ -41,7 +41,6 @@ name: _amdgpu_ps_main tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 stackPtrOffsetReg: $sgpr32 liveins: - { reg: '$vgpr2', virtual-reg: '%0' } diff --git a/llvm/test/CodeGen/AMDGPU/subvector-test.mir b/llvm/test/CodeGen/AMDGPU/subvector-test.mir index 508731a75e1b3..8fd27d1c62d36 100644 --- a/llvm/test/CodeGen/AMDGPU/subvector-test.mir +++ b/llvm/test/CodeGen/AMDGPU/subvector-test.mir @@ -7,7 +7,6 @@ name: subvector-basic-bb tracksRegLiveness: true machineFunctionInfo: scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3 - scratchWaveOffsetReg: $sgpr4 frameOffsetReg: $sgpr5 stackPtrOffsetReg: $sgpr32 body: | diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll index 86b4a39057c37..7a4065eeac465 100644 --- a/llvm/test/CodeGen/AMDGPU/udiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll @@ -256,7 +256,7 @@ define i64 @v_test_udiv_i64(i64 %x, i64 %y) { ; GCN-NEXT: v_mul_lo_u32 v12, v5, v9 ; GCN-NEXT: v_mul_hi_u32 v9, v5, v9 ; GCN-NEXT: v_addc_u32_e32 v10, vcc, v14, v10, vcc -; GCN-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; GCN-NEXT: v_add_i32_e32 v11, vcc, v11, v12 ; GCN-NEXT: v_addc_u32_e32 v9, vcc, v10, v9, vcc ; GCN-NEXT: v_addc_u32_e32 v10, vcc, v15, v13, vcc ; GCN-NEXT: v_add_i32_e32 v8, vcc, v9, v8 @@ -278,7 +278,7 @@ define i64 @v_test_udiv_i64(i64 %x, i64 %y) { ; GCN-NEXT: v_mul_hi_u32 v10, v8, v7 ; GCN-NEXT: v_addc_u32_e32 v15, vcc, v14, v16, vcc ; GCN-NEXT: v_mul_lo_u32 v7, v8, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v12 +; GCN-NEXT: v_add_i32_e32 v6, vcc, v12, v6 ; GCN-NEXT: v_addc_u32_e32 v6, vcc, v15, v11, vcc ; GCN-NEXT: v_addc_u32_e32 v8, vcc, v10, v13, vcc ; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v7 @@ -296,7 +296,7 @@ define i64 @v_test_udiv_i64(i64 %x, i64 %y) { ; GCN-NEXT: v_addc_u32_e32 v7, vcc, v14, v8, vcc ; GCN-NEXT: v_mul_lo_u32 v8, v1, v4 ; GCN-NEXT: v_mul_hi_u32 v4, v1, v4 -; GCN-NEXT: v_add_i32_e32 v6, vcc, v8, v6 +; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; GCN-NEXT: v_addc_u32_e32 v4, vcc, v7, v4, vcc ; GCN-NEXT: v_addc_u32_e32 v6, vcc, v9, v13, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v5 @@ -1174,7 +1174,7 @@ define i64 @v_test_udiv_pow2_k_num_i64(i64 %x) { ; GCN-NEXT: v_mul_lo_u32 v10, v3, v9 ; GCN-NEXT: v_mul_hi_u32 v9, v3, v9 ; GCN-NEXT: v_addc_u32_e32 v7, vcc, v12, v7, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, v10, v8 +; GCN-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; GCN-NEXT: v_addc_u32_e32 v7, vcc, v7, v9, vcc ; GCN-NEXT: v_addc_u32_e32 v8, vcc, v13, v11, vcc ; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6 @@ -1196,7 +1196,7 @@ define i64 @v_test_udiv_pow2_k_num_i64(i64 %x) { ; GCN-NEXT: v_mul_hi_u32 v8, v6, v5 ; GCN-NEXT: v_addc_u32_e32 v13, vcc, v12, v14, vcc ; GCN-NEXT: v_mul_lo_u32 v5, v6, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v10 +; GCN-NEXT: v_add_i32_e32 v4, vcc, v10, v4 ; GCN-NEXT: v_addc_u32_e32 v4, vcc, v13, v9, vcc ; GCN-NEXT: v_addc_u32_e32 v6, vcc, v8, v11, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v5 @@ -1682,7 +1682,7 @@ define i64 @v_test_udiv_k_den_i64(i64 %x) { ; GCN-NEXT: v_addc_u32_e32 v5, vcc, v10, v6, vcc ; GCN-NEXT: v_mul_lo_u32 v6, v1, v2 ; GCN-NEXT: v_mul_hi_u32 v2, v1, v2 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v6, v4 +; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v6 ; GCN-NEXT: v_addc_u32_e32 v2, vcc, v5, v2, vcc ; GCN-NEXT: v_addc_u32_e32 v4, vcc, v7, v9, vcc ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll index 9df153381d83e..9a6f7002ca87b 100644 --- a/llvm/test/CodeGen/AMDGPU/urem64.ll +++ b/llvm/test/CodeGen/AMDGPU/urem64.ll @@ -266,7 +266,7 @@ define i64 @v_test_urem_i64(i64 %x, i64 %y) { ; GCN-NEXT: v_mul_lo_u32 v12, v5, v9 ; GCN-NEXT: v_mul_hi_u32 v9, v5, v9 ; GCN-NEXT: v_addc_u32_e32 v10, vcc, v14, v10, vcc -; GCN-NEXT: v_add_i32_e32 v11, vcc, v12, v11 +; GCN-NEXT: v_add_i32_e32 v11, vcc, v11, v12 ; GCN-NEXT: v_addc_u32_e32 v9, vcc, v10, v9, vcc ; GCN-NEXT: v_addc_u32_e32 v10, vcc, v15, v13, vcc ; GCN-NEXT: v_add_i32_e32 v8, vcc, v9, v8 @@ -288,7 +288,7 @@ define i64 @v_test_urem_i64(i64 %x, i64 %y) { ; GCN-NEXT: v_mul_hi_u32 v10, v8, v7 ; GCN-NEXT: v_addc_u32_e32 v15, vcc, v14, v16, vcc ; GCN-NEXT: v_mul_lo_u32 v7, v8, v7 -; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v12 +; GCN-NEXT: v_add_i32_e32 v6, vcc, v12, v6 ; GCN-NEXT: v_addc_u32_e32 v6, vcc, v15, v11, vcc ; GCN-NEXT: v_addc_u32_e32 v8, vcc, v10, v13, vcc ; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v7 @@ -306,7 +306,7 @@ define i64 @v_test_urem_i64(i64 %x, i64 %y) { ; GCN-NEXT: v_addc_u32_e32 v7, vcc, v14, v8, vcc ; GCN-NEXT: v_mul_lo_u32 v8, v1, v4 ; GCN-NEXT: v_mul_hi_u32 v4, v1, v4 -; GCN-NEXT: v_add_i32_e32 v6, vcc, v8, v6 +; GCN-NEXT: v_add_i32_e32 v6, vcc, v6, v8 ; GCN-NEXT: v_addc_u32_e32 v4, vcc, v7, v4, vcc ; GCN-NEXT: v_addc_u32_e32 v6, vcc, v9, v13, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v5 @@ -1191,7 +1191,7 @@ define i64 @v_test_urem_pow2_k_num_i64(i64 %x) { ; GCN-NEXT: v_mul_lo_u32 v10, v3, v9 ; GCN-NEXT: v_mul_hi_u32 v9, v3, v9 ; GCN-NEXT: v_addc_u32_e32 v7, vcc, v12, v7, vcc -; GCN-NEXT: v_add_i32_e32 v8, vcc, v10, v8 +; GCN-NEXT: v_add_i32_e32 v8, vcc, v8, v10 ; GCN-NEXT: v_addc_u32_e32 v7, vcc, v7, v9, vcc ; GCN-NEXT: v_addc_u32_e32 v8, vcc, v13, v11, vcc ; GCN-NEXT: v_add_i32_e32 v6, vcc, v7, v6 @@ -1213,7 +1213,7 @@ define i64 @v_test_urem_pow2_k_num_i64(i64 %x) { ; GCN-NEXT: v_mul_hi_u32 v8, v6, v5 ; GCN-NEXT: v_addc_u32_e32 v13, vcc, v12, v14, vcc ; GCN-NEXT: v_mul_lo_u32 v5, v6, v5 -; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v10 +; GCN-NEXT: v_add_i32_e32 v4, vcc, v10, v4 ; GCN-NEXT: v_addc_u32_e32 v4, vcc, v13, v9, vcc ; GCN-NEXT: v_addc_u32_e32 v6, vcc, v8, v11, vcc ; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v5 diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll index b1ad28a2e40f1..07a29adc63931 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll @@ -22,9 +22,9 @@ ; GFX9-DAG: s_mov_b32 s[[DESC3:[0-9]+]], 0xe00000 ; OFFREG is offset system SGPR -; GCN: buffer_store_dword {{v[0-9]+}}, off, s{{\[}}[[DESC0]]:[[DESC3]]], s12 offset:{{[0-9]+}} ; 4-byte Folded Spill -; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[}}[[DESC0]]:[[DESC3]]], s12 offset:{{[0-9]+}} ; 4-byte Folded Reload -; GCN: NumVgprs: 256 +; GCN: buffer_store_dword {{v[0-9]+}}, off, s{{\[}}[[DESC0]]:[[DESC3]]], 0 offset:{{[0-9]+}} ; 4-byte Folded Spill +; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[}}[[DESC0]]:[[DESC3]]], 0 offset:{{[0-9]+}} ; 4-byte Folded Reload +; GCN: NumVgprs: 255 ; GCN: ScratchSize: 1536 define amdgpu_vs void @main([9 x <4 x i32>] addrspace(4)* inreg %arg, [17 x <4 x i32>] addrspace(4)* inreg %arg1, [17 x <4 x i32>] addrspace(4)* inreg %arg2, [34 x <8 x i32>] addrspace(4)* inreg %arg3, [16 x <4 x i32>] addrspace(4)* inreg %arg4, i32 inreg %arg5, i32 inreg %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10) #0 { diff --git a/llvm/test/CodeGen/AMDGPU/virtregrewrite-undef-identity-copy.mir b/llvm/test/CodeGen/AMDGPU/virtregrewrite-undef-identity-copy.mir index a5d5e7c82d70b..98849ba3cbc43 100644 --- a/llvm/test/CodeGen/AMDGPU/virtregrewrite-undef-identity-copy.mir +++ b/llvm/test/CodeGen/AMDGPU/virtregrewrite-undef-identity-copy.mir @@ -24,7 +24,6 @@ frameInfo: machineFunctionInfo: isEntryFunction: true scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' - scratchWaveOffsetReg: '$sgpr95' frameOffsetReg: '$sgpr95' stackPtrOffsetReg: '$sgpr32' body: | diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll index e3149be899c06..f9d4e3a5abadd 100644 --- a/llvm/test/CodeGen/AMDGPU/wave32.ll +++ b/llvm/test/CodeGen/AMDGPU/wave32.ll @@ -1063,8 +1063,8 @@ declare void @external_void_func_void() #1 ; GFX1064-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] ; GFX1032-NEXT: s_mov_b32 exec_lo, [[COPY_EXEC0]] -; GCN-NEXT: v_writelane_b32 v32, s34, 2 -; GCN: s_mov_b32 s34, s32 +; GCN-NEXT: v_writelane_b32 v32, s33, 2 +; GCN: s_mov_b32 s33, s32 ; GFX1064: s_add_u32 s32, s32, 0x400 ; GFX1032: s_add_u32 s32, s32, 0x200 @@ -1078,7 +1078,7 @@ declare void @external_void_func_void() #1 ; GFX1064: s_sub_u32 s32, s32, 0x400 ; GFX1032: s_sub_u32 s32, s32, 0x200 -; GCN: v_readlane_b32 s34, v32, 2 +; GCN: v_readlane_b32 s33, v32, 2 ; GFX1064: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; GFX1032: s_or_saveexec_b32 [[COPY_EXEC1:s[0-9]]], -1{{$}} ; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll index 1026d63d70b7a..e3183989e7d24 100644 --- a/llvm/test/CodeGen/AMDGPU/wqm.ll +++ b/llvm/test/CodeGen/AMDGPU/wqm.ll @@ -693,11 +693,11 @@ break: ; CHECK: s_and_b64 exec, exec, [[LIVE]] ; CHECK: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 ; CHECK: s_wqm_b64 exec, exec -; CHECK: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:4{{$}} +; CHECK: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4{{$}} ; CHECK: s_and_b64 exec, exec, [[LIVE]] ; CHECK: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 idxen ; CHECK: s_wqm_b64 exec, exec -; CHECK: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen +; CHECK: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen ; CHECK: s_and_b64 exec, exec, [[LIVE]] ; CHECK: image_sample diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll index 1b1118e7869d0..c69a9f58965e7 100644 --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll @@ -44,7 +44,7 @@ entry: ; GFX9: v_mov_b32_dpp v[[FIRST_MOV:[0-9]+]], v{{[0-9]+}} row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9: v_add_u32_e32 v[[FIRST_ADD:[0-9]+]], v{{[0-9]+}}, v[[FIRST_MOV]] ; GFX9: v_mov_b32_e32 v[[FIRST:[0-9]+]], v[[FIRST_ADD]] -; GFX9-O0: buffer_store_dword v[[FIRST]], off, s{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, s[[FIRST_SGPR_OFFSET:[0-9]+]] offset:[[FIRST_IMM_OFFSET:[0-9]+]] +; GFX9-O0: buffer_store_dword v[[FIRST]], off, s{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, 0 offset:[[FIRST_IMM_OFFSET:[0-9]+]] %tmp120 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp105, i32 323, i32 12, i32 15, i1 false) %tmp121 = add i32 %tmp105, %tmp120 %tmp122 = tail call i32 @llvm.amdgcn.wwm.i32(i32 %tmp121) @@ -58,7 +58,7 @@ if: ; GFX9: v_mov_b32_dpp v[[SECOND_MOV:[0-9]+]], v{{[0-9]+}} row_bcast:31 row_mask:0xc bank_mask:0xf ; GFX9: v_add_u32_e32 v[[SECOND_ADD:[0-9]+]], v{{[0-9]+}}, v[[SECOND_MOV]] ; GFX9: v_mov_b32_e32 v[[SECOND:[0-9]+]], v[[SECOND_ADD]] -; GFX9-O0: buffer_store_dword v[[SECOND]], off, s{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, s[[SECOND_SGPR_OFFSET:[0-9]+]] offset:[[SECOND_IMM_OFFSET:[0-9]+]] +; GFX9-O0: buffer_store_dword v[[SECOND]], off, s{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, 0 offset:[[SECOND_IMM_OFFSET:[0-9]+]] %tmp135 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp107, i32 323, i32 12, i32 15, i1 false) %tmp136 = add i32 %tmp107, %tmp135 %tmp137 = tail call i32 @llvm.amdgcn.wwm.i32(i32 %tmp136) @@ -67,8 +67,8 @@ if: merge: %merge_value = phi i32 [ 0, %entry ], [%tmp137, %if ] ; GFX9-O3: v_cmp_eq_u32_e32 vcc, v[[FIRST]], v[[SECOND]] -; GFX9-O0: buffer_load_dword v[[SECOND:[0-9]+]], off, s{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, s[[SECOND_SGPR_OFFSET]] offset:[[SECOND_IMM_OFFSET]] -; GFX9-O0: buffer_load_dword v[[FIRST:[0-9]+]], off, s{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, s[[FIRST_SGPR_OFFSET]] offset:[[FIRST_IMM_OFFSET]] +; GFX9-O0: buffer_load_dword v[[SECOND:[0-9]+]], off, s{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, 0 offset:[[SECOND_IMM_OFFSET]] +; GFX9-O0: buffer_load_dword v[[FIRST:[0-9]+]], off, s{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, 0 offset:[[FIRST_IMM_OFFSET]] ; GFX9-O0: v_cmp_eq_u32_e64 s{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v[[FIRST]], v[[SECOND]] %tmp138 = icmp eq i32 %tmp122, %merge_value %tmp139 = sext i1 %tmp138 to i32 diff --git a/llvm/test/CodeGen/ARM/indvar-cost.ll b/llvm/test/CodeGen/ARM/indvar-cost.ll new file mode 100644 index 0000000000000..df4c71777b964 --- /dev/null +++ b/llvm/test/CodeGen/ARM/indvar-cost.ll @@ -0,0 +1,514 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -indvars -mtriple=thumbv8m.base -S %s -o - | FileCheck %s --check-prefix=CHECK-T1 +; RUN: opt -indvars -mtriple=thumbv8m.main -S %s -o - | FileCheck %s --check-prefix=CHECK-T2 + +define dso_local arm_aapcscc void @arm_conv_fast_q15(i16* %pSrcA, i32 %srcALen, i16* %pSrcB, i32 %srcBLen, i16* %pDst, i16** %store.px, i16** %store.py, i32* %store.res) local_unnamed_addr { +; CHECK-T1-LABEL: @arm_conv_fast_q15( +; CHECK-T1-NEXT: entry: +; CHECK-T1-NEXT: [[CMP:%.*]] = icmp ult i32 [[SRCALEN:%.*]], [[SRCBLEN:%.*]] +; CHECK-T1-NEXT: [[SRCALEN_SRCBLEN:%.*]] = select i1 [[CMP]], i32 [[SRCALEN]], i32 [[SRCBLEN]] +; CHECK-T1-NEXT: [[PSRCB_PSRCA:%.*]] = select i1 [[CMP]], i16* [[PSRCB:%.*]], i16* [[PSRCA:%.*]] +; CHECK-T1-NEXT: [[PSRCA_PSRCB:%.*]] = select i1 [[CMP]], i16* [[PSRCA]], i16* [[PSRCB]] +; CHECK-T1-NEXT: [[SUB:%.*]] = add i32 [[SRCALEN_SRCBLEN]], -1 +; CHECK-T1-NEXT: [[CMP41080:%.*]] = icmp eq i32 [[SUB]], 0 +; CHECK-T1-NEXT: br i1 [[CMP41080]], label [[WHILE_END13:%.*]], label [[WHILE_COND5_PREHEADER_PREHEADER:%.*]] +; CHECK-T1: while.cond5.preheader.preheader: +; CHECK-T1-NEXT: [[TMP0:%.*]] = add i32 [[SRCALEN_SRCBLEN]], -2 +; CHECK-T1-NEXT: [[TMP1:%.*]] = icmp ult i32 [[TMP0]], 2 +; CHECK-T1-NEXT: [[UMIN:%.*]] = select i1 [[TMP1]], i32 [[TMP0]], i32 2 +; CHECK-T1-NEXT: br label [[WHILE_COND5_PREHEADER:%.*]] +; CHECK-T1: while.cond5.preheader: +; CHECK-T1-NEXT: [[COUNT_01084:%.*]] = phi i32 [ [[INC:%.*]], [[WHILE_END:%.*]] ], [ 1, [[WHILE_COND5_PREHEADER_PREHEADER]] ] +; CHECK-T1-NEXT: [[BLOCKSIZE1_01083:%.*]] = phi i32 [ [[DEC12:%.*]], [[WHILE_END]] ], [ [[SUB]], [[WHILE_COND5_PREHEADER_PREHEADER]] ] +; CHECK-T1-NEXT: [[PY_01082:%.*]] = phi i16* [ [[ADD_PTR:%.*]], [[WHILE_END]] ], [ [[PSRCA_PSRCB]], [[WHILE_COND5_PREHEADER_PREHEADER]] ] +; CHECK-T1-NEXT: [[POUT_01081:%.*]] = phi i16* [ [[INCDEC_PTR11:%.*]], [[WHILE_END]] ], [ [[PDST:%.*]], [[WHILE_COND5_PREHEADER_PREHEADER]] ] +; CHECK-T1-NEXT: br label [[WHILE_BODY7:%.*]] +; CHECK-T1: while.body7: +; CHECK-T1-NEXT: [[K_01078:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY7]] ], [ [[COUNT_01084]], [[WHILE_COND5_PREHEADER]] ] +; CHECK-T1-NEXT: [[SUM_01077:%.*]] = phi i32 [ [[ADD6_I:%.*]], [[WHILE_BODY7]] ], [ 0, [[WHILE_COND5_PREHEADER]] ] +; CHECK-T1-NEXT: [[PY_11076:%.*]] = phi i16* [ [[INCDEC_PTR8:%.*]], [[WHILE_BODY7]] ], [ [[PY_01082]], [[WHILE_COND5_PREHEADER]] ] +; CHECK-T1-NEXT: [[PX_11075:%.*]] = phi i16* [ [[INCDEC_PTR:%.*]], [[WHILE_BODY7]] ], [ [[PSRCB_PSRCA]], [[WHILE_COND5_PREHEADER]] ] +; CHECK-T1-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i16, i16* [[PX_11075]], i32 1 +; CHECK-T1-NEXT: [[TMP2:%.*]] = load i16, i16* [[PX_11075]], align 2 +; CHECK-T1-NEXT: [[CONV:%.*]] = sext i16 [[TMP2]] to i32 +; CHECK-T1-NEXT: [[INCDEC_PTR8]] = getelementptr inbounds i16, i16* [[PY_11076]], i32 -1 +; CHECK-T1-NEXT: [[TMP3:%.*]] = load i16, i16* [[PY_11076]], align 2 +; CHECK-T1-NEXT: [[CONV9:%.*]] = sext i16 [[TMP3]] to i32 +; CHECK-T1-NEXT: [[MUL_I:%.*]] = mul nsw i32 [[CONV9]], [[CONV]] +; CHECK-T1-NEXT: [[SHR3_I:%.*]] = ashr i32 [[CONV]], 16 +; CHECK-T1-NEXT: [[SHR4_I:%.*]] = ashr i32 [[CONV9]], 16 +; CHECK-T1-NEXT: [[MUL5_I:%.*]] = mul nsw i32 [[SHR4_I]], [[SHR3_I]] +; CHECK-T1-NEXT: [[ADD_I:%.*]] = add i32 [[MUL_I]], [[SUM_01077]] +; CHECK-T1-NEXT: [[ADD6_I]] = add i32 [[ADD_I]], [[MUL5_I]] +; CHECK-T1-NEXT: [[DEC]] = add nsw i32 [[K_01078]], -1 +; CHECK-T1-NEXT: [[CMP6:%.*]] = icmp eq i32 [[DEC]], 0 +; CHECK-T1-NEXT: br i1 [[CMP6]], label [[WHILE_END]], label [[WHILE_BODY7]] +; CHECK-T1: while.end: +; CHECK-T1-NEXT: [[ADD6_I_LCSSA:%.*]] = phi i32 [ [[ADD6_I]], [[WHILE_BODY7]] ] +; CHECK-T1-NEXT: [[TMP4:%.*]] = lshr i32 [[ADD6_I_LCSSA]], 15 +; CHECK-T1-NEXT: [[CONV10:%.*]] = trunc i32 [[TMP4]] to i16 +; CHECK-T1-NEXT: [[INCDEC_PTR11]] = getelementptr inbounds i16, i16* [[POUT_01081]], i32 1 +; CHECK-T1-NEXT: store i16 [[CONV10]], i16* [[POUT_01081]], align 2 +; CHECK-T1-NEXT: [[ADD_PTR]] = getelementptr inbounds i16, i16* [[PSRCA_PSRCB]], i32 [[COUNT_01084]] +; CHECK-T1-NEXT: [[INC]] = add nuw nsw i32 [[COUNT_01084]], 1 +; CHECK-T1-NEXT: [[DEC12]] = add i32 [[BLOCKSIZE1_01083]], -1 +; CHECK-T1-NEXT: [[CMP3:%.*]] = icmp ult i32 [[COUNT_01084]], 3 +; CHECK-T1-NEXT: [[CMP4:%.*]] = icmp ne i32 [[DEC12]], 0 +; CHECK-T1-NEXT: [[TMP5:%.*]] = and i1 [[CMP4]], [[CMP3]] +; CHECK-T1-NEXT: br i1 [[TMP5]], label [[WHILE_COND5_PREHEADER]], label [[WHILE_END13_LOOPEXIT:%.*]] +; CHECK-T1: while.end13.loopexit: +; CHECK-T1-NEXT: [[INCDEC_PTR11_LCSSA:%.*]] = phi i16* [ [[INCDEC_PTR11]], [[WHILE_END]] ] +; CHECK-T1-NEXT: [[ADD_PTR_LCSSA:%.*]] = phi i16* [ [[ADD_PTR]], [[WHILE_END]] ] +; CHECK-T1-NEXT: [[DEC12_LCSSA:%.*]] = phi i32 [ [[DEC12]], [[WHILE_END]] ] +; CHECK-T1-NEXT: [[TMP6:%.*]] = add nuw nsw i32 [[UMIN]], 2 +; CHECK-T1-NEXT: br label [[WHILE_END13]] +; CHECK-T1: while.end13: +; CHECK-T1-NEXT: [[POUT_0_LCSSA:%.*]] = phi i16* [ [[PDST]], [[ENTRY:%.*]] ], [ [[INCDEC_PTR11_LCSSA]], [[WHILE_END13_LOOPEXIT]] ] +; CHECK-T1-NEXT: [[PY_0_LCSSA:%.*]] = phi i16* [ [[PSRCA_PSRCB]], [[ENTRY]] ], [ [[ADD_PTR_LCSSA]], [[WHILE_END13_LOOPEXIT]] ] +; CHECK-T1-NEXT: [[BLOCKSIZE1_0_LCSSA:%.*]] = phi i32 [ [[SUB]], [[ENTRY]] ], [ [[DEC12_LCSSA]], [[WHILE_END13_LOOPEXIT]] ] +; CHECK-T1-NEXT: [[COUNT_0_LCSSA:%.*]] = phi i32 [ 1, [[ENTRY]] ], [ [[TMP6]], [[WHILE_END13_LOOPEXIT]] ] +; CHECK-T1-NEXT: [[CMP161068:%.*]] = icmp eq i32 [[BLOCKSIZE1_0_LCSSA]], 0 +; CHECK-T1-NEXT: br i1 [[CMP161068]], label [[EXIT:%.*]], label [[WHILE_BODY18_PREHEADER:%.*]] +; CHECK-T1: while.body18.preheader: +; CHECK-T1-NEXT: [[ADD_PTR14:%.*]] = getelementptr inbounds i16, i16* [[PY_0_LCSSA]], i32 -1 +; CHECK-T1-NEXT: br label [[WHILE_BODY18:%.*]] +; CHECK-T1: while.body18: +; CHECK-T1-NEXT: [[COUNT_11072:%.*]] = phi i32 [ [[INC49:%.*]], [[WHILE_END43:%.*]] ], [ [[COUNT_0_LCSSA]], [[WHILE_BODY18_PREHEADER]] ] +; CHECK-T1-NEXT: [[BLOCKSIZE1_11071:%.*]] = phi i32 [ [[DEC50:%.*]], [[WHILE_END43]] ], [ [[BLOCKSIZE1_0_LCSSA]], [[WHILE_BODY18_PREHEADER]] ] +; CHECK-T1-NEXT: [[PY_21070:%.*]] = phi i16* [ [[ADD_PTR48:%.*]], [[WHILE_END43]] ], [ [[ADD_PTR14]], [[WHILE_BODY18_PREHEADER]] ] +; CHECK-T1-NEXT: [[POUT_11069:%.*]] = phi i16* [ [[INCDEC_PTR46:%.*]], [[WHILE_END43]] ], [ [[POUT_0_LCSSA]], [[WHILE_BODY18_PREHEADER]] ] +; CHECK-T1-NEXT: [[SHR19:%.*]] = lshr i32 [[COUNT_11072]], 2 +; CHECK-T1-NEXT: [[CMP211054:%.*]] = icmp eq i32 [[SHR19]], 0 +; CHECK-T1-NEXT: br i1 [[CMP211054]], label [[WHILE_END31:%.*]], label [[WHILE_BODY23_PREHEADER:%.*]] +; CHECK-T1: while.body23.preheader: +; CHECK-T1-NEXT: br label [[WHILE_BODY23:%.*]] +; CHECK-T1: while.body23: +; CHECK-T1-NEXT: [[K_11058:%.*]] = phi i32 [ [[DEC30:%.*]], [[WHILE_BODY23]] ], [ [[SHR19]], [[WHILE_BODY23_PREHEADER]] ] +; CHECK-T1-NEXT: [[SUM_11057:%.*]] = phi i32 [ [[ADD6_I878:%.*]], [[WHILE_BODY23]] ], [ 0, [[WHILE_BODY23_PREHEADER]] ] +; CHECK-T1-NEXT: [[PY_31056:%.*]] = phi i16* [ [[ADD_PTR_I884:%.*]], [[WHILE_BODY23]] ], [ [[PY_21070]], [[WHILE_BODY23_PREHEADER]] ] +; CHECK-T1-NEXT: [[PX_31055:%.*]] = phi i16* [ [[ADD_PTR_I890:%.*]], [[WHILE_BODY23]] ], [ [[PSRCB_PSRCA]], [[WHILE_BODY23_PREHEADER]] ] +; CHECK-T1-NEXT: [[ARRAYIDX_I907:%.*]] = getelementptr inbounds i16, i16* [[PX_31055]], i32 1 +; CHECK-T1-NEXT: [[TMP7:%.*]] = load i16, i16* [[ARRAYIDX_I907]], align 2 +; CHECK-T1-NEXT: [[TMP8:%.*]] = load i16, i16* [[PX_31055]], align 2 +; CHECK-T1-NEXT: [[ADD_PTR_I912:%.*]] = getelementptr inbounds i16, i16* [[PX_31055]], i32 2 +; CHECK-T1-NEXT: [[ARRAYIDX_I901:%.*]] = getelementptr inbounds i16, i16* [[PY_31056]], i32 1 +; CHECK-T1-NEXT: [[TMP9:%.*]] = load i16, i16* [[ARRAYIDX_I901]], align 2 +; CHECK-T1-NEXT: [[TMP10:%.*]] = load i16, i16* [[PY_31056]], align 2 +; CHECK-T1-NEXT: [[ADD_PTR_I906:%.*]] = getelementptr inbounds i16, i16* [[PY_31056]], i32 -2 +; CHECK-T1-NEXT: [[SHR_I892:%.*]] = sext i16 [[TMP8]] to i32 +; CHECK-T1-NEXT: [[SHR1_I893:%.*]] = sext i16 [[TMP9]] to i32 +; CHECK-T1-NEXT: [[MUL_I894:%.*]] = mul nsw i32 [[SHR1_I893]], [[SHR_I892]] +; CHECK-T1-NEXT: [[SHR2_I895:%.*]] = sext i16 [[TMP7]] to i32 +; CHECK-T1-NEXT: [[SHR4_I897:%.*]] = sext i16 [[TMP10]] to i32 +; CHECK-T1-NEXT: [[MUL5_I898:%.*]] = mul nsw i32 [[SHR4_I897]], [[SHR2_I895]] +; CHECK-T1-NEXT: [[ADD_I899:%.*]] = add i32 [[MUL_I894]], [[SUM_11057]] +; CHECK-T1-NEXT: [[ADD6_I900:%.*]] = add i32 [[ADD_I899]], [[MUL5_I898]] +; CHECK-T1-NEXT: [[ARRAYIDX_I885:%.*]] = getelementptr inbounds i16, i16* [[PX_31055]], i32 3 +; CHECK-T1-NEXT: [[TMP11:%.*]] = load i16, i16* [[ARRAYIDX_I885]], align 2 +; CHECK-T1-NEXT: [[TMP12:%.*]] = load i16, i16* [[ADD_PTR_I912]], align 2 +; CHECK-T1-NEXT: [[ADD_PTR_I890]] = getelementptr inbounds i16, i16* [[PX_31055]], i32 4 +; CHECK-T1-NEXT: [[ARRAYIDX_I879:%.*]] = getelementptr inbounds i16, i16* [[PY_31056]], i32 -1 +; CHECK-T1-NEXT: [[TMP13:%.*]] = load i16, i16* [[ARRAYIDX_I879]], align 2 +; CHECK-T1-NEXT: [[TMP14:%.*]] = load i16, i16* [[ADD_PTR_I906]], align 2 +; CHECK-T1-NEXT: [[ADD_PTR_I884]] = getelementptr inbounds i16, i16* [[PY_31056]], i32 -4 +; CHECK-T1-NEXT: [[SHR_I870:%.*]] = sext i16 [[TMP12]] to i32 +; CHECK-T1-NEXT: [[SHR1_I871:%.*]] = sext i16 [[TMP13]] to i32 +; CHECK-T1-NEXT: [[MUL_I872:%.*]] = mul nsw i32 [[SHR1_I871]], [[SHR_I870]] +; CHECK-T1-NEXT: [[SHR2_I873:%.*]] = sext i16 [[TMP11]] to i32 +; CHECK-T1-NEXT: [[SHR4_I875:%.*]] = sext i16 [[TMP14]] to i32 +; CHECK-T1-NEXT: [[MUL5_I876:%.*]] = mul nsw i32 [[SHR4_I875]], [[SHR2_I873]] +; CHECK-T1-NEXT: [[ADD_I877:%.*]] = add i32 [[ADD6_I900]], [[MUL_I872]] +; CHECK-T1-NEXT: [[ADD6_I878]] = add i32 [[ADD_I877]], [[MUL5_I876]] +; CHECK-T1-NEXT: [[DEC30]] = add nsw i32 [[K_11058]], -1 +; CHECK-T1-NEXT: [[CMP21:%.*]] = icmp eq i32 [[DEC30]], 0 +; CHECK-T1-NEXT: br i1 [[CMP21]], label [[WHILE_END31_LOOPEXIT:%.*]], label [[WHILE_BODY23]] +; CHECK-T1: while.end31.loopexit: +; CHECK-T1-NEXT: [[ADD_PTR_I890_LCSSA:%.*]] = phi i16* [ [[ADD_PTR_I890]], [[WHILE_BODY23]] ] +; CHECK-T1-NEXT: [[ADD_PTR_I884_LCSSA:%.*]] = phi i16* [ [[ADD_PTR_I884]], [[WHILE_BODY23]] ] +; CHECK-T1-NEXT: [[ADD6_I878_LCSSA:%.*]] = phi i32 [ [[ADD6_I878]], [[WHILE_BODY23]] ] +; CHECK-T1-NEXT: br label [[WHILE_END31]] +; CHECK-T1: while.end31: +; CHECK-T1-NEXT: [[PX_3_LCSSA:%.*]] = phi i16* [ [[PSRCB_PSRCA]], [[WHILE_BODY18]] ], [ [[ADD_PTR_I890_LCSSA]], [[WHILE_END31_LOOPEXIT]] ] +; CHECK-T1-NEXT: [[PY_3_LCSSA:%.*]] = phi i16* [ [[PY_21070]], [[WHILE_BODY18]] ], [ [[ADD_PTR_I884_LCSSA]], [[WHILE_END31_LOOPEXIT]] ] +; CHECK-T1-NEXT: [[SUM_1_LCSSA:%.*]] = phi i32 [ 0, [[WHILE_BODY18]] ], [ [[ADD6_I878_LCSSA]], [[WHILE_END31_LOOPEXIT]] ] +; CHECK-T1-NEXT: [[REM:%.*]] = and i32 [[COUNT_11072]], 3 +; CHECK-T1-NEXT: [[CMP341062:%.*]] = icmp eq i32 [[REM]], 0 +; CHECK-T1-NEXT: br i1 [[CMP341062]], label [[WHILE_END43]], label [[WHILE_BODY36_PREHEADER:%.*]] +; CHECK-T1: while.body36.preheader: +; CHECK-T1-NEXT: [[ADD_PTR32:%.*]] = getelementptr inbounds i16, i16* [[PY_3_LCSSA]], i32 1 +; CHECK-T1-NEXT: br label [[WHILE_BODY36:%.*]] +; CHECK-T1: while.body36: +; CHECK-T1-NEXT: [[K_21066:%.*]] = phi i32 [ [[DEC42:%.*]], [[WHILE_BODY36]] ], [ [[REM]], [[WHILE_BODY36_PREHEADER]] ] +; CHECK-T1-NEXT: [[SUM_21065:%.*]] = phi i32 [ [[ADD6_I868:%.*]], [[WHILE_BODY36]] ], [ [[SUM_1_LCSSA]], [[WHILE_BODY36_PREHEADER]] ] +; CHECK-T1-NEXT: [[PY_41064:%.*]] = phi i16* [ [[INCDEC_PTR39:%.*]], [[WHILE_BODY36]] ], [ [[ADD_PTR32]], [[WHILE_BODY36_PREHEADER]] ] +; CHECK-T1-NEXT: [[PX_41063:%.*]] = phi i16* [ [[INCDEC_PTR37:%.*]], [[WHILE_BODY36]] ], [ [[PX_3_LCSSA]], [[WHILE_BODY36_PREHEADER]] ] +; CHECK-T1-NEXT: [[INCDEC_PTR37]] = getelementptr inbounds i16, i16* [[PX_41063]], i32 1 +; CHECK-T1-NEXT: [[TMP15:%.*]] = load i16, i16* [[PX_41063]], align 2 +; CHECK-T1-NEXT: [[CONV38:%.*]] = sext i16 [[TMP15]] to i32 +; CHECK-T1-NEXT: [[INCDEC_PTR39]] = getelementptr inbounds i16, i16* [[PY_41064]], i32 -1 +; CHECK-T1-NEXT: [[TMP16:%.*]] = load i16, i16* [[PY_41064]], align 2 +; CHECK-T1-NEXT: [[CONV40:%.*]] = sext i16 [[TMP16]] to i32 +; CHECK-T1-NEXT: [[MUL_I863:%.*]] = mul nsw i32 [[CONV40]], [[CONV38]] +; CHECK-T1-NEXT: [[SHR3_I864:%.*]] = ashr i32 [[CONV38]], 16 +; CHECK-T1-NEXT: [[SHR4_I865:%.*]] = ashr i32 [[CONV40]], 16 +; CHECK-T1-NEXT: [[MUL5_I866:%.*]] = mul nsw i32 [[SHR4_I865]], [[SHR3_I864]] +; CHECK-T1-NEXT: [[ADD_I867:%.*]] = add i32 [[MUL_I863]], [[SUM_21065]] +; CHECK-T1-NEXT: [[ADD6_I868]] = add i32 [[ADD_I867]], [[MUL5_I866]] +; CHECK-T1-NEXT: [[DEC42]] = add nsw i32 [[K_21066]], -1 +; CHECK-T1-NEXT: [[CMP34:%.*]] = icmp eq i32 [[DEC42]], 0 +; CHECK-T1-NEXT: br i1 [[CMP34]], label [[WHILE_END43_LOOPEXIT:%.*]], label [[WHILE_BODY36]] +; CHECK-T1: while.end43.loopexit: +; CHECK-T1-NEXT: [[ADD6_I868_LCSSA:%.*]] = phi i32 [ [[ADD6_I868]], [[WHILE_BODY36]] ] +; CHECK-T1-NEXT: br label [[WHILE_END43]] +; CHECK-T1: while.end43: +; CHECK-T1-NEXT: [[SUM_2_LCSSA:%.*]] = phi i32 [ [[SUM_1_LCSSA]], [[WHILE_END31]] ], [ [[ADD6_I868_LCSSA]], [[WHILE_END43_LOOPEXIT]] ] +; CHECK-T1-NEXT: [[TMP17:%.*]] = lshr i32 [[SUM_2_LCSSA]], 15 +; CHECK-T1-NEXT: [[CONV45:%.*]] = trunc i32 [[TMP17]] to i16 +; CHECK-T1-NEXT: [[INCDEC_PTR46]] = getelementptr inbounds i16, i16* [[POUT_11069]], i32 1 +; CHECK-T1-NEXT: store i16 [[CONV45]], i16* [[POUT_11069]], align 2 +; CHECK-T1-NEXT: [[SUB47:%.*]] = add i32 [[COUNT_11072]], -1 +; CHECK-T1-NEXT: [[ADD_PTR48]] = getelementptr inbounds i16, i16* [[PSRCA_PSRCB]], i32 [[SUB47]] +; CHECK-T1-NEXT: [[INC49]] = add i32 [[COUNT_11072]], 1 +; CHECK-T1-NEXT: [[DEC50]] = add i32 [[BLOCKSIZE1_11071]], -1 +; CHECK-T1-NEXT: [[CMP16:%.*]] = icmp eq i32 [[DEC50]], 0 +; CHECK-T1-NEXT: br i1 [[CMP16]], label [[EXIT_LOOPEXIT:%.*]], label [[WHILE_BODY18]] +; CHECK-T1: exit.loopexit: +; CHECK-T1-NEXT: br label [[EXIT]] +; CHECK-T1: exit: +; CHECK-T1-NEXT: ret void +; +; CHECK-T2-LABEL: @arm_conv_fast_q15( +; CHECK-T2-NEXT: entry: +; CHECK-T2-NEXT: [[CMP:%.*]] = icmp ult i32 [[SRCALEN:%.*]], [[SRCBLEN:%.*]] +; CHECK-T2-NEXT: [[SRCALEN_SRCBLEN:%.*]] = select i1 [[CMP]], i32 [[SRCALEN]], i32 [[SRCBLEN]] +; CHECK-T2-NEXT: [[PSRCB_PSRCA:%.*]] = select i1 [[CMP]], i16* [[PSRCB:%.*]], i16* [[PSRCA:%.*]] +; CHECK-T2-NEXT: [[PSRCA_PSRCB:%.*]] = select i1 [[CMP]], i16* [[PSRCA]], i16* [[PSRCB]] +; CHECK-T2-NEXT: [[SUB:%.*]] = add i32 [[SRCALEN_SRCBLEN]], -1 +; CHECK-T2-NEXT: [[CMP41080:%.*]] = icmp eq i32 [[SUB]], 0 +; CHECK-T2-NEXT: br i1 [[CMP41080]], label [[WHILE_END13:%.*]], label [[WHILE_COND5_PREHEADER_PREHEADER:%.*]] +; CHECK-T2: while.cond5.preheader.preheader: +; CHECK-T2-NEXT: [[TMP0:%.*]] = add i32 [[SRCALEN_SRCBLEN]], -2 +; CHECK-T2-NEXT: [[TMP1:%.*]] = icmp ult i32 [[TMP0]], 2 +; CHECK-T2-NEXT: [[UMIN:%.*]] = select i1 [[TMP1]], i32 [[TMP0]], i32 2 +; CHECK-T2-NEXT: br label [[WHILE_COND5_PREHEADER:%.*]] +; CHECK-T2: while.cond5.preheader: +; CHECK-T2-NEXT: [[COUNT_01084:%.*]] = phi i32 [ [[INC:%.*]], [[WHILE_END:%.*]] ], [ 1, [[WHILE_COND5_PREHEADER_PREHEADER]] ] +; CHECK-T2-NEXT: [[BLOCKSIZE1_01083:%.*]] = phi i32 [ [[DEC12:%.*]], [[WHILE_END]] ], [ [[SUB]], [[WHILE_COND5_PREHEADER_PREHEADER]] ] +; CHECK-T2-NEXT: [[PY_01082:%.*]] = phi i16* [ [[ADD_PTR:%.*]], [[WHILE_END]] ], [ [[PSRCA_PSRCB]], [[WHILE_COND5_PREHEADER_PREHEADER]] ] +; CHECK-T2-NEXT: [[POUT_01081:%.*]] = phi i16* [ [[INCDEC_PTR11:%.*]], [[WHILE_END]] ], [ [[PDST:%.*]], [[WHILE_COND5_PREHEADER_PREHEADER]] ] +; CHECK-T2-NEXT: br label [[WHILE_BODY7:%.*]] +; CHECK-T2: while.body7: +; CHECK-T2-NEXT: [[K_01078:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY7]] ], [ [[COUNT_01084]], [[WHILE_COND5_PREHEADER]] ] +; CHECK-T2-NEXT: [[SUM_01077:%.*]] = phi i32 [ [[ADD6_I:%.*]], [[WHILE_BODY7]] ], [ 0, [[WHILE_COND5_PREHEADER]] ] +; CHECK-T2-NEXT: [[PY_11076:%.*]] = phi i16* [ [[INCDEC_PTR8:%.*]], [[WHILE_BODY7]] ], [ [[PY_01082]], [[WHILE_COND5_PREHEADER]] ] +; CHECK-T2-NEXT: [[PX_11075:%.*]] = phi i16* [ [[INCDEC_PTR:%.*]], [[WHILE_BODY7]] ], [ [[PSRCB_PSRCA]], [[WHILE_COND5_PREHEADER]] ] +; CHECK-T2-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i16, i16* [[PX_11075]], i32 1 +; CHECK-T2-NEXT: [[TMP2:%.*]] = load i16, i16* [[PX_11075]], align 2 +; CHECK-T2-NEXT: [[CONV:%.*]] = sext i16 [[TMP2]] to i32 +; CHECK-T2-NEXT: [[INCDEC_PTR8]] = getelementptr inbounds i16, i16* [[PY_11076]], i32 -1 +; CHECK-T2-NEXT: [[TMP3:%.*]] = load i16, i16* [[PY_11076]], align 2 +; CHECK-T2-NEXT: [[CONV9:%.*]] = sext i16 [[TMP3]] to i32 +; CHECK-T2-NEXT: [[MUL_I:%.*]] = mul nsw i32 [[CONV9]], [[CONV]] +; CHECK-T2-NEXT: [[SHR3_I:%.*]] = ashr i32 [[CONV]], 16 +; CHECK-T2-NEXT: [[SHR4_I:%.*]] = ashr i32 [[CONV9]], 16 +; CHECK-T2-NEXT: [[MUL5_I:%.*]] = mul nsw i32 [[SHR4_I]], [[SHR3_I]] +; CHECK-T2-NEXT: [[ADD_I:%.*]] = add i32 [[MUL_I]], [[SUM_01077]] +; CHECK-T2-NEXT: [[ADD6_I]] = add i32 [[ADD_I]], [[MUL5_I]] +; CHECK-T2-NEXT: [[DEC]] = add nsw i32 [[K_01078]], -1 +; CHECK-T2-NEXT: [[CMP6:%.*]] = icmp eq i32 [[DEC]], 0 +; CHECK-T2-NEXT: br i1 [[CMP6]], label [[WHILE_END]], label [[WHILE_BODY7]] +; CHECK-T2: while.end: +; CHECK-T2-NEXT: [[ADD6_I_LCSSA:%.*]] = phi i32 [ [[ADD6_I]], [[WHILE_BODY7]] ] +; CHECK-T2-NEXT: [[TMP4:%.*]] = lshr i32 [[ADD6_I_LCSSA]], 15 +; CHECK-T2-NEXT: [[CONV10:%.*]] = trunc i32 [[TMP4]] to i16 +; CHECK-T2-NEXT: [[INCDEC_PTR11]] = getelementptr inbounds i16, i16* [[POUT_01081]], i32 1 +; CHECK-T2-NEXT: store i16 [[CONV10]], i16* [[POUT_01081]], align 2 +; CHECK-T2-NEXT: [[ADD_PTR]] = getelementptr inbounds i16, i16* [[PSRCA_PSRCB]], i32 [[COUNT_01084]] +; CHECK-T2-NEXT: [[INC]] = add nuw nsw i32 [[COUNT_01084]], 1 +; CHECK-T2-NEXT: [[DEC12]] = add i32 [[BLOCKSIZE1_01083]], -1 +; CHECK-T2-NEXT: [[CMP3:%.*]] = icmp ult i32 [[COUNT_01084]], 3 +; CHECK-T2-NEXT: [[CMP4:%.*]] = icmp ne i32 [[DEC12]], 0 +; CHECK-T2-NEXT: [[TMP5:%.*]] = and i1 [[CMP4]], [[CMP3]] +; CHECK-T2-NEXT: br i1 [[TMP5]], label [[WHILE_COND5_PREHEADER]], label [[WHILE_END13_LOOPEXIT:%.*]] +; CHECK-T2: while.end13.loopexit: +; CHECK-T2-NEXT: [[INCDEC_PTR11_LCSSA:%.*]] = phi i16* [ [[INCDEC_PTR11]], [[WHILE_END]] ] +; CHECK-T2-NEXT: [[ADD_PTR_LCSSA:%.*]] = phi i16* [ [[ADD_PTR]], [[WHILE_END]] ] +; CHECK-T2-NEXT: [[DEC12_LCSSA:%.*]] = phi i32 [ [[DEC12]], [[WHILE_END]] ] +; CHECK-T2-NEXT: [[TMP6:%.*]] = add nuw nsw i32 [[UMIN]], 2 +; CHECK-T2-NEXT: br label [[WHILE_END13]] +; CHECK-T2: while.end13: +; CHECK-T2-NEXT: [[POUT_0_LCSSA:%.*]] = phi i16* [ [[PDST]], [[ENTRY:%.*]] ], [ [[INCDEC_PTR11_LCSSA]], [[WHILE_END13_LOOPEXIT]] ] +; CHECK-T2-NEXT: [[PY_0_LCSSA:%.*]] = phi i16* [ [[PSRCA_PSRCB]], [[ENTRY]] ], [ [[ADD_PTR_LCSSA]], [[WHILE_END13_LOOPEXIT]] ] +; CHECK-T2-NEXT: [[BLOCKSIZE1_0_LCSSA:%.*]] = phi i32 [ [[SUB]], [[ENTRY]] ], [ [[DEC12_LCSSA]], [[WHILE_END13_LOOPEXIT]] ] +; CHECK-T2-NEXT: [[COUNT_0_LCSSA:%.*]] = phi i32 [ 1, [[ENTRY]] ], [ [[TMP6]], [[WHILE_END13_LOOPEXIT]] ] +; CHECK-T2-NEXT: [[CMP161068:%.*]] = icmp eq i32 [[BLOCKSIZE1_0_LCSSA]], 0 +; CHECK-T2-NEXT: br i1 [[CMP161068]], label [[EXIT:%.*]], label [[WHILE_BODY18_PREHEADER:%.*]] +; CHECK-T2: while.body18.preheader: +; CHECK-T2-NEXT: [[ADD_PTR14:%.*]] = getelementptr inbounds i16, i16* [[PY_0_LCSSA]], i32 -1 +; CHECK-T2-NEXT: br label [[WHILE_BODY18:%.*]] +; CHECK-T2: while.body18: +; CHECK-T2-NEXT: [[COUNT_11072:%.*]] = phi i32 [ [[INC49:%.*]], [[WHILE_END43:%.*]] ], [ [[COUNT_0_LCSSA]], [[WHILE_BODY18_PREHEADER]] ] +; CHECK-T2-NEXT: [[BLOCKSIZE1_11071:%.*]] = phi i32 [ [[DEC50:%.*]], [[WHILE_END43]] ], [ [[BLOCKSIZE1_0_LCSSA]], [[WHILE_BODY18_PREHEADER]] ] +; CHECK-T2-NEXT: [[PY_21070:%.*]] = phi i16* [ [[ADD_PTR48:%.*]], [[WHILE_END43]] ], [ [[ADD_PTR14]], [[WHILE_BODY18_PREHEADER]] ] +; CHECK-T2-NEXT: [[POUT_11069:%.*]] = phi i16* [ [[INCDEC_PTR46:%.*]], [[WHILE_END43]] ], [ [[POUT_0_LCSSA]], [[WHILE_BODY18_PREHEADER]] ] +; CHECK-T2-NEXT: [[SHR19:%.*]] = lshr i32 [[COUNT_11072]], 2 +; CHECK-T2-NEXT: [[CMP211054:%.*]] = icmp eq i32 [[SHR19]], 0 +; CHECK-T2-NEXT: br i1 [[CMP211054]], label [[WHILE_END31:%.*]], label [[WHILE_BODY23_PREHEADER:%.*]] +; CHECK-T2: while.body23.preheader: +; CHECK-T2-NEXT: br label [[WHILE_BODY23:%.*]] +; CHECK-T2: while.body23: +; CHECK-T2-NEXT: [[K_11058:%.*]] = phi i32 [ [[DEC30:%.*]], [[WHILE_BODY23]] ], [ [[SHR19]], [[WHILE_BODY23_PREHEADER]] ] +; CHECK-T2-NEXT: [[SUM_11057:%.*]] = phi i32 [ [[ADD6_I878:%.*]], [[WHILE_BODY23]] ], [ 0, [[WHILE_BODY23_PREHEADER]] ] +; CHECK-T2-NEXT: [[PY_31056:%.*]] = phi i16* [ [[ADD_PTR_I884:%.*]], [[WHILE_BODY23]] ], [ [[PY_21070]], [[WHILE_BODY23_PREHEADER]] ] +; CHECK-T2-NEXT: [[PX_31055:%.*]] = phi i16* [ [[ADD_PTR_I890:%.*]], [[WHILE_BODY23]] ], [ [[PSRCB_PSRCA]], [[WHILE_BODY23_PREHEADER]] ] +; CHECK-T2-NEXT: [[ARRAYIDX_I907:%.*]] = getelementptr inbounds i16, i16* [[PX_31055]], i32 1 +; CHECK-T2-NEXT: [[TMP7:%.*]] = load i16, i16* [[ARRAYIDX_I907]], align 2 +; CHECK-T2-NEXT: [[TMP8:%.*]] = load i16, i16* [[PX_31055]], align 2 +; CHECK-T2-NEXT: [[ADD_PTR_I912:%.*]] = getelementptr inbounds i16, i16* [[PX_31055]], i32 2 +; CHECK-T2-NEXT: [[ARRAYIDX_I901:%.*]] = getelementptr inbounds i16, i16* [[PY_31056]], i32 1 +; CHECK-T2-NEXT: [[TMP9:%.*]] = load i16, i16* [[ARRAYIDX_I901]], align 2 +; CHECK-T2-NEXT: [[TMP10:%.*]] = load i16, i16* [[PY_31056]], align 2 +; CHECK-T2-NEXT: [[ADD_PTR_I906:%.*]] = getelementptr inbounds i16, i16* [[PY_31056]], i32 -2 +; CHECK-T2-NEXT: [[SHR_I892:%.*]] = sext i16 [[TMP8]] to i32 +; CHECK-T2-NEXT: [[SHR1_I893:%.*]] = sext i16 [[TMP9]] to i32 +; CHECK-T2-NEXT: [[MUL_I894:%.*]] = mul nsw i32 [[SHR1_I893]], [[SHR_I892]] +; CHECK-T2-NEXT: [[SHR2_I895:%.*]] = sext i16 [[TMP7]] to i32 +; CHECK-T2-NEXT: [[SHR4_I897:%.*]] = sext i16 [[TMP10]] to i32 +; CHECK-T2-NEXT: [[MUL5_I898:%.*]] = mul nsw i32 [[SHR4_I897]], [[SHR2_I895]] +; CHECK-T2-NEXT: [[ADD_I899:%.*]] = add i32 [[MUL_I894]], [[SUM_11057]] +; CHECK-T2-NEXT: [[ADD6_I900:%.*]] = add i32 [[ADD_I899]], [[MUL5_I898]] +; CHECK-T2-NEXT: [[ARRAYIDX_I885:%.*]] = getelementptr inbounds i16, i16* [[PX_31055]], i32 3 +; CHECK-T2-NEXT: [[TMP11:%.*]] = load i16, i16* [[ARRAYIDX_I885]], align 2 +; CHECK-T2-NEXT: [[TMP12:%.*]] = load i16, i16* [[ADD_PTR_I912]], align 2 +; CHECK-T2-NEXT: [[ADD_PTR_I890]] = getelementptr inbounds i16, i16* [[PX_31055]], i32 4 +; CHECK-T2-NEXT: [[ARRAYIDX_I879:%.*]] = getelementptr inbounds i16, i16* [[PY_31056]], i32 -1 +; CHECK-T2-NEXT: [[TMP13:%.*]] = load i16, i16* [[ARRAYIDX_I879]], align 2 +; CHECK-T2-NEXT: [[TMP14:%.*]] = load i16, i16* [[ADD_PTR_I906]], align 2 +; CHECK-T2-NEXT: [[ADD_PTR_I884]] = getelementptr inbounds i16, i16* [[PY_31056]], i32 -4 +; CHECK-T2-NEXT: [[SHR_I870:%.*]] = sext i16 [[TMP12]] to i32 +; CHECK-T2-NEXT: [[SHR1_I871:%.*]] = sext i16 [[TMP13]] to i32 +; CHECK-T2-NEXT: [[MUL_I872:%.*]] = mul nsw i32 [[SHR1_I871]], [[SHR_I870]] +; CHECK-T2-NEXT: [[SHR2_I873:%.*]] = sext i16 [[TMP11]] to i32 +; CHECK-T2-NEXT: [[SHR4_I875:%.*]] = sext i16 [[TMP14]] to i32 +; CHECK-T2-NEXT: [[MUL5_I876:%.*]] = mul nsw i32 [[SHR4_I875]], [[SHR2_I873]] +; CHECK-T2-NEXT: [[ADD_I877:%.*]] = add i32 [[ADD6_I900]], [[MUL_I872]] +; CHECK-T2-NEXT: [[ADD6_I878]] = add i32 [[ADD_I877]], [[MUL5_I876]] +; CHECK-T2-NEXT: [[DEC30]] = add nsw i32 [[K_11058]], -1 +; CHECK-T2-NEXT: [[CMP21:%.*]] = icmp eq i32 [[DEC30]], 0 +; CHECK-T2-NEXT: br i1 [[CMP21]], label [[WHILE_END31_LOOPEXIT:%.*]], label [[WHILE_BODY23]] +; CHECK-T2: while.end31.loopexit: +; CHECK-T2-NEXT: [[ADD_PTR_I890_LCSSA:%.*]] = phi i16* [ [[ADD_PTR_I890]], [[WHILE_BODY23]] ] +; CHECK-T2-NEXT: [[ADD_PTR_I884_LCSSA:%.*]] = phi i16* [ [[ADD_PTR_I884]], [[WHILE_BODY23]] ] +; CHECK-T2-NEXT: [[ADD6_I878_LCSSA:%.*]] = phi i32 [ [[ADD6_I878]], [[WHILE_BODY23]] ] +; CHECK-T2-NEXT: br label [[WHILE_END31]] +; CHECK-T2: while.end31: +; CHECK-T2-NEXT: [[PX_3_LCSSA:%.*]] = phi i16* [ [[PSRCB_PSRCA]], [[WHILE_BODY18]] ], [ [[ADD_PTR_I890_LCSSA]], [[WHILE_END31_LOOPEXIT]] ] +; CHECK-T2-NEXT: [[PY_3_LCSSA:%.*]] = phi i16* [ [[PY_21070]], [[WHILE_BODY18]] ], [ [[ADD_PTR_I884_LCSSA]], [[WHILE_END31_LOOPEXIT]] ] +; CHECK-T2-NEXT: [[SUM_1_LCSSA:%.*]] = phi i32 [ 0, [[WHILE_BODY18]] ], [ [[ADD6_I878_LCSSA]], [[WHILE_END31_LOOPEXIT]] ] +; CHECK-T2-NEXT: [[REM:%.*]] = and i32 [[COUNT_11072]], 3 +; CHECK-T2-NEXT: [[CMP341062:%.*]] = icmp eq i32 [[REM]], 0 +; CHECK-T2-NEXT: br i1 [[CMP341062]], label [[WHILE_END43]], label [[WHILE_BODY36_PREHEADER:%.*]] +; CHECK-T2: while.body36.preheader: +; CHECK-T2-NEXT: [[ADD_PTR32:%.*]] = getelementptr inbounds i16, i16* [[PY_3_LCSSA]], i32 1 +; CHECK-T2-NEXT: br label [[WHILE_BODY36:%.*]] +; CHECK-T2: while.body36: +; CHECK-T2-NEXT: [[K_21066:%.*]] = phi i32 [ [[DEC42:%.*]], [[WHILE_BODY36]] ], [ [[REM]], [[WHILE_BODY36_PREHEADER]] ] +; CHECK-T2-NEXT: [[SUM_21065:%.*]] = phi i32 [ [[ADD6_I868:%.*]], [[WHILE_BODY36]] ], [ [[SUM_1_LCSSA]], [[WHILE_BODY36_PREHEADER]] ] +; CHECK-T2-NEXT: [[PY_41064:%.*]] = phi i16* [ [[INCDEC_PTR39:%.*]], [[WHILE_BODY36]] ], [ [[ADD_PTR32]], [[WHILE_BODY36_PREHEADER]] ] +; CHECK-T2-NEXT: [[PX_41063:%.*]] = phi i16* [ [[INCDEC_PTR37:%.*]], [[WHILE_BODY36]] ], [ [[PX_3_LCSSA]], [[WHILE_BODY36_PREHEADER]] ] +; CHECK-T2-NEXT: [[INCDEC_PTR37]] = getelementptr inbounds i16, i16* [[PX_41063]], i32 1 +; CHECK-T2-NEXT: [[TMP15:%.*]] = load i16, i16* [[PX_41063]], align 2 +; CHECK-T2-NEXT: [[CONV38:%.*]] = sext i16 [[TMP15]] to i32 +; CHECK-T2-NEXT: [[INCDEC_PTR39]] = getelementptr inbounds i16, i16* [[PY_41064]], i32 -1 +; CHECK-T2-NEXT: [[TMP16:%.*]] = load i16, i16* [[PY_41064]], align 2 +; CHECK-T2-NEXT: [[CONV40:%.*]] = sext i16 [[TMP16]] to i32 +; CHECK-T2-NEXT: [[MUL_I863:%.*]] = mul nsw i32 [[CONV40]], [[CONV38]] +; CHECK-T2-NEXT: [[SHR3_I864:%.*]] = ashr i32 [[CONV38]], 16 +; CHECK-T2-NEXT: [[SHR4_I865:%.*]] = ashr i32 [[CONV40]], 16 +; CHECK-T2-NEXT: [[MUL5_I866:%.*]] = mul nsw i32 [[SHR4_I865]], [[SHR3_I864]] +; CHECK-T2-NEXT: [[ADD_I867:%.*]] = add i32 [[MUL_I863]], [[SUM_21065]] +; CHECK-T2-NEXT: [[ADD6_I868]] = add i32 [[ADD_I867]], [[MUL5_I866]] +; CHECK-T2-NEXT: [[DEC42]] = add nsw i32 [[K_21066]], -1 +; CHECK-T2-NEXT: [[CMP34:%.*]] = icmp eq i32 [[DEC42]], 0 +; CHECK-T2-NEXT: br i1 [[CMP34]], label [[WHILE_END43_LOOPEXIT:%.*]], label [[WHILE_BODY36]] +; CHECK-T2: while.end43.loopexit: +; CHECK-T2-NEXT: [[ADD6_I868_LCSSA:%.*]] = phi i32 [ [[ADD6_I868]], [[WHILE_BODY36]] ] +; CHECK-T2-NEXT: br label [[WHILE_END43]] +; CHECK-T2: while.end43: +; CHECK-T2-NEXT: [[SUM_2_LCSSA:%.*]] = phi i32 [ [[SUM_1_LCSSA]], [[WHILE_END31]] ], [ [[ADD6_I868_LCSSA]], [[WHILE_END43_LOOPEXIT]] ] +; CHECK-T2-NEXT: [[TMP17:%.*]] = lshr i32 [[SUM_2_LCSSA]], 15 +; CHECK-T2-NEXT: [[CONV45:%.*]] = trunc i32 [[TMP17]] to i16 +; CHECK-T2-NEXT: [[INCDEC_PTR46]] = getelementptr inbounds i16, i16* [[POUT_11069]], i32 1 +; CHECK-T2-NEXT: store i16 [[CONV45]], i16* [[POUT_11069]], align 2 +; CHECK-T2-NEXT: [[SUB47:%.*]] = add i32 [[COUNT_11072]], -1 +; CHECK-T2-NEXT: [[ADD_PTR48]] = getelementptr inbounds i16, i16* [[PSRCA_PSRCB]], i32 [[SUB47]] +; CHECK-T2-NEXT: [[INC49]] = add i32 [[COUNT_11072]], 1 +; CHECK-T2-NEXT: [[DEC50]] = add i32 [[BLOCKSIZE1_11071]], -1 +; CHECK-T2-NEXT: [[CMP16:%.*]] = icmp eq i32 [[DEC50]], 0 +; CHECK-T2-NEXT: br i1 [[CMP16]], label [[EXIT_LOOPEXIT:%.*]], label [[WHILE_BODY18]] +; CHECK-T2: exit.loopexit: +; CHECK-T2-NEXT: br label [[EXIT]] +; CHECK-T2: exit: +; CHECK-T2-NEXT: ret void +; +entry: + %cmp = icmp ult i32 %srcALen, %srcBLen + %srcALen.srcBLen = select i1 %cmp, i32 %srcALen, i32 %srcBLen + %pSrcB.pSrcA = select i1 %cmp, i16* %pSrcB, i16* %pSrcA + %pSrcA.pSrcB = select i1 %cmp, i16* %pSrcA, i16* %pSrcB + %sub = add i32 %srcALen.srcBLen, -1 + %cmp41080 = icmp eq i32 %sub, 0 + br i1 %cmp41080, label %while.end13, label %while.cond5.preheader + +while.cond5.preheader: ; preds = %while.end, %entry + %count.01084 = phi i32 [ %inc, %while.end ], [ 1, %entry ] + %blockSize1.01083 = phi i32 [ %dec12, %while.end ], [ %sub, %entry ] + %py.01082 = phi i16* [ %add.ptr, %while.end ], [ %pSrcA.pSrcB, %entry ] + %pOut.01081 = phi i16* [ %incdec.ptr11, %while.end ], [ %pDst, %entry ] + br label %while.body7 + +while.body7: ; preds = %while.body7, %while.cond5.preheader + %k.01078 = phi i32 [ %dec, %while.body7 ], [ %count.01084, %while.cond5.preheader ] + %sum.01077 = phi i32 [ %add6.i, %while.body7 ], [ 0, %while.cond5.preheader ] + %py.11076 = phi i16* [ %incdec.ptr8, %while.body7 ], [ %py.01082, %while.cond5.preheader ] + %px.11075 = phi i16* [ %incdec.ptr, %while.body7 ], [ %pSrcB.pSrcA, %while.cond5.preheader ] + %incdec.ptr = getelementptr inbounds i16, i16* %px.11075, i32 1 + %0 = load i16, i16* %px.11075, align 2 + %conv = sext i16 %0 to i32 + %incdec.ptr8 = getelementptr inbounds i16, i16* %py.11076, i32 -1 + %1 = load i16, i16* %py.11076, align 2 + %conv9 = sext i16 %1 to i32 + %mul.i = mul nsw i32 %conv9, %conv + %shr3.i = ashr i32 %conv, 16 + %shr4.i = ashr i32 %conv9, 16 + %mul5.i = mul nsw i32 %shr4.i, %shr3.i + %add.i = add i32 %mul.i, %sum.01077 + %add6.i = add i32 %add.i, %mul5.i + %dec = add nsw i32 %k.01078, -1 + %cmp6 = icmp eq i32 %dec, 0 + br i1 %cmp6, label %while.end, label %while.body7 + +while.end: ; preds = %while.body7 + %2 = lshr i32 %add6.i, 15 + %conv10 = trunc i32 %2 to i16 + %incdec.ptr11 = getelementptr inbounds i16, i16* %pOut.01081, i32 1 + store i16 %conv10, i16* %pOut.01081, align 2 + %add.ptr = getelementptr inbounds i16, i16* %pSrcA.pSrcB, i32 %count.01084 + %inc = add nuw nsw i32 %count.01084, 1 + %dec12 = add i32 %blockSize1.01083, -1 + %cmp3 = icmp ult i32 %count.01084, 3 + %cmp4 = icmp ne i32 %dec12, 0 + %3 = and i1 %cmp4, %cmp3 + br i1 %3, label %while.cond5.preheader, label %while.end13 + +while.end13: ; preds = %while.end, %entry + %pOut.0.lcssa = phi i16* [ %pDst, %entry ], [ %incdec.ptr11, %while.end ] + %py.0.lcssa = phi i16* [ %pSrcA.pSrcB, %entry ], [ %add.ptr, %while.end ] + %blockSize1.0.lcssa = phi i32 [ %sub, %entry ], [ %dec12, %while.end ] + %count.0.lcssa = phi i32 [ 1, %entry ], [ %inc, %while.end ] + %cmp161068 = icmp eq i32 %blockSize1.0.lcssa, 0 + br i1 %cmp161068, label %exit, label %while.body18.preheader + +while.body18.preheader: ; preds = %while.end13 + %add.ptr14 = getelementptr inbounds i16, i16* %py.0.lcssa, i32 -1 + br label %while.body18 + +while.body18: ; preds = %while.end43, %while.body18.preheader + %count.11072 = phi i32 [ %inc49, %while.end43 ], [ %count.0.lcssa, %while.body18.preheader ] + %blockSize1.11071 = phi i32 [ %dec50, %while.end43 ], [ %blockSize1.0.lcssa, %while.body18.preheader ] + %py.21070 = phi i16* [ %add.ptr48, %while.end43 ], [ %add.ptr14, %while.body18.preheader ] + %pOut.11069 = phi i16* [ %incdec.ptr46, %while.end43 ], [ %pOut.0.lcssa, %while.body18.preheader ] + %shr19 = lshr i32 %count.11072, 2 + %cmp211054 = icmp eq i32 %shr19, 0 + br i1 %cmp211054, label %while.end31, label %while.body23 + +while.body23: ; preds = %while.body23, %while.body18 + %k.11058 = phi i32 [ %dec30, %while.body23 ], [ %shr19, %while.body18 ] + %sum.11057 = phi i32 [ %add6.i878, %while.body23 ], [ 0, %while.body18 ] + %py.31056 = phi i16* [ %add.ptr.i884, %while.body23 ], [ %py.21070, %while.body18 ] + %px.31055 = phi i16* [ %add.ptr.i890, %while.body23 ], [ %pSrcB.pSrcA, %while.body18 ] + %arrayidx.i907 = getelementptr inbounds i16, i16* %px.31055, i32 1 + %4 = load i16, i16* %arrayidx.i907, align 2 + %5 = load i16, i16* %px.31055, align 2 + %add.ptr.i912 = getelementptr inbounds i16, i16* %px.31055, i32 2 + %arrayidx.i901 = getelementptr inbounds i16, i16* %py.31056, i32 1 + %6 = load i16, i16* %arrayidx.i901, align 2 + %7 = load i16, i16* %py.31056, align 2 + %add.ptr.i906 = getelementptr inbounds i16, i16* %py.31056, i32 -2 + %shr.i892 = sext i16 %5 to i32 + %shr1.i893 = sext i16 %6 to i32 + %mul.i894 = mul nsw i32 %shr1.i893, %shr.i892 + %shr2.i895 = sext i16 %4 to i32 + %shr4.i897 = sext i16 %7 to i32 + %mul5.i898 = mul nsw i32 %shr4.i897, %shr2.i895 + %add.i899 = add i32 %mul.i894, %sum.11057 + %add6.i900 = add i32 %add.i899, %mul5.i898 + %arrayidx.i885 = getelementptr inbounds i16, i16* %px.31055, i32 3 + %8 = load i16, i16* %arrayidx.i885, align 2 + %9 = load i16, i16* %add.ptr.i912, align 2 + %add.ptr.i890 = getelementptr inbounds i16, i16* %px.31055, i32 4 + %arrayidx.i879 = getelementptr inbounds i16, i16* %py.31056, i32 -1 + %10 = load i16, i16* %arrayidx.i879, align 2 + %11 = load i16, i16* %add.ptr.i906, align 2 + %add.ptr.i884 = getelementptr inbounds i16, i16* %py.31056, i32 -4 + %shr.i870 = sext i16 %9 to i32 + %shr1.i871 = sext i16 %10 to i32 + %mul.i872 = mul nsw i32 %shr1.i871, %shr.i870 + %shr2.i873 = sext i16 %8 to i32 + %shr4.i875 = sext i16 %11 to i32 + %mul5.i876 = mul nsw i32 %shr4.i875, %shr2.i873 + %add.i877 = add i32 %add6.i900, %mul.i872 + %add6.i878 = add i32 %add.i877, %mul5.i876 + %dec30 = add nsw i32 %k.11058, -1 + %cmp21 = icmp eq i32 %dec30, 0 + br i1 %cmp21, label %while.end31, label %while.body23 + +while.end31: ; preds = %while.body23, %while.body18 + %px.3.lcssa = phi i16* [ %pSrcB.pSrcA, %while.body18 ], [ %add.ptr.i890, %while.body23 ] + %py.3.lcssa = phi i16* [ %py.21070, %while.body18 ], [ %add.ptr.i884, %while.body23 ] + %sum.1.lcssa = phi i32 [ 0, %while.body18 ], [ %add6.i878, %while.body23 ] + %rem = and i32 %count.11072, 3 + %cmp341062 = icmp eq i32 %rem, 0 + br i1 %cmp341062, label %while.end43, label %while.body36.preheader + +while.body36.preheader: ; preds = %while.end31 + %add.ptr32 = getelementptr inbounds i16, i16* %py.3.lcssa, i32 1 + br label %while.body36 + +while.body36: ; preds = %while.body36, %while.body36.preheader + %k.21066 = phi i32 [ %dec42, %while.body36 ], [ %rem, %while.body36.preheader ] + %sum.21065 = phi i32 [ %add6.i868, %while.body36 ], [ %sum.1.lcssa, %while.body36.preheader ] + %py.41064 = phi i16* [ %incdec.ptr39, %while.body36 ], [ %add.ptr32, %while.body36.preheader ] + %px.41063 = phi i16* [ %incdec.ptr37, %while.body36 ], [ %px.3.lcssa, %while.body36.preheader ] + %incdec.ptr37 = getelementptr inbounds i16, i16* %px.41063, i32 1 + %12 = load i16, i16* %px.41063, align 2 + %conv38 = sext i16 %12 to i32 + %incdec.ptr39 = getelementptr inbounds i16, i16* %py.41064, i32 -1 + %13 = load i16, i16* %py.41064, align 2 + %conv40 = sext i16 %13 to i32 + %mul.i863 = mul nsw i32 %conv40, %conv38 + %shr3.i864 = ashr i32 %conv38, 16 + %shr4.i865 = ashr i32 %conv40, 16 + %mul5.i866 = mul nsw i32 %shr4.i865, %shr3.i864 + %add.i867 = add i32 %mul.i863, %sum.21065 + %add6.i868 = add i32 %add.i867, %mul5.i866 + %dec42 = add nsw i32 %k.21066, -1 + %cmp34 = icmp eq i32 %dec42, 0 + br i1 %cmp34, label %while.end43, label %while.body36 + +while.end43: ; preds = %while.body36, %while.end31 + %sum.2.lcssa = phi i32 [ %sum.1.lcssa, %while.end31 ], [ %add6.i868, %while.body36 ] + %14 = lshr i32 %sum.2.lcssa, 15 + %conv45 = trunc i32 %14 to i16 + %incdec.ptr46 = getelementptr inbounds i16, i16* %pOut.11069, i32 1 + store i16 %conv45, i16* %pOut.11069, align 2 + %sub47 = add i32 %count.11072, -1 + %add.ptr48 = getelementptr inbounds i16, i16* %pSrcA.pSrcB, i32 %sub47 + %inc49 = add i32 %count.11072, 1 + %dec50 = add i32 %blockSize1.11071, -1 + %cmp16 = icmp eq i32 %dec50, 0 + br i1 %cmp16, label %exit, label %while.body18 + +exit: ; preds = %while.end43, %while.end13 + ret void +} diff --git a/llvm/test/CodeGen/ARM/indvar-unroll-imm-cost.ll b/llvm/test/CodeGen/ARM/indvar-unroll-imm-cost.ll new file mode 100644 index 0000000000000..36749a03553ea --- /dev/null +++ b/llvm/test/CodeGen/ARM/indvar-unroll-imm-cost.ll @@ -0,0 +1,578 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -indvars -loop-unroll -mtriple=thumbv8m.main %s -S -o - | FileCheck %s + +define dso_local arm_aapcscc void @test(i32* nocapture %pDest, i16* nocapture readonly %pSrcA, i16* nocapture readonly %pSrcB, i32 %blkCnt) local_unnamed_addr #0 { +; CHECK-LABEL: @test( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP88:%.*]] = icmp eq i32 [[BLKCNT:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP88]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup.loopexit: +; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; CHECK: for.body: +; CHECK-NEXT: [[I_092:%.*]] = phi i32 [ [[INC42:%.*]], [[FOR_END40:%.*]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[PDEST_ADDR_091:%.*]] = phi i32* [ [[PDEST_ADDR_2_LCSSA:%.*]], [[FOR_END40]] ], [ [[PDEST:%.*]], [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[PSRCA_ADDR_090:%.*]] = phi i16* [ [[PSRCA_ADDR_2_LCSSA:%.*]], [[FOR_END40]] ], [ [[PSRCA:%.*]], [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[PSRCB_ADDR_089:%.*]] = phi i16* [ [[PSRCB_ADDR_2_LCSSA:%.*]], [[FOR_END40]] ], [ [[PSRCB:%.*]], [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[TMP0:%.*]] = lshr i32 [[I_092]], 2 +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[TMP0]], -1 +; CHECK-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP1]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw i32 [[TMP2]], 1 +; CHECK-NEXT: [[TMP4:%.*]] = lshr i32 [[I_092]], 2 +; CHECK-NEXT: [[TMP5:%.*]] = add nuw nsw i32 [[TMP4]], 3 +; CHECK-NEXT: [[TMP6:%.*]] = and i32 [[TMP5]], 2147483644 +; CHECK-NEXT: [[CMP272:%.*]] = icmp eq i32 [[TMP4]], 0 +; CHECK-NEXT: br i1 [[CMP272]], label [[FOR_END:%.*]], label [[FOR_BODY3_PREHEADER:%.*]] +; CHECK: for.body3.preheader: +; CHECK-NEXT: [[XTRAITER:%.*]] = and i32 [[TMP3]], 3 +; CHECK-NEXT: [[TMP7:%.*]] = icmp ult i32 [[TMP2]], 3 +; CHECK-NEXT: br i1 [[TMP7]], label [[FOR_END_LOOPEXIT_UNR_LCSSA:%.*]], label [[FOR_BODY3_PREHEADER_NEW:%.*]] +; CHECK: for.body3.preheader.new: +; CHECK-NEXT: [[UNROLL_ITER:%.*]] = sub i32 [[TMP3]], [[XTRAITER]] +; CHECK-NEXT: br label [[FOR_BODY3:%.*]] +; CHECK: for.body3: +; CHECK-NEXT: [[J_076:%.*]] = phi i32 [ 0, [[FOR_BODY3_PREHEADER_NEW]] ], [ [[ADD24_3:%.*]], [[FOR_BODY3]] ] +; CHECK-NEXT: [[PDEST_ADDR_175:%.*]] = phi i32* [ [[PDEST_ADDR_091]], [[FOR_BODY3_PREHEADER_NEW]] ], [ [[INCDEC_PTR_3:%.*]], [[FOR_BODY3]] ] +; CHECK-NEXT: [[PSRCA_ADDR_174:%.*]] = phi i16* [ [[PSRCA_ADDR_090]], [[FOR_BODY3_PREHEADER_NEW]] ], [ [[ADD_PTR_3:%.*]], [[FOR_BODY3]] ] +; CHECK-NEXT: [[PSRCB_ADDR_173:%.*]] = phi i16* [ [[PSRCB_ADDR_089]], [[FOR_BODY3_PREHEADER_NEW]] ], [ [[ADD_PTR23_3:%.*]], [[FOR_BODY3]] ] +; CHECK-NEXT: [[NITER:%.*]] = phi i32 [ [[UNROLL_ITER]], [[FOR_BODY3_PREHEADER_NEW]] ], [ [[NITER_NSUB_3:%.*]], [[FOR_BODY3]] ] +; CHECK-NEXT: [[TMP8:%.*]] = load i16, i16* [[PSRCA_ADDR_174]], align 2 +; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[TMP8]] to i32 +; CHECK-NEXT: [[TMP9:%.*]] = load i16, i16* [[PSRCB_ADDR_173]], align 2 +; CHECK-NEXT: [[CONV5:%.*]] = sext i16 [[TMP9]] to i32 +; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[CONV5]], [[CONV]] +; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i16, i16* [[PSRCA_ADDR_174]], i32 1 +; CHECK-NEXT: [[TMP10:%.*]] = load i16, i16* [[ARRAYIDX6]], align 2 +; CHECK-NEXT: [[CONV7:%.*]] = sext i16 [[TMP10]] to i32 +; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i16, i16* [[PSRCB_ADDR_173]], i32 1 +; CHECK-NEXT: [[TMP11:%.*]] = load i16, i16* [[ARRAYIDX8]], align 2 +; CHECK-NEXT: [[CONV9:%.*]] = sext i16 [[TMP11]] to i32 +; CHECK-NEXT: [[MUL10:%.*]] = mul nsw i32 [[CONV9]], [[CONV7]] +; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds i16, i16* [[PSRCA_ADDR_174]], i32 2 +; CHECK-NEXT: [[TMP12:%.*]] = load i16, i16* [[ARRAYIDX11]], align 2 +; CHECK-NEXT: [[CONV12:%.*]] = sext i16 [[TMP12]] to i32 +; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds i16, i16* [[PSRCB_ADDR_173]], i32 3 +; CHECK-NEXT: [[TMP13:%.*]] = load i16, i16* [[ARRAYIDX13]], align 2 +; CHECK-NEXT: [[CONV14:%.*]] = sext i16 [[TMP13]] to i32 +; CHECK-NEXT: [[MUL15:%.*]] = mul nsw i32 [[CONV14]], [[CONV12]] +; CHECK-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds i16, i16* [[PSRCA_ADDR_174]], i32 3 +; CHECK-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX17]], align 2 +; CHECK-NEXT: [[CONV18:%.*]] = sext i16 [[TMP14]] to i32 +; CHECK-NEXT: [[ADD21:%.*]] = add i32 [[MUL10]], [[MUL]] +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[ADD21]], [[CONV14]] +; CHECK-NEXT: [[ADD16:%.*]] = add i32 [[ADD]], [[MUL15]] +; CHECK-NEXT: [[ADD22:%.*]] = add i32 [[ADD16]], [[CONV18]] +; CHECK-NEXT: store i32 [[ADD22]], i32* [[PDEST_ADDR_175]], align 4 +; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i16, i16* [[PSRCA_ADDR_174]], i32 4 +; CHECK-NEXT: [[ADD_PTR23:%.*]] = getelementptr inbounds i16, i16* [[PSRCB_ADDR_173]], i32 4 +; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[PDEST_ADDR_175]], i32 1 +; CHECK-NEXT: [[ADD24:%.*]] = add nuw nsw i32 [[J_076]], 4 +; CHECK-NEXT: [[NITER_NSUB:%.*]] = sub i32 [[NITER]], 1 +; CHECK-NEXT: [[TMP15:%.*]] = load i16, i16* [[ADD_PTR]], align 2 +; CHECK-NEXT: [[CONV_1:%.*]] = sext i16 [[TMP15]] to i32 +; CHECK-NEXT: [[TMP16:%.*]] = load i16, i16* [[ADD_PTR23]], align 2 +; CHECK-NEXT: [[CONV5_1:%.*]] = sext i16 [[TMP16]] to i32 +; CHECK-NEXT: [[MUL_1:%.*]] = mul nsw i32 [[CONV5_1]], [[CONV_1]] +; CHECK-NEXT: [[ARRAYIDX6_1:%.*]] = getelementptr inbounds i16, i16* [[ADD_PTR]], i32 1 +; CHECK-NEXT: [[TMP17:%.*]] = load i16, i16* [[ARRAYIDX6_1]], align 2 +; CHECK-NEXT: [[CONV7_1:%.*]] = sext i16 [[TMP17]] to i32 +; CHECK-NEXT: [[ARRAYIDX8_1:%.*]] = getelementptr inbounds i16, i16* [[ADD_PTR23]], i32 1 +; CHECK-NEXT: [[TMP18:%.*]] = load i16, i16* [[ARRAYIDX8_1]], align 2 +; CHECK-NEXT: [[CONV9_1:%.*]] = sext i16 [[TMP18]] to i32 +; CHECK-NEXT: [[MUL10_1:%.*]] = mul nsw i32 [[CONV9_1]], [[CONV7_1]] +; CHECK-NEXT: [[ARRAYIDX11_1:%.*]] = getelementptr inbounds i16, i16* [[ADD_PTR]], i32 2 +; CHECK-NEXT: [[TMP19:%.*]] = load i16, i16* [[ARRAYIDX11_1]], align 2 +; CHECK-NEXT: [[CONV12_1:%.*]] = sext i16 [[TMP19]] to i32 +; CHECK-NEXT: [[ARRAYIDX13_1:%.*]] = getelementptr inbounds i16, i16* [[ADD_PTR23]], i32 3 +; CHECK-NEXT: [[TMP20:%.*]] = load i16, i16* [[ARRAYIDX13_1]], align 2 +; CHECK-NEXT: [[CONV14_1:%.*]] = sext i16 [[TMP20]] to i32 +; CHECK-NEXT: [[MUL15_1:%.*]] = mul nsw i32 [[CONV14_1]], [[CONV12_1]] +; CHECK-NEXT: [[ARRAYIDX17_1:%.*]] = getelementptr inbounds i16, i16* [[ADD_PTR]], i32 3 +; CHECK-NEXT: [[TMP21:%.*]] = load i16, i16* [[ARRAYIDX17_1]], align 2 +; CHECK-NEXT: [[CONV18_1:%.*]] = sext i16 [[TMP21]] to i32 +; CHECK-NEXT: [[ADD21_1:%.*]] = add i32 [[MUL10_1]], [[MUL_1]] +; CHECK-NEXT: [[ADD_1:%.*]] = add i32 [[ADD21_1]], [[CONV14_1]] +; CHECK-NEXT: [[ADD16_1:%.*]] = add i32 [[ADD_1]], [[MUL15_1]] +; CHECK-NEXT: [[ADD22_1:%.*]] = add i32 [[ADD16_1]], [[CONV18_1]] +; CHECK-NEXT: store i32 [[ADD22_1]], i32* [[INCDEC_PTR]], align 4 +; CHECK-NEXT: [[ADD_PTR_1:%.*]] = getelementptr inbounds i16, i16* [[ADD_PTR]], i32 4 +; CHECK-NEXT: [[ADD_PTR23_1:%.*]] = getelementptr inbounds i16, i16* [[ADD_PTR23]], i32 4 +; CHECK-NEXT: [[INCDEC_PTR_1:%.*]] = getelementptr inbounds i32, i32* [[INCDEC_PTR]], i32 1 +; CHECK-NEXT: [[ADD24_1:%.*]] = add nuw nsw i32 [[ADD24]], 4 +; CHECK-NEXT: [[NITER_NSUB_1:%.*]] = sub i32 [[NITER_NSUB]], 1 +; CHECK-NEXT: [[TMP22:%.*]] = load i16, i16* [[ADD_PTR_1]], align 2 +; CHECK-NEXT: [[CONV_2:%.*]] = sext i16 [[TMP22]] to i32 +; CHECK-NEXT: [[TMP23:%.*]] = load i16, i16* [[ADD_PTR23_1]], align 2 +; CHECK-NEXT: [[CONV5_2:%.*]] = sext i16 [[TMP23]] to i32 +; CHECK-NEXT: [[MUL_2:%.*]] = mul nsw i32 [[CONV5_2]], [[CONV_2]] +; CHECK-NEXT: [[ARRAYIDX6_2:%.*]] = getelementptr inbounds i16, i16* [[ADD_PTR_1]], i32 1 +; CHECK-NEXT: [[TMP24:%.*]] = load i16, i16* [[ARRAYIDX6_2]], align 2 +; CHECK-NEXT: [[CONV7_2:%.*]] = sext i16 [[TMP24]] to i32 +; CHECK-NEXT: [[ARRAYIDX8_2:%.*]] = getelementptr inbounds i16, i16* [[ADD_PTR23_1]], i32 1 +; CHECK-NEXT: [[TMP25:%.*]] = load i16, i16* [[ARRAYIDX8_2]], align 2 +; CHECK-NEXT: [[CONV9_2:%.*]] = sext i16 [[TMP25]] to i32 +; CHECK-NEXT: [[MUL10_2:%.*]] = mul nsw i32 [[CONV9_2]], [[CONV7_2]] +; CHECK-NEXT: [[ARRAYIDX11_2:%.*]] = getelementptr inbounds i16, i16* [[ADD_PTR_1]], i32 2 +; CHECK-NEXT: [[TMP26:%.*]] = load i16, i16* [[ARRAYIDX11_2]], align 2 +; CHECK-NEXT: [[CONV12_2:%.*]] = sext i16 [[TMP26]] to i32 +; CHECK-NEXT: [[ARRAYIDX13_2:%.*]] = getelementptr inbounds i16, i16* [[ADD_PTR23_1]], i32 3 +; CHECK-NEXT: [[TMP27:%.*]] = load i16, i16* [[ARRAYIDX13_2]], align 2 +; CHECK-NEXT: [[CONV14_2:%.*]] = sext i16 [[TMP27]] to i32 +; CHECK-NEXT: [[MUL15_2:%.*]] = mul nsw i32 [[CONV14_2]], [[CONV12_2]] +; CHECK-NEXT: [[ARRAYIDX17_2:%.*]] = getelementptr inbounds i16, i16* [[ADD_PTR_1]], i32 3 +; CHECK-NEXT: [[TMP28:%.*]] = load i16, i16* [[ARRAYIDX17_2]], align 2 +; CHECK-NEXT: [[CONV18_2:%.*]] = sext i16 [[TMP28]] to i32 +; CHECK-NEXT: [[ADD21_2:%.*]] = add i32 [[MUL10_2]], [[MUL_2]] +; CHECK-NEXT: [[ADD_2:%.*]] = add i32 [[ADD21_2]], [[CONV14_2]] +; CHECK-NEXT: [[ADD16_2:%.*]] = add i32 [[ADD_2]], [[MUL15_2]] +; CHECK-NEXT: [[ADD22_2:%.*]] = add i32 [[ADD16_2]], [[CONV18_2]] +; CHECK-NEXT: store i32 [[ADD22_2]], i32* [[INCDEC_PTR_1]], align 4 +; CHECK-NEXT: [[ADD_PTR_2:%.*]] = getelementptr inbounds i16, i16* [[ADD_PTR_1]], i32 4 +; CHECK-NEXT: [[ADD_PTR23_2:%.*]] = getelementptr inbounds i16, i16* [[ADD_PTR23_1]], i32 4 +; CHECK-NEXT: [[INCDEC_PTR_2:%.*]] = getelementptr inbounds i32, i32* [[INCDEC_PTR_1]], i32 1 +; CHECK-NEXT: [[ADD24_2:%.*]] = add nuw nsw i32 [[ADD24_1]], 4 +; CHECK-NEXT: [[NITER_NSUB_2:%.*]] = sub i32 [[NITER_NSUB_1]], 1 +; CHECK-NEXT: [[TMP29:%.*]] = load i16, i16* [[ADD_PTR_2]], align 2 +; CHECK-NEXT: [[CONV_3:%.*]] = sext i16 [[TMP29]] to i32 +; CHECK-NEXT: [[TMP30:%.*]] = load i16, i16* [[ADD_PTR23_2]], align 2 +; CHECK-NEXT: [[CONV5_3:%.*]] = sext i16 [[TMP30]] to i32 +; CHECK-NEXT: [[MUL_3:%.*]] = mul nsw i32 [[CONV5_3]], [[CONV_3]] +; CHECK-NEXT: [[ARRAYIDX6_3:%.*]] = getelementptr inbounds i16, i16* [[ADD_PTR_2]], i32 1 +; CHECK-NEXT: [[TMP31:%.*]] = load i16, i16* [[ARRAYIDX6_3]], align 2 +; CHECK-NEXT: [[CONV7_3:%.*]] = sext i16 [[TMP31]] to i32 +; CHECK-NEXT: [[ARRAYIDX8_3:%.*]] = getelementptr inbounds i16, i16* [[ADD_PTR23_2]], i32 1 +; CHECK-NEXT: [[TMP32:%.*]] = load i16, i16* [[ARRAYIDX8_3]], align 2 +; CHECK-NEXT: [[CONV9_3:%.*]] = sext i16 [[TMP32]] to i32 +; CHECK-NEXT: [[MUL10_3:%.*]] = mul nsw i32 [[CONV9_3]], [[CONV7_3]] +; CHECK-NEXT: [[ARRAYIDX11_3:%.*]] = getelementptr inbounds i16, i16* [[ADD_PTR_2]], i32 2 +; CHECK-NEXT: [[TMP33:%.*]] = load i16, i16* [[ARRAYIDX11_3]], align 2 +; CHECK-NEXT: [[CONV12_3:%.*]] = sext i16 [[TMP33]] to i32 +; CHECK-NEXT: [[ARRAYIDX13_3:%.*]] = getelementptr inbounds i16, i16* [[ADD_PTR23_2]], i32 3 +; CHECK-NEXT: [[TMP34:%.*]] = load i16, i16* [[ARRAYIDX13_3]], align 2 +; CHECK-NEXT: [[CONV14_3:%.*]] = sext i16 [[TMP34]] to i32 +; CHECK-NEXT: [[MUL15_3:%.*]] = mul nsw i32 [[CONV14_3]], [[CONV12_3]] +; CHECK-NEXT: [[ARRAYIDX17_3:%.*]] = getelementptr inbounds i16, i16* [[ADD_PTR_2]], i32 3 +; CHECK-NEXT: [[TMP35:%.*]] = load i16, i16* [[ARRAYIDX17_3]], align 2 +; CHECK-NEXT: [[CONV18_3:%.*]] = sext i16 [[TMP35]] to i32 +; CHECK-NEXT: [[ADD21_3:%.*]] = add i32 [[MUL10_3]], [[MUL_3]] +; CHECK-NEXT: [[ADD_3:%.*]] = add i32 [[ADD21_3]], [[CONV14_3]] +; CHECK-NEXT: [[ADD16_3:%.*]] = add i32 [[ADD_3]], [[MUL15_3]] +; CHECK-NEXT: [[ADD22_3:%.*]] = add i32 [[ADD16_3]], [[CONV18_3]] +; CHECK-NEXT: store i32 [[ADD22_3]], i32* [[INCDEC_PTR_2]], align 4 +; CHECK-NEXT: [[ADD_PTR_3]] = getelementptr inbounds i16, i16* [[ADD_PTR_2]], i32 4 +; CHECK-NEXT: [[ADD_PTR23_3]] = getelementptr inbounds i16, i16* [[ADD_PTR23_2]], i32 4 +; CHECK-NEXT: [[INCDEC_PTR_3]] = getelementptr inbounds i32, i32* [[INCDEC_PTR_2]], i32 1 +; CHECK-NEXT: [[ADD24_3]] = add nuw nsw i32 [[ADD24_2]], 4 +; CHECK-NEXT: [[NITER_NSUB_3]] = sub i32 [[NITER_NSUB_2]], 1 +; CHECK-NEXT: [[NITER_NCMP_3:%.*]] = icmp ne i32 [[NITER_NSUB_3]], 0 +; CHECK-NEXT: br i1 [[NITER_NCMP_3]], label [[FOR_BODY3]], label [[FOR_END_LOOPEXIT_UNR_LCSSA_LOOPEXIT:%.*]] +; CHECK: for.end.loopexit.unr-lcssa.loopexit: +; CHECK-NEXT: [[ADD_PTR_LCSSA_PH_PH:%.*]] = phi i16* [ [[ADD_PTR_3]], [[FOR_BODY3]] ] +; CHECK-NEXT: [[ADD_PTR23_LCSSA_PH_PH:%.*]] = phi i16* [ [[ADD_PTR23_3]], [[FOR_BODY3]] ] +; CHECK-NEXT: [[INCDEC_PTR_LCSSA_PH_PH:%.*]] = phi i32* [ [[INCDEC_PTR_3]], [[FOR_BODY3]] ] +; CHECK-NEXT: [[J_076_UNR_PH:%.*]] = phi i32 [ [[ADD24_3]], [[FOR_BODY3]] ] +; CHECK-NEXT: [[PDEST_ADDR_175_UNR_PH:%.*]] = phi i32* [ [[INCDEC_PTR_3]], [[FOR_BODY3]] ] +; CHECK-NEXT: [[PSRCA_ADDR_174_UNR_PH:%.*]] = phi i16* [ [[ADD_PTR_3]], [[FOR_BODY3]] ] +; CHECK-NEXT: [[PSRCB_ADDR_173_UNR_PH:%.*]] = phi i16* [ [[ADD_PTR23_3]], [[FOR_BODY3]] ] +; CHECK-NEXT: br label [[FOR_END_LOOPEXIT_UNR_LCSSA]] +; CHECK: for.end.loopexit.unr-lcssa: +; CHECK-NEXT: [[ADD_PTR_LCSSA_PH:%.*]] = phi i16* [ undef, [[FOR_BODY3_PREHEADER]] ], [ [[ADD_PTR_LCSSA_PH_PH]], [[FOR_END_LOOPEXIT_UNR_LCSSA_LOOPEXIT]] ] +; CHECK-NEXT: [[ADD_PTR23_LCSSA_PH:%.*]] = phi i16* [ undef, [[FOR_BODY3_PREHEADER]] ], [ [[ADD_PTR23_LCSSA_PH_PH]], [[FOR_END_LOOPEXIT_UNR_LCSSA_LOOPEXIT]] ] +; CHECK-NEXT: [[INCDEC_PTR_LCSSA_PH:%.*]] = phi i32* [ undef, [[FOR_BODY3_PREHEADER]] ], [ [[INCDEC_PTR_LCSSA_PH_PH]], [[FOR_END_LOOPEXIT_UNR_LCSSA_LOOPEXIT]] ] +; CHECK-NEXT: [[J_076_UNR:%.*]] = phi i32 [ 0, [[FOR_BODY3_PREHEADER]] ], [ [[J_076_UNR_PH]], [[FOR_END_LOOPEXIT_UNR_LCSSA_LOOPEXIT]] ] +; CHECK-NEXT: [[PDEST_ADDR_175_UNR:%.*]] = phi i32* [ [[PDEST_ADDR_091]], [[FOR_BODY3_PREHEADER]] ], [ [[PDEST_ADDR_175_UNR_PH]], [[FOR_END_LOOPEXIT_UNR_LCSSA_LOOPEXIT]] ] +; CHECK-NEXT: [[PSRCA_ADDR_174_UNR:%.*]] = phi i16* [ [[PSRCA_ADDR_090]], [[FOR_BODY3_PREHEADER]] ], [ [[PSRCA_ADDR_174_UNR_PH]], [[FOR_END_LOOPEXIT_UNR_LCSSA_LOOPEXIT]] ] +; CHECK-NEXT: [[PSRCB_ADDR_173_UNR:%.*]] = phi i16* [ [[PSRCB_ADDR_089]], [[FOR_BODY3_PREHEADER]] ], [ [[PSRCB_ADDR_173_UNR_PH]], [[FOR_END_LOOPEXIT_UNR_LCSSA_LOOPEXIT]] ] +; CHECK-NEXT: [[LCMP_MOD:%.*]] = icmp ne i32 [[XTRAITER]], 0 +; CHECK-NEXT: br i1 [[LCMP_MOD]], label [[FOR_BODY3_EPIL_PREHEADER:%.*]], label [[FOR_END_LOOPEXIT:%.*]] +; CHECK: for.body3.epil.preheader: +; CHECK-NEXT: br label [[FOR_BODY3_EPIL:%.*]] +; CHECK: for.body3.epil: +; CHECK-NEXT: [[TMP36:%.*]] = load i16, i16* [[PSRCA_ADDR_174_UNR]], align 2 +; CHECK-NEXT: [[CONV_EPIL:%.*]] = sext i16 [[TMP36]] to i32 +; CHECK-NEXT: [[TMP37:%.*]] = load i16, i16* [[PSRCB_ADDR_173_UNR]], align 2 +; CHECK-NEXT: [[CONV5_EPIL:%.*]] = sext i16 [[TMP37]] to i32 +; CHECK-NEXT: [[MUL_EPIL:%.*]] = mul nsw i32 [[CONV5_EPIL]], [[CONV_EPIL]] +; CHECK-NEXT: [[ARRAYIDX6_EPIL:%.*]] = getelementptr inbounds i16, i16* [[PSRCA_ADDR_174_UNR]], i32 1 +; CHECK-NEXT: [[TMP38:%.*]] = load i16, i16* [[ARRAYIDX6_EPIL]], align 2 +; CHECK-NEXT: [[CONV7_EPIL:%.*]] = sext i16 [[TMP38]] to i32 +; CHECK-NEXT: [[ARRAYIDX8_EPIL:%.*]] = getelementptr inbounds i16, i16* [[PSRCB_ADDR_173_UNR]], i32 1 +; CHECK-NEXT: [[TMP39:%.*]] = load i16, i16* [[ARRAYIDX8_EPIL]], align 2 +; CHECK-NEXT: [[CONV9_EPIL:%.*]] = sext i16 [[TMP39]] to i32 +; CHECK-NEXT: [[MUL10_EPIL:%.*]] = mul nsw i32 [[CONV9_EPIL]], [[CONV7_EPIL]] +; CHECK-NEXT: [[ARRAYIDX11_EPIL:%.*]] = getelementptr inbounds i16, i16* [[PSRCA_ADDR_174_UNR]], i32 2 +; CHECK-NEXT: [[TMP40:%.*]] = load i16, i16* [[ARRAYIDX11_EPIL]], align 2 +; CHECK-NEXT: [[CONV12_EPIL:%.*]] = sext i16 [[TMP40]] to i32 +; CHECK-NEXT: [[ARRAYIDX13_EPIL:%.*]] = getelementptr inbounds i16, i16* [[PSRCB_ADDR_173_UNR]], i32 3 +; CHECK-NEXT: [[TMP41:%.*]] = load i16, i16* [[ARRAYIDX13_EPIL]], align 2 +; CHECK-NEXT: [[CONV14_EPIL:%.*]] = sext i16 [[TMP41]] to i32 +; CHECK-NEXT: [[MUL15_EPIL:%.*]] = mul nsw i32 [[CONV14_EPIL]], [[CONV12_EPIL]] +; CHECK-NEXT: [[ARRAYIDX17_EPIL:%.*]] = getelementptr inbounds i16, i16* [[PSRCA_ADDR_174_UNR]], i32 3 +; CHECK-NEXT: [[TMP42:%.*]] = load i16, i16* [[ARRAYIDX17_EPIL]], align 2 +; CHECK-NEXT: [[CONV18_EPIL:%.*]] = sext i16 [[TMP42]] to i32 +; CHECK-NEXT: [[ADD21_EPIL:%.*]] = add i32 [[MUL10_EPIL]], [[MUL_EPIL]] +; CHECK-NEXT: [[ADD_EPIL:%.*]] = add i32 [[ADD21_EPIL]], [[CONV14_EPIL]] +; CHECK-NEXT: [[ADD16_EPIL:%.*]] = add i32 [[ADD_EPIL]], [[MUL15_EPIL]] +; CHECK-NEXT: [[ADD22_EPIL:%.*]] = add i32 [[ADD16_EPIL]], [[CONV18_EPIL]] +; CHECK-NEXT: store i32 [[ADD22_EPIL]], i32* [[PDEST_ADDR_175_UNR]], align 4 +; CHECK-NEXT: [[ADD_PTR_EPIL:%.*]] = getelementptr inbounds i16, i16* [[PSRCA_ADDR_174_UNR]], i32 4 +; CHECK-NEXT: [[ADD_PTR23_EPIL:%.*]] = getelementptr inbounds i16, i16* [[PSRCB_ADDR_173_UNR]], i32 4 +; CHECK-NEXT: [[INCDEC_PTR_EPIL:%.*]] = getelementptr inbounds i32, i32* [[PDEST_ADDR_175_UNR]], i32 1 +; CHECK-NEXT: [[ADD24_EPIL:%.*]] = add nuw nsw i32 [[J_076_UNR]], 4 +; CHECK-NEXT: [[EPIL_ITER_SUB:%.*]] = sub i32 [[XTRAITER]], 1 +; CHECK-NEXT: [[EPIL_ITER_CMP:%.*]] = icmp ne i32 [[EPIL_ITER_SUB]], 0 +; CHECK-NEXT: br i1 [[EPIL_ITER_CMP]], label [[FOR_BODY3_EPIL_1:%.*]], label [[FOR_END_LOOPEXIT_EPILOG_LCSSA:%.*]] +; CHECK: for.end.loopexit.epilog-lcssa: +; CHECK-NEXT: [[ADD_PTR_LCSSA_PH1:%.*]] = phi i16* [ [[ADD_PTR_EPIL]], [[FOR_BODY3_EPIL]] ], [ [[ADD_PTR_EPIL_1:%.*]], [[FOR_BODY3_EPIL_1]] ], [ [[ADD_PTR_EPIL_2:%.*]], [[FOR_BODY3_EPIL_2:%.*]] ] +; CHECK-NEXT: [[ADD_PTR23_LCSSA_PH2:%.*]] = phi i16* [ [[ADD_PTR23_EPIL]], [[FOR_BODY3_EPIL]] ], [ [[ADD_PTR23_EPIL_1:%.*]], [[FOR_BODY3_EPIL_1]] ], [ [[ADD_PTR23_EPIL_2:%.*]], [[FOR_BODY3_EPIL_2]] ] +; CHECK-NEXT: [[INCDEC_PTR_LCSSA_PH3:%.*]] = phi i32* [ [[INCDEC_PTR_EPIL]], [[FOR_BODY3_EPIL]] ], [ [[INCDEC_PTR_EPIL_1:%.*]], [[FOR_BODY3_EPIL_1]] ], [ [[INCDEC_PTR_EPIL_2:%.*]], [[FOR_BODY3_EPIL_2]] ] +; CHECK-NEXT: br label [[FOR_END_LOOPEXIT]] +; CHECK: for.end.loopexit: +; CHECK-NEXT: [[ADD_PTR_LCSSA:%.*]] = phi i16* [ [[ADD_PTR_LCSSA_PH]], [[FOR_END_LOOPEXIT_UNR_LCSSA]] ], [ [[ADD_PTR_LCSSA_PH1]], [[FOR_END_LOOPEXIT_EPILOG_LCSSA]] ] +; CHECK-NEXT: [[ADD_PTR23_LCSSA:%.*]] = phi i16* [ [[ADD_PTR23_LCSSA_PH]], [[FOR_END_LOOPEXIT_UNR_LCSSA]] ], [ [[ADD_PTR23_LCSSA_PH2]], [[FOR_END_LOOPEXIT_EPILOG_LCSSA]] ] +; CHECK-NEXT: [[INCDEC_PTR_LCSSA:%.*]] = phi i32* [ [[INCDEC_PTR_LCSSA_PH]], [[FOR_END_LOOPEXIT_UNR_LCSSA]] ], [ [[INCDEC_PTR_LCSSA_PH3]], [[FOR_END_LOOPEXIT_EPILOG_LCSSA]] ] +; CHECK-NEXT: br label [[FOR_END]] +; CHECK: for.end: +; CHECK-NEXT: [[PSRCB_ADDR_1_LCSSA:%.*]] = phi i16* [ [[PSRCB_ADDR_089]], [[FOR_BODY]] ], [ [[ADD_PTR23_LCSSA]], [[FOR_END_LOOPEXIT]] ] +; CHECK-NEXT: [[PSRCA_ADDR_1_LCSSA:%.*]] = phi i16* [ [[PSRCA_ADDR_090]], [[FOR_BODY]] ], [ [[ADD_PTR_LCSSA]], [[FOR_END_LOOPEXIT]] ] +; CHECK-NEXT: [[PDEST_ADDR_1_LCSSA:%.*]] = phi i32* [ [[PDEST_ADDR_091]], [[FOR_BODY]] ], [ [[INCDEC_PTR_LCSSA]], [[FOR_END_LOOPEXIT]] ] +; CHECK-NEXT: [[J_0_LCSSA:%.*]] = phi i32 [ 0, [[FOR_BODY]] ], [ [[TMP6]], [[FOR_END_LOOPEXIT]] ] +; CHECK-NEXT: [[REM:%.*]] = and i32 [[TMP4]], 3 +; CHECK-NEXT: [[ADD25:%.*]] = or i32 [[J_0_LCSSA]], [[REM]] +; CHECK-NEXT: [[CMP2780:%.*]] = icmp ugt i32 [[ADD25]], [[J_0_LCSSA]] +; CHECK-NEXT: br i1 [[CMP2780]], label [[FOR_BODY29_PREHEADER:%.*]], label [[FOR_END40]] +; CHECK: for.body29.preheader: +; CHECK-NEXT: [[TMP43:%.*]] = sub nsw i32 [[ADD25]], [[J_0_LCSSA]] +; CHECK-NEXT: [[TMP44:%.*]] = sub i32 [[ADD25]], [[J_0_LCSSA]] +; CHECK-NEXT: [[TMP45:%.*]] = add i32 [[ADD25]], -1 +; CHECK-NEXT: [[TMP46:%.*]] = sub i32 [[TMP45]], [[J_0_LCSSA]] +; CHECK-NEXT: [[XTRAITER4:%.*]] = and i32 [[TMP44]], 3 +; CHECK-NEXT: [[LCMP_MOD5:%.*]] = icmp ne i32 [[XTRAITER4]], 0 +; CHECK-NEXT: br i1 [[LCMP_MOD5]], label [[FOR_BODY29_PROL_PREHEADER:%.*]], label [[FOR_BODY29_PROL_LOOPEXIT:%.*]] +; CHECK: for.body29.prol.preheader: +; CHECK-NEXT: br label [[FOR_BODY29_PROL:%.*]] +; CHECK: for.body29.prol: +; CHECK-NEXT: [[ARRAYIDX30_PROL:%.*]] = getelementptr inbounds i16, i16* [[PSRCA_ADDR_1_LCSSA]], i32 [[J_0_LCSSA]] +; CHECK-NEXT: [[TMP47:%.*]] = load i16, i16* [[ARRAYIDX30_PROL]], align 2 +; CHECK-NEXT: [[CONV31_PROL:%.*]] = sext i16 [[TMP47]] to i32 +; CHECK-NEXT: [[ARRAYIDX32_PROL:%.*]] = getelementptr inbounds i16, i16* [[PSRCB_ADDR_1_LCSSA]], i32 [[J_0_LCSSA]] +; CHECK-NEXT: [[TMP48:%.*]] = load i16, i16* [[ARRAYIDX32_PROL]], align 2 +; CHECK-NEXT: [[CONV33_PROL:%.*]] = sext i16 [[TMP48]] to i32 +; CHECK-NEXT: [[MUL34_PROL:%.*]] = mul nsw i32 [[CONV33_PROL]], [[CONV31_PROL]] +; CHECK-NEXT: [[TMP49:%.*]] = load i32, i32* [[PDEST_ADDR_1_LCSSA]], align 4 +; CHECK-NEXT: [[ADD35_PROL:%.*]] = add nsw i32 [[MUL34_PROL]], [[TMP49]] +; CHECK-NEXT: store i32 [[ADD35_PROL]], i32* [[PDEST_ADDR_1_LCSSA]], align 4 +; CHECK-NEXT: [[INCDEC_PTR36_PROL:%.*]] = getelementptr inbounds i16, i16* [[PSRCA_ADDR_1_LCSSA]], i32 1 +; CHECK-NEXT: [[INCDEC_PTR37_PROL:%.*]] = getelementptr inbounds i16, i16* [[PSRCB_ADDR_1_LCSSA]], i32 1 +; CHECK-NEXT: [[INCDEC_PTR38_PROL:%.*]] = getelementptr inbounds i32, i32* [[PDEST_ADDR_1_LCSSA]], i32 1 +; CHECK-NEXT: [[INC_PROL:%.*]] = add nuw i32 [[J_0_LCSSA]], 1 +; CHECK-NEXT: [[PROL_ITER_SUB:%.*]] = sub i32 [[XTRAITER4]], 1 +; CHECK-NEXT: [[PROL_ITER_CMP:%.*]] = icmp ne i32 [[PROL_ITER_SUB]], 0 +; CHECK-NEXT: br i1 [[PROL_ITER_CMP]], label [[FOR_BODY29_PROL_1:%.*]], label [[FOR_BODY29_PROL_LOOPEXIT_UNR_LCSSA:%.*]] +; CHECK: for.body29.prol.loopexit.unr-lcssa: +; CHECK-NEXT: [[J_184_UNR_PH:%.*]] = phi i32 [ [[INC_PROL]], [[FOR_BODY29_PROL]] ], [ [[INC_PROL_1:%.*]], [[FOR_BODY29_PROL_1]] ], [ [[INC_PROL_2:%.*]], [[FOR_BODY29_PROL_2:%.*]] ] +; CHECK-NEXT: [[PDEST_ADDR_283_UNR_PH:%.*]] = phi i32* [ [[INCDEC_PTR38_PROL]], [[FOR_BODY29_PROL]] ], [ [[INCDEC_PTR38_PROL_1:%.*]], [[FOR_BODY29_PROL_1]] ], [ [[INCDEC_PTR38_PROL_2:%.*]], [[FOR_BODY29_PROL_2]] ] +; CHECK-NEXT: [[PSRCA_ADDR_282_UNR_PH:%.*]] = phi i16* [ [[INCDEC_PTR36_PROL]], [[FOR_BODY29_PROL]] ], [ [[INCDEC_PTR36_PROL_1:%.*]], [[FOR_BODY29_PROL_1]] ], [ [[INCDEC_PTR36_PROL_2:%.*]], [[FOR_BODY29_PROL_2]] ] +; CHECK-NEXT: [[PSRCB_ADDR_281_UNR_PH:%.*]] = phi i16* [ [[INCDEC_PTR37_PROL]], [[FOR_BODY29_PROL]] ], [ [[INCDEC_PTR37_PROL_1:%.*]], [[FOR_BODY29_PROL_1]] ], [ [[INCDEC_PTR37_PROL_2:%.*]], [[FOR_BODY29_PROL_2]] ] +; CHECK-NEXT: br label [[FOR_BODY29_PROL_LOOPEXIT]] +; CHECK: for.body29.prol.loopexit: +; CHECK-NEXT: [[J_184_UNR:%.*]] = phi i32 [ [[J_0_LCSSA]], [[FOR_BODY29_PREHEADER]] ], [ [[J_184_UNR_PH]], [[FOR_BODY29_PROL_LOOPEXIT_UNR_LCSSA]] ] +; CHECK-NEXT: [[PDEST_ADDR_283_UNR:%.*]] = phi i32* [ [[PDEST_ADDR_1_LCSSA]], [[FOR_BODY29_PREHEADER]] ], [ [[PDEST_ADDR_283_UNR_PH]], [[FOR_BODY29_PROL_LOOPEXIT_UNR_LCSSA]] ] +; CHECK-NEXT: [[PSRCA_ADDR_282_UNR:%.*]] = phi i16* [ [[PSRCA_ADDR_1_LCSSA]], [[FOR_BODY29_PREHEADER]] ], [ [[PSRCA_ADDR_282_UNR_PH]], [[FOR_BODY29_PROL_LOOPEXIT_UNR_LCSSA]] ] +; CHECK-NEXT: [[PSRCB_ADDR_281_UNR:%.*]] = phi i16* [ [[PSRCB_ADDR_1_LCSSA]], [[FOR_BODY29_PREHEADER]] ], [ [[PSRCB_ADDR_281_UNR_PH]], [[FOR_BODY29_PROL_LOOPEXIT_UNR_LCSSA]] ] +; CHECK-NEXT: [[TMP50:%.*]] = icmp ult i32 [[TMP46]], 3 +; CHECK-NEXT: br i1 [[TMP50]], label [[FOR_END40_LOOPEXIT:%.*]], label [[FOR_BODY29_PREHEADER_NEW:%.*]] +; CHECK: for.body29.preheader.new: +; CHECK-NEXT: br label [[FOR_BODY29:%.*]] +; CHECK: for.body29: +; CHECK-NEXT: [[J_184:%.*]] = phi i32 [ [[J_184_UNR]], [[FOR_BODY29_PREHEADER_NEW]] ], [ [[INC_3:%.*]], [[FOR_BODY29]] ] +; CHECK-NEXT: [[PDEST_ADDR_283:%.*]] = phi i32* [ [[PDEST_ADDR_283_UNR]], [[FOR_BODY29_PREHEADER_NEW]] ], [ [[INCDEC_PTR38_3:%.*]], [[FOR_BODY29]] ] +; CHECK-NEXT: [[PSRCA_ADDR_282:%.*]] = phi i16* [ [[PSRCA_ADDR_282_UNR]], [[FOR_BODY29_PREHEADER_NEW]] ], [ [[INCDEC_PTR36_3:%.*]], [[FOR_BODY29]] ] +; CHECK-NEXT: [[PSRCB_ADDR_281:%.*]] = phi i16* [ [[PSRCB_ADDR_281_UNR]], [[FOR_BODY29_PREHEADER_NEW]] ], [ [[INCDEC_PTR37_3:%.*]], [[FOR_BODY29]] ] +; CHECK-NEXT: [[ARRAYIDX30:%.*]] = getelementptr inbounds i16, i16* [[PSRCA_ADDR_282]], i32 [[J_184]] +; CHECK-NEXT: [[TMP51:%.*]] = load i16, i16* [[ARRAYIDX30]], align 2 +; CHECK-NEXT: [[CONV31:%.*]] = sext i16 [[TMP51]] to i32 +; CHECK-NEXT: [[ARRAYIDX32:%.*]] = getelementptr inbounds i16, i16* [[PSRCB_ADDR_281]], i32 [[J_184]] +; CHECK-NEXT: [[TMP52:%.*]] = load i16, i16* [[ARRAYIDX32]], align 2 +; CHECK-NEXT: [[CONV33:%.*]] = sext i16 [[TMP52]] to i32 +; CHECK-NEXT: [[MUL34:%.*]] = mul nsw i32 [[CONV33]], [[CONV31]] +; CHECK-NEXT: [[TMP53:%.*]] = load i32, i32* [[PDEST_ADDR_283]], align 4 +; CHECK-NEXT: [[ADD35:%.*]] = add nsw i32 [[MUL34]], [[TMP53]] +; CHECK-NEXT: store i32 [[ADD35]], i32* [[PDEST_ADDR_283]], align 4 +; CHECK-NEXT: [[INCDEC_PTR36:%.*]] = getelementptr inbounds i16, i16* [[PSRCA_ADDR_282]], i32 1 +; CHECK-NEXT: [[INCDEC_PTR37:%.*]] = getelementptr inbounds i16, i16* [[PSRCB_ADDR_281]], i32 1 +; CHECK-NEXT: [[INCDEC_PTR38:%.*]] = getelementptr inbounds i32, i32* [[PDEST_ADDR_283]], i32 1 +; CHECK-NEXT: [[INC:%.*]] = add nuw i32 [[J_184]], 1 +; CHECK-NEXT: [[ARRAYIDX30_1:%.*]] = getelementptr inbounds i16, i16* [[INCDEC_PTR36]], i32 [[INC]] +; CHECK-NEXT: [[TMP54:%.*]] = load i16, i16* [[ARRAYIDX30_1]], align 2 +; CHECK-NEXT: [[CONV31_1:%.*]] = sext i16 [[TMP54]] to i32 +; CHECK-NEXT: [[ARRAYIDX32_1:%.*]] = getelementptr inbounds i16, i16* [[INCDEC_PTR37]], i32 [[INC]] +; CHECK-NEXT: [[TMP55:%.*]] = load i16, i16* [[ARRAYIDX32_1]], align 2 +; CHECK-NEXT: [[CONV33_1:%.*]] = sext i16 [[TMP55]] to i32 +; CHECK-NEXT: [[MUL34_1:%.*]] = mul nsw i32 [[CONV33_1]], [[CONV31_1]] +; CHECK-NEXT: [[TMP56:%.*]] = load i32, i32* [[INCDEC_PTR38]], align 4 +; CHECK-NEXT: [[ADD35_1:%.*]] = add nsw i32 [[MUL34_1]], [[TMP56]] +; CHECK-NEXT: store i32 [[ADD35_1]], i32* [[INCDEC_PTR38]], align 4 +; CHECK-NEXT: [[INCDEC_PTR36_1:%.*]] = getelementptr inbounds i16, i16* [[INCDEC_PTR36]], i32 1 +; CHECK-NEXT: [[INCDEC_PTR37_1:%.*]] = getelementptr inbounds i16, i16* [[INCDEC_PTR37]], i32 1 +; CHECK-NEXT: [[INCDEC_PTR38_1:%.*]] = getelementptr inbounds i32, i32* [[INCDEC_PTR38]], i32 1 +; CHECK-NEXT: [[INC_1:%.*]] = add nuw i32 [[INC]], 1 +; CHECK-NEXT: [[ARRAYIDX30_2:%.*]] = getelementptr inbounds i16, i16* [[INCDEC_PTR36_1]], i32 [[INC_1]] +; CHECK-NEXT: [[TMP57:%.*]] = load i16, i16* [[ARRAYIDX30_2]], align 2 +; CHECK-NEXT: [[CONV31_2:%.*]] = sext i16 [[TMP57]] to i32 +; CHECK-NEXT: [[ARRAYIDX32_2:%.*]] = getelementptr inbounds i16, i16* [[INCDEC_PTR37_1]], i32 [[INC_1]] +; CHECK-NEXT: [[TMP58:%.*]] = load i16, i16* [[ARRAYIDX32_2]], align 2 +; CHECK-NEXT: [[CONV33_2:%.*]] = sext i16 [[TMP58]] to i32 +; CHECK-NEXT: [[MUL34_2:%.*]] = mul nsw i32 [[CONV33_2]], [[CONV31_2]] +; CHECK-NEXT: [[TMP59:%.*]] = load i32, i32* [[INCDEC_PTR38_1]], align 4 +; CHECK-NEXT: [[ADD35_2:%.*]] = add nsw i32 [[MUL34_2]], [[TMP59]] +; CHECK-NEXT: store i32 [[ADD35_2]], i32* [[INCDEC_PTR38_1]], align 4 +; CHECK-NEXT: [[INCDEC_PTR36_2:%.*]] = getelementptr inbounds i16, i16* [[INCDEC_PTR36_1]], i32 1 +; CHECK-NEXT: [[INCDEC_PTR37_2:%.*]] = getelementptr inbounds i16, i16* [[INCDEC_PTR37_1]], i32 1 +; CHECK-NEXT: [[INCDEC_PTR38_2:%.*]] = getelementptr inbounds i32, i32* [[INCDEC_PTR38_1]], i32 1 +; CHECK-NEXT: [[INC_2:%.*]] = add nuw i32 [[INC_1]], 1 +; CHECK-NEXT: [[ARRAYIDX30_3:%.*]] = getelementptr inbounds i16, i16* [[INCDEC_PTR36_2]], i32 [[INC_2]] +; CHECK-NEXT: [[TMP60:%.*]] = load i16, i16* [[ARRAYIDX30_3]], align 2 +; CHECK-NEXT: [[CONV31_3:%.*]] = sext i16 [[TMP60]] to i32 +; CHECK-NEXT: [[ARRAYIDX32_3:%.*]] = getelementptr inbounds i16, i16* [[INCDEC_PTR37_2]], i32 [[INC_2]] +; CHECK-NEXT: [[TMP61:%.*]] = load i16, i16* [[ARRAYIDX32_3]], align 2 +; CHECK-NEXT: [[CONV33_3:%.*]] = sext i16 [[TMP61]] to i32 +; CHECK-NEXT: [[MUL34_3:%.*]] = mul nsw i32 [[CONV33_3]], [[CONV31_3]] +; CHECK-NEXT: [[TMP62:%.*]] = load i32, i32* [[INCDEC_PTR38_2]], align 4 +; CHECK-NEXT: [[ADD35_3:%.*]] = add nsw i32 [[MUL34_3]], [[TMP62]] +; CHECK-NEXT: store i32 [[ADD35_3]], i32* [[INCDEC_PTR38_2]], align 4 +; CHECK-NEXT: [[INCDEC_PTR36_3]] = getelementptr inbounds i16, i16* [[INCDEC_PTR36_2]], i32 1 +; CHECK-NEXT: [[INCDEC_PTR37_3]] = getelementptr inbounds i16, i16* [[INCDEC_PTR37_2]], i32 1 +; CHECK-NEXT: [[INCDEC_PTR38_3]] = getelementptr inbounds i32, i32* [[INCDEC_PTR38_2]], i32 1 +; CHECK-NEXT: [[INC_3]] = add nuw i32 [[INC_2]], 1 +; CHECK-NEXT: [[EXITCOND_3:%.*]] = icmp eq i32 [[INC_3]], [[ADD25]] +; CHECK-NEXT: br i1 [[EXITCOND_3]], label [[FOR_END40_LOOPEXIT_UNR_LCSSA:%.*]], label [[FOR_BODY29]] +; CHECK: for.end40.loopexit.unr-lcssa: +; CHECK-NEXT: br label [[FOR_END40_LOOPEXIT]] +; CHECK: for.end40.loopexit: +; CHECK-NEXT: [[SCEVGEP93:%.*]] = getelementptr i16, i16* [[PSRCB_ADDR_1_LCSSA]], i32 [[TMP43]] +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i16, i16* [[PSRCA_ADDR_1_LCSSA]], i32 [[TMP43]] +; CHECK-NEXT: [[SCEVGEP94:%.*]] = getelementptr i32, i32* [[PDEST_ADDR_1_LCSSA]], i32 [[TMP43]] +; CHECK-NEXT: br label [[FOR_END40]] +; CHECK: for.end40: +; CHECK-NEXT: [[PSRCB_ADDR_2_LCSSA]] = phi i16* [ [[PSRCB_ADDR_1_LCSSA]], [[FOR_END]] ], [ [[SCEVGEP93]], [[FOR_END40_LOOPEXIT]] ] +; CHECK-NEXT: [[PSRCA_ADDR_2_LCSSA]] = phi i16* [ [[PSRCA_ADDR_1_LCSSA]], [[FOR_END]] ], [ [[SCEVGEP]], [[FOR_END40_LOOPEXIT]] ] +; CHECK-NEXT: [[PDEST_ADDR_2_LCSSA]] = phi i32* [ [[PDEST_ADDR_1_LCSSA]], [[FOR_END]] ], [ [[SCEVGEP94]], [[FOR_END40_LOOPEXIT]] ] +; CHECK-NEXT: [[INC42]] = add nuw i32 [[I_092]], 1 +; CHECK-NEXT: [[EXITCOND95:%.*]] = icmp eq i32 [[INC42]], [[BLKCNT]] +; CHECK-NEXT: br i1 [[EXITCOND95]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]] +; CHECK: for.body3.epil.1: +; CHECK-NEXT: [[TMP63:%.*]] = load i16, i16* [[ADD_PTR_EPIL]], align 2 +; CHECK-NEXT: [[CONV_EPIL_1:%.*]] = sext i16 [[TMP63]] to i32 +; CHECK-NEXT: [[TMP64:%.*]] = load i16, i16* [[ADD_PTR23_EPIL]], align 2 +; CHECK-NEXT: [[CONV5_EPIL_1:%.*]] = sext i16 [[TMP64]] to i32 +; CHECK-NEXT: [[MUL_EPIL_1:%.*]] = mul nsw i32 [[CONV5_EPIL_1]], [[CONV_EPIL_1]] +; CHECK-NEXT: [[ARRAYIDX6_EPIL_1:%.*]] = getelementptr inbounds i16, i16* [[ADD_PTR_EPIL]], i32 1 +; CHECK-NEXT: [[TMP65:%.*]] = load i16, i16* [[ARRAYIDX6_EPIL_1]], align 2 +; CHECK-NEXT: [[CONV7_EPIL_1:%.*]] = sext i16 [[TMP65]] to i32 +; CHECK-NEXT: [[ARRAYIDX8_EPIL_1:%.*]] = getelementptr inbounds i16, i16* [[ADD_PTR23_EPIL]], i32 1 +; CHECK-NEXT: [[TMP66:%.*]] = load i16, i16* [[ARRAYIDX8_EPIL_1]], align 2 +; CHECK-NEXT: [[CONV9_EPIL_1:%.*]] = sext i16 [[TMP66]] to i32 +; CHECK-NEXT: [[MUL10_EPIL_1:%.*]] = mul nsw i32 [[CONV9_EPIL_1]], [[CONV7_EPIL_1]] +; CHECK-NEXT: [[ARRAYIDX11_EPIL_1:%.*]] = getelementptr inbounds i16, i16* [[ADD_PTR_EPIL]], i32 2 +; CHECK-NEXT: [[TMP67:%.*]] = load i16, i16* [[ARRAYIDX11_EPIL_1]], align 2 +; CHECK-NEXT: [[CONV12_EPIL_1:%.*]] = sext i16 [[TMP67]] to i32 +; CHECK-NEXT: [[ARRAYIDX13_EPIL_1:%.*]] = getelementptr inbounds i16, i16* [[ADD_PTR23_EPIL]], i32 3 +; CHECK-NEXT: [[TMP68:%.*]] = load i16, i16* [[ARRAYIDX13_EPIL_1]], align 2 +; CHECK-NEXT: [[CONV14_EPIL_1:%.*]] = sext i16 [[TMP68]] to i32 +; CHECK-NEXT: [[MUL15_EPIL_1:%.*]] = mul nsw i32 [[CONV14_EPIL_1]], [[CONV12_EPIL_1]] +; CHECK-NEXT: [[ARRAYIDX17_EPIL_1:%.*]] = getelementptr inbounds i16, i16* [[ADD_PTR_EPIL]], i32 3 +; CHECK-NEXT: [[TMP69:%.*]] = load i16, i16* [[ARRAYIDX17_EPIL_1]], align 2 +; CHECK-NEXT: [[CONV18_EPIL_1:%.*]] = sext i16 [[TMP69]] to i32 +; CHECK-NEXT: [[ADD21_EPIL_1:%.*]] = add i32 [[MUL10_EPIL_1]], [[MUL_EPIL_1]] +; CHECK-NEXT: [[ADD_EPIL_1:%.*]] = add i32 [[ADD21_EPIL_1]], [[CONV14_EPIL_1]] +; CHECK-NEXT: [[ADD16_EPIL_1:%.*]] = add i32 [[ADD_EPIL_1]], [[MUL15_EPIL_1]] +; CHECK-NEXT: [[ADD22_EPIL_1:%.*]] = add i32 [[ADD16_EPIL_1]], [[CONV18_EPIL_1]] +; CHECK-NEXT: store i32 [[ADD22_EPIL_1]], i32* [[INCDEC_PTR_EPIL]], align 4 +; CHECK-NEXT: [[ADD_PTR_EPIL_1]] = getelementptr inbounds i16, i16* [[ADD_PTR_EPIL]], i32 4 +; CHECK-NEXT: [[ADD_PTR23_EPIL_1]] = getelementptr inbounds i16, i16* [[ADD_PTR23_EPIL]], i32 4 +; CHECK-NEXT: [[INCDEC_PTR_EPIL_1]] = getelementptr inbounds i32, i32* [[INCDEC_PTR_EPIL]], i32 1 +; CHECK-NEXT: [[ADD24_EPIL_1:%.*]] = add nuw nsw i32 [[ADD24_EPIL]], 4 +; CHECK-NEXT: [[EPIL_ITER_SUB_1:%.*]] = sub i32 [[EPIL_ITER_SUB]], 1 +; CHECK-NEXT: [[EPIL_ITER_CMP_1:%.*]] = icmp ne i32 [[EPIL_ITER_SUB_1]], 0 +; CHECK-NEXT: br i1 [[EPIL_ITER_CMP_1]], label [[FOR_BODY3_EPIL_2]], label [[FOR_END_LOOPEXIT_EPILOG_LCSSA]] +; CHECK: for.body3.epil.2: +; CHECK-NEXT: [[TMP70:%.*]] = load i16, i16* [[ADD_PTR_EPIL_1]], align 2 +; CHECK-NEXT: [[CONV_EPIL_2:%.*]] = sext i16 [[TMP70]] to i32 +; CHECK-NEXT: [[TMP71:%.*]] = load i16, i16* [[ADD_PTR23_EPIL_1]], align 2 +; CHECK-NEXT: [[CONV5_EPIL_2:%.*]] = sext i16 [[TMP71]] to i32 +; CHECK-NEXT: [[MUL_EPIL_2:%.*]] = mul nsw i32 [[CONV5_EPIL_2]], [[CONV_EPIL_2]] +; CHECK-NEXT: [[ARRAYIDX6_EPIL_2:%.*]] = getelementptr inbounds i16, i16* [[ADD_PTR_EPIL_1]], i32 1 +; CHECK-NEXT: [[TMP72:%.*]] = load i16, i16* [[ARRAYIDX6_EPIL_2]], align 2 +; CHECK-NEXT: [[CONV7_EPIL_2:%.*]] = sext i16 [[TMP72]] to i32 +; CHECK-NEXT: [[ARRAYIDX8_EPIL_2:%.*]] = getelementptr inbounds i16, i16* [[ADD_PTR23_EPIL_1]], i32 1 +; CHECK-NEXT: [[TMP73:%.*]] = load i16, i16* [[ARRAYIDX8_EPIL_2]], align 2 +; CHECK-NEXT: [[CONV9_EPIL_2:%.*]] = sext i16 [[TMP73]] to i32 +; CHECK-NEXT: [[MUL10_EPIL_2:%.*]] = mul nsw i32 [[CONV9_EPIL_2]], [[CONV7_EPIL_2]] +; CHECK-NEXT: [[ARRAYIDX11_EPIL_2:%.*]] = getelementptr inbounds i16, i16* [[ADD_PTR_EPIL_1]], i32 2 +; CHECK-NEXT: [[TMP74:%.*]] = load i16, i16* [[ARRAYIDX11_EPIL_2]], align 2 +; CHECK-NEXT: [[CONV12_EPIL_2:%.*]] = sext i16 [[TMP74]] to i32 +; CHECK-NEXT: [[ARRAYIDX13_EPIL_2:%.*]] = getelementptr inbounds i16, i16* [[ADD_PTR23_EPIL_1]], i32 3 +; CHECK-NEXT: [[TMP75:%.*]] = load i16, i16* [[ARRAYIDX13_EPIL_2]], align 2 +; CHECK-NEXT: [[CONV14_EPIL_2:%.*]] = sext i16 [[TMP75]] to i32 +; CHECK-NEXT: [[MUL15_EPIL_2:%.*]] = mul nsw i32 [[CONV14_EPIL_2]], [[CONV12_EPIL_2]] +; CHECK-NEXT: [[ARRAYIDX17_EPIL_2:%.*]] = getelementptr inbounds i16, i16* [[ADD_PTR_EPIL_1]], i32 3 +; CHECK-NEXT: [[TMP76:%.*]] = load i16, i16* [[ARRAYIDX17_EPIL_2]], align 2 +; CHECK-NEXT: [[CONV18_EPIL_2:%.*]] = sext i16 [[TMP76]] to i32 +; CHECK-NEXT: [[ADD21_EPIL_2:%.*]] = add i32 [[MUL10_EPIL_2]], [[MUL_EPIL_2]] +; CHECK-NEXT: [[ADD_EPIL_2:%.*]] = add i32 [[ADD21_EPIL_2]], [[CONV14_EPIL_2]] +; CHECK-NEXT: [[ADD16_EPIL_2:%.*]] = add i32 [[ADD_EPIL_2]], [[MUL15_EPIL_2]] +; CHECK-NEXT: [[ADD22_EPIL_2:%.*]] = add i32 [[ADD16_EPIL_2]], [[CONV18_EPIL_2]] +; CHECK-NEXT: store i32 [[ADD22_EPIL_2]], i32* [[INCDEC_PTR_EPIL_1]], align 4 +; CHECK-NEXT: [[ADD_PTR_EPIL_2]] = getelementptr inbounds i16, i16* [[ADD_PTR_EPIL_1]], i32 4 +; CHECK-NEXT: [[ADD_PTR23_EPIL_2]] = getelementptr inbounds i16, i16* [[ADD_PTR23_EPIL_1]], i32 4 +; CHECK-NEXT: [[INCDEC_PTR_EPIL_2]] = getelementptr inbounds i32, i32* [[INCDEC_PTR_EPIL_1]], i32 1 +; CHECK-NEXT: [[ADD24_EPIL_2:%.*]] = add nuw nsw i32 [[ADD24_EPIL_1]], 4 +; CHECK-NEXT: [[EPIL_ITER_SUB_2:%.*]] = sub i32 [[EPIL_ITER_SUB_1]], 1 +; CHECK-NEXT: br label [[FOR_END_LOOPEXIT_EPILOG_LCSSA]] +; CHECK: for.body29.prol.1: +; CHECK-NEXT: [[ARRAYIDX30_PROL_1:%.*]] = getelementptr inbounds i16, i16* [[INCDEC_PTR36_PROL]], i32 [[INC_PROL]] +; CHECK-NEXT: [[TMP77:%.*]] = load i16, i16* [[ARRAYIDX30_PROL_1]], align 2 +; CHECK-NEXT: [[CONV31_PROL_1:%.*]] = sext i16 [[TMP77]] to i32 +; CHECK-NEXT: [[ARRAYIDX32_PROL_1:%.*]] = getelementptr inbounds i16, i16* [[INCDEC_PTR37_PROL]], i32 [[INC_PROL]] +; CHECK-NEXT: [[TMP78:%.*]] = load i16, i16* [[ARRAYIDX32_PROL_1]], align 2 +; CHECK-NEXT: [[CONV33_PROL_1:%.*]] = sext i16 [[TMP78]] to i32 +; CHECK-NEXT: [[MUL34_PROL_1:%.*]] = mul nsw i32 [[CONV33_PROL_1]], [[CONV31_PROL_1]] +; CHECK-NEXT: [[TMP79:%.*]] = load i32, i32* [[INCDEC_PTR38_PROL]], align 4 +; CHECK-NEXT: [[ADD35_PROL_1:%.*]] = add nsw i32 [[MUL34_PROL_1]], [[TMP79]] +; CHECK-NEXT: store i32 [[ADD35_PROL_1]], i32* [[INCDEC_PTR38_PROL]], align 4 +; CHECK-NEXT: [[INCDEC_PTR36_PROL_1]] = getelementptr inbounds i16, i16* [[INCDEC_PTR36_PROL]], i32 1 +; CHECK-NEXT: [[INCDEC_PTR37_PROL_1]] = getelementptr inbounds i16, i16* [[INCDEC_PTR37_PROL]], i32 1 +; CHECK-NEXT: [[INCDEC_PTR38_PROL_1]] = getelementptr inbounds i32, i32* [[INCDEC_PTR38_PROL]], i32 1 +; CHECK-NEXT: [[INC_PROL_1]] = add nuw i32 [[INC_PROL]], 1 +; CHECK-NEXT: [[PROL_ITER_SUB_1:%.*]] = sub i32 [[PROL_ITER_SUB]], 1 +; CHECK-NEXT: [[PROL_ITER_CMP_1:%.*]] = icmp ne i32 [[PROL_ITER_SUB_1]], 0 +; CHECK-NEXT: br i1 [[PROL_ITER_CMP_1]], label [[FOR_BODY29_PROL_2]], label [[FOR_BODY29_PROL_LOOPEXIT_UNR_LCSSA]] +; CHECK: for.body29.prol.2: +; CHECK-NEXT: [[ARRAYIDX30_PROL_2:%.*]] = getelementptr inbounds i16, i16* [[INCDEC_PTR36_PROL_1]], i32 [[INC_PROL_1]] +; CHECK-NEXT: [[TMP80:%.*]] = load i16, i16* [[ARRAYIDX30_PROL_2]], align 2 +; CHECK-NEXT: [[CONV31_PROL_2:%.*]] = sext i16 [[TMP80]] to i32 +; CHECK-NEXT: [[ARRAYIDX32_PROL_2:%.*]] = getelementptr inbounds i16, i16* [[INCDEC_PTR37_PROL_1]], i32 [[INC_PROL_1]] +; CHECK-NEXT: [[TMP81:%.*]] = load i16, i16* [[ARRAYIDX32_PROL_2]], align 2 +; CHECK-NEXT: [[CONV33_PROL_2:%.*]] = sext i16 [[TMP81]] to i32 +; CHECK-NEXT: [[MUL34_PROL_2:%.*]] = mul nsw i32 [[CONV33_PROL_2]], [[CONV31_PROL_2]] +; CHECK-NEXT: [[TMP82:%.*]] = load i32, i32* [[INCDEC_PTR38_PROL_1]], align 4 +; CHECK-NEXT: [[ADD35_PROL_2:%.*]] = add nsw i32 [[MUL34_PROL_2]], [[TMP82]] +; CHECK-NEXT: store i32 [[ADD35_PROL_2]], i32* [[INCDEC_PTR38_PROL_1]], align 4 +; CHECK-NEXT: [[INCDEC_PTR36_PROL_2]] = getelementptr inbounds i16, i16* [[INCDEC_PTR36_PROL_1]], i32 1 +; CHECK-NEXT: [[INCDEC_PTR37_PROL_2]] = getelementptr inbounds i16, i16* [[INCDEC_PTR37_PROL_1]], i32 1 +; CHECK-NEXT: [[INCDEC_PTR38_PROL_2]] = getelementptr inbounds i32, i32* [[INCDEC_PTR38_PROL_1]], i32 1 +; CHECK-NEXT: [[INC_PROL_2]] = add nuw i32 [[INC_PROL_1]], 1 +; CHECK-NEXT: [[PROL_ITER_SUB_2:%.*]] = sub i32 [[PROL_ITER_SUB_1]], 1 +; CHECK-NEXT: br label [[FOR_BODY29_PROL_LOOPEXIT_UNR_LCSSA]] +; +entry: + %cmp88 = icmp eq i32 %blkCnt, 0 + br i1 %cmp88, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.end40, %entry + ret void + +for.body: ; preds = %for.end40, %entry + %i.092 = phi i32 [ %inc42, %for.end40 ], [ 0, %entry ] + %pDest.addr.091 = phi i32* [ %pDest.addr.2.lcssa, %for.end40 ], [ %pDest, %entry ] + %pSrcA.addr.090 = phi i16* [ %pSrcA.addr.2.lcssa, %for.end40 ], [ %pSrcA, %entry ] + %pSrcB.addr.089 = phi i16* [ %pSrcB.addr.2.lcssa, %for.end40 ], [ %pSrcB, %entry ] + %0 = lshr i32 %i.092, 2 + %1 = add nuw nsw i32 %0, 3 + %2 = and i32 %1, 2147483644 + %cmp272 = icmp eq i32 %0, 0 + br i1 %cmp272, label %for.end, label %for.body3 + +for.body3: ; preds = %for.body3, %for.body + %j.076 = phi i32 [ %add24, %for.body3 ], [ 0, %for.body ] + %pDest.addr.175 = phi i32* [ %incdec.ptr, %for.body3 ], [ %pDest.addr.091, %for.body ] + %pSrcA.addr.174 = phi i16* [ %add.ptr, %for.body3 ], [ %pSrcA.addr.090, %for.body ] + %pSrcB.addr.173 = phi i16* [ %add.ptr23, %for.body3 ], [ %pSrcB.addr.089, %for.body ] + %3 = load i16, i16* %pSrcA.addr.174, align 2 + %conv = sext i16 %3 to i32 + %4 = load i16, i16* %pSrcB.addr.173, align 2 + %conv5 = sext i16 %4 to i32 + %mul = mul nsw i32 %conv5, %conv + %arrayidx6 = getelementptr inbounds i16, i16* %pSrcA.addr.174, i32 1 + %5 = load i16, i16* %arrayidx6, align 2 + %conv7 = sext i16 %5 to i32 + %arrayidx8 = getelementptr inbounds i16, i16* %pSrcB.addr.173, i32 1 + %6 = load i16, i16* %arrayidx8, align 2 + %conv9 = sext i16 %6 to i32 + %mul10 = mul nsw i32 %conv9, %conv7 + %arrayidx11 = getelementptr inbounds i16, i16* %pSrcA.addr.174, i32 2 + %7 = load i16, i16* %arrayidx11, align 2 + %conv12 = sext i16 %7 to i32 + %arrayidx13 = getelementptr inbounds i16, i16* %pSrcB.addr.173, i32 3 + %8 = load i16, i16* %arrayidx13, align 2 + %conv14 = sext i16 %8 to i32 + %mul15 = mul nsw i32 %conv14, %conv12 + %arrayidx17 = getelementptr inbounds i16, i16* %pSrcA.addr.174, i32 3 + %9 = load i16, i16* %arrayidx17, align 2 + %conv18 = sext i16 %9 to i32 + %add21 = add i32 %mul10, %mul + %add = add i32 %add21, %conv14 + %add16 = add i32 %add, %mul15 + %add22 = add i32 %add16, %conv18 + store i32 %add22, i32* %pDest.addr.175, align 4 + %add.ptr = getelementptr inbounds i16, i16* %pSrcA.addr.174, i32 4 + %add.ptr23 = getelementptr inbounds i16, i16* %pSrcB.addr.173, i32 4 + %incdec.ptr = getelementptr inbounds i32, i32* %pDest.addr.175, i32 1 + %add24 = add nuw nsw i32 %j.076, 4 + %cmp2 = icmp ult i32 %add24, %0 + br i1 %cmp2, label %for.body3, label %for.end + +for.end: ; preds = %for.body3, %for.body + %pSrcB.addr.1.lcssa = phi i16* [ %pSrcB.addr.089, %for.body ], [ %add.ptr23, %for.body3 ] + %pSrcA.addr.1.lcssa = phi i16* [ %pSrcA.addr.090, %for.body ], [ %add.ptr, %for.body3 ] + %pDest.addr.1.lcssa = phi i32* [ %pDest.addr.091, %for.body ], [ %incdec.ptr, %for.body3 ] + %j.0.lcssa = phi i32 [ 0, %for.body ], [ %2, %for.body3 ] + %rem = and i32 %0, 3 + %add25 = or i32 %j.0.lcssa, %rem + %cmp2780 = icmp ugt i32 %add25, %j.0.lcssa + br i1 %cmp2780, label %for.body29.preheader, label %for.end40 + +for.body29.preheader: ; preds = %for.end + %10 = sub nsw i32 %add25, %j.0.lcssa + %scevgep93 = getelementptr i16, i16* %pSrcB.addr.1.lcssa, i32 %10 + br label %for.body29 + +for.body29: ; preds = %for.body29, %for.body29.preheader + %j.184 = phi i32 [ %inc, %for.body29 ], [ %j.0.lcssa, %for.body29.preheader ] + %pDest.addr.283 = phi i32* [ %incdec.ptr38, %for.body29 ], [ %pDest.addr.1.lcssa, %for.body29.preheader ] + %pSrcA.addr.282 = phi i16* [ %incdec.ptr36, %for.body29 ], [ %pSrcA.addr.1.lcssa, %for.body29.preheader ] + %pSrcB.addr.281 = phi i16* [ %incdec.ptr37, %for.body29 ], [ %pSrcB.addr.1.lcssa, %for.body29.preheader ] + %arrayidx30 = getelementptr inbounds i16, i16* %pSrcA.addr.282, i32 %j.184 + %11 = load i16, i16* %arrayidx30, align 2 + %conv31 = sext i16 %11 to i32 + %arrayidx32 = getelementptr inbounds i16, i16* %pSrcB.addr.281, i32 %j.184 + %12 = load i16, i16* %arrayidx32, align 2 + %conv33 = sext i16 %12 to i32 + %mul34 = mul nsw i32 %conv33, %conv31 + %13 = load i32, i32* %pDest.addr.283, align 4 + %add35 = add nsw i32 %mul34, %13 + store i32 %add35, i32* %pDest.addr.283, align 4 + %incdec.ptr36 = getelementptr inbounds i16, i16* %pSrcA.addr.282, i32 1 + %incdec.ptr37 = getelementptr inbounds i16, i16* %pSrcB.addr.281, i32 1 + %incdec.ptr38 = getelementptr inbounds i32, i32* %pDest.addr.283, i32 1 + %inc = add nuw i32 %j.184, 1 + %exitcond = icmp eq i32 %inc, %add25 + br i1 %exitcond, label %for.end40.loopexit, label %for.body29 + +for.end40.loopexit: ; preds = %for.body29 + %scevgep = getelementptr i16, i16* %pSrcA.addr.1.lcssa, i32 %10 + %scevgep94 = getelementptr i32, i32* %pDest.addr.1.lcssa, i32 %10 + br label %for.end40 + +for.end40: ; preds = %for.end40.loopexit, %for.end + %pSrcB.addr.2.lcssa = phi i16* [ %pSrcB.addr.1.lcssa, %for.end ], [ %scevgep93, %for.end40.loopexit ] + %pSrcA.addr.2.lcssa = phi i16* [ %pSrcA.addr.1.lcssa, %for.end ], [ %scevgep, %for.end40.loopexit ] + %pDest.addr.2.lcssa = phi i32* [ %pDest.addr.1.lcssa, %for.end ], [ %scevgep94, %for.end40.loopexit ] + %inc42 = add nuw i32 %i.092, 1 + %exitcond95 = icmp eq i32 %inc42, %blkCnt + br i1 %exitcond95, label %for.cond.cleanup, label %for.body +} diff --git a/llvm/test/CodeGen/Hexagon/prof-early-if.ll b/llvm/test/CodeGen/Hexagon/prof-early-if.ll index a5215a9b351f6..b0f21110b7dee 100644 --- a/llvm/test/CodeGen/Hexagon/prof-early-if.ll +++ b/llvm/test/CodeGen/Hexagon/prof-early-if.ll @@ -1,8 +1,8 @@ ; RUN: llc -O2 -march=hexagon < %s | FileCheck %s ; Rely on the comments generated by llc. Check that "if.then" was not predicated. +; CHECK: b5 ; CHECK: b2 ; CHECK-NOT: if{{.*}}memd -; CHECK: b5 %s.0 = type { [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [24 x i32], [3 x i32], [24 x i32], [8 x %s.1], [5 x i32] } %s.1 = type { i32, i32 } diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir index 8bc20c9b88c94..e39ff3b350c80 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir +++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir @@ -13,7 +13,6 @@ # FULL-NEXT: memoryBound: true # FULL-NEXT: waveLimiter: true # FULL-NEXT: scratchRSrcReg: '$sgpr8_sgpr9_sgpr10_sgpr11' -# FULL-NEXT: scratchWaveOffsetReg: '$sgpr12' # FULL-NEXT: frameOffsetReg: '$sgpr12' # FULL-NEXT: stackPtrOffsetReg: '$sgpr13' # FULL-NEXT: argumentInfo: @@ -40,7 +39,6 @@ # SIMPLE-NEXT: memoryBound: true # SIMPLE-NEXT: waveLimiter: true # SIMPLE-NEXT: scratchRSrcReg: '$sgpr8_sgpr9_sgpr10_sgpr11' -# SIMPLE-NEXT: scratchWaveOffsetReg: '$sgpr12' # SIMPLE-NEXT: frameOffsetReg: '$sgpr12' # SIMPLE-NEXT: stackPtrOffsetReg: '$sgpr13' # SIMPLE-NEXT: argumentInfo: @@ -60,7 +58,6 @@ machineFunctionInfo: memoryBound: true waveLimiter: true scratchRSrcReg: '$sgpr8_sgpr9_sgpr10_sgpr11' - scratchWaveOffsetReg: '$sgpr12' frameOffsetReg: '$sgpr12' stackPtrOffsetReg: '$sgpr13' argumentInfo: @@ -87,12 +84,10 @@ body: | # FULL-NEXT: memoryBound: false # FULL-NEXT: waveLimiter: false # FULL-NEXT: scratchRSrcReg: '$private_rsrc_reg' -# FULL-NEXT: scratchWaveOffsetReg: '$scratch_wave_offset_reg' # FULL-NEXT: frameOffsetReg: '$fp_reg' # FULL-NEXT: stackPtrOffsetReg: '$sp_reg' # FULL-NEXT: argumentInfo: # FULL-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } -# FULL-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr33' } # FULL-NEXT: mode: # FULL-NEXT: ieee: true # FULL-NEXT: dx10-clamp: true @@ -107,7 +102,6 @@ body: | # SIMPLE-NEXT: maxKernArgAlign: 1 # SIMPLE-NEXT: argumentInfo: # SIMPLE-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } -# SIMPLE-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr33' } # SIMPLE-NEXT: body: name: no_mfi @@ -128,12 +122,10 @@ body: | # FULL-NEXT: memoryBound: false # FULL-NEXT: waveLimiter: false # FULL-NEXT: scratchRSrcReg: '$private_rsrc_reg' -# FULL-NEXT: scratchWaveOffsetReg: '$scratch_wave_offset_reg' # FULL-NEXT: frameOffsetReg: '$fp_reg' # FULL-NEXT: stackPtrOffsetReg: '$sp_reg' # FULL-NEXT: argumentInfo: # FULL-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } -# FULL-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr33' } # FULL-NEXT: mode: # FULL-NEXT: ieee: true # FULL-NEXT: dx10-clamp: true @@ -148,7 +140,6 @@ body: | # SIMPLE-NEXT: maxKernArgAlign: 1 # SIMPLE-NEXT: argumentInfo: # SIMPLE-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } -# SIMPLE-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr33' } # SIMPLE-NEXT: body: name: empty_mfi @@ -170,12 +161,10 @@ body: | # FULL-NEXT: memoryBound: false # FULL-NEXT: waveLimiter: false # FULL-NEXT: scratchRSrcReg: '$private_rsrc_reg' -# FULL-NEXT: scratchWaveOffsetReg: '$scratch_wave_offset_reg' # FULL-NEXT: frameOffsetReg: '$fp_reg' # FULL-NEXT: stackPtrOffsetReg: '$sp_reg' # FULL-NEXT: argumentInfo: # FULL-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } -# FULL-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr33' } # FULL-NEXT: mode: # FULL-NEXT: ieee: true # FULL-NEXT: dx10-clamp: true @@ -191,7 +180,6 @@ body: | # SIMPLE-NEXT: isEntryFunction: true # SIMPLE-NEXT: argumentInfo: # SIMPLE-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } -# SIMPLE-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr33' } # SIMPLE-NEXT: body: name: empty_mfi_entry_func @@ -207,12 +195,10 @@ body: | # ALL-LABEL: name: default_regs_mfi # FULL: scratchRSrcReg: '$private_rsrc_reg' -# FULL-NEXT: scratchWaveOffsetReg: '$scratch_wave_offset_reg' # FULL-NEXT: frameOffsetReg: '$fp_reg' # FULL-NEXT: stackPtrOffsetReg: '$sp_reg' # SIMPLE-NOT: scratchRSrcReg -# SIMPLE-NOT: scratchWaveOffsetReg # SIMPLE-NOT:: stackPtrOffsetReg name: default_regs_mfi machineFunctionInfo: @@ -230,13 +216,11 @@ body: | # FULL: argumentInfo: # FULL-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } # FULL-NEXT: flatScratchInit: { offset: 4 } -# FULL-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr33' } # FULL-NEXT: workItemIDY: { reg: '$vgpr0', mask: 65280 } # SIMPLE: argumentInfo: # SIMPLE-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } # SIMPLE-NEXT: flatScratchInit: { offset: 4 } -# SIMPLE-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr33' } # SIMPLE-NEXT: workItemIDY: { reg: '$vgpr0', mask: 65280 } name: fake_stack_arginfo machineFunctionInfo: diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll index 87629c3ae3db2..975a4ea19af4f 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll +++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info.ll @@ -16,9 +16,8 @@ ; CHECK-NEXT: memoryBound: false ; CHECK-NEXT: waveLimiter: false ; CHECK-NEXT: scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99' -; CHECK-NEXT: scratchWaveOffsetReg: '$sgpr101' -; CHECK-NEXT: frameOffsetReg: '$sgpr101' -; CHECK-NEXT: stackPtrOffsetReg: '$sgpr101' +; CHECK-NEXT: frameOffsetReg: '$fp_reg' +; CHECK-NEXT: stackPtrOffsetReg: '$sgpr32' ; CHECK-NEXT: argumentInfo: ; CHECK-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } ; CHECK-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' } @@ -50,9 +49,8 @@ define amdgpu_kernel void @kernel(i32 %arg0, i64 %arg1, <16 x i32> %arg2) { ; CHECK-NEXT: memoryBound: false ; CHECK-NEXT: waveLimiter: false ; CHECK-NEXT: scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99' -; CHECK-NEXT: scratchWaveOffsetReg: '$sgpr101' -; CHECK-NEXT: frameOffsetReg: '$sgpr101' -; CHECK-NEXT: stackPtrOffsetReg: '$sgpr101' +; CHECK-NEXT: frameOffsetReg: '$fp_reg' +; CHECK-NEXT: stackPtrOffsetReg: '$sgpr32' ; CHECK-NEXT: argumentInfo: ; CHECK-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr3' } ; CHECK-NEXT: implicitBufferPtr: { reg: '$sgpr0_sgpr1' } @@ -79,12 +77,10 @@ define amdgpu_ps void @ps_shader(i32 %arg0, i32 inreg %arg1) { ; CHECK-NEXT: memoryBound: false ; CHECK-NEXT: waveLimiter: false ; CHECK-NEXT: scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' -; CHECK-NEXT: scratchWaveOffsetReg: '$sgpr33' -; CHECK-NEXT: frameOffsetReg: '$sgpr34' +; CHECK-NEXT: frameOffsetReg: '$sgpr33' ; CHECK-NEXT: stackPtrOffsetReg: '$sgpr32' ; CHECK-NEXT: argumentInfo: ; CHECK-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } -; CHECK-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr33' } ; CHECK-NEXT: mode: ; CHECK-NEXT: ieee: true ; CHECK-NEXT: dx10-clamp: true @@ -108,12 +104,10 @@ define void @function() { ; CHECK-NEXT: memoryBound: false ; CHECK-NEXT: waveLimiter: false ; CHECK-NEXT: scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' -; CHECK-NEXT: scratchWaveOffsetReg: '$sgpr33' -; CHECK-NEXT: frameOffsetReg: '$sgpr34' +; CHECK-NEXT: frameOffsetReg: '$sgpr33' ; CHECK-NEXT: stackPtrOffsetReg: '$sgpr32' ; CHECK-NEXT: argumentInfo: ; CHECK-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } -; CHECK-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr33' } ; CHECK-NEXT: mode: ; CHECK-NEXT: ieee: true ; CHECK-NEXT: dx10-clamp: true diff --git a/llvm/test/CodeGen/MIR/AMDGPU/mfi-parse-error-scratch-wave-offset-reg.mir b/llvm/test/CodeGen/MIR/AMDGPU/mfi-parse-error-scratch-wave-offset-reg.mir deleted file mode 100644 index bbf58085cfa03..0000000000000 --- a/llvm/test/CodeGen/MIR/AMDGPU/mfi-parse-error-scratch-wave-offset-reg.mir +++ /dev/null @@ -1,12 +0,0 @@ -# RUN: not llc -march=amdgcn -run-pass none -o /dev/null %s 2>&1 | FileCheck %s -# CHECK: :7:27: expected a named register -# CHECK: scratchWaveOffsetReg: '' ---- -name: empty_scratch_wave_offset_reg -machineFunctionInfo: - scratchWaveOffsetReg: '' -body: | - bb.0: - - S_ENDPGM -... diff --git a/llvm/test/CodeGen/MIR/AMDGPU/mfi-scratch-wave-offset-reg-class.mir b/llvm/test/CodeGen/MIR/AMDGPU/mfi-scratch-wave-offset-reg-class.mir deleted file mode 100644 index 8e765ba01e32b..0000000000000 --- a/llvm/test/CodeGen/MIR/AMDGPU/mfi-scratch-wave-offset-reg-class.mir +++ /dev/null @@ -1,13 +0,0 @@ -# RUN: not llc -march=amdgcn -run-pass none -o /dev/null %s 2>&1 | FileCheck %s -# CHECK: :8:33: incorrect register class for field -# CHECK: scratchWaveOffsetReg: '$vgpr0' - ---- -name: wrong_reg_class_scratch_wave_offset_reg -machineFunctionInfo: - scratchWaveOffsetReg: '$vgpr0' -body: | - bb.0: - - S_ENDPGM -... diff --git a/llvm/test/CodeGen/MIR/AMDGPU/parse-order-reserved-regs.mir b/llvm/test/CodeGen/MIR/AMDGPU/parse-order-reserved-regs.mir index c48b13e46e207..8bed8fe6af167 100644 --- a/llvm/test/CodeGen/MIR/AMDGPU/parse-order-reserved-regs.mir +++ b/llvm/test/CodeGen/MIR/AMDGPU/parse-order-reserved-regs.mir @@ -10,7 +10,6 @@ # CHECK: machineFunctionInfo: # CHECK: isEntryFunction: true # CHECK: scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' -# CHECK: scratchWaveOffsetReg: '$sgpr50' # CHECK: frameOffsetReg: '$sgpr50' # CHECK: renamable $vgpr0 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr50, 4, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) name: reserve_correct_register @@ -18,7 +17,6 @@ tracksRegLiveness: true machineFunctionInfo: isEntryFunction: true scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' - scratchWaveOffsetReg: '$sgpr50' frameOffsetReg: '$sgpr50' stack: - { id: 0, type: default, offset: 0, size: 4, alignment: 4 } diff --git a/llvm/test/CodeGen/MIR/Hexagon/bundled-call-site-info.mir b/llvm/test/CodeGen/MIR/Hexagon/bundled-call-site-info.mir index fec542223fc98..e4100543d3c71 100644 --- a/llvm/test/CodeGen/MIR/Hexagon/bundled-call-site-info.mir +++ b/llvm/test/CodeGen/MIR/Hexagon/bundled-call-site-info.mir @@ -1,3 +1,5 @@ +# We do not support the call site info for the target now, so we use the experimental option (-emit-call-site-info -debug-entry-values). + # RUN: llc -emit-call-site-info -debug-entry-values -run-pass=none -verify-machineinstrs -o - %s | FileCheck %s # Verify that it is possible to read and write MIR where a callSites entry diff --git a/llvm/test/CodeGen/MIR/X86/call-site-info-error4.mir b/llvm/test/CodeGen/MIR/X86/call-site-info-error4.mir index 2472aa707e169..d5bd82c710803 100644 --- a/llvm/test/CodeGen/MIR/X86/call-site-info-error4.mir +++ b/llvm/test/CodeGen/MIR/X86/call-site-info-error4.mir @@ -1,5 +1,5 @@ -# RUN: not llc -mtriple=x86_64-- -run-pass none %s -o - 2>&1 | FileCheck %s -# CHECK: Call site info provided but not used +# RUN: llc -emit-call-site-info -mtriple=x86_64-- -run-pass none %s -o - 2>&1 | FileCheck %s +# CHECK-NOT: Call site info provided but not used --- | define dso_local i32 @baa(i32 %a) local_unnamed_addr { entry: diff --git a/llvm/test/CodeGen/PowerPC/aix-cc-byval.ll b/llvm/test/CodeGen/PowerPC/aix-cc-byval.ll index 0f90da66d60d9..55713dfc70e69 100644 --- a/llvm/test/CodeGen/PowerPC/aix-cc-byval.ll +++ b/llvm/test/CodeGen/PowerPC/aix-cc-byval.ll @@ -1,27 +1,32 @@ -; RUN: llc -mtriple powerpc-ibm-aix-xcoff -stop-after=machine-cp -verify-machineinstrs < %s | \ +; RUN: llc -mtriple powerpc-ibm-aix-xcoff -stop-after=machine-cp -mcpu=pwr4 \ +; RUN: -mattr=-altivec -verify-machineinstrs < %s | \ ; RUN: FileCheck --check-prefixes=CHECK,32BIT %s ; RUN: llc -verify-machineinstrs -mcpu=pwr4 -mattr=-altivec \ ; RUN: -mtriple powerpc-ibm-aix-xcoff < %s | \ -; RUN: FileCheck --check-prefixes=CHECKASM,ASM32PWR4 %s +; RUN: FileCheck --check-prefixes=CHECKASM,ASM32 %s -; RUN: llc -mtriple powerpc64-ibm-aix-xcoff -stop-after=machine-cp -verify-machineinstrs < %s | \ +; RUN: llc -mtriple powerpc64-ibm-aix-xcoff -stop-after=machine-cp -mcpu=pwr4 \ +; RUN: -mattr=-altivec -verify-machineinstrs < %s | \ ; RUN: FileCheck --check-prefixes=CHECK,64BIT %s ; RUN: llc -verify-machineinstrs -mcpu=pwr4 -mattr=-altivec \ ; RUN: -mtriple powerpc64-ibm-aix-xcoff < %s | \ -; RUN: FileCheck --check-prefixes=CHECKASM,ASM64PWR4 %s +; RUN: FileCheck --check-prefixes=CHECKASM,ASM64 %s + +%struct.S0 = type {} %struct.S1 = type { [1 x i8] } @gS1 = external global %struct.S1, align 1 define void @call_test_byval_1Byte() { entry: - call void @test_byval_1Byte(%struct.S1* byval(%struct.S1) align 1 @gS1) + %s0 = alloca %struct.S0, align 8 + call void @test_byval_1Byte(%struct.S0* byval(%struct.S0) align 1 %s0, %struct.S1* byval(%struct.S1) align 1 @gS1) ret void } -declare void @test_byval_1Byte(%struct.S1* byval(%struct.S1) align 1) +declare void @test_byval_1Byte(%struct.S0* byval(%struct.S0) align 1, %struct.S1* byval(%struct.S1) align 1) ; CHECK-LABEL: name: call_test_byval_1Byte{{.*}} @@ -34,13 +39,13 @@ declare void @test_byval_1Byte(%struct.S1* byval(%struct.S1) align 1) ; CHECKASM-LABEL: .call_test_byval_1Byte: -; ASM32PWR4: stwu 1, -64(1) -; ASM32PWR4-NEXT: lwz [[REG:[0-9]+]], LC{{[0-9]+}}(2) -; ASM32PWR4-NEXT: lbz 3, 0([[REG]]) -; ASM32PWR4-NEXT: slwi 3, 3, 24 -; ASM32PWR4-NEXT: bl .test_byval_1Byte -; ASM32PWR4-NEXT: nop -; ASM32PWR4-NEXT: addi 1, 1, 64 +; ASM32: stwu 1, -64(1) +; ASM32-NEXT: lwz [[REG:[0-9]+]], LC{{[0-9]+}}(2) +; ASM32-NEXT: lbz 3, 0([[REG]]) +; ASM32-NEXT: slwi 3, 3, 24 +; ASM32-NEXT: bl .test_byval_1Byte +; ASM32-NEXT: nop +; ASM32-NEXT: addi 1, 1, 64 ; 64BIT: ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1 ; 64BIT-NEXT: renamable $x[[REG:[0-9]+]] = LDtoc @gS1, $x2 :: (load 8 from got) @@ -49,14 +54,14 @@ declare void @test_byval_1Byte(%struct.S1* byval(%struct.S1) align 1) ; 64BIT-NEXT: BL8_NOP , csr_aix64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x2, implicit-def $r1 ; 64BIT-NEXT: ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1 -; ASM64PWR4: std 0, 16(1) -; ASM64PWR4-NEXT: stdu 1, -112(1) -; ASM64PWR4-NEXT: ld [[REG:[0-9]+]], LC{{[0-9]+}}(2) -; ASM64PWR4-NEXT: lbz 3, 0([[REG]]) -; ASM64PWR4-NEXT: sldi 3, 3, 56 -; ASM64PWR4-NEXT: bl .test_byval_1Byte -; ASM64PWR4-NEXT: nop -; ASM64PWR4-NEXT: addi 1, 1, 112 +; ASM64: std 0, 16(1) +; ASM64-NEXT: stdu 1, -128(1) +; ASM64-NEXT: ld [[REG:[0-9]+]], LC{{[0-9]+}}(2) +; ASM64-NEXT: lbz 3, 0([[REG]]) +; ASM64-NEXT: sldi 3, 3, 56 +; ASM64-NEXT: bl .test_byval_1Byte +; ASM64-NEXT: nop +; ASM64-NEXT: addi 1, 1, 128 %struct.S2 = type { [2 x i8] } @@ -81,13 +86,13 @@ declare void @test_byval_2Byte(%struct.S2* byval(%struct.S2) align 1) ; CHECKASM-LABEL: .call_test_byval_2Byte: -; ASM32PWR4: stwu 1, -64(1) -; ASM32PWR4-NEXT: lwz [[REG:[0-9]+]], LC{{[0-9]+}}(2) -; ASM32PWR4-NEXT: lhz 3, 0([[REG]]) -; ASM32PWR4-NEXT: slwi 3, 3, 16 -; ASM32PWR4-NEXT: bl .test_byval_2Byte -; ASM32PWR4-NEXT: nop -; ASM32PWR4-NEXT: addi 1, 1, 64 +; ASM32: stwu 1, -64(1) +; ASM32-NEXT: lwz [[REG:[0-9]+]], LC{{[0-9]+}}(2) +; ASM32-NEXT: lhz 3, 0([[REG]]) +; ASM32-NEXT: slwi 3, 3, 16 +; ASM32-NEXT: bl .test_byval_2Byte +; ASM32-NEXT: nop +; ASM32-NEXT: addi 1, 1, 64 ; 64BIT: ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1 ; 64BIT-NEXT: renamable $x[[REG:[0-9]+]] = LDtoc @gS2, $x2 :: (load 8 from got) @@ -96,16 +101,16 @@ declare void @test_byval_2Byte(%struct.S2* byval(%struct.S2) align 1) ; 64BIT-NEXT: BL8_NOP , csr_aix64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x2, implicit-def $r1 ; 64BIT-NEXT: ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1 -; ASM64PWR4: std 0, 16(1) -; ASM64PWR4-NEXT: stdu 1, -112(1) -; ASM64PWR4-NEXT: ld [[REG:[0-9]+]], LC{{[0-9]+}}(2) -; ASM64PWR4-NEXT: lhz 3, 0([[REG]]) -; ASM64PWR4-NEXT: sldi 3, 3, 48 -; ASM64PWR4-NEXT: bl .test_byval_2Byte -; ASM64PWR4-NEXT: nop -; ASM64PWR4-NEXT: addi 1, 1, 112 +; ASM64: std 0, 16(1) +; ASM64-NEXT: stdu 1, -112(1) +; ASM64-NEXT: ld [[REG:[0-9]+]], LC{{[0-9]+}}(2) +; ASM64-NEXT: lhz 3, 0([[REG]]) +; ASM64-NEXT: sldi 3, 3, 48 +; ASM64-NEXT: bl .test_byval_2Byte +; ASM64-NEXT: nop +; ASM64-NEXT: addi 1, 1, 112 -%struct.S3 = type { [3 x i8] } +%struct.S3 = type <{ i8, i16 }> @gS3 = external global %struct.S3, align 1 @@ -132,14 +137,14 @@ declare void @test_byval_3Byte(%struct.S3* byval(%struct.S3) align 1) ; CHECKASM-LABEL: .call_test_byval_3Byte: ; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings. -; ASM32PWR4: stwu 1, -64(1) -; ASM32PWR4-NEXT: lwz [[REGADDR:[0-9]+]], LC{{[0-9]+}}(2) -; ASM32PWR4-DAG: lhz [[REG1:[0-9]+]], 0([[REGADDR]]) -; ASM32PWR4-DAG: lbz [[REG2:[0-9]+]], 2([[REGADDR]]) -; ASM32PWR4-DAG: rlwinm 3, [[REG2]], 8, 16, 23 -; ASM32PWR4-DAG: rlwimi 3, [[REG1]], 16, 0, 15 -; ASM32PWR4-NEXT: bl .test_byval_3Byte -; ASM32PWR4-NEXT: nop +; ASM32: stwu 1, -64(1) +; ASM32-NEXT: lwz [[REGADDR:[0-9]+]], LC{{[0-9]+}}(2) +; ASM32-DAG: lhz [[REG1:[0-9]+]], 0([[REGADDR]]) +; ASM32-DAG: lbz [[REG2:[0-9]+]], 2([[REGADDR]]) +; ASM32-DAG: rlwinm 3, [[REG2]], 8, 16, 23 +; ASM32-DAG: rlwimi 3, [[REG1]], 16, 0, 15 +; ASM32-NEXT: bl .test_byval_3Byte +; ASM32-NEXT: nop ; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings. ; 64BIT: ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1 @@ -152,55 +157,65 @@ declare void @test_byval_3Byte(%struct.S3* byval(%struct.S3) align 1) ; 64BIT-NEXT: ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1 ; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings. -; ASM64PWR4: stdu 1, -112(1) -; ASM64PWR4-NEXT: ld [[REGADDR:[0-9]+]], LC{{[0-9]+}}(2) -; ASM64PWR4-DAG: lhz [[REG1:[0-9]+]], 0([[REGADDR]]) -; ASM64PWR4-DAG: lbz [[REG2:[0-9]+]], 2([[REGADDR]]) -; ASM64PWR4-DAG: rldic 3, [[REG2]], 40, 16 -; ASM64PWR4-DAG: rldimi 3, [[REG1]], 48, 0 -; ASM64PWR4-NEXT: bl .test_byval_3Byte -; ASM64PWR4-NEXT: nop +; ASM64: stdu 1, -112(1) +; ASM64-NEXT: ld [[REGADDR:[0-9]+]], LC{{[0-9]+}}(2) +; ASM64-DAG: lhz [[REG1:[0-9]+]], 0([[REGADDR]]) +; ASM64-DAG: lbz [[REG2:[0-9]+]], 2([[REGADDR]]) +; ASM64-DAG: rldic 3, [[REG2]], 40, 16 +; ASM64-DAG: rldimi 3, [[REG1]], 48, 0 +; ASM64-NEXT: bl .test_byval_3Byte +; ASM64-NEXT: nop %struct.S4 = type { [4 x i8] } +%struct.S4A = type { i32 } @gS4 = external global %struct.S4, align 1 define void @call_test_byval_4Byte() { entry: - call void @test_byval_4Byte(%struct.S4* byval(%struct.S4) align 1 @gS4) + %s0 = alloca %struct.S0, align 8 + %s4a = alloca %struct.S4A, align 4 + call void @test_byval_4Byte(%struct.S4* byval(%struct.S4) align 1 @gS4, %struct.S0* byval(%struct.S0) align 1 %s0, %struct.S4A* byval(%struct.S4A) align 4 %s4a) ret void } -declare void @test_byval_4Byte(%struct.S4* byval(%struct.S4) align 1) +declare void @test_byval_4Byte(%struct.S4* byval(%struct.S4) align 1, %struct.S0* byval(%struct.S0) align 1, %struct.S4A* byval(%struct.S4A) align 4) ; CHECK-LABEL: name: call_test_byval_4Byte{{.*}} ; 32BIT: ADJCALLSTACKDOWN 56, 0, implicit-def dead $r1, implicit $r1 ; 32BIT-NEXT: renamable $r[[REG:[0-9]+]] = LWZtoc @gS4, $r2 :: (load 4 from got) -; 32BIT-NEXT: renamable $r3 = LWZ 0, killed renamable $r[[REG]] :: (load 4) -; 32BIT-NEXT: BL_NOP , csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r2, implicit-def $r1 +; 32BIT-DAG: renamable $r3 = LWZ 0, killed renamable $r[[REG]] :: (load 4) +; 32BIT-DAG: renamable $r4 = LWZ 0, %stack.1.s4a :: (load 4 from %stack.1.s4a, align 8) +; 32BIT-NEXT: BL_NOP , csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r4, implicit $r2, implicit-def $r1 ; 32BIT-NEXT: ADJCALLSTACKUP 56, 0, implicit-def dead $r1, implicit $r1 ; CHECKASM-LABEL: .call_test_byval_4Byte: -; ASM32PWR4: stwu 1, -64(1) -; ASM32PWR4-NEXT: lwz [[REG:[0-9]+]], LC{{[0-9]+}}(2) -; ASM32PWR4-NEXT: lwz 3, 0([[REG]]) -; ASM32PWR4-NEXT: bl .test_byval_4Byte -; ASM32PWR4-NEXT: nop -; ASM32PWR4-NEXT: addi 1, 1, 64 +; ASM32: stwu 1, -80(1) +; ASM32-NEXT: lwz [[REG:[0-9]+]], LC{{[0-9]+}}(2) +; ASM32-DAG: lwz 3, 0([[REG]]) +; ASM32-DAG: lwz 4, 64(1) +; ASM32-NEXT: bl .test_byval_4Byte +; ASM32-NEXT: nop +; ASM32-NEXT: addi 1, 1, 80 ; 64BIT: ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1 -; 64BIT-NEXT: renamable $x[[REG:[0-9]+]] = LDtoc @gS4, $x2 :: (load 8 from got) -; 64BIT-NEXT: renamable $x3 = LWZ8 0, killed renamable $x[[REG]] :: (load 4) -; 64BIT-NEXT: renamable $x3 = RLDICR killed renamable $x3, 32, 31 -; 64BIT-NEXT: BL8_NOP , csr_aix64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x2, implicit-def $r1 +; 64BIT-NEXT: renamable $x[[REGADDR:[0-9]+]] = LDtoc @gS4, $x2 :: (load 8 from got) +; 64BIT-DAG: renamable $x[[LD1:[0-9]+]] = LWZ8 0, killed renamable $x[[REGADDR]] :: (load 4) +; 64BIT-DAG: renamable $x[[LD2:[0-9]+]] = LWZ8 0, %stack.1.s4a :: (load 4 from %stack.1.s4a, align 8) +; 64BIT-DAG: renamable $x3 = RLDICR killed renamable $x[[LD1]], 32, 31 +; 64BIT-DAG: renamable $x4 = RLDICR killed renamable $x[[LD2]], 32, 31 +; 64BIT-NEXT: BL8_NOP , csr_aix64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x4, implicit $x2, implicit-def $r1 ; 64BIT-NEXT: ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1 -; ASM64PWR4: stdu 1, -112(1) -; ASM64PWR4-NEXT: ld [[REG:[0-9]+]], LC{{[0-9]+}}(2) -; ASM64PWR4-NEXT: lwz 3, 0([[REG]]) -; ASM64PWR4-NEXT: sldi 3, 3, 32 -; ASM64PWR4-NEXT: bl .test_byval_4Byte -; ASM64PWR4-NEXT: nop -; ASM64PWR4-NEXT: addi 1, 1, 112 +; ASM64: stdu 1, -128(1) +; ASM64-NEXT: ld [[REGADDR:[0-9]+]], LC{{[0-9]+}}(2) +; ASM64-DAG: lwz [[LD1:[0-9]+]], 0([[REGADDR]]) +; ASM64-DAG: lwz [[LD2:[0-9]+]], 112(1) +; ASM64-DAG: sldi 3, [[LD1]], 32 +; ASM64-DAG: sldi 4, [[LD2]], 32 +; ASM64-NEXT: bl .test_byval_4Byte +; ASM64-NEXT: nop +; ASM64-NEXT: addi 1, 1, 128 + diff --git a/llvm/test/CodeGen/PowerPC/aix64-cc-byval.ll b/llvm/test/CodeGen/PowerPC/aix64-cc-byval.ll index 599ab13530b84..e8135af274b54 100644 --- a/llvm/test/CodeGen/PowerPC/aix64-cc-byval.ll +++ b/llvm/test/CodeGen/PowerPC/aix64-cc-byval.ll @@ -1,9 +1,10 @@ -; RUN: llc -mtriple powerpc64-ibm-aix-xcoff -stop-after=machine-cp -verify-machineinstrs < %s | \ -; RUN: FileCheck --check-prefixes=CHECK,64BIT %s +; RUN: llc -mtriple powerpc64-ibm-aix-xcoff -stop-after=machine-cp -mcpu=pwr4 \ +; RUN: -mattr=-altivec -verify-machineinstrs < %s | \ +; RUN: FileCheck %s ; RUN: llc -verify-machineinstrs -mcpu=pwr4 -mattr=-altivec \ ; RUN: -mtriple powerpc64-ibm-aix-xcoff < %s | \ -; RUN: FileCheck --check-prefixes=CHECKASM,ASM64PWR4 %s +; RUN: FileCheck --check-prefix=ASM %s %struct.S5 = type { [5 x i8] } @@ -19,27 +20,27 @@ declare void @test_byval_5Byte(%struct.S5* byval(%struct.S5) align 1) ; CHECK-LABEL: name: call_test_byval_5Byte{{.*}} -; CHECKASM-LABEL: .call_test_byval_5Byte: +; ASM-LABEL: .call_test_byval_5Byte: ; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings. -; 64BIT: ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1 -; 64BIT-NEXT: renamable $x[[REGADDR:[0-9]+]] = LDtoc @gS5, $x2 :: (load 8 from got) -; 64BIT-DAG: renamable $x[[REG1:[0-9]+]] = LWZ8 0, killed renamable $x[[REGADDR]] :: (load 4) -; 64BIT-DAG: renamable $x[[REG2:[0-9]+]] = LBZ8 4, renamable $x[[REGADDR]] :: (load 1) -; 64BIT-DAG: renamable $x3 = RLWINM8 killed renamable $x[[REG2]], 24, 0, 7 -; 64BIT-DAG: renamable $x3 = RLDIMI killed renamable $x3, killed renamable $x[[REG1]], 32, 0 -; 64BIT-NEXT: BL8_NOP , csr_aix64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x2, implicit-def $r1 -; 64BIT-NEXT: ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1 +; CHECK: ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1 +; CHECK-NEXT: renamable $x[[REGADDR:[0-9]+]] = LDtoc @gS5, $x2 :: (load 8 from got) +; CHECK-DAG: renamable $x[[REG1:[0-9]+]] = LWZ8 0, killed renamable $x[[REGADDR]] :: (load 4) +; CHECK-DAG: renamable $x[[REG2:[0-9]+]] = LBZ8 4, renamable $x[[REGADDR]] :: (load 1) +; CHECK-DAG: renamable $x3 = RLWINM8 killed renamable $x[[REG2]], 24, 0, 7 +; CHECK-DAG: renamable $x3 = RLDIMI killed renamable $x3, killed renamable $x[[REG1]], 32, 0 +; CHECK-NEXT: BL8_NOP , csr_aix64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x2, implicit-def $r1 +; CHECK-NEXT: ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1 ; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings. -; ASM64PWR4: stdu 1, -112(1) -; ASM64PWR4-NEXT: ld [[REGADDR:[0-9]+]], LC{{[0-9]+}}(2) -; ASM64PWR4-DAG: lwz [[REG1:[0-9]+]], 0([[REGADDR]]) -; ASM64PWR4-DAG: lbz [[REG2:[0-9]+]], 4([[REGADDR]]) -; ASM64PWR4-DAG: rlwinm 3, [[REG2]], 24, 0, 7 -; ASM64PWR4-DAG: rldimi 3, [[REG1]], 32, 0 -; ASM64PWR4-NEXT: bl .test_byval_5Byte -; ASM64PWR4-NEXT: nop +; ASM: stdu 1, -112(1) +; ASM-NEXT: ld [[REGADDR:[0-9]+]], LC{{[0-9]+}}(2) +; ASM-DAG: lwz [[REG1:[0-9]+]], 0([[REGADDR]]) +; ASM-DAG: lbz [[REG2:[0-9]+]], 4([[REGADDR]]) +; ASM-DAG: rlwinm 3, [[REG2]], 24, 0, 7 +; ASM-DAG: rldimi 3, [[REG1]], 32, 0 +; ASM-NEXT: bl .test_byval_5Byte +; ASM-NEXT: nop %struct.S6 = type { [6 x i8] } @@ -55,27 +56,27 @@ declare void @test_byval_6Byte(%struct.S6* byval(%struct.S6) align 1) ; CHECK-LABEL: name: call_test_byval_6Byte{{.*}} -; CHECKASM-LABEL: .call_test_byval_6Byte: +; ASM-LABEL: .call_test_byval_6Byte: ; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings. -; 64BIT: ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1 -; 64BIT-NEXT: renamable $x[[REGADDR:[0-9]+]] = LDtoc @gS6, $x2 :: (load 8 from got) -; 64BIT-DAG: renamable $x[[REG1:[0-9]+]] = LWZ8 0, killed renamable $x[[REGADDR]] :: (load 4) -; 64BIT-DAG: renamable $x[[REG2:[0-9]+]] = LHZ8 4, renamable $x[[REGADDR]] :: (load 2) -; 64BIT-DAG: renamable $x3 = RLWINM8 killed renamable $x[[REG2]], 16, 0, 15 -; 64BIT-DAG: renamable $x3 = RLDIMI killed renamable $x3, killed renamable $x[[REG1]], 32, 0 -; 64BIT-NEXT: BL8_NOP , csr_aix64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x2, implicit-def $r1 -; 64BIT-NEXT: ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1 +; CHECK: ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1 +; CHECK-NEXT: renamable $x[[REGADDR:[0-9]+]] = LDtoc @gS6, $x2 :: (load 8 from got) +; CHECK-DAG: renamable $x[[REG1:[0-9]+]] = LWZ8 0, killed renamable $x[[REGADDR]] :: (load 4) +; CHECK-DAG: renamable $x[[REG2:[0-9]+]] = LHZ8 4, renamable $x[[REGADDR]] :: (load 2) +; CHECK-DAG: renamable $x3 = RLWINM8 killed renamable $x[[REG2]], 16, 0, 15 +; CHECK-DAG: renamable $x3 = RLDIMI killed renamable $x3, killed renamable $x[[REG1]], 32, 0 +; CHECK-NEXT: BL8_NOP , csr_aix64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x2, implicit-def $r1 +; CHECK-NEXT: ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1 ; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings. -; ASM64PWR4: stdu 1, -112(1) -; ASM64PWR4-NEXT: ld [[REGADDR:[0-9]+]], LC{{[0-9]+}}(2) -; ASM64PWR4-DAG: lwz [[REG1:[0-9]+]], 0([[REGADDR]]) -; ASM64PWR4-DAG: lhz [[REG2:[0-9]+]], 4([[REGADDR]]) -; ASM64PWR4-DAG: rlwinm 3, [[REG2]], 16, 0, 15 -; ASM64PWR4-DAG: rldimi 3, [[REG1]], 32, 0 -; ASM64PWR4-NEXT: bl .test_byval_6Byte -; ASM64PWR4-NEXT: nop +; ASM: stdu 1, -112(1) +; ASM-NEXT: ld [[REGADDR:[0-9]+]], LC{{[0-9]+}}(2) +; ASM-DAG: lwz [[REG1:[0-9]+]], 0([[REGADDR]]) +; ASM-DAG: lhz [[REG2:[0-9]+]], 4([[REGADDR]]) +; ASM-DAG: rlwinm 3, [[REG2]], 16, 0, 15 +; ASM-DAG: rldimi 3, [[REG1]], 32, 0 +; ASM-NEXT: bl .test_byval_6Byte +; ASM-NEXT: nop %struct.S7 = type { [7 x i8] } @@ -91,31 +92,31 @@ declare void @test_byval_7Byte(%struct.S7* byval(%struct.S7) align 1) ; CHECK-LABEL: name: call_test_byval_7Byte{{.*}} -; CHECKASM-LABEL: .call_test_byval_7Byte: +; ASM-LABEL: .call_test_byval_7Byte: ; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings. -; 64BIT: ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1 -; 64BIT-NEXT: renamable $x[[REGADDR:[0-9]+]] = LDtoc @gS7, $x2 :: (load 8 from got) -; 64BIT-DAG: renamable $x[[REG1:[0-9]+]] = LWZ8 0, killed renamable $x[[REGADDR]] :: (load 4) -; 64BIT-DAG: renamable $x[[REG2:[0-9]+]] = LHZ8 4, renamable $x[[REGADDR]] :: (load 2) -; 64BIT-DAG: renamable $x[[REG3:[0-9]+]] = LBZ8 6, renamable $x[[REGADDR]] :: (load 1) -; 64BIT-DAG: renamable $x3 = RLWINM8 killed renamable $x[[REG3]], 8, 16, 23 -; 64BIT-DAG: renamable $x3 = RLWIMI8 killed renamable $x3, killed renamable $x[[REG2]], 16, 0, 15 -; 64BIT-DAG: renamable $x3 = RLDIMI killed renamable $x3, killed renamable $x[[REG1]], 32, 0 -; 64BIT-NEXT: BL8_NOP , csr_aix64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x2, implicit-def $r1 -; 64BIT-NEXT: ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1 +; CHECK: ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1 +; CHECK-NEXT: renamable $x[[REGADDR:[0-9]+]] = LDtoc @gS7, $x2 :: (load 8 from got) +; CHECK-DAG: renamable $x[[REG1:[0-9]+]] = LWZ8 0, killed renamable $x[[REGADDR]] :: (load 4) +; CHECK-DAG: renamable $x[[REG2:[0-9]+]] = LHZ8 4, renamable $x[[REGADDR]] :: (load 2) +; CHECK-DAG: renamable $x[[REG3:[0-9]+]] = LBZ8 6, renamable $x[[REGADDR]] :: (load 1) +; CHECK-DAG: renamable $x3 = RLWINM8 killed renamable $x[[REG3]], 8, 16, 23 +; CHECK-DAG: renamable $x3 = RLWIMI8 killed renamable $x3, killed renamable $x[[REG2]], 16, 0, 15 +; CHECK-DAG: renamable $x3 = RLDIMI killed renamable $x3, killed renamable $x[[REG1]], 32, 0 +; CHECK-NEXT: BL8_NOP , csr_aix64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x2, implicit-def $r1 +; CHECK-NEXT: ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1 ; The DAG block permits some invalid inputs for the benefit of allowing more valid orderings. -; ASM64PWR4: stdu 1, -112(1) -; ASM64PWR4-NEXT: ld [[REGADDR:[0-9]+]], LC{{[0-9]+}}(2) -; ASM64PWR4-DAG: lwz [[REG1:[0-9]+]], 0([[REGADDR]]) -; ASM64PWR4-DAG: lhz [[REG2:[0-9]+]], 4([[REGADDR]]) -; ASM64PWR4-DAG: lbz [[REG3:[0-9]+]], 6([[REGADDR]]) -; ASM64PWR4-DAG: rlwinm 3, [[REG3]], 8, 16, 23 -; ASM64PWR4-DAG: rlwimi 3, [[REG2]], 16, 0, 15 -; ASM64PWR4-DAG: rldimi 3, [[REG1]], 32, 0 -; ASM64PWR4-NEXT: bl .test_byval_7Byte -; ASM64PWR4-NEXT: nop +; ASM: stdu 1, -112(1) +; ASM-NEXT: ld [[REGADDR:[0-9]+]], LC{{[0-9]+}}(2) +; ASM-DAG: lwz [[REG1:[0-9]+]], 0([[REGADDR]]) +; ASM-DAG: lhz [[REG2:[0-9]+]], 4([[REGADDR]]) +; ASM-DAG: lbz [[REG3:[0-9]+]], 6([[REGADDR]]) +; ASM-DAG: rlwinm 3, [[REG3]], 8, 16, 23 +; ASM-DAG: rlwimi 3, [[REG2]], 16, 0, 15 +; ASM-DAG: rldimi 3, [[REG1]], 32, 0 +; ASM-NEXT: bl .test_byval_7Byte +; ASM-NEXT: nop %struct.S8 = type { [8 x i8] } @@ -131,16 +132,16 @@ declare void @test_byval_8Byte(%struct.S8* byval(%struct.S8) align 1) ; CHECK-LABEL: name: call_test_byval_8Byte{{.*}} -; CHECKASM-LABEL: .call_test_byval_8Byte: +; ASM-LABEL: .call_test_byval_8Byte: -; 64BIT: ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1 -; 64BIT-NEXT: renamable $x[[REGADDR:[0-9]+]] = LDtoc @gS8, $x2 :: (load 8 from got) -; 64BIT-NEXT: renamable $x3 = LD 0, killed renamable $x[[REGADDR]] :: (load 8) -; 64BIT-NEXT: BL8_NOP , csr_aix64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x2, implicit-def $r1 -; 64BIT-NEXT: ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1 +; CHECK: ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1 +; CHECK-NEXT: renamable $x[[REGADDR:[0-9]+]] = LDtoc @gS8, $x2 :: (load 8 from got) +; CHECK-NEXT: renamable $x3 = LD 0, killed renamable $x[[REGADDR]] :: (load 8) +; CHECK-NEXT: BL8_NOP , csr_aix64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x2, implicit-def $r1 +; CHECK-NEXT: ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1 -; ASM64PWR4: stdu 1, -112(1) -; ASM64PWR4-NEXT: ld [[REGADDR:[0-9]+]], LC{{[0-9]+}}(2) -; ASM64PWR4-NEXT: ld 3, 0([[REGADDR]]) -; ASM64PWR4-NEXT: bl .test_byval_8Byte -; ASM64PWR4-NEXT: nop +; ASM: stdu 1, -112(1) +; ASM-NEXT: ld [[REGADDR:[0-9]+]], LC{{[0-9]+}}(2) +; ASM-NEXT: ld 3, 0([[REGADDR]]) +; ASM-NEXT: bl .test_byval_8Byte +; ASM-NEXT: nop diff --git a/llvm/test/CodeGen/PowerPC/atomics-fences.ll b/llvm/test/CodeGen/PowerPC/atomics-fences.ll index 3fea72150000c..8fe366307fd1c 100644 --- a/llvm/test/CodeGen/PowerPC/atomics-fences.ll +++ b/llvm/test/CodeGen/PowerPC/atomics-fences.ll @@ -1,6 +1,7 @@ ; RUN: llc < %s -mtriple=powerpc-unknown-linux-gnu -verify-machineinstrs | FileCheck %s ; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -verify-machineinstrs | FileCheck %s ; RUN: llc < %s -mtriple=powerpc-unknown-linux-gnu -mcpu=440 | FileCheck %s --check-prefix=PPC440 +; RUN: llc < %s -mtriple=powerpc-unknown-linux-gnu -mcpu=e500 | FileCheck %s --check-prefix=PPC440 ; Fences define void @fence_acquire() { diff --git a/llvm/test/CodeGen/PowerPC/build-vector-tests.ll b/llvm/test/CodeGen/PowerPC/build-vector-tests.ll index 4e096b1c5c03b..469cef01094bf 100644 --- a/llvm/test/CodeGen/PowerPC/build-vector-tests.ll +++ b/llvm/test/CodeGen/PowerPC/build-vector-tests.ll @@ -6123,3 +6123,412 @@ entry: %splat.splat = shufflevector <2 x i64> %splat.splatinsert, <2 x i64> undef, <2 x i32> zeroinitializer ret <2 x i64> %splat.splat } + +; Some additional patterns that come up in real code. +define dso_local <2 x double> @sint_to_fp_widen02(<4 x i32> %a) { +; P9BE-LABEL: sint_to_fp_widen02: +; P9BE: # %bb.0: # %entry +; P9BE-NEXT: xvcvsxwdp v2, v2 +; P9BE-NEXT: blr +; +; P9LE-LABEL: sint_to_fp_widen02: +; P9LE: # %bb.0: # %entry +; P9LE-NEXT: xxsldwi vs0, v2, v2, 1 +; P9LE-NEXT: xvcvsxwdp v2, vs0 +; P9LE-NEXT: blr +; +; P8BE-LABEL: sint_to_fp_widen02: +; P8BE: # %bb.0: # %entry +; P8BE-NEXT: xvcvsxwdp v2, v2 +; P8BE-NEXT: blr +; +; P8LE-LABEL: sint_to_fp_widen02: +; P8LE: # %bb.0: # %entry +; P8LE-NEXT: xxsldwi vs0, v2, v2, 1 +; P8LE-NEXT: xvcvsxwdp v2, vs0 +; P8LE-NEXT: blr +entry: + %vecext = extractelement <4 x i32> %a, i32 0 + %conv = sitofp i32 %vecext to double + %vecinit = insertelement <2 x double> undef, double %conv, i32 0 + %vecext1 = extractelement <4 x i32> %a, i32 2 + %conv2 = sitofp i32 %vecext1 to double + %vecinit3 = insertelement <2 x double> %vecinit, double %conv2, i32 1 + ret <2 x double> %vecinit3 +} + +define dso_local <2 x double> @sint_to_fp_widen13(<4 x i32> %a) { +; P9BE-LABEL: sint_to_fp_widen13: +; P9BE: # %bb.0: # %entry +; P9BE-NEXT: xxsldwi vs0, v2, v2, 3 +; P9BE-NEXT: xvcvsxwdp v2, vs0 +; P9BE-NEXT: blr +; +; P9LE-LABEL: sint_to_fp_widen13: +; P9LE: # %bb.0: # %entry +; P9LE-NEXT: xvcvsxwdp v2, v2 +; P9LE-NEXT: blr +; +; P8BE-LABEL: sint_to_fp_widen13: +; P8BE: # %bb.0: # %entry +; P8BE-NEXT: xxsldwi vs0, v2, v2, 3 +; P8BE-NEXT: xvcvsxwdp v2, vs0 +; P8BE-NEXT: blr +; +; P8LE-LABEL: sint_to_fp_widen13: +; P8LE: # %bb.0: # %entry +; P8LE-NEXT: xvcvsxwdp v2, v2 +; P8LE-NEXT: blr +entry: + %vecext = extractelement <4 x i32> %a, i32 1 + %conv = sitofp i32 %vecext to double + %vecinit = insertelement <2 x double> undef, double %conv, i32 0 + %vecext1 = extractelement <4 x i32> %a, i32 3 + %conv2 = sitofp i32 %vecext1 to double + %vecinit3 = insertelement <2 x double> %vecinit, double %conv2, i32 1 + ret <2 x double> %vecinit3 +} + +define dso_local <2 x double> @uint_to_fp_widen02(<4 x i32> %a) { +; P9BE-LABEL: uint_to_fp_widen02: +; P9BE: # %bb.0: # %entry +; P9BE-NEXT: xvcvuxwdp v2, v2 +; P9BE-NEXT: blr +; +; P9LE-LABEL: uint_to_fp_widen02: +; P9LE: # %bb.0: # %entry +; P9LE-NEXT: xxsldwi vs0, v2, v2, 1 +; P9LE-NEXT: xvcvuxwdp v2, vs0 +; P9LE-NEXT: blr +; +; P8BE-LABEL: uint_to_fp_widen02: +; P8BE: # %bb.0: # %entry +; P8BE-NEXT: xvcvuxwdp v2, v2 +; P8BE-NEXT: blr +; +; P8LE-LABEL: uint_to_fp_widen02: +; P8LE: # %bb.0: # %entry +; P8LE-NEXT: xxsldwi vs0, v2, v2, 1 +; P8LE-NEXT: xvcvuxwdp v2, vs0 +; P8LE-NEXT: blr +entry: + %vecext = extractelement <4 x i32> %a, i32 0 + %conv = uitofp i32 %vecext to double + %vecinit = insertelement <2 x double> undef, double %conv, i32 0 + %vecext1 = extractelement <4 x i32> %a, i32 2 + %conv2 = uitofp i32 %vecext1 to double + %vecinit3 = insertelement <2 x double> %vecinit, double %conv2, i32 1 + ret <2 x double> %vecinit3 +} + +define dso_local <2 x double> @uint_to_fp_widen13(<4 x i32> %a) { +; P9BE-LABEL: uint_to_fp_widen13: +; P9BE: # %bb.0: # %entry +; P9BE-NEXT: xxsldwi vs0, v2, v2, 3 +; P9BE-NEXT: xvcvuxwdp v2, vs0 +; P9BE-NEXT: blr +; +; P9LE-LABEL: uint_to_fp_widen13: +; P9LE: # %bb.0: # %entry +; P9LE-NEXT: xvcvuxwdp v2, v2 +; P9LE-NEXT: blr +; +; P8BE-LABEL: uint_to_fp_widen13: +; P8BE: # %bb.0: # %entry +; P8BE-NEXT: xxsldwi vs0, v2, v2, 3 +; P8BE-NEXT: xvcvuxwdp v2, vs0 +; P8BE-NEXT: blr +; +; P8LE-LABEL: uint_to_fp_widen13: +; P8LE: # %bb.0: # %entry +; P8LE-NEXT: xvcvuxwdp v2, v2 +; P8LE-NEXT: blr +entry: + %vecext = extractelement <4 x i32> %a, i32 1 + %conv = uitofp i32 %vecext to double + %vecinit = insertelement <2 x double> undef, double %conv, i32 0 + %vecext1 = extractelement <4 x i32> %a, i32 3 + %conv2 = uitofp i32 %vecext1 to double + %vecinit3 = insertelement <2 x double> %vecinit, double %conv2, i32 1 + ret <2 x double> %vecinit3 +} + +define dso_local <2 x double> @fp_extend01(<4 x float> %a) { +; P9BE-LABEL: fp_extend01: +; P9BE: # %bb.0: # %entry +; P9BE-NEXT: xxmrghw vs0, v2, v2 +; P9BE-NEXT: xvcvspdp v2, vs0 +; P9BE-NEXT: blr +; +; P9LE-LABEL: fp_extend01: +; P9LE: # %bb.0: # %entry +; P9LE-NEXT: xxmrglw vs0, v2, v2 +; P9LE-NEXT: xvcvspdp v2, vs0 +; P9LE-NEXT: blr +; +; P8BE-LABEL: fp_extend01: +; P8BE: # %bb.0: # %entry +; P8BE-NEXT: xxmrghw vs0, v2, v2 +; P8BE-NEXT: xvcvspdp v2, vs0 +; P8BE-NEXT: blr +; +; P8LE-LABEL: fp_extend01: +; P8LE: # %bb.0: # %entry +; P8LE-NEXT: xxmrglw vs0, v2, v2 +; P8LE-NEXT: xvcvspdp v2, vs0 +; P8LE-NEXT: blr +entry: + %vecext = extractelement <4 x float> %a, i32 0 + %conv = fpext float %vecext to double + %vecinit = insertelement <2 x double> undef, double %conv, i32 0 + %vecext1 = extractelement <4 x float> %a, i32 1 + %conv2 = fpext float %vecext1 to double + %vecinit3 = insertelement <2 x double> %vecinit, double %conv2, i32 1 + ret <2 x double> %vecinit3 +} + +define dso_local <2 x double> @fp_extend10(<4 x float> %a) { +; P9BE-LABEL: fp_extend10: +; P9BE: # %bb.0: # %entry +; P9BE-NEXT: xxmrghw vs0, v2, v2 +; P9BE-NEXT: xvcvspdp vs0, vs0 +; P9BE-NEXT: xxswapd v2, vs0 +; P9BE-NEXT: blr +; +; P9LE-LABEL: fp_extend10: +; P9LE: # %bb.0: # %entry +; P9LE-NEXT: xxmrglw vs0, v2, v2 +; P9LE-NEXT: xvcvspdp vs0, vs0 +; P9LE-NEXT: xxswapd v2, vs0 +; P9LE-NEXT: blr +; +; P8BE-LABEL: fp_extend10: +; P8BE: # %bb.0: # %entry +; P8BE-NEXT: xxmrghw vs0, v2, v2 +; P8BE-NEXT: xvcvspdp vs0, vs0 +; P8BE-NEXT: xxswapd v2, vs0 +; P8BE-NEXT: blr +; +; P8LE-LABEL: fp_extend10: +; P8LE: # %bb.0: # %entry +; P8LE-NEXT: xxmrglw vs0, v2, v2 +; P8LE-NEXT: xvcvspdp vs0, vs0 +; P8LE-NEXT: xxswapd v2, vs0 +; P8LE-NEXT: blr +entry: + %vecext = extractelement <4 x float> %a, i32 1 + %conv = fpext float %vecext to double + %vecinit = insertelement <2 x double> undef, double %conv, i32 0 + %vecext1 = extractelement <4 x float> %a, i32 0 + %conv2 = fpext float %vecext1 to double + %vecinit3 = insertelement <2 x double> %vecinit, double %conv2, i32 1 + ret <2 x double> %vecinit3 +} + +define dso_local <2 x double> @fp_extend02(<4 x float> %a) { +; P9BE-LABEL: fp_extend02: +; P9BE: # %bb.0: # %entry +; P9BE-NEXT: xvcvspdp v2, v2 +; P9BE-NEXT: blr +; +; P9LE-LABEL: fp_extend02: +; P9LE: # %bb.0: # %entry +; P9LE-NEXT: xxsldwi vs0, v2, v2, 1 +; P9LE-NEXT: xvcvspdp v2, vs0 +; P9LE-NEXT: blr +; +; P8BE-LABEL: fp_extend02: +; P8BE: # %bb.0: # %entry +; P8BE-NEXT: xvcvspdp v2, v2 +; P8BE-NEXT: blr +; +; P8LE-LABEL: fp_extend02: +; P8LE: # %bb.0: # %entry +; P8LE-NEXT: xxsldwi vs0, v2, v2, 1 +; P8LE-NEXT: xvcvspdp v2, vs0 +; P8LE-NEXT: blr +entry: + %vecext = extractelement <4 x float> %a, i32 0 + %conv = fpext float %vecext to double + %vecinit = insertelement <2 x double> undef, double %conv, i32 0 + %vecext1 = extractelement <4 x float> %a, i32 2 + %conv2 = fpext float %vecext1 to double + %vecinit3 = insertelement <2 x double> %vecinit, double %conv2, i32 1 + ret <2 x double> %vecinit3 +} + +define dso_local <2 x double> @fp_extend13(<4 x float> %a) { +; P9BE-LABEL: fp_extend13: +; P9BE: # %bb.0: # %entry +; P9BE-NEXT: xxsldwi vs0, v2, v2, 3 +; P9BE-NEXT: xvcvspdp v2, vs0 +; P9BE-NEXT: blr +; +; P9LE-LABEL: fp_extend13: +; P9LE: # %bb.0: # %entry +; P9LE-NEXT: xvcvspdp v2, v2 +; P9LE-NEXT: blr +; +; P8BE-LABEL: fp_extend13: +; P8BE: # %bb.0: # %entry +; P8BE-NEXT: xxsldwi vs0, v2, v2, 3 +; P8BE-NEXT: xvcvspdp v2, vs0 +; P8BE-NEXT: blr +; +; P8LE-LABEL: fp_extend13: +; P8LE: # %bb.0: # %entry +; P8LE-NEXT: xvcvspdp v2, v2 +; P8LE-NEXT: blr +entry: + %vecext = extractelement <4 x float> %a, i32 1 + %conv = fpext float %vecext to double + %vecinit = insertelement <2 x double> undef, double %conv, i32 0 + %vecext1 = extractelement <4 x float> %a, i32 3 + %conv2 = fpext float %vecext1 to double + %vecinit3 = insertelement <2 x double> %vecinit, double %conv2, i32 1 + ret <2 x double> %vecinit3 +} + +define dso_local <2 x double> @fp_extend23(<4 x float> %a) { +; P9BE-LABEL: fp_extend23: +; P9BE: # %bb.0: # %entry +; P9BE-NEXT: xxmrglw vs0, v2, v2 +; P9BE-NEXT: xvcvspdp v2, vs0 +; P9BE-NEXT: blr +; +; P9LE-LABEL: fp_extend23: +; P9LE: # %bb.0: # %entry +; P9LE-NEXT: xxmrghw vs0, v2, v2 +; P9LE-NEXT: xvcvspdp v2, vs0 +; P9LE-NEXT: blr +; +; P8BE-LABEL: fp_extend23: +; P8BE: # %bb.0: # %entry +; P8BE-NEXT: xxmrglw vs0, v2, v2 +; P8BE-NEXT: xvcvspdp v2, vs0 +; P8BE-NEXT: blr +; +; P8LE-LABEL: fp_extend23: +; P8LE: # %bb.0: # %entry +; P8LE-NEXT: xxmrghw vs0, v2, v2 +; P8LE-NEXT: xvcvspdp v2, vs0 +; P8LE-NEXT: blr +entry: + %vecext = extractelement <4 x float> %a, i32 2 + %conv = fpext float %vecext to double + %vecinit = insertelement <2 x double> undef, double %conv, i32 0 + %vecext1 = extractelement <4 x float> %a, i32 3 + %conv2 = fpext float %vecext1 to double + %vecinit3 = insertelement <2 x double> %vecinit, double %conv2, i32 1 + ret <2 x double> %vecinit3 +} + +define dso_local <2 x double> @fp_extend32(<4 x float> %a) { +; P9BE-LABEL: fp_extend32: +; P9BE: # %bb.0: # %entry +; P9BE-NEXT: xxmrglw vs0, v2, v2 +; P9BE-NEXT: xvcvspdp vs0, vs0 +; P9BE-NEXT: xxswapd v2, vs0 +; P9BE-NEXT: blr +; +; P9LE-LABEL: fp_extend32: +; P9LE: # %bb.0: # %entry +; P9LE-NEXT: xxmrghw vs0, v2, v2 +; P9LE-NEXT: xvcvspdp vs0, vs0 +; P9LE-NEXT: xxswapd v2, vs0 +; P9LE-NEXT: blr +; +; P8BE-LABEL: fp_extend32: +; P8BE: # %bb.0: # %entry +; P8BE-NEXT: xxmrglw vs0, v2, v2 +; P8BE-NEXT: xvcvspdp vs0, vs0 +; P8BE-NEXT: xxswapd v2, vs0 +; P8BE-NEXT: blr +; +; P8LE-LABEL: fp_extend32: +; P8LE: # %bb.0: # %entry +; P8LE-NEXT: xxmrghw vs0, v2, v2 +; P8LE-NEXT: xvcvspdp vs0, vs0 +; P8LE-NEXT: xxswapd v2, vs0 +; P8LE-NEXT: blr +entry: + %vecext = extractelement <4 x float> %a, i32 3 + %conv = fpext float %vecext to double + %vecinit = insertelement <2 x double> undef, double %conv, i32 0 + %vecext1 = extractelement <4 x float> %a, i32 2 + %conv2 = fpext float %vecext1 to double + %vecinit3 = insertelement <2 x double> %vecinit, double %conv2, i32 1 + ret <2 x double> %vecinit3 +} + +define dso_local <2 x double> @fp_extend_two00(<4 x float> %a, <4 x float> %b) { +; P9BE-LABEL: fp_extend_two00: +; P9BE: # %bb.0: # %entry +; P9BE-NEXT: xxmrghd vs0, v2, v3 +; P9BE-NEXT: xvcvspdp v2, vs0 +; P9BE-NEXT: blr +; +; P9LE-LABEL: fp_extend_two00: +; P9LE: # %bb.0: # %entry +; P9LE-NEXT: xxmrgld vs0, v3, v2 +; P9LE-NEXT: xxsldwi vs0, vs0, vs0, 1 +; P9LE-NEXT: xvcvspdp v2, vs0 +; P9LE-NEXT: blr +; +; P8BE-LABEL: fp_extend_two00: +; P8BE: # %bb.0: # %entry +; P8BE-NEXT: xxmrghd vs0, v2, v3 +; P8BE-NEXT: xvcvspdp v2, vs0 +; P8BE-NEXT: blr +; +; P8LE-LABEL: fp_extend_two00: +; P8LE: # %bb.0: # %entry +; P8LE-NEXT: xxmrgld vs0, v3, v2 +; P8LE-NEXT: xxsldwi vs0, vs0, vs0, 1 +; P8LE-NEXT: xvcvspdp v2, vs0 +; P8LE-NEXT: blr +entry: + %vecext = extractelement <4 x float> %a, i32 0 + %conv = fpext float %vecext to double + %vecinit = insertelement <2 x double> undef, double %conv, i32 0 + %vecext1 = extractelement <4 x float> %b, i32 0 + %conv2 = fpext float %vecext1 to double + %vecinit3 = insertelement <2 x double> %vecinit, double %conv2, i32 1 + ret <2 x double> %vecinit3 +} + +define dso_local <2 x double> @fp_extend_two33(<4 x float> %a, <4 x float> %b) { +; P9BE-LABEL: fp_extend_two33: +; P9BE: # %bb.0: # %entry +; P9BE-NEXT: xxmrgld vs0, v2, v3 +; P9BE-NEXT: xxsldwi vs0, vs0, vs0, 1 +; P9BE-NEXT: xvcvspdp v2, vs0 +; P9BE-NEXT: blr +; +; P9LE-LABEL: fp_extend_two33: +; P9LE: # %bb.0: # %entry +; P9LE-NEXT: xxmrghd vs0, v3, v2 +; P9LE-NEXT: xvcvspdp v2, vs0 +; P9LE-NEXT: blr +; +; P8BE-LABEL: fp_extend_two33: +; P8BE: # %bb.0: # %entry +; P8BE-NEXT: xxmrgld vs0, v2, v3 +; P8BE-NEXT: xxsldwi vs0, vs0, vs0, 1 +; P8BE-NEXT: xvcvspdp v2, vs0 +; P8BE-NEXT: blr +; +; P8LE-LABEL: fp_extend_two33: +; P8LE: # %bb.0: # %entry +; P8LE-NEXT: xxmrghd vs0, v3, v2 +; P8LE-NEXT: xvcvspdp v2, vs0 +; P8LE-NEXT: blr +entry: + %vecext = extractelement <4 x float> %a, i32 3 + %conv = fpext float %vecext to double + %vecinit = insertelement <2 x double> undef, double %conv, i32 0 + %vecext1 = extractelement <4 x float> %b, i32 3 + %conv2 = fpext float %vecext1 to double + %vecinit3 = insertelement <2 x double> %vecinit, double %conv2, i32 1 + ret <2 x double> %vecinit3 +} diff --git a/llvm/test/CodeGen/PowerPC/float-vector-gather.ll b/llvm/test/CodeGen/PowerPC/float-vector-gather.ll index b7bb622a1f907..672a8f3e82aa7 100644 --- a/llvm/test/CodeGen/PowerPC/float-vector-gather.ll +++ b/llvm/test/CodeGen/PowerPC/float-vector-gather.ll @@ -1,4 +1,4 @@ -; NOTE: This test ensures that for both Big and Little Endian cases a set of +; NOTE: This test ensures that, for both Big and Little Endian cases, a set of ; NOTE: 4 floats is gathered into a v4f32 register using xxmrghw and xxmrgld ; RUN: llc -verify-machineinstrs -mcpu=pwr9 -ppc-vsr-nums-as-vr \ ; RUN: -ppc-asm-full-reg-names -mtriple=powerpc64le-unknown-linux-gnu < %s \ diff --git a/llvm/test/CodeGen/PowerPC/fma-assoc.ll b/llvm/test/CodeGen/PowerPC/fma-assoc.ll index 9bca280015d52..a899729188620 100644 --- a/llvm/test/CodeGen/PowerPC/fma-assoc.ll +++ b/llvm/test/CodeGen/PowerPC/fma-assoc.ll @@ -1,133 +1,110 @@ -; RUN: llc -verify-machineinstrs < %s -mtriple=ppc32-- -fp-contract=fast -mattr=-vsx -disable-ppc-vsx-fma-mutation=false | FileCheck -check-prefix=CHECK -check-prefix=CHECK-SAFE %s -; RUN: llc -verify-machineinstrs < %s -mtriple=powerpc64-unknown-linux-gnu -fp-contract=fast -mattr=+vsx -mcpu=pwr7 -disable-ppc-vsx-fma-mutation=false | FileCheck -check-prefix=CHECK-VSX -check-prefix=CHECK-VSX-SAFE %s -; RUN: llc -verify-machineinstrs < %s -mtriple=ppc32-- -fp-contract=fast -enable-unsafe-fp-math -mattr=-vsx -disable-ppc-vsx-fma-mutation=false | FileCheck -check-prefix=CHECK -check-prefix=CHECK-UNSAFE %s -; RUN: llc -verify-machineinstrs < %s -mtriple=powerpc64-unknown-linux-gnu -fp-contract=fast -enable-unsafe-fp-math -mattr=+vsx -mcpu=pwr7 -disable-ppc-vsx-fma-mutation=false | FileCheck -check-prefix=CHECK-VSX -check-prefix=CHECK-UNSAFE-VSX %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs < %s -mtriple=ppc32-- -fp-contract=fast \ +; RUN: -mattr=-vsx -disable-ppc-vsx-fma-mutation=false | FileCheck %s +; RUN: llc -verify-machineinstrs < %s -mtriple=powerpc64-unknown-linux-gnu \ +; RUN: -fp-contract=fast -mattr=+vsx -disable-ppc-vsx-fma-mutation=false \ +; RUN: -mcpu=pwr7 | FileCheck -check-prefix=CHECK-VSX %s define double @test_FMADD_ASSOC1(double %A, double %B, double %C, +; CHECK-LABEL: test_FMADD_ASSOC1: +; CHECK: # %bb.0: +; CHECK-NEXT: fmul 0, 3, 4 +; CHECK-NEXT: fmadd 0, 1, 2, 0 +; CHECK-NEXT: fadd 1, 0, 5 +; CHECK-NEXT: blr +; +; CHECK-VSX-LABEL: test_FMADD_ASSOC1: +; CHECK-VSX: # %bb.0: +; CHECK-VSX-NEXT: xsmuldp 0, 3, 4 +; CHECK-VSX-NEXT: xsmaddadp 0, 1, 2 +; CHECK-VSX-NEXT: xsadddp 1, 0, 5 +; CHECK-VSX-NEXT: blr double %D, double %E) { %F = fmul double %A, %B ; [#uses=1] %G = fmul double %C, %D ; [#uses=1] %H = fadd double %F, %G ; [#uses=1] %I = fadd double %H, %E ; [#uses=1] ret double %I -; CHECK-SAFE-LABEL: test_FMADD_ASSOC1: -; CHECK-SAFE: fmul -; CHECK-SAFE-NEXT: fmadd -; CHECK-SAFE-NEXT: fadd -; CHECK-SAFE-NEXT: blr - -; CHECK-UNSAFE-LABEL: test_FMADD_ASSOC1: -; CHECK-UNSAFE: fmadd -; CHECK-UNSAFE-NEXT: fmadd -; CHECK-UNSAFE-NEXT: blr - -; CHECK-VSX-SAFE-LABEL: test_FMADD_ASSOC1: -; CHECK-VSX-SAFE: xsmuldp -; CHECK-VSX-SAFE-NEXT: xsmaddadp -; CHECK-VSX-SAFE-NEXT: xsadddp -; CHECK-VSX-SAFE-NEXT: blr - -; CHECK-VSX-UNSAFE-LABEL: test_FMADD_ASSOC1: -; CHECK-VSX-UNSAFE: xsmaddmdp -; CHECK-VSX-UNSAFE-NEXT: xsmaddadp -; CHECK-VSX-UNSAFE-NEXT: fmr -; CHECK-VSX-UNSAFE-NEXT: blr } define double @test_FMADD_ASSOC2(double %A, double %B, double %C, +; CHECK-LABEL: test_FMADD_ASSOC2: +; CHECK: # %bb.0: +; CHECK-NEXT: fmul 0, 3, 4 +; CHECK-NEXT: fmadd 0, 1, 2, 0 +; CHECK-NEXT: fadd 1, 5, 0 +; CHECK-NEXT: blr +; +; CHECK-VSX-LABEL: test_FMADD_ASSOC2: +; CHECK-VSX: # %bb.0: +; CHECK-VSX-NEXT: xsmuldp 0, 3, 4 +; CHECK-VSX-NEXT: xsmaddadp 0, 1, 2 +; CHECK-VSX-NEXT: xsadddp 1, 5, 0 +; CHECK-VSX-NEXT: blr double %D, double %E) { %F = fmul double %A, %B ; [#uses=1] %G = fmul double %C, %D ; [#uses=1] %H = fadd double %F, %G ; [#uses=1] %I = fadd double %E, %H ; [#uses=1] ret double %I -; CHECK-SAFE-LABEL: test_FMADD_ASSOC2: -; CHECK-SAFE: fmul -; CHECK-SAFE-NEXT: fmadd -; CHECK-SAFE-NEXT: fadd -; CHECK-SAFE-NEXT: blr - -; CHECK-UNSAFE-LABEL: test_FMADD_ASSOC2: -; CHECK-UNSAFE: fmadd -; CHECK-UNSAFE-NEXT: fmadd -; CHECK-UNSAFE-NEXT: blr - -; CHECK-VSX-SAFE-LABEL: test_FMADD_ASSOC2: -; CHECK-VSX-SAFE: xsmuldp -; CHECK-VSX-SAFE-NEXT: xsmaddadp -; CHECK-VSX-SAFE-NEXT: xsadddp -; CHECK-VSX-SAFE-NEXT: blr - -; CHECK-VSX-UNSAFE-LABEL: test_FMADD_ASSOC2: -; CHECK-VSX-UNSAFE: xsmaddmdp -; CHECK-VSX-UNSAFE-NEXT: xsmaddadp -; CHECK-VSX-UNSAFE-NEXT: fmr -; CHECK-VSX-UNSAFE-NEXT: blr } define double @test_FMSUB_ASSOC1(double %A, double %B, double %C, +; CHECK-LABEL: test_FMSUB_ASSOC1: +; CHECK: # %bb.0: +; CHECK-NEXT: fmul 0, 3, 4 +; CHECK-NEXT: fmadd 0, 1, 2, 0 +; CHECK-NEXT: fsub 1, 0, 5 +; CHECK-NEXT: blr +; +; CHECK-VSX-LABEL: test_FMSUB_ASSOC1: +; CHECK-VSX: # %bb.0: +; CHECK-VSX-NEXT: xsmuldp 0, 3, 4 +; CHECK-VSX-NEXT: xsmaddadp 0, 1, 2 +; CHECK-VSX-NEXT: xssubdp 1, 0, 5 +; CHECK-VSX-NEXT: blr double %D, double %E) { %F = fmul double %A, %B ; [#uses=1] %G = fmul double %C, %D ; [#uses=1] %H = fadd double %F, %G ; [#uses=1] %I = fsub double %H, %E ; [#uses=1] ret double %I -; CHECK-SAFE-LABEL: test_FMSUB_ASSOC1: -; CHECK-SAFE: fmul -; CHECK-SAFE-NEXT: fmadd -; CHECK-SAFE-NEXT: fsub -; CHECK-SAFE-NEXT: blr - -; CHECK-UNSAFE-LABEL: test_FMSUB_ASSOC1: -; CHECK-UNSAFE: fmsub -; CHECK-UNSAFE-NEXT: fmadd -; CHECK-UNSAFE-NEXT: blr - -; CHECK-SAFE-VSX-LABEL: test_FMSUB_ASSOC1: -; CHECK-SAFE-VSX: xsmuldp -; CHECK-SAFE-VSX-NEXT: xsmaddadp -; CHECK-SAFE-VSX-NEXT: xssubdp -; CHECK-SAFE-VSX-NEXT: blr - -; CHECK-UNSAFE-VSX-LABEL: test_FMSUB_ASSOC1: -; CHECK-UNSAFE-VSX: xsmsubmdp -; CHECK-UNSAFE-VSX-NEXT: xsmaddadp -; CHECK-UNSAFE-VSX-NEXT: fmr -; CHECK-UNSAFE-VSX-NEXT: blr } define double @test_FMSUB_ASSOC2(double %A, double %B, double %C, +; CHECK-LABEL: test_FMSUB_ASSOC2: +; CHECK: # %bb.0: +; CHECK-NEXT: fmul 0, 3, 4 +; CHECK-NEXT: fmadd 0, 1, 2, 0 +; CHECK-NEXT: fsub 1, 5, 0 +; CHECK-NEXT: blr +; +; CHECK-VSX-LABEL: test_FMSUB_ASSOC2: +; CHECK-VSX: # %bb.0: +; CHECK-VSX-NEXT: xsmuldp 0, 3, 4 +; CHECK-VSX-NEXT: xsmaddadp 0, 1, 2 +; CHECK-VSX-NEXT: xssubdp 1, 5, 0 +; CHECK-VSX-NEXT: blr double %D, double %E) { %F = fmul double %A, %B ; [#uses=1] %G = fmul double %C, %D ; [#uses=1] %H = fadd double %F, %G ; [#uses=1] %I = fsub double %E, %H ; [#uses=1] ret double %I -; CHECK-SAFE-LABEL: test_FMSUB_ASSOC2: -; CHECK-SAFE: fmul -; CHECK-SAFE-NEXT: fmadd -; CHECK-SAFE-NEXT: fsub -; CHECK-SAFE-NEXT: blr - -; CHECK-UNSAFE-LABEL: test_FMSUB_ASSOC2: -; CHECK-UNSAFE: fnmsub -; CHECK-UNSAFE-NEXT: fnmsub -; CHECK-UNSAFE-NEXT: blr - -; CHECK-SAFE-VSX-LABEL: test_FMSUB_ASSOC2: -; CHECK-SAFE-VSX: xsmuldp -; CHECK-SAFE-VSX-NEXT: xsmaddadp -; CHECK-SAFE-VSX-NEXT: xssubdp -; CHECK-SAFE-VSX-NEXT: blr - -; CHECK-UNSAFE-VSX-LABEL: test_FMSUB_ASSOC2: -; CHECK-UNSAFE-VSX: xsnmsubmdp -; CHECK-UNSAFE-VSX-NEXT: xsnmsubadp -; CHECK-UNSAFE-VSX-NEXT: fmr -; CHECK-UNSAFE-VSX-NEXT: blr } define double @test_FMADD_ASSOC_EXT1(float %A, float %B, double %C, +; CHECK-LABEL: test_FMADD_ASSOC_EXT1: +; CHECK: # %bb.0: +; CHECK-NEXT: fmadd 0, 1, 2, 5 +; CHECK-NEXT: fmadd 1, 3, 4, 0 +; CHECK-NEXT: blr +; +; CHECK-VSX-LABEL: test_FMADD_ASSOC_EXT1: +; CHECK-VSX: # %bb.0: +; CHECK-VSX-NEXT: xsmaddmdp 1, 2, 5 +; CHECK-VSX-NEXT: xsmaddadp 1, 3, 4 +; CHECK-VSX-NEXT: blr double %D, double %E) { %F = fmul float %A, %B ; [#uses=1] %G = fpext float %F to double ; [#uses=1] @@ -135,18 +112,21 @@ define double @test_FMADD_ASSOC_EXT1(float %A, float %B, double %C, %I = fadd double %H, %G ; [#uses=1] %J = fadd double %I, %E ; [#uses=1] ret double %J -; CHECK-LABEL: test_FMADD_ASSOC_EXT1: -; CHECK: fmadd -; CHECK-NEXT: fmadd -; CHECK-NEXT: blr - -; CHECK-VSX-LABEL: test_FMADD_ASSOC_EXT1: -; CHECK-VSX: xsmaddmdp -; CHECK-VSX-NEXT: xsmaddadp -; CHECK-VSX-NEXT: blr } define double @test_FMADD_ASSOC_EXT2(float %A, float %B, float %C, +; CHECK-LABEL: test_FMADD_ASSOC_EXT2: +; CHECK: # %bb.0: +; CHECK-NEXT: fmadd 0, 3, 4, 5 +; CHECK-NEXT: fmadd 1, 1, 2, 0 +; CHECK-NEXT: blr +; +; CHECK-VSX-LABEL: test_FMADD_ASSOC_EXT2: +; CHECK-VSX: # %bb.0: +; CHECK-VSX-NEXT: xsmaddmdp 3, 4, 5 +; CHECK-VSX-NEXT: xsmaddadp 3, 1, 2 +; CHECK-VSX-NEXT: fmr 1, 3 +; CHECK-VSX-NEXT: blr float %D, double %E) { %F = fmul float %A, %B ; [#uses=1] %G = fmul float %C, %D ; [#uses=1] @@ -154,19 +134,20 @@ define double @test_FMADD_ASSOC_EXT2(float %A, float %B, float %C, %I = fpext float %H to double ; [#uses=1] %J = fadd double %I, %E ; [#uses=1] ret double %J -; CHECK-LABEL: test_FMADD_ASSOC_EXT2: -; CHECK: fmadd -; CHECK-NEXT: fmadd -; CHECK-NEXT: blr - -; CHECK-VSX-LABEL: test_FMADD_ASSOC_EXT2: -; CHECK-VSX: xsmaddmdp -; CHECK-VSX-NEXT: xsmaddadp -; CHECK-VSX-NEXT: fmr -; CHECK-VSX-NEXT: blr } define double @test_FMADD_ASSOC_EXT3(float %A, float %B, double %C, +; CHECK-LABEL: test_FMADD_ASSOC_EXT3: +; CHECK: # %bb.0: +; CHECK-NEXT: fmadd 0, 1, 2, 5 +; CHECK-NEXT: fmadd 1, 3, 4, 0 +; CHECK-NEXT: blr +; +; CHECK-VSX-LABEL: test_FMADD_ASSOC_EXT3: +; CHECK-VSX: # %bb.0: +; CHECK-VSX-NEXT: xsmaddmdp 1, 2, 5 +; CHECK-VSX-NEXT: xsmaddadp 1, 3, 4 +; CHECK-VSX-NEXT: blr double %D, double %E) { %F = fmul float %A, %B ; [#uses=1] %G = fpext float %F to double ; [#uses=1] @@ -174,18 +155,21 @@ define double @test_FMADD_ASSOC_EXT3(float %A, float %B, double %C, %I = fadd double %H, %G ; [#uses=1] %J = fadd double %E, %I ; [#uses=1] ret double %J -; CHECK-LABEL: test_FMADD_ASSOC_EXT3: -; CHECK: fmadd -; CHECK-NEXT: fmadd -; CHECK-NEXT: blr - -; CHECK-VSX-LABEL: test_FMADD_ASSOC_EXT3: -; CHECK-VSX: xsmaddmdp -; CHECK-VSX-NEXT: xsmaddadp -; CHECK-VSX-NEXT: blr } define double @test_FMADD_ASSOC_EXT4(float %A, float %B, float %C, +; CHECK-LABEL: test_FMADD_ASSOC_EXT4: +; CHECK: # %bb.0: +; CHECK-NEXT: fmadd 0, 3, 4, 5 +; CHECK-NEXT: fmadd 1, 1, 2, 0 +; CHECK-NEXT: blr +; +; CHECK-VSX-LABEL: test_FMADD_ASSOC_EXT4: +; CHECK-VSX: # %bb.0: +; CHECK-VSX-NEXT: xsmaddmdp 3, 4, 5 +; CHECK-VSX-NEXT: xsmaddadp 3, 1, 2 +; CHECK-VSX-NEXT: fmr 1, 3 +; CHECK-VSX-NEXT: blr float %D, double %E) { %F = fmul float %A, %B ; [#uses=1] %G = fmul float %C, %D ; [#uses=1] @@ -193,19 +177,20 @@ define double @test_FMADD_ASSOC_EXT4(float %A, float %B, float %C, %I = fpext float %H to double ; [#uses=1] %J = fadd double %E, %I ; [#uses=1] ret double %J -; CHECK-LABEL: test_FMADD_ASSOC_EXT4: -; CHECK: fmadd -; CHECK-NEXT: fmadd -; CHECK-NEXT: blr - -; CHECK-VSX-LABEL: test_FMADD_ASSOC_EXT4: -; CHECK-VSX: xsmaddmdp -; CHECK-VSX-NEXT: xsmaddadp -; CHECK-VSX-NEXT: fmr -; CHECK-VSX-NEXT: blr } define double @test_FMSUB_ASSOC_EXT1(float %A, float %B, double %C, +; CHECK-LABEL: test_FMSUB_ASSOC_EXT1: +; CHECK: # %bb.0: +; CHECK-NEXT: fmsub 0, 1, 2, 5 +; CHECK-NEXT: fmadd 1, 3, 4, 0 +; CHECK-NEXT: blr +; +; CHECK-VSX-LABEL: test_FMSUB_ASSOC_EXT1: +; CHECK-VSX: # %bb.0: +; CHECK-VSX-NEXT: xsmsubmdp 1, 2, 5 +; CHECK-VSX-NEXT: xsmaddadp 1, 3, 4 +; CHECK-VSX-NEXT: blr double %D, double %E) { %F = fmul float %A, %B ; [#uses=1] %G = fpext float %F to double ; [#uses=1] @@ -213,18 +198,21 @@ define double @test_FMSUB_ASSOC_EXT1(float %A, float %B, double %C, %I = fadd double %H, %G ; [#uses=1] %J = fsub double %I, %E ; [#uses=1] ret double %J -; CHECK-LABEL: test_FMSUB_ASSOC_EXT1: -; CHECK: fmsub -; CHECK-NEXT: fmadd -; CHECK-NEXT: blr - -; CHECK-VSX-LABEL: test_FMSUB_ASSOC_EXT1: -; CHECK-VSX: xsmsubmdp -; CHECK-VSX-NEXT: xsmaddadp -; CHECK-VSX-NEXT: blr } define double @test_FMSUB_ASSOC_EXT2(float %A, float %B, float %C, +; CHECK-LABEL: test_FMSUB_ASSOC_EXT2: +; CHECK: # %bb.0: +; CHECK-NEXT: fmsub 0, 3, 4, 5 +; CHECK-NEXT: fmadd 1, 1, 2, 0 +; CHECK-NEXT: blr +; +; CHECK-VSX-LABEL: test_FMSUB_ASSOC_EXT2: +; CHECK-VSX: # %bb.0: +; CHECK-VSX-NEXT: xsmsubmdp 3, 4, 5 +; CHECK-VSX-NEXT: xsmaddadp 3, 1, 2 +; CHECK-VSX-NEXT: fmr 1, 3 +; CHECK-VSX-NEXT: blr float %D, double %E) { %F = fmul float %A, %B ; [#uses=1] %G = fmul float %C, %D ; [#uses=1] @@ -232,19 +220,20 @@ define double @test_FMSUB_ASSOC_EXT2(float %A, float %B, float %C, %I = fpext float %H to double ; [#uses=1] %J = fsub double %I, %E ; [#uses=1] ret double %J -; CHECK-LABEL: test_FMSUB_ASSOC_EXT2: -; CHECK: fmsub -; CHECK-NEXT: fmadd -; CHECK-NEXT: blr - -; CHECK-VSX-LABEL: test_FMSUB_ASSOC_EXT2: -; CHECK-VSX: xsmsubmdp -; CHECK-VSX-NEXT: xsmaddadp -; CHECK-VSX-NEXT: fmr -; CHECK-VSX-NEXT: blr } define double @test_FMSUB_ASSOC_EXT3(float %A, float %B, double %C, +; CHECK-LABEL: test_FMSUB_ASSOC_EXT3: +; CHECK: # %bb.0: +; CHECK-NEXT: fnmsub 0, 1, 2, 5 +; CHECK-NEXT: fnmsub 1, 3, 4, 0 +; CHECK-NEXT: blr +; +; CHECK-VSX-LABEL: test_FMSUB_ASSOC_EXT3: +; CHECK-VSX: # %bb.0: +; CHECK-VSX-NEXT: xsnmsubmdp 1, 2, 5 +; CHECK-VSX-NEXT: xsnmsubadp 1, 3, 4 +; CHECK-VSX-NEXT: blr double %D, double %E) { %F = fmul float %A, %B ; [#uses=1] %G = fpext float %F to double ; [#uses=1] @@ -252,18 +241,21 @@ define double @test_FMSUB_ASSOC_EXT3(float %A, float %B, double %C, %I = fadd double %H, %G ; [#uses=1] %J = fsub double %E, %I ; [#uses=1] ret double %J -; CHECK-LABEL: test_FMSUB_ASSOC_EXT3: -; CHECK: fnmsub -; CHECK-NEXT: fnmsub -; CHECK-NEXT: blr - -; CHECK-VSX-LABEL: test_FMSUB_ASSOC_EXT3: -; CHECK-VSX: xsnmsubmdp -; CHECK-VSX-NEXT: xsnmsubadp -; CHECK-VSX-NEXT: blr } define double @test_FMSUB_ASSOC_EXT4(float %A, float %B, float %C, +; CHECK-LABEL: test_FMSUB_ASSOC_EXT4: +; CHECK: # %bb.0: +; CHECK-NEXT: fnmsub 0, 3, 4, 5 +; CHECK-NEXT: fnmsub 1, 1, 2, 0 +; CHECK-NEXT: blr +; +; CHECK-VSX-LABEL: test_FMSUB_ASSOC_EXT4: +; CHECK-VSX: # %bb.0: +; CHECK-VSX-NEXT: xsnmsubmdp 3, 4, 5 +; CHECK-VSX-NEXT: xsnmsubadp 3, 1, 2 +; CHECK-VSX-NEXT: fmr 1, 3 +; CHECK-VSX-NEXT: blr float %D, double %E) { %F = fmul float %A, %B ; [#uses=1] %G = fmul float %C, %D ; [#uses=1] @@ -271,14 +263,282 @@ define double @test_FMSUB_ASSOC_EXT4(float %A, float %B, float %C, %I = fpext float %H to double ; [#uses=1] %J = fsub double %E, %I ; [#uses=1] ret double %J -; CHECK-LABEL: test_FMSUB_ASSOC_EXT4: -; CHECK: fnmsub -; CHECK-NEXT: fnmsub -; CHECK-NEXT: blr +} -; CHECK-VSX-LABEL: test_FMSUB_ASSOC_EXT4: -; CHECK-VSX: xsnmsubmdp -; CHECK-VSX-NEXT: xsnmsubadp -; CHECK-VSX-NEXT: fmr -; CHECK-VSX-NEXT: blr +define double @test_reassoc_FMADD_ASSOC1(double %A, double %B, double %C, +; CHECK-LABEL: test_reassoc_FMADD_ASSOC1: +; CHECK: # %bb.0: +; CHECK-NEXT: fmadd 0, 3, 4, 5 +; CHECK-NEXT: fmadd 1, 1, 2, 0 +; CHECK-NEXT: blr +; +; CHECK-VSX-LABEL: test_reassoc_FMADD_ASSOC1: +; CHECK-VSX: # %bb.0: +; CHECK-VSX-NEXT: xsmaddmdp 3, 4, 5 +; CHECK-VSX-NEXT: xsmaddadp 3, 1, 2 +; CHECK-VSX-NEXT: fmr 1, 3 +; CHECK-VSX-NEXT: blr + double %D, double %E) { + %F = fmul reassoc double %A, %B ; [#uses=1] + %G = fmul reassoc double %C, %D ; [#uses=1] + %H = fadd reassoc double %F, %G ; [#uses=1] + %I = fadd reassoc double %H, %E ; [#uses=1] + ret double %I +} + +define double @test_reassoc_FMADD_ASSOC2(double %A, double %B, double %C, +; CHECK-LABEL: test_reassoc_FMADD_ASSOC2: +; CHECK: # %bb.0: +; CHECK-NEXT: fmadd 0, 3, 4, 5 +; CHECK-NEXT: fmadd 1, 1, 2, 0 +; CHECK-NEXT: blr +; +; CHECK-VSX-LABEL: test_reassoc_FMADD_ASSOC2: +; CHECK-VSX: # %bb.0: +; CHECK-VSX-NEXT: xsmaddmdp 3, 4, 5 +; CHECK-VSX-NEXT: xsmaddadp 3, 1, 2 +; CHECK-VSX-NEXT: fmr 1, 3 +; CHECK-VSX-NEXT: blr + double %D, double %E) { + %F = fmul reassoc double %A, %B ; [#uses=1] + %G = fmul reassoc double %C, %D ; [#uses=1] + %H = fadd reassoc double %F, %G ; [#uses=1] + %I = fadd reassoc double %E, %H ; [#uses=1] + ret double %I +} + +define double @test_reassoc_FMSUB_ASSOC1(double %A, double %B, double %C, +; CHECK-LABEL: test_reassoc_FMSUB_ASSOC1: +; CHECK: # %bb.0: +; CHECK-NEXT: fmsub 0, 3, 4, 5 +; CHECK-NEXT: fmadd 1, 1, 2, 0 +; CHECK-NEXT: blr +; +; CHECK-VSX-LABEL: test_reassoc_FMSUB_ASSOC1: +; CHECK-VSX: # %bb.0: +; CHECK-VSX-NEXT: xsmsubmdp 3, 4, 5 +; CHECK-VSX-NEXT: xsmaddadp 3, 1, 2 +; CHECK-VSX-NEXT: fmr 1, 3 +; CHECK-VSX-NEXT: blr + double %D, double %E) { + %F = fmul reassoc double %A, %B ; [#uses=1] + %G = fmul reassoc double %C, %D ; [#uses=1] + %H = fadd reassoc double %F, %G ; [#uses=1] + %I = fsub reassoc double %H, %E ; [#uses=1] + ret double %I +} + +define double @test_reassoc_FMSUB_ASSOC2(double %A, double %B, double %C, +; CHECK-LABEL: test_reassoc_FMSUB_ASSOC2: +; CHECK: # %bb.0: +; CHECK-NEXT: fmul 0, 3, 4 +; CHECK-NEXT: fmadd 0, 1, 2, 0 +; CHECK-NEXT: fsub 1, 5, 0 +; CHECK-NEXT: blr +; +; CHECK-VSX-LABEL: test_reassoc_FMSUB_ASSOC2: +; CHECK-VSX: # %bb.0: +; CHECK-VSX-NEXT: xsmuldp 0, 3, 4 +; CHECK-VSX-NEXT: xsmaddadp 0, 1, 2 +; CHECK-VSX-NEXT: xssubdp 1, 5, 0 +; CHECK-VSX-NEXT: blr + double %D, double %E) { + %F = fmul reassoc double %A, %B ; [#uses=1] + %G = fmul reassoc double %C, %D ; [#uses=1] + %H = fadd reassoc double %F, %G ; [#uses=1] + %I = fsub reassoc double %E, %H ; [#uses=1] + ret double %I +} + +define double @test_fast_FMSUB_ASSOC2(double %A, double %B, double %C, +; CHECK-LABEL: test_fast_FMSUB_ASSOC2: +; CHECK: # %bb.0: +; CHECK-NEXT: fnmsub 0, 3, 4, 5 +; CHECK-NEXT: fnmsub 1, 1, 2, 0 +; CHECK-NEXT: blr +; +; CHECK-VSX-LABEL: test_fast_FMSUB_ASSOC2: +; CHECK-VSX: # %bb.0: +; CHECK-VSX-NEXT: xsnmsubmdp 3, 4, 5 +; CHECK-VSX-NEXT: xsnmsubadp 3, 1, 2 +; CHECK-VSX-NEXT: fmr 1, 3 +; CHECK-VSX-NEXT: blr + double %D, double %E) { + %F = fmul fast double %A, %B ; [#uses=1] + %G = fmul fast double %C, %D ; [#uses=1] + %H = fadd fast double %F, %G ; [#uses=1] + %I = fsub fast double %E, %H ; [#uses=1] + ret double %I +} + +define double @test_reassoc_FMADD_ASSOC_EXT1(float %A, float %B, double %C, +; CHECK-LABEL: test_reassoc_FMADD_ASSOC_EXT1: +; CHECK: # %bb.0: +; CHECK-NEXT: fmadd 0, 1, 2, 5 +; CHECK-NEXT: fmadd 1, 3, 4, 0 +; CHECK-NEXT: blr +; +; CHECK-VSX-LABEL: test_reassoc_FMADD_ASSOC_EXT1: +; CHECK-VSX: # %bb.0: +; CHECK-VSX-NEXT: xsmaddmdp 1, 2, 5 +; CHECK-VSX-NEXT: xsmaddadp 1, 3, 4 +; CHECK-VSX-NEXT: blr + double %D, double %E) { + %F = fmul reassoc float %A, %B ; [#uses=1] + %G = fpext float %F to double ; [#uses=1] + %H = fmul reassoc double %C, %D ; [#uses=1] + %I = fadd reassoc double %H, %G ; [#uses=1] + %J = fadd reassoc double %I, %E ; [#uses=1] + ret double %J +} + +define double @test_reassoc_FMADD_ASSOC_EXT2(float %A, float %B, float %C, +; CHECK-LABEL: test_reassoc_FMADD_ASSOC_EXT2: +; CHECK: # %bb.0: +; CHECK-NEXT: fmadd 0, 3, 4, 5 +; CHECK-NEXT: fmadd 1, 1, 2, 0 +; CHECK-NEXT: blr +; +; CHECK-VSX-LABEL: test_reassoc_FMADD_ASSOC_EXT2: +; CHECK-VSX: # %bb.0: +; CHECK-VSX-NEXT: xsmaddmdp 3, 4, 5 +; CHECK-VSX-NEXT: xsmaddadp 3, 1, 2 +; CHECK-VSX-NEXT: fmr 1, 3 +; CHECK-VSX-NEXT: blr + float %D, double %E) { + %F = fmul reassoc float %A, %B ; [#uses=1] + %G = fmul reassoc float %C, %D ; [#uses=1] + %H = fadd reassoc float %F, %G ; [#uses=1] + %I = fpext float %H to double ; [#uses=1] + %J = fadd reassoc double %I, %E ; [#uses=1] + ret double %J +} + +define double @test_reassoc_FMADD_ASSOC_EXT3(float %A, float %B, double %C, +; CHECK-LABEL: test_reassoc_FMADD_ASSOC_EXT3: +; CHECK: # %bb.0: +; CHECK-NEXT: fmadd 0, 1, 2, 5 +; CHECK-NEXT: fmadd 1, 3, 4, 0 +; CHECK-NEXT: blr +; +; CHECK-VSX-LABEL: test_reassoc_FMADD_ASSOC_EXT3: +; CHECK-VSX: # %bb.0: +; CHECK-VSX-NEXT: xsmaddmdp 1, 2, 5 +; CHECK-VSX-NEXT: xsmaddadp 1, 3, 4 +; CHECK-VSX-NEXT: blr + double %D, double %E) { + %F = fmul reassoc float %A, %B ; [#uses=1] + %G = fpext float %F to double ; [#uses=1] + %H = fmul reassoc double %C, %D ; [#uses=1] + %I = fadd reassoc double %H, %G ; [#uses=1] + %J = fadd reassoc double %E, %I ; [#uses=1] + ret double %J +} + +define double @test_reassoc_FMADD_ASSOC_EXT4(float %A, float %B, float %C, +; CHECK-LABEL: test_reassoc_FMADD_ASSOC_EXT4: +; CHECK: # %bb.0: +; CHECK-NEXT: fmadd 0, 3, 4, 5 +; CHECK-NEXT: fmadd 1, 1, 2, 0 +; CHECK-NEXT: blr +; +; CHECK-VSX-LABEL: test_reassoc_FMADD_ASSOC_EXT4: +; CHECK-VSX: # %bb.0: +; CHECK-VSX-NEXT: xsmaddmdp 3, 4, 5 +; CHECK-VSX-NEXT: xsmaddadp 3, 1, 2 +; CHECK-VSX-NEXT: fmr 1, 3 +; CHECK-VSX-NEXT: blr + float %D, double %E) { + %F = fmul reassoc float %A, %B ; [#uses=1] + %G = fmul reassoc float %C, %D ; [#uses=1] + %H = fadd reassoc float %F, %G ; [#uses=1] + %I = fpext float %H to double ; [#uses=1] + %J = fadd reassoc double %E, %I ; [#uses=1] + ret double %J +} + +define double @test_reassoc_FMSUB_ASSOC_EXT1(float %A, float %B, double %C, +; CHECK-LABEL: test_reassoc_FMSUB_ASSOC_EXT1: +; CHECK: # %bb.0: +; CHECK-NEXT: fmsub 0, 1, 2, 5 +; CHECK-NEXT: fmadd 1, 3, 4, 0 +; CHECK-NEXT: blr +; +; CHECK-VSX-LABEL: test_reassoc_FMSUB_ASSOC_EXT1: +; CHECK-VSX: # %bb.0: +; CHECK-VSX-NEXT: xsmsubmdp 1, 2, 5 +; CHECK-VSX-NEXT: xsmaddadp 1, 3, 4 +; CHECK-VSX-NEXT: blr + double %D, double %E) { + %F = fmul reassoc float %A, %B ; [#uses=1] + %G = fpext float %F to double ; [#uses=1] + %H = fmul reassoc double %C, %D ; [#uses=1] + %I = fadd reassoc double %H, %G ; [#uses=1] + %J = fsub reassoc double %I, %E ; [#uses=1] + ret double %J +} + +define double @test_reassoc_FMSUB_ASSOC_EXT2(float %A, float %B, float %C, +; CHECK-LABEL: test_reassoc_FMSUB_ASSOC_EXT2: +; CHECK: # %bb.0: +; CHECK-NEXT: fmsub 0, 3, 4, 5 +; CHECK-NEXT: fmadd 1, 1, 2, 0 +; CHECK-NEXT: blr +; +; CHECK-VSX-LABEL: test_reassoc_FMSUB_ASSOC_EXT2: +; CHECK-VSX: # %bb.0: +; CHECK-VSX-NEXT: xsmsubmdp 3, 4, 5 +; CHECK-VSX-NEXT: xsmaddadp 3, 1, 2 +; CHECK-VSX-NEXT: fmr 1, 3 +; CHECK-VSX-NEXT: blr + float %D, double %E) { + %F = fmul reassoc float %A, %B ; [#uses=1] + %G = fmul reassoc float %C, %D ; [#uses=1] + %H = fadd reassoc float %F, %G ; [#uses=1] + %I = fpext float %H to double ; [#uses=1] + %J = fsub reassoc double %I, %E ; [#uses=1] + ret double %J +} + +define double @test_reassoc_FMSUB_ASSOC_EXT3(float %A, float %B, double %C, +; CHECK-LABEL: test_reassoc_FMSUB_ASSOC_EXT3: +; CHECK: # %bb.0: +; CHECK-NEXT: fnmsub 0, 1, 2, 5 +; CHECK-NEXT: fnmsub 1, 3, 4, 0 +; CHECK-NEXT: blr +; +; CHECK-VSX-LABEL: test_reassoc_FMSUB_ASSOC_EXT3: +; CHECK-VSX: # %bb.0: +; CHECK-VSX-NEXT: xsnmsubmdp 1, 2, 5 +; CHECK-VSX-NEXT: xsnmsubadp 1, 3, 4 +; CHECK-VSX-NEXT: blr + double %D, double %E) { + %F = fmul reassoc float %A, %B ; [#uses=1] + %G = fpext float %F to double ; [#uses=1] + %H = fmul reassoc double %C, %D ; [#uses=1] + %I = fadd reassoc double %H, %G ; [#uses=1] + %J = fsub reassoc double %E, %I ; [#uses=1] + ret double %J +} + +define double @test_reassoc_FMSUB_ASSOC_EXT4(float %A, float %B, float %C, +; CHECK-LABEL: test_reassoc_FMSUB_ASSOC_EXT4: +; CHECK: # %bb.0: +; CHECK-NEXT: fnmsub 0, 3, 4, 5 +; CHECK-NEXT: fnmsub 1, 1, 2, 0 +; CHECK-NEXT: blr +; +; CHECK-VSX-LABEL: test_reassoc_FMSUB_ASSOC_EXT4: +; CHECK-VSX: # %bb.0: +; CHECK-VSX-NEXT: xsnmsubmdp 3, 4, 5 +; CHECK-VSX-NEXT: xsnmsubadp 3, 1, 2 +; CHECK-VSX-NEXT: fmr 1, 3 +; CHECK-VSX-NEXT: blr + float %D, double %E) { + %F = fmul reassoc float %A, %B ; [#uses=1] + %G = fmul reassoc float %C, %D ; [#uses=1] + %H = fadd reassoc float %F, %G ; [#uses=1] + %I = fpext float %H to double ; [#uses=1] + %J = fsub reassoc double %E, %I ; [#uses=1] + ret double %J } diff --git a/llvm/test/CodeGen/PowerPC/fma-negate.ll b/llvm/test/CodeGen/PowerPC/fma-negate.ll new file mode 100644 index 0000000000000..cb260532b494f --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/fma-negate.ll @@ -0,0 +1,314 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple powerpc64le -verify-machineinstrs \ +; RUN: | FileCheck -check-prefix=VSX %s +; RUN: llc < %s -mtriple powerpc64le -verify-machineinstrs -mattr=-vsx \ +; RUN: | FileCheck -check-prefix=NO-VSX %s + +define double @test_mul_sub_f64(double %a, double %b, double %c) { +; VSX-LABEL: test_mul_sub_f64: +; VSX: # %bb.0: # %entry +; VSX-NEXT: xsnmsubadp 1, 2, 3 +; VSX-NEXT: blr +; +; NO-VSX-LABEL: test_mul_sub_f64: +; NO-VSX: # %bb.0: # %entry +; NO-VSX-NEXT: fnmsub 1, 2, 3, 1 +; NO-VSX-NEXT: blr +entry: + %0 = fmul contract reassoc double %b, %c + %1 = fsub contract reassoc double %a, %0 + ret double %1 +} + +define double @test_2mul_sub_f64(double %a, double %b, double %c, double %d) { +; VSX-LABEL: test_2mul_sub_f64: +; VSX: # %bb.0: # %entry +; VSX-NEXT: xsmuldp 0, 3, 4 +; VSX-NEXT: xsmsubadp 0, 1, 2 +; VSX-NEXT: fmr 1, 0 +; VSX-NEXT: blr +; +; NO-VSX-LABEL: test_2mul_sub_f64: +; NO-VSX: # %bb.0: # %entry +; NO-VSX-NEXT: fmul 0, 3, 4 +; NO-VSX-NEXT: fmsub 1, 1, 2, 0 +; NO-VSX-NEXT: blr +entry: + %0 = fmul contract reassoc double %a, %b + %1 = fmul contract reassoc double %c, %d + %2 = fsub contract reassoc double %0, %1 + ret double %2 +} + +define double @test_neg_fma_f64(double %a, double %b, double %c) { +; VSX-LABEL: test_neg_fma_f64: +; VSX: # %bb.0: # %entry +; VSX-NEXT: xsnmsubadp 3, 1, 2 +; VSX-NEXT: fmr 1, 3 +; VSX-NEXT: blr +; +; NO-VSX-LABEL: test_neg_fma_f64: +; NO-VSX: # %bb.0: # %entry +; NO-VSX-NEXT: fnmsub 1, 1, 2, 3 +; NO-VSX-NEXT: blr +entry: + %0 = fsub contract reassoc double -0.0, %a + %1 = call contract reassoc double @llvm.fma.f64(double %0, double %b, + double %c) + ret double %1 +} + +define float @test_mul_sub_f32(float %a, float %b, float %c) { +; VSX-LABEL: test_mul_sub_f32: +; VSX: # %bb.0: # %entry +; VSX-NEXT: xsnmsubasp 1, 2, 3 +; VSX-NEXT: blr +; +; NO-VSX-LABEL: test_mul_sub_f32: +; NO-VSX: # %bb.0: # %entry +; NO-VSX-NEXT: fnmsubs 1, 2, 3, 1 +; NO-VSX-NEXT: blr +entry: + %0 = fmul contract reassoc float %b, %c + %1 = fsub contract reassoc float %a, %0 + ret float %1 +} + +define float @test_2mul_sub_f32(float %a, float %b, float %c, float %d) { +; VSX-LABEL: test_2mul_sub_f32: +; VSX: # %bb.0: # %entry +; VSX-NEXT: xsmulsp 0, 3, 4 +; VSX-NEXT: xsmsubasp 0, 1, 2 +; VSX-NEXT: fmr 1, 0 +; VSX-NEXT: blr +; +; NO-VSX-LABEL: test_2mul_sub_f32: +; NO-VSX: # %bb.0: # %entry +; NO-VSX-NEXT: fmuls 0, 3, 4 +; NO-VSX-NEXT: fmsubs 1, 1, 2, 0 +; NO-VSX-NEXT: blr +entry: + %0 = fmul contract reassoc float %a, %b + %1 = fmul contract reassoc float %c, %d + %2 = fsub contract reassoc float %0, %1 + ret float %2 +} + +define float @test_neg_fma_f32(float %a, float %b, float %c) { +; VSX-LABEL: test_neg_fma_f32: +; VSX: # %bb.0: # %entry +; VSX-NEXT: xsnmsubasp 3, 1, 2 +; VSX-NEXT: fmr 1, 3 +; VSX-NEXT: blr +; +; NO-VSX-LABEL: test_neg_fma_f32: +; NO-VSX: # %bb.0: # %entry +; NO-VSX-NEXT: fnmsubs 1, 1, 2, 3 +; NO-VSX-NEXT: blr +entry: + %0 = fsub contract reassoc float -0.0, %a + %1 = call contract reassoc float @llvm.fma.f32(float %0, float %b, float %c) + ret float %1 +} + +define <2 x double> @test_neg_fma_v2f64(<2 x double> %a, <2 x double> %b, +; VSX-LABEL: test_neg_fma_v2f64: +; VSX: # %bb.0: # %entry +; VSX-NEXT: xvnmsubadp 36, 34, 35 +; VSX-NEXT: vmr 2, 4 +; VSX-NEXT: blr +; +; NO-VSX-LABEL: test_neg_fma_v2f64: +; NO-VSX: # %bb.0: # %entry +; NO-VSX-NEXT: fnmsub 1, 1, 3, 5 +; NO-VSX-NEXT: fnmsub 2, 2, 4, 6 +; NO-VSX-NEXT: blr + <2 x double> %c) { +entry: + %0 = fsub contract reassoc <2 x double> , %a + %1 = call contract reassoc <2 x double> @llvm.fma.v2f64(<2 x double> %0, + <2 x double> %b, + <2 x double> %c) + ret <2 x double> %1 +} + +define <4 x float> @test_neg_fma_v4f32(<4 x float> %a, <4 x float> %b, +; VSX-LABEL: test_neg_fma_v4f32: +; VSX: # %bb.0: # %entry +; VSX-NEXT: xvnmsubasp 36, 34, 35 +; VSX-NEXT: vmr 2, 4 +; VSX-NEXT: blr +; +; NO-VSX-LABEL: test_neg_fma_v4f32: +; NO-VSX: # %bb.0: # %entry +; NO-VSX-NEXT: vspltisb 5, -1 +; NO-VSX-NEXT: vslw 5, 5, 5 +; NO-VSX-NEXT: vsubfp 2, 5, 2 +; NO-VSX-NEXT: vmaddfp 2, 2, 3, 4 +; NO-VSX-NEXT: blr + <4 x float> %c) { +entry: + %0 = fsub contract reassoc <4 x float> , %a + %1 = call contract reassoc <4 x float> @llvm.fma.v4f32(<4 x float> %0, + <4 x float> %b, + <4 x float> %c) + ret <4 x float> %1 +} + +define double @test_fast_mul_sub_f64(double %a, double %b, double %c) { +; VSX-LABEL: test_fast_mul_sub_f64: +; VSX: # %bb.0: # %entry +; VSX-NEXT: xsnmsubadp 1, 2, 3 +; VSX-NEXT: blr +; +; NO-VSX-LABEL: test_fast_mul_sub_f64: +; NO-VSX: # %bb.0: # %entry +; NO-VSX-NEXT: fnmsub 1, 2, 3, 1 +; NO-VSX-NEXT: blr +entry: + %0 = fmul fast double %b, %c + %1 = fsub fast double %a, %0 + ret double %1 +} + +define double @test_fast_2mul_sub_f64(double %a, double %b, double %c, +; VSX-LABEL: test_fast_2mul_sub_f64: +; VSX: # %bb.0: # %entry +; VSX-NEXT: xsmuldp 0, 3, 4 +; VSX-NEXT: xsmsubadp 0, 1, 2 +; VSX-NEXT: fmr 1, 0 +; VSX-NEXT: blr +; +; NO-VSX-LABEL: test_fast_2mul_sub_f64: +; NO-VSX: # %bb.0: # %entry +; NO-VSX-NEXT: fmul 0, 3, 4 +; NO-VSX-NEXT: fmsub 1, 1, 2, 0 +; NO-VSX-NEXT: blr + double %d) { +entry: + %0 = fmul fast double %a, %b + %1 = fmul fast double %c, %d + %2 = fsub fast double %0, %1 + ret double %2 +} + +define double @test_fast_neg_fma_f64(double %a, double %b, double %c) { +; VSX-LABEL: test_fast_neg_fma_f64: +; VSX: # %bb.0: # %entry +; VSX-NEXT: xsnmsubadp 3, 1, 2 +; VSX-NEXT: fmr 1, 3 +; VSX-NEXT: blr +; +; NO-VSX-LABEL: test_fast_neg_fma_f64: +; NO-VSX: # %bb.0: # %entry +; NO-VSX-NEXT: fnmsub 1, 1, 2, 3 +; NO-VSX-NEXT: blr +entry: + %0 = fsub fast double -0.0, %a + %1 = call fast double @llvm.fma.f64(double %0, double %b, double %c) + ret double %1 +} + +define float @test_fast_mul_sub_f32(float %a, float %b, float %c) { +; VSX-LABEL: test_fast_mul_sub_f32: +; VSX: # %bb.0: # %entry +; VSX-NEXT: xsnmsubasp 1, 2, 3 +; VSX-NEXT: blr +; +; NO-VSX-LABEL: test_fast_mul_sub_f32: +; NO-VSX: # %bb.0: # %entry +; NO-VSX-NEXT: fnmsubs 1, 2, 3, 1 +; NO-VSX-NEXT: blr +entry: + %0 = fmul fast float %b, %c + %1 = fsub fast float %a, %0 + ret float %1 +} + +define float @test_fast_2mul_sub_f32(float %a, float %b, float %c, float %d) { +; VSX-LABEL: test_fast_2mul_sub_f32: +; VSX: # %bb.0: # %entry +; VSX-NEXT: xsmulsp 0, 3, 4 +; VSX-NEXT: xsmsubasp 0, 1, 2 +; VSX-NEXT: fmr 1, 0 +; VSX-NEXT: blr +; +; NO-VSX-LABEL: test_fast_2mul_sub_f32: +; NO-VSX: # %bb.0: # %entry +; NO-VSX-NEXT: fmuls 0, 3, 4 +; NO-VSX-NEXT: fmsubs 1, 1, 2, 0 +; NO-VSX-NEXT: blr +entry: + %0 = fmul fast float %a, %b + %1 = fmul fast float %c, %d + %2 = fsub fast float %0, %1 + ret float %2 +} + +define float @test_fast_neg_fma_f32(float %a, float %b, float %c) { +; VSX-LABEL: test_fast_neg_fma_f32: +; VSX: # %bb.0: # %entry +; VSX-NEXT: xsnmsubasp 3, 1, 2 +; VSX-NEXT: fmr 1, 3 +; VSX-NEXT: blr +; +; NO-VSX-LABEL: test_fast_neg_fma_f32: +; NO-VSX: # %bb.0: # %entry +; NO-VSX-NEXT: fnmsubs 1, 1, 2, 3 +; NO-VSX-NEXT: blr +entry: + %0 = fsub fast float -0.0, %a + %1 = call fast float @llvm.fma.f32(float %0, float %b, float %c) + ret float %1 +} + +define <2 x double> @test_fast_neg_fma_v2f64(<2 x double> %a, <2 x double> %b, +; VSX-LABEL: test_fast_neg_fma_v2f64: +; VSX: # %bb.0: # %entry +; VSX-NEXT: xvnmsubadp 36, 34, 35 +; VSX-NEXT: vmr 2, 4 +; VSX-NEXT: blr +; +; NO-VSX-LABEL: test_fast_neg_fma_v2f64: +; NO-VSX: # %bb.0: # %entry +; NO-VSX-NEXT: fnmsub 1, 1, 3, 5 +; NO-VSX-NEXT: fnmsub 2, 2, 4, 6 +; NO-VSX-NEXT: blr + <2 x double> %c) { +entry: + %0 = fsub fast <2 x double> , %a + %1 = call fast <2 x double> @llvm.fma.v2f64(<2 x double> %0, <2 x double> %b, + <2 x double> %c) + ret <2 x double> %1 +} + +define <4 x float> @test_fast_neg_fma_v4f32(<4 x float> %a, <4 x float> %b, +; VSX-LABEL: test_fast_neg_fma_v4f32: +; VSX: # %bb.0: # %entry +; VSX-NEXT: xvnmsubasp 36, 34, 35 +; VSX-NEXT: vmr 2, 4 +; VSX-NEXT: blr +; +; NO-VSX-LABEL: test_fast_neg_fma_v4f32: +; NO-VSX: # %bb.0: # %entry +; NO-VSX-NEXT: vspltisb 5, -1 +; NO-VSX-NEXT: vslw 5, 5, 5 +; NO-VSX-NEXT: vsubfp 2, 5, 2 +; NO-VSX-NEXT: vmaddfp 2, 2, 3, 4 +; NO-VSX-NEXT: blr + <4 x float> %c) { +entry: + %0 = fsub fast <4 x float> , %a + %1 = call fast <4 x float> @llvm.fma.v4f32(<4 x float> %0, <4 x float> %b, + <4 x float> %c) + ret <4 x float> %1 +} + +declare float @llvm.fma.f32(float %a, float %b, float %c) +declare double @llvm.fma.f64(double %a, double %b, double %c) +declare <4 x float> @llvm.fma.v4f32(<4 x float> %a, <4 x float> %b, + <4 x float> %c) +declare <2 x double> @llvm.fma.v2f64(<2 x double> %a, <2 x double> %b, + <2 x double> %c) diff --git a/llvm/test/CodeGen/PowerPC/pr45297.ll b/llvm/test/CodeGen/PowerPC/pr45297.ll new file mode 100644 index 0000000000000..71c19744fb706 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/pr45297.ll @@ -0,0 +1,10 @@ +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \ +; RUN: -mattr=+altivec -mattr=-power8-vector -mattr=-vsx < %s | FileCheck %s +; XFAIL: * + +define dso_local void @test(float %0) local_unnamed_addr { +entry: + %1 = fptosi float %0 to i32 + store i32 %1, i32* undef, align 4 + ret void +} diff --git a/llvm/test/CodeGen/PowerPC/reduce_scalarization02.ll b/llvm/test/CodeGen/PowerPC/reduce_scalarization02.ll index f7727d6f4ea10..1dc40edf71464 100644 --- a/llvm/test/CodeGen/PowerPC/reduce_scalarization02.ll +++ b/llvm/test/CodeGen/PowerPC/reduce_scalarization02.ll @@ -47,33 +47,23 @@ define dso_local void @test2(<16 x float>* nocapture readonly %a, <2 x double>* ; CHECK-LABEL: test2: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: lxv vs0, 0(r3) -; CHECK-NEXT: xxsldwi vs1, vs0, vs0, 1 -; CHECK-NEXT: xscvspdpn f2, vs0 -; CHECK-NEXT: xxsldwi vs3, vs0, vs0, 3 -; CHECK-NEXT: xxswapd vs0, vs0 -; CHECK-NEXT: xscvspdpn f1, vs1 -; CHECK-NEXT: xscvspdpn f3, vs3 -; CHECK-NEXT: xscvspdpn f0, vs0 -; CHECK-NEXT: xxmrghd vs0, vs0, vs3 -; CHECK-NEXT: xxmrghd vs1, vs2, vs1 -; CHECK-NEXT: stxv vs0, 0(r4) -; CHECK-NEXT: stxv vs1, 0(r5) +; CHECK-NEXT: xxmrglw vs1, vs0, vs0 +; CHECK-NEXT: xxmrghw vs0, vs0, vs0 +; CHECK-NEXT: xvcvspdp vs1, vs1 +; CHECK-NEXT: xvcvspdp vs0, vs0 +; CHECK-NEXT: stxv vs1, 0(r4) +; CHECK-NEXT: stxv vs0, 0(r5) ; CHECK-NEXT: blr ; ; CHECK-BE-LABEL: test2: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: lxv vs0, 0(r3) -; CHECK-BE-NEXT: xxswapd vs1, vs0 -; CHECK-BE-NEXT: xxsldwi vs2, vs0, vs0, 3 -; CHECK-BE-NEXT: xscvspdpn f3, vs0 -; CHECK-BE-NEXT: xxsldwi vs0, vs0, vs0, 1 -; CHECK-BE-NEXT: xscvspdpn f1, vs1 -; CHECK-BE-NEXT: xscvspdpn f2, vs2 -; CHECK-BE-NEXT: xscvspdpn f0, vs0 -; CHECK-BE-NEXT: xxmrghd vs0, vs3, vs0 -; CHECK-BE-NEXT: xxmrghd vs1, vs1, vs2 -; CHECK-BE-NEXT: stxv vs0, 0(r4) -; CHECK-BE-NEXT: stxv vs1, 0(r5) +; CHECK-BE-NEXT: xxmrghw vs1, vs0, vs0 +; CHECK-BE-NEXT: xxmrglw vs0, vs0, vs0 +; CHECK-BE-NEXT: xvcvspdp vs1, vs1 +; CHECK-BE-NEXT: xvcvspdp vs0, vs0 +; CHECK-BE-NEXT: stxv vs1, 0(r4) +; CHECK-BE-NEXT: stxv vs0, 0(r5) ; CHECK-BE-NEXT: blr entry: %0 = load <16 x float>, <16 x float>* %a, align 16 diff --git a/llvm/test/CodeGen/PowerPC/vec_conv_fp32_to_i64_elts.ll b/llvm/test/CodeGen/PowerPC/vec_conv_fp32_to_i64_elts.ll index cf4a6d6362079..d355dcd08b0f4 100644 --- a/llvm/test/CodeGen/PowerPC/vec_conv_fp32_to_i64_elts.ll +++ b/llvm/test/CodeGen/PowerPC/vec_conv_fp32_to_i64_elts.ll @@ -14,10 +14,8 @@ define <2 x i64> @test2elt(i64 %a.coerce) local_unnamed_addr #0 { ; CHECK-P8: # %bb.0: # %entry ; CHECK-P8-NEXT: mtvsrd f0, r3 ; CHECK-P8-NEXT: xxswapd v2, vs0 -; CHECK-P8-NEXT: xscvspdpn f0, vs0 -; CHECK-P8-NEXT: xxsldwi vs1, v2, v2, 3 -; CHECK-P8-NEXT: xscvspdpn f1, vs1 -; CHECK-P8-NEXT: xxmrghd vs0, vs0, vs1 +; CHECK-P8-NEXT: xxmrglw vs0, v2, v2 +; CHECK-P8-NEXT: xvcvspdp vs0, vs0 ; CHECK-P8-NEXT: xvcvdpuxds v2, vs0 ; CHECK-P8-NEXT: blr ; @@ -25,20 +23,16 @@ define <2 x i64> @test2elt(i64 %a.coerce) local_unnamed_addr #0 { ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: mtvsrd f0, r3 ; CHECK-P9-NEXT: xxswapd v2, vs0 -; CHECK-P9-NEXT: xscvspdpn f0, vs0 -; CHECK-P9-NEXT: xxsldwi vs1, v2, v2, 3 -; CHECK-P9-NEXT: xscvspdpn f1, vs1 -; CHECK-P9-NEXT: xxmrghd vs0, vs0, vs1 +; CHECK-P9-NEXT: xxmrglw vs0, v2, v2 +; CHECK-P9-NEXT: xvcvspdp vs0, vs0 ; CHECK-P9-NEXT: xvcvdpuxds v2, vs0 ; CHECK-P9-NEXT: blr ; ; CHECK-BE-LABEL: test2elt: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: mtvsrd f0, r3 -; CHECK-BE-NEXT: xscvspdpn f1, vs0 -; CHECK-BE-NEXT: xxsldwi vs0, vs0, vs0, 1 -; CHECK-BE-NEXT: xscvspdpn f0, vs0 -; CHECK-BE-NEXT: xxmrghd vs0, vs1, vs0 +; CHECK-BE-NEXT: xxmrghw vs0, vs0, vs0 +; CHECK-BE-NEXT: xvcvspdp vs0, vs0 ; CHECK-BE-NEXT: xvcvdpuxds v2, vs0 ; CHECK-BE-NEXT: blr entry: @@ -50,16 +44,11 @@ entry: define void @test4elt(<4 x i64>* noalias nocapture sret %agg.result, <4 x float> %a) local_unnamed_addr #1 { ; CHECK-P8-LABEL: test4elt: ; CHECK-P8: # %bb.0: # %entry -; CHECK-P8-NEXT: xxsldwi vs0, v2, v2, 3 -; CHECK-P8-NEXT: xxswapd vs1, v2 +; CHECK-P8-NEXT: xxmrglw vs0, v2, v2 +; CHECK-P8-NEXT: xxmrghw vs1, v2, v2 ; CHECK-P8-NEXT: li r4, 16 -; CHECK-P8-NEXT: xxsldwi vs3, v2, v2, 1 -; CHECK-P8-NEXT: xscvspdpn f2, v2 -; CHECK-P8-NEXT: xscvspdpn f0, vs0 -; CHECK-P8-NEXT: xscvspdpn f1, vs1 -; CHECK-P8-NEXT: xscvspdpn f3, vs3 -; CHECK-P8-NEXT: xxmrghd vs0, vs1, vs0 -; CHECK-P8-NEXT: xxmrghd vs1, vs2, vs3 +; CHECK-P8-NEXT: xvcvspdp vs0, vs0 +; CHECK-P8-NEXT: xvcvspdp vs1, vs1 ; CHECK-P8-NEXT: xvcvdpuxds v2, vs0 ; CHECK-P8-NEXT: xvcvdpuxds v3, vs1 ; CHECK-P8-NEXT: xxswapd vs1, v2 @@ -70,36 +59,26 @@ define void @test4elt(<4 x i64>* noalias nocapture sret %agg.result, <4 x float> ; ; CHECK-P9-LABEL: test4elt: ; CHECK-P9: # %bb.0: # %entry -; CHECK-P9-NEXT: xxsldwi vs0, v2, v2, 3 -; CHECK-P9-NEXT: xxswapd vs1, v2 -; CHECK-P9-NEXT: xscvspdpn f0, vs0 -; CHECK-P9-NEXT: xscvspdpn f1, vs1 -; CHECK-P9-NEXT: xxsldwi vs2, v2, v2, 1 -; CHECK-P9-NEXT: xscvspdpn f2, vs2 -; CHECK-P9-NEXT: xxmrghd vs0, vs1, vs0 -; CHECK-P9-NEXT: xscvspdpn f1, v2 -; CHECK-P9-NEXT: xxmrghd vs1, vs1, vs2 +; CHECK-P9-NEXT: xxmrglw vs0, v2, v2 +; CHECK-P9-NEXT: xxmrghw vs1, v2, v2 +; CHECK-P9-NEXT: xvcvspdp vs0, vs0 +; CHECK-P9-NEXT: xvcvspdp vs1, vs1 ; CHECK-P9-NEXT: xvcvdpuxds vs0, vs0 ; CHECK-P9-NEXT: xvcvdpuxds vs1, vs1 -; CHECK-P9-NEXT: stxv vs0, 0(r3) ; CHECK-P9-NEXT: stxv vs1, 16(r3) +; CHECK-P9-NEXT: stxv vs0, 0(r3) ; CHECK-P9-NEXT: blr ; ; CHECK-BE-LABEL: test4elt: ; CHECK-BE: # %bb.0: # %entry -; CHECK-BE-NEXT: xxsldwi vs1, v2, v2, 1 -; CHECK-BE-NEXT: xscvspdpn f0, v2 -; CHECK-BE-NEXT: xxswapd vs2, v2 -; CHECK-BE-NEXT: xscvspdpn f1, vs1 -; CHECK-BE-NEXT: xxmrghd vs0, vs0, vs1 -; CHECK-BE-NEXT: xxsldwi vs1, v2, v2, 3 -; CHECK-BE-NEXT: xscvspdpn f1, vs1 -; CHECK-BE-NEXT: xscvspdpn f2, vs2 +; CHECK-BE-NEXT: xxmrghw vs0, v2, v2 +; CHECK-BE-NEXT: xxmrglw vs1, v2, v2 +; CHECK-BE-NEXT: xvcvspdp vs0, vs0 +; CHECK-BE-NEXT: xvcvspdp vs1, vs1 ; CHECK-BE-NEXT: xvcvdpuxds vs0, vs0 -; CHECK-BE-NEXT: xxmrghd vs1, vs2, vs1 ; CHECK-BE-NEXT: xvcvdpuxds vs1, vs1 -; CHECK-BE-NEXT: stxv vs0, 0(r3) ; CHECK-BE-NEXT: stxv vs1, 16(r3) +; CHECK-BE-NEXT: stxv vs0, 0(r3) ; CHECK-BE-NEXT: blr entry: %0 = fptoui <4 x float> %a to <4 x i64> @@ -115,31 +94,21 @@ define void @test8elt(<8 x i64>* noalias nocapture sret %agg.result, <8 x float> ; CHECK-P8-NEXT: li r6, 32 ; CHECK-P8-NEXT: lvx v2, r4, r5 ; CHECK-P8-NEXT: li r4, 48 -; CHECK-P8-NEXT: xxsldwi vs5, v3, v3, 3 -; CHECK-P8-NEXT: xxswapd vs6, v3 -; CHECK-P8-NEXT: xxsldwi vs0, v2, v2, 3 -; CHECK-P8-NEXT: xxswapd vs1, v2 -; CHECK-P8-NEXT: xxsldwi vs3, v2, v2, 1 -; CHECK-P8-NEXT: xxsldwi vs7, v3, v3, 1 -; CHECK-P8-NEXT: xscvspdpn f2, v2 -; CHECK-P8-NEXT: xscvspdpn f4, v3 -; CHECK-P8-NEXT: xscvspdpn f0, vs0 -; CHECK-P8-NEXT: xscvspdpn f1, vs1 -; CHECK-P8-NEXT: xscvspdpn f3, vs3 -; CHECK-P8-NEXT: xscvspdpn f5, vs5 -; CHECK-P8-NEXT: xscvspdpn f6, vs6 -; CHECK-P8-NEXT: xscvspdpn f7, vs7 -; CHECK-P8-NEXT: xxmrghd vs0, vs1, vs0 -; CHECK-P8-NEXT: xxmrghd vs1, vs2, vs3 -; CHECK-P8-NEXT: xxmrghd vs2, vs6, vs5 +; CHECK-P8-NEXT: xxmrglw vs2, v3, v3 +; CHECK-P8-NEXT: xxmrghw vs3, v3, v3 +; CHECK-P8-NEXT: xxmrglw vs0, v2, v2 +; CHECK-P8-NEXT: xxmrghw vs1, v2, v2 +; CHECK-P8-NEXT: xvcvspdp vs2, vs2 +; CHECK-P8-NEXT: xvcvspdp vs0, vs0 +; CHECK-P8-NEXT: xvcvspdp vs1, vs1 +; CHECK-P8-NEXT: xvcvspdp vs3, vs3 +; CHECK-P8-NEXT: xvcvdpuxds v4, vs2 ; CHECK-P8-NEXT: xvcvdpuxds v2, vs0 -; CHECK-P8-NEXT: xxmrghd vs3, vs4, vs7 ; CHECK-P8-NEXT: xvcvdpuxds v3, vs1 -; CHECK-P8-NEXT: xvcvdpuxds v4, vs2 ; CHECK-P8-NEXT: xvcvdpuxds v5, vs3 +; CHECK-P8-NEXT: xxswapd vs3, v4 ; CHECK-P8-NEXT: xxswapd vs1, v2 ; CHECK-P8-NEXT: xxswapd vs0, v3 -; CHECK-P8-NEXT: xxswapd vs3, v4 ; CHECK-P8-NEXT: xxswapd vs2, v5 ; CHECK-P8-NEXT: stxvd2x vs0, r3, r4 ; CHECK-P8-NEXT: stxvd2x vs1, r3, r6 @@ -149,65 +118,45 @@ define void @test8elt(<8 x i64>* noalias nocapture sret %agg.result, <8 x float> ; ; CHECK-P9-LABEL: test8elt: ; CHECK-P9: # %bb.0: # %entry -; CHECK-P9-NEXT: lxv vs0, 0(r4) -; CHECK-P9-NEXT: xxsldwi vs1, vs0, vs0, 3 -; CHECK-P9-NEXT: xxswapd vs2, vs0 -; CHECK-P9-NEXT: xscvspdpn f1, vs1 -; CHECK-P9-NEXT: xscvspdpn f2, vs2 -; CHECK-P9-NEXT: xscvspdpn f3, vs0 -; CHECK-P9-NEXT: xxsldwi vs0, vs0, vs0, 1 -; CHECK-P9-NEXT: xscvspdpn f0, vs0 -; CHECK-P9-NEXT: xxmrghd vs1, vs2, vs1 -; CHECK-P9-NEXT: lxv vs2, 16(r4) -; CHECK-P9-NEXT: xxmrghd vs0, vs3, vs0 +; CHECK-P9-NEXT: lxv vs0, 16(r4) +; CHECK-P9-NEXT: lxv vs1, 0(r4) +; CHECK-P9-NEXT: xxmrglw vs2, vs1, vs1 +; CHECK-P9-NEXT: xxmrghw vs1, vs1, vs1 +; CHECK-P9-NEXT: xxmrglw vs3, vs0, vs0 +; CHECK-P9-NEXT: xxmrghw vs0, vs0, vs0 +; CHECK-P9-NEXT: xvcvspdp vs2, vs2 +; CHECK-P9-NEXT: xvcvspdp vs1, vs1 +; CHECK-P9-NEXT: xvcvspdp vs3, vs3 +; CHECK-P9-NEXT: xvcvspdp vs0, vs0 +; CHECK-P9-NEXT: xvcvdpuxds vs2, vs2 ; CHECK-P9-NEXT: xvcvdpuxds vs1, vs1 -; CHECK-P9-NEXT: xvcvdpuxds vs0, vs0 -; CHECK-P9-NEXT: xxsldwi vs3, vs2, vs2, 3 -; CHECK-P9-NEXT: xxswapd vs4, vs2 -; CHECK-P9-NEXT: xscvspdpn f3, vs3 -; CHECK-P9-NEXT: xscvspdpn f4, vs4 -; CHECK-P9-NEXT: stxv vs0, 16(r3) -; CHECK-P9-NEXT: xxmrghd vs3, vs4, vs3 -; CHECK-P9-NEXT: xscvspdpn f4, vs2 -; CHECK-P9-NEXT: xxsldwi vs2, vs2, vs2, 1 -; CHECK-P9-NEXT: xscvspdpn f2, vs2 ; CHECK-P9-NEXT: xvcvdpuxds vs3, vs3 -; CHECK-P9-NEXT: xxmrghd vs2, vs4, vs2 -; CHECK-P9-NEXT: xvcvdpuxds vs2, vs2 +; CHECK-P9-NEXT: xvcvdpuxds vs0, vs0 +; CHECK-P9-NEXT: stxv vs0, 48(r3) ; CHECK-P9-NEXT: stxv vs3, 32(r3) -; CHECK-P9-NEXT: stxv vs2, 48(r3) -; CHECK-P9-NEXT: stxv vs1, 0(r3) +; CHECK-P9-NEXT: stxv vs1, 16(r3) +; CHECK-P9-NEXT: stxv vs2, 0(r3) ; CHECK-P9-NEXT: blr ; ; CHECK-BE-LABEL: test8elt: ; CHECK-BE: # %bb.0: # %entry -; CHECK-BE-NEXT: lxv vs1, 0(r4) -; CHECK-BE-NEXT: xxsldwi vs3, vs1, vs1, 1 -; CHECK-BE-NEXT: xscvspdpn f2, vs1 -; CHECK-BE-NEXT: xscvspdpn f3, vs3 ; CHECK-BE-NEXT: lxv vs0, 16(r4) -; CHECK-BE-NEXT: xxsldwi vs4, vs0, vs0, 1 -; CHECK-BE-NEXT: xscvspdpn f4, vs4 -; CHECK-BE-NEXT: xxmrghd vs2, vs2, vs3 -; CHECK-BE-NEXT: xxsldwi vs3, vs1, vs1, 3 -; CHECK-BE-NEXT: xxswapd vs1, vs1 -; CHECK-BE-NEXT: xscvspdpn f3, vs3 -; CHECK-BE-NEXT: xscvspdpn f1, vs1 -; CHECK-BE-NEXT: xxmrghd vs1, vs1, vs3 -; CHECK-BE-NEXT: xscvspdpn f3, vs0 -; CHECK-BE-NEXT: xxmrghd vs3, vs3, vs4 -; CHECK-BE-NEXT: xxsldwi vs4, vs0, vs0, 3 -; CHECK-BE-NEXT: xxswapd vs0, vs0 -; CHECK-BE-NEXT: xscvspdpn f0, vs0 -; CHECK-BE-NEXT: xscvspdpn f4, vs4 -; CHECK-BE-NEXT: xxmrghd vs0, vs0, vs4 +; CHECK-BE-NEXT: lxv vs1, 0(r4) +; CHECK-BE-NEXT: xxmrghw vs2, vs1, vs1 +; CHECK-BE-NEXT: xxmrglw vs1, vs1, vs1 +; CHECK-BE-NEXT: xxmrghw vs3, vs0, vs0 +; CHECK-BE-NEXT: xxmrglw vs0, vs0, vs0 +; CHECK-BE-NEXT: xvcvspdp vs2, vs2 +; CHECK-BE-NEXT: xvcvspdp vs1, vs1 +; CHECK-BE-NEXT: xvcvspdp vs3, vs3 +; CHECK-BE-NEXT: xvcvspdp vs0, vs0 ; CHECK-BE-NEXT: xvcvdpuxds vs2, vs2 ; CHECK-BE-NEXT: xvcvdpuxds vs1, vs1 ; CHECK-BE-NEXT: xvcvdpuxds vs3, vs3 -; CHECK-BE-NEXT: stxv vs1, 16(r3) ; CHECK-BE-NEXT: xvcvdpuxds vs0, vs0 -; CHECK-BE-NEXT: stxv vs3, 32(r3) ; CHECK-BE-NEXT: stxv vs0, 48(r3) +; CHECK-BE-NEXT: stxv vs3, 32(r3) +; CHECK-BE-NEXT: stxv vs1, 16(r3) ; CHECK-BE-NEXT: stxv vs2, 0(r3) ; CHECK-BE-NEXT: blr entry: @@ -220,70 +169,50 @@ entry: define void @test16elt(<16 x i64>* noalias nocapture sret %agg.result, <16 x float>* nocapture readonly) local_unnamed_addr #2 { ; CHECK-P8-LABEL: test16elt: ; CHECK-P8: # %bb.0: # %entry -; CHECK-P8-NEXT: li r5, 16 ; CHECK-P8-NEXT: li r7, 48 +; CHECK-P8-NEXT: li r5, 16 ; CHECK-P8-NEXT: li r6, 32 -; CHECK-P8-NEXT: lvx v4, 0, r4 ; CHECK-P8-NEXT: li r8, 64 -; CHECK-P8-NEXT: lvx v5, r4, r5 -; CHECK-P8-NEXT: lvx v3, r4, r7 -; CHECK-P8-NEXT: lvx v2, r4, r6 +; CHECK-P8-NEXT: lvx v4, r4, r7 +; CHECK-P8-NEXT: lvx v2, r4, r5 +; CHECK-P8-NEXT: lvx v3, r4, r6 +; CHECK-P8-NEXT: xxmrghw vs3, v4, v4 +; CHECK-P8-NEXT: xxmrglw vs5, v4, v4 +; CHECK-P8-NEXT: xxmrglw vs0, v2, v2 +; CHECK-P8-NEXT: xxmrghw vs1, v2, v2 +; CHECK-P8-NEXT: lvx v2, 0, r4 ; CHECK-P8-NEXT: li r4, 112 -; CHECK-P8-NEXT: xxsldwi vs13, v4, v4, 3 -; CHECK-P8-NEXT: xscvspdpn f6, v4 -; CHECK-P8-NEXT: xxsldwi vs1, v5, v5, 3 -; CHECK-P8-NEXT: xxswapd vs3, v5 -; CHECK-P8-NEXT: xxsldwi vs9, v3, v3, 1 -; CHECK-P8-NEXT: xscvspdpn f4, v3 -; CHECK-P8-NEXT: xxsldwi vs5, v5, v5, 1 -; CHECK-P8-NEXT: xxsldwi vs10, v3, v3, 3 -; CHECK-P8-NEXT: xscvspdpn f1, vs1 -; CHECK-P8-NEXT: xxswapd vs11, v3 -; CHECK-P8-NEXT: xscvspdpn f3, vs3 -; CHECK-P8-NEXT: xxsldwi vs7, v2, v2, 3 -; CHECK-P8-NEXT: xscvspdpn f9, vs9 -; CHECK-P8-NEXT: xxswapd vs8, v2 -; CHECK-P8-NEXT: xscvspdpn f0, v5 -; CHECK-P8-NEXT: xxsldwi vs12, v2, v2, 1 -; CHECK-P8-NEXT: xscvspdpn f2, v2 -; CHECK-P8-NEXT: xxswapd v2, v4 -; CHECK-P8-NEXT: xscvspdpn f5, vs5 -; CHECK-P8-NEXT: xxsldwi v3, v4, v4, 1 -; CHECK-P8-NEXT: xscvspdpn f10, vs10 -; CHECK-P8-NEXT: xscvspdpn f11, vs11 -; CHECK-P8-NEXT: xxmrghd vs1, vs3, vs1 -; CHECK-P8-NEXT: xscvspdpn f7, vs7 -; CHECK-P8-NEXT: xxmrghd vs4, vs4, vs9 -; CHECK-P8-NEXT: xscvspdpn f8, vs8 -; CHECK-P8-NEXT: xscvspdpn f12, vs12 -; CHECK-P8-NEXT: xscvspdpn f13, vs13 -; CHECK-P8-NEXT: xxmrghd vs0, vs0, vs5 -; CHECK-P8-NEXT: xscvspdpn f3, v2 -; CHECK-P8-NEXT: xscvspdpn f9, v3 -; CHECK-P8-NEXT: xxmrghd vs5, vs11, vs10 -; CHECK-P8-NEXT: xvcvdpuxds v3, vs4 -; CHECK-P8-NEXT: xvcvdpuxds v2, vs1 -; CHECK-P8-NEXT: xxmrghd vs1, vs2, vs12 -; CHECK-P8-NEXT: xxmrghd vs2, vs8, vs7 -; CHECK-P8-NEXT: xvcvdpuxds v4, vs0 -; CHECK-P8-NEXT: xxmrghd vs0, vs3, vs13 +; CHECK-P8-NEXT: xxmrglw vs2, v3, v3 +; CHECK-P8-NEXT: xxmrghw vs4, v3, v3 +; CHECK-P8-NEXT: xvcvspdp vs3, vs3 +; CHECK-P8-NEXT: xxmrglw vs6, v2, v2 +; CHECK-P8-NEXT: xxmrghw vs7, v2, v2 +; CHECK-P8-NEXT: xvcvspdp vs5, vs5 +; CHECK-P8-NEXT: xvcvspdp vs0, vs0 +; CHECK-P8-NEXT: xvcvspdp vs1, vs1 +; CHECK-P8-NEXT: xvcvspdp vs2, vs2 +; CHECK-P8-NEXT: xvcvspdp vs4, vs4 +; CHECK-P8-NEXT: xvcvspdp vs6, vs6 +; CHECK-P8-NEXT: xvcvspdp vs7, vs7 +; CHECK-P8-NEXT: xvcvdpuxds v3, vs3 ; CHECK-P8-NEXT: xvcvdpuxds v5, vs5 -; CHECK-P8-NEXT: xxmrghd vs3, vs6, vs9 -; CHECK-P8-NEXT: xvcvdpuxds v0, vs1 +; CHECK-P8-NEXT: xvcvdpuxds v2, vs0 +; CHECK-P8-NEXT: xvcvdpuxds v4, vs1 +; CHECK-P8-NEXT: xvcvdpuxds v0, vs4 ; CHECK-P8-NEXT: xvcvdpuxds v1, vs2 -; CHECK-P8-NEXT: xvcvdpuxds v6, vs0 +; CHECK-P8-NEXT: xvcvdpuxds v6, vs6 ; CHECK-P8-NEXT: xxswapd vs0, v3 -; CHECK-P8-NEXT: xvcvdpuxds v7, vs3 -; CHECK-P8-NEXT: xxswapd vs4, v2 -; CHECK-P8-NEXT: xxswapd vs3, v4 +; CHECK-P8-NEXT: xvcvdpuxds v7, vs7 ; CHECK-P8-NEXT: xxswapd vs1, v5 +; CHECK-P8-NEXT: xxswapd vs4, v2 ; CHECK-P8-NEXT: stxvd2x vs0, r3, r4 ; CHECK-P8-NEXT: li r4, 96 +; CHECK-P8-NEXT: xxswapd vs3, v4 ; CHECK-P8-NEXT: xxswapd vs2, v0 -; CHECK-P8-NEXT: xxswapd vs0, v1 ; CHECK-P8-NEXT: stxvd2x vs1, r3, r4 -; CHECK-P8-NEXT: xxswapd vs5, v6 ; CHECK-P8-NEXT: li r4, 80 +; CHECK-P8-NEXT: xxswapd vs0, v1 +; CHECK-P8-NEXT: xxswapd vs5, v6 ; CHECK-P8-NEXT: xxswapd vs1, v7 ; CHECK-P8-NEXT: stxvd2x vs2, r3, r4 ; CHECK-P8-NEXT: stxvd2x vs0, r3, r8 @@ -295,122 +224,82 @@ define void @test16elt(<16 x i64>* noalias nocapture sret %agg.result, <16 x flo ; ; CHECK-P9-LABEL: test16elt: ; CHECK-P9: # %bb.0: # %entry -; CHECK-P9-NEXT: lxv vs4, 16(r4) -; CHECK-P9-NEXT: xxsldwi vs5, vs4, vs4, 3 -; CHECK-P9-NEXT: xxswapd vs6, vs4 -; CHECK-P9-NEXT: lxv vs0, 0(r4) -; CHECK-P9-NEXT: xxsldwi vs1, vs0, vs0, 3 -; CHECK-P9-NEXT: xxswapd vs2, vs0 -; CHECK-P9-NEXT: xscvspdpn f5, vs5 -; CHECK-P9-NEXT: xscvspdpn f6, vs6 -; CHECK-P9-NEXT: xxmrghd vs5, vs6, vs5 -; CHECK-P9-NEXT: xscvspdpn f6, vs4 -; CHECK-P9-NEXT: xxsldwi vs4, vs4, vs4, 1 -; CHECK-P9-NEXT: lxv vs3, 32(r4) -; CHECK-P9-NEXT: xscvspdpn f2, vs2 -; CHECK-P9-NEXT: xxswapd vs7, vs3 -; CHECK-P9-NEXT: xscvspdpn f7, vs7 -; CHECK-P9-NEXT: xscvspdpn f4, vs4 -; CHECK-P9-NEXT: xscvspdpn f1, vs1 -; CHECK-P9-NEXT: xxmrghd vs1, vs2, vs1 -; CHECK-P9-NEXT: xscvspdpn f2, vs0 -; CHECK-P9-NEXT: xxsldwi vs0, vs0, vs0, 1 -; CHECK-P9-NEXT: xscvspdpn f0, vs0 -; CHECK-P9-NEXT: xxmrghd vs0, vs2, vs0 -; CHECK-P9-NEXT: xxmrghd vs4, vs6, vs4 -; CHECK-P9-NEXT: xxsldwi vs6, vs3, vs3, 3 +; CHECK-P9-NEXT: lxv vs0, 48(r4) +; CHECK-P9-NEXT: lxv vs1, 0(r4) +; CHECK-P9-NEXT: lxv vs3, 16(r4) +; CHECK-P9-NEXT: lxv vs5, 32(r4) +; CHECK-P9-NEXT: xxmrglw vs2, vs1, vs1 +; CHECK-P9-NEXT: xxmrghw vs1, vs1, vs1 +; CHECK-P9-NEXT: xxmrglw vs4, vs3, vs3 +; CHECK-P9-NEXT: xxmrghw vs3, vs3, vs3 +; CHECK-P9-NEXT: xxmrglw vs6, vs5, vs5 +; CHECK-P9-NEXT: xxmrghw vs5, vs5, vs5 +; CHECK-P9-NEXT: xxmrglw vs7, vs0, vs0 +; CHECK-P9-NEXT: xxmrghw vs0, vs0, vs0 +; CHECK-P9-NEXT: xvcvspdp vs2, vs2 +; CHECK-P9-NEXT: xvcvspdp vs1, vs1 +; CHECK-P9-NEXT: xvcvspdp vs4, vs4 +; CHECK-P9-NEXT: xvcvspdp vs3, vs3 +; CHECK-P9-NEXT: xvcvspdp vs6, vs6 +; CHECK-P9-NEXT: xvcvspdp vs5, vs5 +; CHECK-P9-NEXT: xvcvspdp vs7, vs7 +; CHECK-P9-NEXT: xvcvspdp vs0, vs0 +; CHECK-P9-NEXT: xvcvdpuxds vs2, vs2 ; CHECK-P9-NEXT: xvcvdpuxds vs1, vs1 -; CHECK-P9-NEXT: xvcvdpuxds vs5, vs5 -; CHECK-P9-NEXT: xscvspdpn f6, vs6 -; CHECK-P9-NEXT: xxmrghd vs6, vs7, vs6 -; CHECK-P9-NEXT: xscvspdpn f7, vs3 -; CHECK-P9-NEXT: xxsldwi vs3, vs3, vs3, 1 -; CHECK-P9-NEXT: lxv vs2, 48(r4) -; CHECK-P9-NEXT: xxswapd vs8, vs2 -; CHECK-P9-NEXT: xscvspdpn f8, vs8 ; CHECK-P9-NEXT: xvcvdpuxds vs4, vs4 -; CHECK-P9-NEXT: xscvspdpn f3, vs3 -; CHECK-P9-NEXT: xxmrghd vs3, vs7, vs3 -; CHECK-P9-NEXT: xxsldwi vs7, vs2, vs2, 3 -; CHECK-P9-NEXT: xvcvdpuxds vs0, vs0 -; CHECK-P9-NEXT: xvcvdpuxds vs6, vs6 -; CHECK-P9-NEXT: stxv vs6, 64(r3) -; CHECK-P9-NEXT: xscvspdpn f7, vs7 -; CHECK-P9-NEXT: xxmrghd vs7, vs8, vs7 -; CHECK-P9-NEXT: xscvspdpn f8, vs2 -; CHECK-P9-NEXT: xxsldwi vs2, vs2, vs2, 1 -; CHECK-P9-NEXT: xscvspdpn f2, vs2 -; CHECK-P9-NEXT: xxmrghd vs2, vs8, vs2 ; CHECK-P9-NEXT: xvcvdpuxds vs3, vs3 +; CHECK-P9-NEXT: xvcvdpuxds vs6, vs6 +; CHECK-P9-NEXT: xvcvdpuxds vs5, vs5 ; CHECK-P9-NEXT: xvcvdpuxds vs7, vs7 -; CHECK-P9-NEXT: stxv vs3, 80(r3) -; CHECK-P9-NEXT: xvcvdpuxds vs2, vs2 +; CHECK-P9-NEXT: xvcvdpuxds vs0, vs0 +; CHECK-P9-NEXT: stxv vs0, 112(r3) ; CHECK-P9-NEXT: stxv vs7, 96(r3) -; CHECK-P9-NEXT: stxv vs2, 112(r3) -; CHECK-P9-NEXT: stxv vs4, 48(r3) -; CHECK-P9-NEXT: stxv vs5, 32(r3) -; CHECK-P9-NEXT: stxv vs0, 16(r3) -; CHECK-P9-NEXT: stxv vs1, 0(r3) +; CHECK-P9-NEXT: stxv vs5, 80(r3) +; CHECK-P9-NEXT: stxv vs6, 64(r3) +; CHECK-P9-NEXT: stxv vs3, 48(r3) +; CHECK-P9-NEXT: stxv vs4, 32(r3) +; CHECK-P9-NEXT: stxv vs1, 16(r3) +; CHECK-P9-NEXT: stxv vs2, 0(r3) ; CHECK-P9-NEXT: blr ; ; CHECK-BE-LABEL: test16elt: ; CHECK-BE: # %bb.0: # %entry -; CHECK-BE-NEXT: lxv vs0, 0(r4) -; CHECK-BE-NEXT: lxv vs4, 16(r4) -; CHECK-BE-NEXT: xxsldwi vs2, vs0, vs0, 1 -; CHECK-BE-NEXT: xscvspdpn f1, vs0 -; CHECK-BE-NEXT: xxsldwi vs5, vs0, vs0, 3 -; CHECK-BE-NEXT: xxswapd vs0, vs0 -; CHECK-BE-NEXT: xscvspdpn f5, vs5 -; CHECK-BE-NEXT: xscvspdpn f0, vs0 -; CHECK-BE-NEXT: xxsldwi vs6, vs4, vs4, 1 -; CHECK-BE-NEXT: xscvspdpn f6, vs6 -; CHECK-BE-NEXT: xxmrghd vs0, vs0, vs5 -; CHECK-BE-NEXT: xscvspdpn f5, vs4 -; CHECK-BE-NEXT: lxv vs3, 32(r4) -; CHECK-BE-NEXT: xxsldwi vs7, vs3, vs3, 1 -; CHECK-BE-NEXT: xscvspdpn f7, vs7 -; CHECK-BE-NEXT: xxmrghd vs5, vs5, vs6 -; CHECK-BE-NEXT: xxsldwi vs6, vs4, vs4, 3 -; CHECK-BE-NEXT: xxswapd vs4, vs4 -; CHECK-BE-NEXT: xscvspdpn f6, vs6 -; CHECK-BE-NEXT: xscvspdpn f4, vs4 -; CHECK-BE-NEXT: xscvspdpn f2, vs2 -; CHECK-BE-NEXT: xxmrghd vs1, vs1, vs2 -; CHECK-BE-NEXT: lxv vs2, 48(r4) -; CHECK-BE-NEXT: xxsldwi vs8, vs2, vs2, 1 +; CHECK-BE-NEXT: lxv vs0, 48(r4) +; CHECK-BE-NEXT: lxv vs1, 0(r4) +; CHECK-BE-NEXT: lxv vs3, 16(r4) +; CHECK-BE-NEXT: lxv vs5, 32(r4) +; CHECK-BE-NEXT: xxmrghw vs2, vs1, vs1 +; CHECK-BE-NEXT: xxmrglw vs1, vs1, vs1 +; CHECK-BE-NEXT: xxmrghw vs4, vs3, vs3 +; CHECK-BE-NEXT: xxmrglw vs3, vs3, vs3 +; CHECK-BE-NEXT: xxmrghw vs6, vs5, vs5 +; CHECK-BE-NEXT: xxmrglw vs5, vs5, vs5 +; CHECK-BE-NEXT: xxmrghw vs7, vs0, vs0 +; CHECK-BE-NEXT: xxmrglw vs0, vs0, vs0 +; CHECK-BE-NEXT: xvcvspdp vs2, vs2 +; CHECK-BE-NEXT: xvcvspdp vs1, vs1 +; CHECK-BE-NEXT: xvcvspdp vs4, vs4 +; CHECK-BE-NEXT: xvcvspdp vs3, vs3 +; CHECK-BE-NEXT: xvcvspdp vs6, vs6 +; CHECK-BE-NEXT: xvcvspdp vs5, vs5 +; CHECK-BE-NEXT: xvcvspdp vs7, vs7 +; CHECK-BE-NEXT: xvcvspdp vs0, vs0 +; CHECK-BE-NEXT: xvcvdpuxds vs2, vs2 ; CHECK-BE-NEXT: xvcvdpuxds vs1, vs1 -; CHECK-BE-NEXT: xvcvdpuxds vs0, vs0 -; CHECK-BE-NEXT: xvcvdpuxds vs5, vs5 -; CHECK-BE-NEXT: xscvspdpn f8, vs8 -; CHECK-BE-NEXT: xxmrghd vs4, vs4, vs6 -; CHECK-BE-NEXT: xscvspdpn f6, vs3 -; CHECK-BE-NEXT: stxv vs0, 16(r3) -; CHECK-BE-NEXT: xxmrghd vs6, vs6, vs7 -; CHECK-BE-NEXT: xxsldwi vs7, vs3, vs3, 3 -; CHECK-BE-NEXT: xxswapd vs3, vs3 -; CHECK-BE-NEXT: xscvspdpn f7, vs7 -; CHECK-BE-NEXT: xscvspdpn f3, vs3 -; CHECK-BE-NEXT: xxmrghd vs3, vs3, vs7 -; CHECK-BE-NEXT: xscvspdpn f7, vs2 -; CHECK-BE-NEXT: xxmrghd vs7, vs7, vs8 -; CHECK-BE-NEXT: xxsldwi vs8, vs2, vs2, 3 -; CHECK-BE-NEXT: xxswapd vs2, vs2 -; CHECK-BE-NEXT: xscvspdpn f8, vs8 -; CHECK-BE-NEXT: xscvspdpn f2, vs2 -; CHECK-BE-NEXT: xxmrghd vs2, vs2, vs8 -; CHECK-BE-NEXT: stxv vs5, 32(r3) ; CHECK-BE-NEXT: xvcvdpuxds vs4, vs4 -; CHECK-BE-NEXT: xvcvdpuxds vs6, vs6 ; CHECK-BE-NEXT: xvcvdpuxds vs3, vs3 +; CHECK-BE-NEXT: xvcvdpuxds vs6, vs6 +; CHECK-BE-NEXT: xvcvdpuxds vs5, vs5 ; CHECK-BE-NEXT: xvcvdpuxds vs7, vs7 -; CHECK-BE-NEXT: stxv vs3, 80(r3) +; CHECK-BE-NEXT: xvcvdpuxds vs0, vs0 +; CHECK-BE-NEXT: stxv vs0, 112(r3) ; CHECK-BE-NEXT: stxv vs7, 96(r3) -; CHECK-BE-NEXT: xvcvdpuxds vs2, vs2 -; CHECK-BE-NEXT: stxv vs2, 112(r3) +; CHECK-BE-NEXT: stxv vs5, 80(r3) ; CHECK-BE-NEXT: stxv vs6, 64(r3) -; CHECK-BE-NEXT: stxv vs4, 48(r3) -; CHECK-BE-NEXT: stxv vs1, 0(r3) +; CHECK-BE-NEXT: stxv vs3, 48(r3) +; CHECK-BE-NEXT: stxv vs4, 32(r3) +; CHECK-BE-NEXT: stxv vs1, 16(r3) +; CHECK-BE-NEXT: stxv vs2, 0(r3) ; CHECK-BE-NEXT: blr entry: %a = load <16 x float>, <16 x float>* %0, align 64 @@ -424,10 +313,8 @@ define <2 x i64> @test2elt_signed(i64 %a.coerce) local_unnamed_addr #0 { ; CHECK-P8: # %bb.0: # %entry ; CHECK-P8-NEXT: mtvsrd f0, r3 ; CHECK-P8-NEXT: xxswapd v2, vs0 -; CHECK-P8-NEXT: xscvspdpn f0, vs0 -; CHECK-P8-NEXT: xxsldwi vs1, v2, v2, 3 -; CHECK-P8-NEXT: xscvspdpn f1, vs1 -; CHECK-P8-NEXT: xxmrghd vs0, vs0, vs1 +; CHECK-P8-NEXT: xxmrglw vs0, v2, v2 +; CHECK-P8-NEXT: xvcvspdp vs0, vs0 ; CHECK-P8-NEXT: xvcvdpuxds v2, vs0 ; CHECK-P8-NEXT: blr ; @@ -435,20 +322,16 @@ define <2 x i64> @test2elt_signed(i64 %a.coerce) local_unnamed_addr #0 { ; CHECK-P9: # %bb.0: # %entry ; CHECK-P9-NEXT: mtvsrd f0, r3 ; CHECK-P9-NEXT: xxswapd v2, vs0 -; CHECK-P9-NEXT: xscvspdpn f0, vs0 -; CHECK-P9-NEXT: xxsldwi vs1, v2, v2, 3 -; CHECK-P9-NEXT: xscvspdpn f1, vs1 -; CHECK-P9-NEXT: xxmrghd vs0, vs0, vs1 +; CHECK-P9-NEXT: xxmrglw vs0, v2, v2 +; CHECK-P9-NEXT: xvcvspdp vs0, vs0 ; CHECK-P9-NEXT: xvcvdpuxds v2, vs0 ; CHECK-P9-NEXT: blr ; ; CHECK-BE-LABEL: test2elt_signed: ; CHECK-BE: # %bb.0: # %entry ; CHECK-BE-NEXT: mtvsrd f0, r3 -; CHECK-BE-NEXT: xscvspdpn f1, vs0 -; CHECK-BE-NEXT: xxsldwi vs0, vs0, vs0, 1 -; CHECK-BE-NEXT: xscvspdpn f0, vs0 -; CHECK-BE-NEXT: xxmrghd vs0, vs1, vs0 +; CHECK-BE-NEXT: xxmrghw vs0, vs0, vs0 +; CHECK-BE-NEXT: xvcvspdp vs0, vs0 ; CHECK-BE-NEXT: xvcvdpuxds v2, vs0 ; CHECK-BE-NEXT: blr entry: @@ -460,16 +343,11 @@ entry: define void @test4elt_signed(<4 x i64>* noalias nocapture sret %agg.result, <4 x float> %a) local_unnamed_addr #1 { ; CHECK-P8-LABEL: test4elt_signed: ; CHECK-P8: # %bb.0: # %entry -; CHECK-P8-NEXT: xxsldwi vs0, v2, v2, 3 -; CHECK-P8-NEXT: xxswapd vs1, v2 +; CHECK-P8-NEXT: xxmrglw vs0, v2, v2 +; CHECK-P8-NEXT: xxmrghw vs1, v2, v2 ; CHECK-P8-NEXT: li r4, 16 -; CHECK-P8-NEXT: xxsldwi vs3, v2, v2, 1 -; CHECK-P8-NEXT: xscvspdpn f2, v2 -; CHECK-P8-NEXT: xscvspdpn f0, vs0 -; CHECK-P8-NEXT: xscvspdpn f1, vs1 -; CHECK-P8-NEXT: xscvspdpn f3, vs3 -; CHECK-P8-NEXT: xxmrghd vs0, vs1, vs0 -; CHECK-P8-NEXT: xxmrghd vs1, vs2, vs3 +; CHECK-P8-NEXT: xvcvspdp vs0, vs0 +; CHECK-P8-NEXT: xvcvspdp vs1, vs1 ; CHECK-P8-NEXT: xvcvdpuxds v2, vs0 ; CHECK-P8-NEXT: xvcvdpuxds v3, vs1 ; CHECK-P8-NEXT: xxswapd vs1, v2 @@ -480,36 +358,26 @@ define void @test4elt_signed(<4 x i64>* noalias nocapture sret %agg.result, <4 x ; ; CHECK-P9-LABEL: test4elt_signed: ; CHECK-P9: # %bb.0: # %entry -; CHECK-P9-NEXT: xxsldwi vs0, v2, v2, 3 -; CHECK-P9-NEXT: xxswapd vs1, v2 -; CHECK-P9-NEXT: xscvspdpn f0, vs0 -; CHECK-P9-NEXT: xscvspdpn f1, vs1 -; CHECK-P9-NEXT: xxsldwi vs2, v2, v2, 1 -; CHECK-P9-NEXT: xscvspdpn f2, vs2 -; CHECK-P9-NEXT: xxmrghd vs0, vs1, vs0 -; CHECK-P9-NEXT: xscvspdpn f1, v2 -; CHECK-P9-NEXT: xxmrghd vs1, vs1, vs2 +; CHECK-P9-NEXT: xxmrglw vs0, v2, v2 +; CHECK-P9-NEXT: xxmrghw vs1, v2, v2 +; CHECK-P9-NEXT: xvcvspdp vs0, vs0 +; CHECK-P9-NEXT: xvcvspdp vs1, vs1 ; CHECK-P9-NEXT: xvcvdpuxds vs0, vs0 ; CHECK-P9-NEXT: xvcvdpuxds vs1, vs1 -; CHECK-P9-NEXT: stxv vs0, 0(r3) ; CHECK-P9-NEXT: stxv vs1, 16(r3) +; CHECK-P9-NEXT: stxv vs0, 0(r3) ; CHECK-P9-NEXT: blr ; ; CHECK-BE-LABEL: test4elt_signed: ; CHECK-BE: # %bb.0: # %entry -; CHECK-BE-NEXT: xxsldwi vs1, v2, v2, 1 -; CHECK-BE-NEXT: xscvspdpn f0, v2 -; CHECK-BE-NEXT: xxswapd vs2, v2 -; CHECK-BE-NEXT: xscvspdpn f1, vs1 -; CHECK-BE-NEXT: xxmrghd vs0, vs0, vs1 -; CHECK-BE-NEXT: xxsldwi vs1, v2, v2, 3 -; CHECK-BE-NEXT: xscvspdpn f1, vs1 -; CHECK-BE-NEXT: xscvspdpn f2, vs2 +; CHECK-BE-NEXT: xxmrghw vs0, v2, v2 +; CHECK-BE-NEXT: xxmrglw vs1, v2, v2 +; CHECK-BE-NEXT: xvcvspdp vs0, vs0 +; CHECK-BE-NEXT: xvcvspdp vs1, vs1 ; CHECK-BE-NEXT: xvcvdpuxds vs0, vs0 -; CHECK-BE-NEXT: xxmrghd vs1, vs2, vs1 ; CHECK-BE-NEXT: xvcvdpuxds vs1, vs1 -; CHECK-BE-NEXT: stxv vs0, 0(r3) ; CHECK-BE-NEXT: stxv vs1, 16(r3) +; CHECK-BE-NEXT: stxv vs0, 0(r3) ; CHECK-BE-NEXT: blr entry: %0 = fptoui <4 x float> %a to <4 x i64> @@ -525,31 +393,21 @@ define void @test8elt_signed(<8 x i64>* noalias nocapture sret %agg.result, <8 x ; CHECK-P8-NEXT: li r6, 32 ; CHECK-P8-NEXT: lvx v2, r4, r5 ; CHECK-P8-NEXT: li r4, 48 -; CHECK-P8-NEXT: xxsldwi vs5, v3, v3, 3 -; CHECK-P8-NEXT: xxswapd vs6, v3 -; CHECK-P8-NEXT: xxsldwi vs0, v2, v2, 3 -; CHECK-P8-NEXT: xxswapd vs1, v2 -; CHECK-P8-NEXT: xxsldwi vs3, v2, v2, 1 -; CHECK-P8-NEXT: xxsldwi vs7, v3, v3, 1 -; CHECK-P8-NEXT: xscvspdpn f2, v2 -; CHECK-P8-NEXT: xscvspdpn f4, v3 -; CHECK-P8-NEXT: xscvspdpn f0, vs0 -; CHECK-P8-NEXT: xscvspdpn f1, vs1 -; CHECK-P8-NEXT: xscvspdpn f3, vs3 -; CHECK-P8-NEXT: xscvspdpn f5, vs5 -; CHECK-P8-NEXT: xscvspdpn f6, vs6 -; CHECK-P8-NEXT: xscvspdpn f7, vs7 -; CHECK-P8-NEXT: xxmrghd vs0, vs1, vs0 -; CHECK-P8-NEXT: xxmrghd vs1, vs2, vs3 -; CHECK-P8-NEXT: xxmrghd vs2, vs6, vs5 +; CHECK-P8-NEXT: xxmrglw vs2, v3, v3 +; CHECK-P8-NEXT: xxmrghw vs3, v3, v3 +; CHECK-P8-NEXT: xxmrglw vs0, v2, v2 +; CHECK-P8-NEXT: xxmrghw vs1, v2, v2 +; CHECK-P8-NEXT: xvcvspdp vs2, vs2 +; CHECK-P8-NEXT: xvcvspdp vs0, vs0 +; CHECK-P8-NEXT: xvcvspdp vs1, vs1 +; CHECK-P8-NEXT: xvcvspdp vs3, vs3 +; CHECK-P8-NEXT: xvcvdpuxds v4, vs2 ; CHECK-P8-NEXT: xvcvdpuxds v2, vs0 -; CHECK-P8-NEXT: xxmrghd vs3, vs4, vs7 ; CHECK-P8-NEXT: xvcvdpuxds v3, vs1 -; CHECK-P8-NEXT: xvcvdpuxds v4, vs2 ; CHECK-P8-NEXT: xvcvdpuxds v5, vs3 +; CHECK-P8-NEXT: xxswapd vs3, v4 ; CHECK-P8-NEXT: xxswapd vs1, v2 ; CHECK-P8-NEXT: xxswapd vs0, v3 -; CHECK-P8-NEXT: xxswapd vs3, v4 ; CHECK-P8-NEXT: xxswapd vs2, v5 ; CHECK-P8-NEXT: stxvd2x vs0, r3, r4 ; CHECK-P8-NEXT: stxvd2x vs1, r3, r6 @@ -559,65 +417,45 @@ define void @test8elt_signed(<8 x i64>* noalias nocapture sret %agg.result, <8 x ; ; CHECK-P9-LABEL: test8elt_signed: ; CHECK-P9: # %bb.0: # %entry -; CHECK-P9-NEXT: lxv vs0, 0(r4) -; CHECK-P9-NEXT: xxsldwi vs1, vs0, vs0, 3 -; CHECK-P9-NEXT: xxswapd vs2, vs0 -; CHECK-P9-NEXT: xscvspdpn f1, vs1 -; CHECK-P9-NEXT: xscvspdpn f2, vs2 -; CHECK-P9-NEXT: xscvspdpn f3, vs0 -; CHECK-P9-NEXT: xxsldwi vs0, vs0, vs0, 1 -; CHECK-P9-NEXT: xscvspdpn f0, vs0 -; CHECK-P9-NEXT: xxmrghd vs1, vs2, vs1 -; CHECK-P9-NEXT: lxv vs2, 16(r4) -; CHECK-P9-NEXT: xxmrghd vs0, vs3, vs0 +; CHECK-P9-NEXT: lxv vs0, 16(r4) +; CHECK-P9-NEXT: lxv vs1, 0(r4) +; CHECK-P9-NEXT: xxmrglw vs2, vs1, vs1 +; CHECK-P9-NEXT: xxmrghw vs1, vs1, vs1 +; CHECK-P9-NEXT: xxmrglw vs3, vs0, vs0 +; CHECK-P9-NEXT: xxmrghw vs0, vs0, vs0 +; CHECK-P9-NEXT: xvcvspdp vs2, vs2 +; CHECK-P9-NEXT: xvcvspdp vs1, vs1 +; CHECK-P9-NEXT: xvcvspdp vs3, vs3 +; CHECK-P9-NEXT: xvcvspdp vs0, vs0 +; CHECK-P9-NEXT: xvcvdpuxds vs2, vs2 ; CHECK-P9-NEXT: xvcvdpuxds vs1, vs1 -; CHECK-P9-NEXT: xvcvdpuxds vs0, vs0 -; CHECK-P9-NEXT: xxsldwi vs3, vs2, vs2, 3 -; CHECK-P9-NEXT: xxswapd vs4, vs2 -; CHECK-P9-NEXT: xscvspdpn f3, vs3 -; CHECK-P9-NEXT: xscvspdpn f4, vs4 -; CHECK-P9-NEXT: stxv vs0, 16(r3) -; CHECK-P9-NEXT: xxmrghd vs3, vs4, vs3 -; CHECK-P9-NEXT: xscvspdpn f4, vs2 -; CHECK-P9-NEXT: xxsldwi vs2, vs2, vs2, 1 -; CHECK-P9-NEXT: xscvspdpn f2, vs2 ; CHECK-P9-NEXT: xvcvdpuxds vs3, vs3 -; CHECK-P9-NEXT: xxmrghd vs2, vs4, vs2 -; CHECK-P9-NEXT: xvcvdpuxds vs2, vs2 +; CHECK-P9-NEXT: xvcvdpuxds vs0, vs0 +; CHECK-P9-NEXT: stxv vs0, 48(r3) ; CHECK-P9-NEXT: stxv vs3, 32(r3) -; CHECK-P9-NEXT: stxv vs2, 48(r3) -; CHECK-P9-NEXT: stxv vs1, 0(r3) +; CHECK-P9-NEXT: stxv vs1, 16(r3) +; CHECK-P9-NEXT: stxv vs2, 0(r3) ; CHECK-P9-NEXT: blr ; ; CHECK-BE-LABEL: test8elt_signed: ; CHECK-BE: # %bb.0: # %entry -; CHECK-BE-NEXT: lxv vs1, 0(r4) -; CHECK-BE-NEXT: xxsldwi vs3, vs1, vs1, 1 -; CHECK-BE-NEXT: xscvspdpn f2, vs1 -; CHECK-BE-NEXT: xscvspdpn f3, vs3 ; CHECK-BE-NEXT: lxv vs0, 16(r4) -; CHECK-BE-NEXT: xxsldwi vs4, vs0, vs0, 1 -; CHECK-BE-NEXT: xscvspdpn f4, vs4 -; CHECK-BE-NEXT: xxmrghd vs2, vs2, vs3 -; CHECK-BE-NEXT: xxsldwi vs3, vs1, vs1, 3 -; CHECK-BE-NEXT: xxswapd vs1, vs1 -; CHECK-BE-NEXT: xscvspdpn f3, vs3 -; CHECK-BE-NEXT: xscvspdpn f1, vs1 -; CHECK-BE-NEXT: xxmrghd vs1, vs1, vs3 -; CHECK-BE-NEXT: xscvspdpn f3, vs0 -; CHECK-BE-NEXT: xxmrghd vs3, vs3, vs4 -; CHECK-BE-NEXT: xxsldwi vs4, vs0, vs0, 3 -; CHECK-BE-NEXT: xxswapd vs0, vs0 -; CHECK-BE-NEXT: xscvspdpn f0, vs0 -; CHECK-BE-NEXT: xscvspdpn f4, vs4 -; CHECK-BE-NEXT: xxmrghd vs0, vs0, vs4 +; CHECK-BE-NEXT: lxv vs1, 0(r4) +; CHECK-BE-NEXT: xxmrghw vs2, vs1, vs1 +; CHECK-BE-NEXT: xxmrglw vs1, vs1, vs1 +; CHECK-BE-NEXT: xxmrghw vs3, vs0, vs0 +; CHECK-BE-NEXT: xxmrglw vs0, vs0, vs0 +; CHECK-BE-NEXT: xvcvspdp vs2, vs2 +; CHECK-BE-NEXT: xvcvspdp vs1, vs1 +; CHECK-BE-NEXT: xvcvspdp vs3, vs3 +; CHECK-BE-NEXT: xvcvspdp vs0, vs0 ; CHECK-BE-NEXT: xvcvdpuxds vs2, vs2 ; CHECK-BE-NEXT: xvcvdpuxds vs1, vs1 ; CHECK-BE-NEXT: xvcvdpuxds vs3, vs3 -; CHECK-BE-NEXT: stxv vs1, 16(r3) ; CHECK-BE-NEXT: xvcvdpuxds vs0, vs0 -; CHECK-BE-NEXT: stxv vs3, 32(r3) ; CHECK-BE-NEXT: stxv vs0, 48(r3) +; CHECK-BE-NEXT: stxv vs3, 32(r3) +; CHECK-BE-NEXT: stxv vs1, 16(r3) ; CHECK-BE-NEXT: stxv vs2, 0(r3) ; CHECK-BE-NEXT: blr entry: @@ -630,70 +468,50 @@ entry: define void @test16elt_signed(<16 x i64>* noalias nocapture sret %agg.result, <16 x float>* nocapture readonly) local_unnamed_addr #2 { ; CHECK-P8-LABEL: test16elt_signed: ; CHECK-P8: # %bb.0: # %entry -; CHECK-P8-NEXT: li r5, 16 ; CHECK-P8-NEXT: li r7, 48 +; CHECK-P8-NEXT: li r5, 16 ; CHECK-P8-NEXT: li r6, 32 -; CHECK-P8-NEXT: lvx v4, 0, r4 ; CHECK-P8-NEXT: li r8, 64 -; CHECK-P8-NEXT: lvx v5, r4, r5 -; CHECK-P8-NEXT: lvx v3, r4, r7 -; CHECK-P8-NEXT: lvx v2, r4, r6 +; CHECK-P8-NEXT: lvx v4, r4, r7 +; CHECK-P8-NEXT: lvx v2, r4, r5 +; CHECK-P8-NEXT: lvx v3, r4, r6 +; CHECK-P8-NEXT: xxmrghw vs3, v4, v4 +; CHECK-P8-NEXT: xxmrglw vs5, v4, v4 +; CHECK-P8-NEXT: xxmrglw vs0, v2, v2 +; CHECK-P8-NEXT: xxmrghw vs1, v2, v2 +; CHECK-P8-NEXT: lvx v2, 0, r4 ; CHECK-P8-NEXT: li r4, 112 -; CHECK-P8-NEXT: xxsldwi vs13, v4, v4, 3 -; CHECK-P8-NEXT: xscvspdpn f6, v4 -; CHECK-P8-NEXT: xxsldwi vs1, v5, v5, 3 -; CHECK-P8-NEXT: xxswapd vs3, v5 -; CHECK-P8-NEXT: xxsldwi vs9, v3, v3, 1 -; CHECK-P8-NEXT: xscvspdpn f4, v3 -; CHECK-P8-NEXT: xxsldwi vs5, v5, v5, 1 -; CHECK-P8-NEXT: xxsldwi vs10, v3, v3, 3 -; CHECK-P8-NEXT: xscvspdpn f1, vs1 -; CHECK-P8-NEXT: xxswapd vs11, v3 -; CHECK-P8-NEXT: xscvspdpn f3, vs3 -; CHECK-P8-NEXT: xxsldwi vs7, v2, v2, 3 -; CHECK-P8-NEXT: xscvspdpn f9, vs9 -; CHECK-P8-NEXT: xxswapd vs8, v2 -; CHECK-P8-NEXT: xscvspdpn f0, v5 -; CHECK-P8-NEXT: xxsldwi vs12, v2, v2, 1 -; CHECK-P8-NEXT: xscvspdpn f2, v2 -; CHECK-P8-NEXT: xxswapd v2, v4 -; CHECK-P8-NEXT: xscvspdpn f5, vs5 -; CHECK-P8-NEXT: xxsldwi v3, v4, v4, 1 -; CHECK-P8-NEXT: xscvspdpn f10, vs10 -; CHECK-P8-NEXT: xscvspdpn f11, vs11 -; CHECK-P8-NEXT: xxmrghd vs1, vs3, vs1 -; CHECK-P8-NEXT: xscvspdpn f7, vs7 -; CHECK-P8-NEXT: xxmrghd vs4, vs4, vs9 -; CHECK-P8-NEXT: xscvspdpn f8, vs8 -; CHECK-P8-NEXT: xscvspdpn f12, vs12 -; CHECK-P8-NEXT: xscvspdpn f13, vs13 -; CHECK-P8-NEXT: xxmrghd vs0, vs0, vs5 -; CHECK-P8-NEXT: xscvspdpn f3, v2 -; CHECK-P8-NEXT: xscvspdpn f9, v3 -; CHECK-P8-NEXT: xxmrghd vs5, vs11, vs10 -; CHECK-P8-NEXT: xvcvdpuxds v3, vs4 -; CHECK-P8-NEXT: xvcvdpuxds v2, vs1 -; CHECK-P8-NEXT: xxmrghd vs1, vs2, vs12 -; CHECK-P8-NEXT: xxmrghd vs2, vs8, vs7 -; CHECK-P8-NEXT: xvcvdpuxds v4, vs0 -; CHECK-P8-NEXT: xxmrghd vs0, vs3, vs13 +; CHECK-P8-NEXT: xxmrglw vs2, v3, v3 +; CHECK-P8-NEXT: xxmrghw vs4, v3, v3 +; CHECK-P8-NEXT: xvcvspdp vs3, vs3 +; CHECK-P8-NEXT: xxmrglw vs6, v2, v2 +; CHECK-P8-NEXT: xxmrghw vs7, v2, v2 +; CHECK-P8-NEXT: xvcvspdp vs5, vs5 +; CHECK-P8-NEXT: xvcvspdp vs0, vs0 +; CHECK-P8-NEXT: xvcvspdp vs1, vs1 +; CHECK-P8-NEXT: xvcvspdp vs2, vs2 +; CHECK-P8-NEXT: xvcvspdp vs4, vs4 +; CHECK-P8-NEXT: xvcvspdp vs6, vs6 +; CHECK-P8-NEXT: xvcvspdp vs7, vs7 +; CHECK-P8-NEXT: xvcvdpuxds v3, vs3 ; CHECK-P8-NEXT: xvcvdpuxds v5, vs5 -; CHECK-P8-NEXT: xxmrghd vs3, vs6, vs9 -; CHECK-P8-NEXT: xvcvdpuxds v0, vs1 +; CHECK-P8-NEXT: xvcvdpuxds v2, vs0 +; CHECK-P8-NEXT: xvcvdpuxds v4, vs1 +; CHECK-P8-NEXT: xvcvdpuxds v0, vs4 ; CHECK-P8-NEXT: xvcvdpuxds v1, vs2 -; CHECK-P8-NEXT: xvcvdpuxds v6, vs0 +; CHECK-P8-NEXT: xvcvdpuxds v6, vs6 ; CHECK-P8-NEXT: xxswapd vs0, v3 -; CHECK-P8-NEXT: xvcvdpuxds v7, vs3 -; CHECK-P8-NEXT: xxswapd vs4, v2 -; CHECK-P8-NEXT: xxswapd vs3, v4 +; CHECK-P8-NEXT: xvcvdpuxds v7, vs7 ; CHECK-P8-NEXT: xxswapd vs1, v5 +; CHECK-P8-NEXT: xxswapd vs4, v2 ; CHECK-P8-NEXT: stxvd2x vs0, r3, r4 ; CHECK-P8-NEXT: li r4, 96 +; CHECK-P8-NEXT: xxswapd vs3, v4 ; CHECK-P8-NEXT: xxswapd vs2, v0 -; CHECK-P8-NEXT: xxswapd vs0, v1 ; CHECK-P8-NEXT: stxvd2x vs1, r3, r4 -; CHECK-P8-NEXT: xxswapd vs5, v6 ; CHECK-P8-NEXT: li r4, 80 +; CHECK-P8-NEXT: xxswapd vs0, v1 +; CHECK-P8-NEXT: xxswapd vs5, v6 ; CHECK-P8-NEXT: xxswapd vs1, v7 ; CHECK-P8-NEXT: stxvd2x vs2, r3, r4 ; CHECK-P8-NEXT: stxvd2x vs0, r3, r8 @@ -705,122 +523,82 @@ define void @test16elt_signed(<16 x i64>* noalias nocapture sret %agg.result, <1 ; ; CHECK-P9-LABEL: test16elt_signed: ; CHECK-P9: # %bb.0: # %entry -; CHECK-P9-NEXT: lxv vs4, 16(r4) -; CHECK-P9-NEXT: xxsldwi vs5, vs4, vs4, 3 -; CHECK-P9-NEXT: xxswapd vs6, vs4 -; CHECK-P9-NEXT: lxv vs0, 0(r4) -; CHECK-P9-NEXT: xxsldwi vs1, vs0, vs0, 3 -; CHECK-P9-NEXT: xxswapd vs2, vs0 -; CHECK-P9-NEXT: xscvspdpn f5, vs5 -; CHECK-P9-NEXT: xscvspdpn f6, vs6 -; CHECK-P9-NEXT: xxmrghd vs5, vs6, vs5 -; CHECK-P9-NEXT: xscvspdpn f6, vs4 -; CHECK-P9-NEXT: xxsldwi vs4, vs4, vs4, 1 -; CHECK-P9-NEXT: lxv vs3, 32(r4) -; CHECK-P9-NEXT: xscvspdpn f2, vs2 -; CHECK-P9-NEXT: xxswapd vs7, vs3 -; CHECK-P9-NEXT: xscvspdpn f7, vs7 -; CHECK-P9-NEXT: xscvspdpn f4, vs4 -; CHECK-P9-NEXT: xscvspdpn f1, vs1 -; CHECK-P9-NEXT: xxmrghd vs1, vs2, vs1 -; CHECK-P9-NEXT: xscvspdpn f2, vs0 -; CHECK-P9-NEXT: xxsldwi vs0, vs0, vs0, 1 -; CHECK-P9-NEXT: xscvspdpn f0, vs0 -; CHECK-P9-NEXT: xxmrghd vs0, vs2, vs0 -; CHECK-P9-NEXT: xxmrghd vs4, vs6, vs4 -; CHECK-P9-NEXT: xxsldwi vs6, vs3, vs3, 3 +; CHECK-P9-NEXT: lxv vs0, 48(r4) +; CHECK-P9-NEXT: lxv vs1, 0(r4) +; CHECK-P9-NEXT: lxv vs3, 16(r4) +; CHECK-P9-NEXT: lxv vs5, 32(r4) +; CHECK-P9-NEXT: xxmrglw vs2, vs1, vs1 +; CHECK-P9-NEXT: xxmrghw vs1, vs1, vs1 +; CHECK-P9-NEXT: xxmrglw vs4, vs3, vs3 +; CHECK-P9-NEXT: xxmrghw vs3, vs3, vs3 +; CHECK-P9-NEXT: xxmrglw vs6, vs5, vs5 +; CHECK-P9-NEXT: xxmrghw vs5, vs5, vs5 +; CHECK-P9-NEXT: xxmrglw vs7, vs0, vs0 +; CHECK-P9-NEXT: xxmrghw vs0, vs0, vs0 +; CHECK-P9-NEXT: xvcvspdp vs2, vs2 +; CHECK-P9-NEXT: xvcvspdp vs1, vs1 +; CHECK-P9-NEXT: xvcvspdp vs4, vs4 +; CHECK-P9-NEXT: xvcvspdp vs3, vs3 +; CHECK-P9-NEXT: xvcvspdp vs6, vs6 +; CHECK-P9-NEXT: xvcvspdp vs5, vs5 +; CHECK-P9-NEXT: xvcvspdp vs7, vs7 +; CHECK-P9-NEXT: xvcvspdp vs0, vs0 +; CHECK-P9-NEXT: xvcvdpuxds vs2, vs2 ; CHECK-P9-NEXT: xvcvdpuxds vs1, vs1 -; CHECK-P9-NEXT: xvcvdpuxds vs5, vs5 -; CHECK-P9-NEXT: xscvspdpn f6, vs6 -; CHECK-P9-NEXT: xxmrghd vs6, vs7, vs6 -; CHECK-P9-NEXT: xscvspdpn f7, vs3 -; CHECK-P9-NEXT: xxsldwi vs3, vs3, vs3, 1 -; CHECK-P9-NEXT: lxv vs2, 48(r4) -; CHECK-P9-NEXT: xxswapd vs8, vs2 -; CHECK-P9-NEXT: xscvspdpn f8, vs8 ; CHECK-P9-NEXT: xvcvdpuxds vs4, vs4 -; CHECK-P9-NEXT: xscvspdpn f3, vs3 -; CHECK-P9-NEXT: xxmrghd vs3, vs7, vs3 -; CHECK-P9-NEXT: xxsldwi vs7, vs2, vs2, 3 -; CHECK-P9-NEXT: xvcvdpuxds vs0, vs0 -; CHECK-P9-NEXT: xvcvdpuxds vs6, vs6 -; CHECK-P9-NEXT: stxv vs6, 64(r3) -; CHECK-P9-NEXT: xscvspdpn f7, vs7 -; CHECK-P9-NEXT: xxmrghd vs7, vs8, vs7 -; CHECK-P9-NEXT: xscvspdpn f8, vs2 -; CHECK-P9-NEXT: xxsldwi vs2, vs2, vs2, 1 -; CHECK-P9-NEXT: xscvspdpn f2, vs2 -; CHECK-P9-NEXT: xxmrghd vs2, vs8, vs2 ; CHECK-P9-NEXT: xvcvdpuxds vs3, vs3 +; CHECK-P9-NEXT: xvcvdpuxds vs6, vs6 +; CHECK-P9-NEXT: xvcvdpuxds vs5, vs5 ; CHECK-P9-NEXT: xvcvdpuxds vs7, vs7 -; CHECK-P9-NEXT: stxv vs3, 80(r3) -; CHECK-P9-NEXT: xvcvdpuxds vs2, vs2 +; CHECK-P9-NEXT: xvcvdpuxds vs0, vs0 +; CHECK-P9-NEXT: stxv vs0, 112(r3) ; CHECK-P9-NEXT: stxv vs7, 96(r3) -; CHECK-P9-NEXT: stxv vs2, 112(r3) -; CHECK-P9-NEXT: stxv vs4, 48(r3) -; CHECK-P9-NEXT: stxv vs5, 32(r3) -; CHECK-P9-NEXT: stxv vs0, 16(r3) -; CHECK-P9-NEXT: stxv vs1, 0(r3) +; CHECK-P9-NEXT: stxv vs5, 80(r3) +; CHECK-P9-NEXT: stxv vs6, 64(r3) +; CHECK-P9-NEXT: stxv vs3, 48(r3) +; CHECK-P9-NEXT: stxv vs4, 32(r3) +; CHECK-P9-NEXT: stxv vs1, 16(r3) +; CHECK-P9-NEXT: stxv vs2, 0(r3) ; CHECK-P9-NEXT: blr ; ; CHECK-BE-LABEL: test16elt_signed: ; CHECK-BE: # %bb.0: # %entry -; CHECK-BE-NEXT: lxv vs0, 0(r4) -; CHECK-BE-NEXT: lxv vs4, 16(r4) -; CHECK-BE-NEXT: xxsldwi vs2, vs0, vs0, 1 -; CHECK-BE-NEXT: xscvspdpn f1, vs0 -; CHECK-BE-NEXT: xxsldwi vs5, vs0, vs0, 3 -; CHECK-BE-NEXT: xxswapd vs0, vs0 -; CHECK-BE-NEXT: xscvspdpn f5, vs5 -; CHECK-BE-NEXT: xscvspdpn f0, vs0 -; CHECK-BE-NEXT: xxsldwi vs6, vs4, vs4, 1 -; CHECK-BE-NEXT: xscvspdpn f6, vs6 -; CHECK-BE-NEXT: xxmrghd vs0, vs0, vs5 -; CHECK-BE-NEXT: xscvspdpn f5, vs4 -; CHECK-BE-NEXT: lxv vs3, 32(r4) -; CHECK-BE-NEXT: xxsldwi vs7, vs3, vs3, 1 -; CHECK-BE-NEXT: xscvspdpn f7, vs7 -; CHECK-BE-NEXT: xxmrghd vs5, vs5, vs6 -; CHECK-BE-NEXT: xxsldwi vs6, vs4, vs4, 3 -; CHECK-BE-NEXT: xxswapd vs4, vs4 -; CHECK-BE-NEXT: xscvspdpn f6, vs6 -; CHECK-BE-NEXT: xscvspdpn f4, vs4 -; CHECK-BE-NEXT: xscvspdpn f2, vs2 -; CHECK-BE-NEXT: xxmrghd vs1, vs1, vs2 -; CHECK-BE-NEXT: lxv vs2, 48(r4) -; CHECK-BE-NEXT: xxsldwi vs8, vs2, vs2, 1 +; CHECK-BE-NEXT: lxv vs0, 48(r4) +; CHECK-BE-NEXT: lxv vs1, 0(r4) +; CHECK-BE-NEXT: lxv vs3, 16(r4) +; CHECK-BE-NEXT: lxv vs5, 32(r4) +; CHECK-BE-NEXT: xxmrghw vs2, vs1, vs1 +; CHECK-BE-NEXT: xxmrglw vs1, vs1, vs1 +; CHECK-BE-NEXT: xxmrghw vs4, vs3, vs3 +; CHECK-BE-NEXT: xxmrglw vs3, vs3, vs3 +; CHECK-BE-NEXT: xxmrghw vs6, vs5, vs5 +; CHECK-BE-NEXT: xxmrglw vs5, vs5, vs5 +; CHECK-BE-NEXT: xxmrghw vs7, vs0, vs0 +; CHECK-BE-NEXT: xxmrglw vs0, vs0, vs0 +; CHECK-BE-NEXT: xvcvspdp vs2, vs2 +; CHECK-BE-NEXT: xvcvspdp vs1, vs1 +; CHECK-BE-NEXT: xvcvspdp vs4, vs4 +; CHECK-BE-NEXT: xvcvspdp vs3, vs3 +; CHECK-BE-NEXT: xvcvspdp vs6, vs6 +; CHECK-BE-NEXT: xvcvspdp vs5, vs5 +; CHECK-BE-NEXT: xvcvspdp vs7, vs7 +; CHECK-BE-NEXT: xvcvspdp vs0, vs0 +; CHECK-BE-NEXT: xvcvdpuxds vs2, vs2 ; CHECK-BE-NEXT: xvcvdpuxds vs1, vs1 -; CHECK-BE-NEXT: xvcvdpuxds vs0, vs0 -; CHECK-BE-NEXT: xvcvdpuxds vs5, vs5 -; CHECK-BE-NEXT: xscvspdpn f8, vs8 -; CHECK-BE-NEXT: xxmrghd vs4, vs4, vs6 -; CHECK-BE-NEXT: xscvspdpn f6, vs3 -; CHECK-BE-NEXT: stxv vs0, 16(r3) -; CHECK-BE-NEXT: xxmrghd vs6, vs6, vs7 -; CHECK-BE-NEXT: xxsldwi vs7, vs3, vs3, 3 -; CHECK-BE-NEXT: xxswapd vs3, vs3 -; CHECK-BE-NEXT: xscvspdpn f7, vs7 -; CHECK-BE-NEXT: xscvspdpn f3, vs3 -; CHECK-BE-NEXT: xxmrghd vs3, vs3, vs7 -; CHECK-BE-NEXT: xscvspdpn f7, vs2 -; CHECK-BE-NEXT: xxmrghd vs7, vs7, vs8 -; CHECK-BE-NEXT: xxsldwi vs8, vs2, vs2, 3 -; CHECK-BE-NEXT: xxswapd vs2, vs2 -; CHECK-BE-NEXT: xscvspdpn f8, vs8 -; CHECK-BE-NEXT: xscvspdpn f2, vs2 -; CHECK-BE-NEXT: xxmrghd vs2, vs2, vs8 -; CHECK-BE-NEXT: stxv vs5, 32(r3) ; CHECK-BE-NEXT: xvcvdpuxds vs4, vs4 -; CHECK-BE-NEXT: xvcvdpuxds vs6, vs6 ; CHECK-BE-NEXT: xvcvdpuxds vs3, vs3 +; CHECK-BE-NEXT: xvcvdpuxds vs6, vs6 +; CHECK-BE-NEXT: xvcvdpuxds vs5, vs5 ; CHECK-BE-NEXT: xvcvdpuxds vs7, vs7 -; CHECK-BE-NEXT: stxv vs3, 80(r3) +; CHECK-BE-NEXT: xvcvdpuxds vs0, vs0 +; CHECK-BE-NEXT: stxv vs0, 112(r3) ; CHECK-BE-NEXT: stxv vs7, 96(r3) -; CHECK-BE-NEXT: xvcvdpuxds vs2, vs2 -; CHECK-BE-NEXT: stxv vs2, 112(r3) +; CHECK-BE-NEXT: stxv vs5, 80(r3) ; CHECK-BE-NEXT: stxv vs6, 64(r3) -; CHECK-BE-NEXT: stxv vs4, 48(r3) -; CHECK-BE-NEXT: stxv vs1, 0(r3) +; CHECK-BE-NEXT: stxv vs3, 48(r3) +; CHECK-BE-NEXT: stxv vs4, 32(r3) +; CHECK-BE-NEXT: stxv vs1, 16(r3) +; CHECK-BE-NEXT: stxv vs2, 0(r3) ; CHECK-BE-NEXT: blr entry: %a = load <16 x float>, <16 x float>* %0, align 64 diff --git a/llvm/test/CodeGen/PowerPC/vmladduhm.ll b/llvm/test/CodeGen/PowerPC/vmladduhm.ll new file mode 100644 index 0000000000000..f2475d9e24907 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/vmladduhm.ll @@ -0,0 +1,26 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mcpu=pwr9 -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK,CHECK-P9 +; RUN: llc -verify-machineinstrs -mcpu=pwr8 -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK,CHECK-P8 +define <8 x i16> @mul(<8 x i16> %m, <8 x i16> %n) { +; CHECK-LABEL: mul: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xxlxor 36, 36, 36 +; CHECK-NEXT: vmladduhm 2, 2, 3, 4 +; CHECK-NEXT: blr +entry: + %0 = mul <8 x i16> %m, %n + ret <8 x i16> %0 +} + +define <8 x i16> @madd(<8 x i16> %m, <8 x i16> %n, <8 x i16> %o) { +; CHECK-LABEL: madd: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xxlxor 37, 37, 37 +; CHECK-NEXT: vmladduhm 2, 2, 3, 5 +; CHECK-NEXT: vadduhm 2, 2, 4 +; CHECK-NEXT: blr +entry: + %0 = mul <8 x i16> %m, %n + %1 = add <8 x i16> %0, %o + ret <8 x i16> %1 +} diff --git a/llvm/test/CodeGen/PowerPC/vsx.ll b/llvm/test/CodeGen/PowerPC/vsx.ll index 9923cb02cc8d8..7cf43a92a5dc1 100644 --- a/llvm/test/CodeGen/PowerPC/vsx.ll +++ b/llvm/test/CodeGen/PowerPC/vsx.ll @@ -1554,11 +1554,8 @@ define <2 x i64> @test46(<2 x float> %a) { ; ; CHECK-LE-LABEL: test46: ; CHECK-LE: # %bb.0: -; CHECK-LE-NEXT: xxsldwi vs0, v2, v2, 3 -; CHECK-LE-NEXT: xxswapd vs1, v2 -; CHECK-LE-NEXT: xscvspdpn f0, vs0 -; CHECK-LE-NEXT: xscvspdpn f1, vs1 -; CHECK-LE-NEXT: xxmrghd vs0, vs1, vs0 +; CHECK-LE-NEXT: xxmrglw vs0, v2, v2 +; CHECK-LE-NEXT: xvcvspdp vs0, vs0 ; CHECK-LE-NEXT: xvcvdpuxds v2, vs0 ; CHECK-LE-NEXT: blr %v = fptoui <2 x float> %a to <2 x i64> @@ -1625,11 +1622,8 @@ define <2 x i64> @test47(<2 x float> %a) { ; ; CHECK-LE-LABEL: test47: ; CHECK-LE: # %bb.0: -; CHECK-LE-NEXT: xxsldwi vs0, v2, v2, 3 -; CHECK-LE-NEXT: xxswapd vs1, v2 -; CHECK-LE-NEXT: xscvspdpn f0, vs0 -; CHECK-LE-NEXT: xscvspdpn f1, vs1 -; CHECK-LE-NEXT: xxmrghd vs0, vs1, vs0 +; CHECK-LE-NEXT: xxmrglw vs0, v2, v2 +; CHECK-LE-NEXT: xvcvspdp vs0, vs0 ; CHECK-LE-NEXT: xvcvdpsxds v2, vs0 ; CHECK-LE-NEXT: blr %v = fptosi <2 x float> %a to <2 x i64> diff --git a/llvm/test/CodeGen/RISCV/double-arith.ll b/llvm/test/CodeGen/RISCV/double-arith.ll index ad68dca154dc2..604911ae49f2c 100644 --- a/llvm/test/CodeGen/RISCV/double-arith.ll +++ b/llvm/test/CodeGen/RISCV/double-arith.ll @@ -460,9 +460,7 @@ define double @fmsub_d(double %a, double %b, double %c) nounwind { ; RV32IFD-NEXT: sw a4, 8(sp) ; RV32IFD-NEXT: sw a5, 12(sp) ; RV32IFD-NEXT: fld ft2, 8(sp) -; RV32IFD-NEXT: lui a0, %hi(.LCPI15_0) -; RV32IFD-NEXT: addi a0, a0, %lo(.LCPI15_0) -; RV32IFD-NEXT: fld ft3, 0(a0) +; RV32IFD-NEXT: fcvt.d.w ft3, zero ; RV32IFD-NEXT: fadd.d ft2, ft2, ft3 ; RV32IFD-NEXT: fmsub.d ft0, ft1, ft0, ft2 ; RV32IFD-NEXT: fsd ft0, 8(sp) @@ -473,14 +471,12 @@ define double @fmsub_d(double %a, double %b, double %c) nounwind { ; ; RV64IFD-LABEL: fmsub_d: ; RV64IFD: # %bb.0: -; RV64IFD-NEXT: lui a3, %hi(.LCPI15_0) -; RV64IFD-NEXT: addi a3, a3, %lo(.LCPI15_0) -; RV64IFD-NEXT: fld ft0, 0(a3) -; RV64IFD-NEXT: fmv.d.x ft1, a1 -; RV64IFD-NEXT: fmv.d.x ft2, a0 -; RV64IFD-NEXT: fmv.d.x ft3, a2 -; RV64IFD-NEXT: fadd.d ft0, ft3, ft0 -; RV64IFD-NEXT: fmsub.d ft0, ft2, ft1, ft0 +; RV64IFD-NEXT: fmv.d.x ft0, a1 +; RV64IFD-NEXT: fmv.d.x ft1, a0 +; RV64IFD-NEXT: fmv.d.x ft2, a2 +; RV64IFD-NEXT: fmv.d.x ft3, zero +; RV64IFD-NEXT: fadd.d ft2, ft2, ft3 +; RV64IFD-NEXT: fmsub.d ft0, ft1, ft0, ft2 ; RV64IFD-NEXT: fmv.x.d a0, ft0 ; RV64IFD-NEXT: ret %c_ = fadd double 0.0, %c ; avoid negation using xor @@ -502,9 +498,7 @@ define double @fnmadd_d(double %a, double %b, double %c) nounwind { ; RV32IFD-NEXT: sw a0, 8(sp) ; RV32IFD-NEXT: sw a1, 12(sp) ; RV32IFD-NEXT: fld ft2, 8(sp) -; RV32IFD-NEXT: lui a0, %hi(.LCPI16_0) -; RV32IFD-NEXT: addi a0, a0, %lo(.LCPI16_0) -; RV32IFD-NEXT: fld ft3, 0(a0) +; RV32IFD-NEXT: fcvt.d.w ft3, zero ; RV32IFD-NEXT: fadd.d ft2, ft2, ft3 ; RV32IFD-NEXT: fadd.d ft1, ft1, ft3 ; RV32IFD-NEXT: fnmadd.d ft0, ft2, ft0, ft1 @@ -516,15 +510,13 @@ define double @fnmadd_d(double %a, double %b, double %c) nounwind { ; ; RV64IFD-LABEL: fnmadd_d: ; RV64IFD: # %bb.0: -; RV64IFD-NEXT: lui a3, %hi(.LCPI16_0) -; RV64IFD-NEXT: addi a3, a3, %lo(.LCPI16_0) -; RV64IFD-NEXT: fld ft0, 0(a3) -; RV64IFD-NEXT: fmv.d.x ft1, a1 -; RV64IFD-NEXT: fmv.d.x ft2, a2 -; RV64IFD-NEXT: fmv.d.x ft3, a0 -; RV64IFD-NEXT: fadd.d ft3, ft3, ft0 -; RV64IFD-NEXT: fadd.d ft0, ft2, ft0 -; RV64IFD-NEXT: fnmadd.d ft0, ft3, ft1, ft0 +; RV64IFD-NEXT: fmv.d.x ft0, a1 +; RV64IFD-NEXT: fmv.d.x ft1, a2 +; RV64IFD-NEXT: fmv.d.x ft2, a0 +; RV64IFD-NEXT: fmv.d.x ft3, zero +; RV64IFD-NEXT: fadd.d ft2, ft2, ft3 +; RV64IFD-NEXT: fadd.d ft1, ft1, ft3 +; RV64IFD-NEXT: fnmadd.d ft0, ft2, ft0, ft1 ; RV64IFD-NEXT: fmv.x.d a0, ft0 ; RV64IFD-NEXT: ret %a_ = fadd double 0.0, %a @@ -548,9 +540,7 @@ define double @fnmsub_d(double %a, double %b, double %c) nounwind { ; RV32IFD-NEXT: sw a0, 8(sp) ; RV32IFD-NEXT: sw a1, 12(sp) ; RV32IFD-NEXT: fld ft2, 8(sp) -; RV32IFD-NEXT: lui a0, %hi(.LCPI17_0) -; RV32IFD-NEXT: addi a0, a0, %lo(.LCPI17_0) -; RV32IFD-NEXT: fld ft3, 0(a0) +; RV32IFD-NEXT: fcvt.d.w ft3, zero ; RV32IFD-NEXT: fadd.d ft2, ft2, ft3 ; RV32IFD-NEXT: fnmsub.d ft0, ft2, ft1, ft0 ; RV32IFD-NEXT: fsd ft0, 8(sp) @@ -561,14 +551,12 @@ define double @fnmsub_d(double %a, double %b, double %c) nounwind { ; ; RV64IFD-LABEL: fnmsub_d: ; RV64IFD: # %bb.0: -; RV64IFD-NEXT: lui a3, %hi(.LCPI17_0) -; RV64IFD-NEXT: addi a3, a3, %lo(.LCPI17_0) -; RV64IFD-NEXT: fld ft0, 0(a3) -; RV64IFD-NEXT: fmv.d.x ft1, a2 -; RV64IFD-NEXT: fmv.d.x ft2, a1 -; RV64IFD-NEXT: fmv.d.x ft3, a0 -; RV64IFD-NEXT: fadd.d ft0, ft3, ft0 -; RV64IFD-NEXT: fnmsub.d ft0, ft0, ft2, ft1 +; RV64IFD-NEXT: fmv.d.x ft0, a2 +; RV64IFD-NEXT: fmv.d.x ft1, a1 +; RV64IFD-NEXT: fmv.d.x ft2, a0 +; RV64IFD-NEXT: fmv.d.x ft3, zero +; RV64IFD-NEXT: fadd.d ft2, ft2, ft3 +; RV64IFD-NEXT: fnmsub.d ft0, ft2, ft1, ft0 ; RV64IFD-NEXT: fmv.x.d a0, ft0 ; RV64IFD-NEXT: ret %a_ = fadd double 0.0, %a diff --git a/llvm/test/CodeGen/RISCV/float-arith.ll b/llvm/test/CodeGen/RISCV/float-arith.ll index 5244a69a6fad0..f22f85d5d7908 100644 --- a/llvm/test/CodeGen/RISCV/float-arith.ll +++ b/llvm/test/CodeGen/RISCV/float-arith.ll @@ -339,27 +339,23 @@ define float @fmadd_s(float %a, float %b, float %c) nounwind { define float @fmsub_s(float %a, float %b, float %c) nounwind { ; RV32IF-LABEL: fmsub_s: ; RV32IF: # %bb.0: -; RV32IF-NEXT: lui a3, %hi(.LCPI15_0) -; RV32IF-NEXT: addi a3, a3, %lo(.LCPI15_0) -; RV32IF-NEXT: flw ft0, 0(a3) -; RV32IF-NEXT: fmv.w.x ft1, a1 -; RV32IF-NEXT: fmv.w.x ft2, a0 -; RV32IF-NEXT: fmv.w.x ft3, a2 -; RV32IF-NEXT: fadd.s ft0, ft3, ft0 -; RV32IF-NEXT: fmsub.s ft0, ft2, ft1, ft0 +; RV32IF-NEXT: fmv.w.x ft0, a1 +; RV32IF-NEXT: fmv.w.x ft1, a0 +; RV32IF-NEXT: fmv.w.x ft2, a2 +; RV32IF-NEXT: fmv.w.x ft3, zero +; RV32IF-NEXT: fadd.s ft2, ft2, ft3 +; RV32IF-NEXT: fmsub.s ft0, ft1, ft0, ft2 ; RV32IF-NEXT: fmv.x.w a0, ft0 ; RV32IF-NEXT: ret ; ; RV64IF-LABEL: fmsub_s: ; RV64IF: # %bb.0: -; RV64IF-NEXT: lui a3, %hi(.LCPI15_0) -; RV64IF-NEXT: addi a3, a3, %lo(.LCPI15_0) -; RV64IF-NEXT: flw ft0, 0(a3) -; RV64IF-NEXT: fmv.w.x ft1, a1 -; RV64IF-NEXT: fmv.w.x ft2, a0 -; RV64IF-NEXT: fmv.w.x ft3, a2 -; RV64IF-NEXT: fadd.s ft0, ft3, ft0 -; RV64IF-NEXT: fmsub.s ft0, ft2, ft1, ft0 +; RV64IF-NEXT: fmv.w.x ft0, a1 +; RV64IF-NEXT: fmv.w.x ft1, a0 +; RV64IF-NEXT: fmv.w.x ft2, a2 +; RV64IF-NEXT: fmv.w.x ft3, zero +; RV64IF-NEXT: fadd.s ft2, ft2, ft3 +; RV64IF-NEXT: fmsub.s ft0, ft1, ft0, ft2 ; RV64IF-NEXT: fmv.x.w a0, ft0 ; RV64IF-NEXT: ret %c_ = fadd float 0.0, %c ; avoid negation using xor @@ -371,29 +367,25 @@ define float @fmsub_s(float %a, float %b, float %c) nounwind { define float @fnmadd_s(float %a, float %b, float %c) nounwind { ; RV32IF-LABEL: fnmadd_s: ; RV32IF: # %bb.0: -; RV32IF-NEXT: lui a3, %hi(.LCPI16_0) -; RV32IF-NEXT: addi a3, a3, %lo(.LCPI16_0) -; RV32IF-NEXT: flw ft0, 0(a3) -; RV32IF-NEXT: fmv.w.x ft1, a1 -; RV32IF-NEXT: fmv.w.x ft2, a2 -; RV32IF-NEXT: fmv.w.x ft3, a0 -; RV32IF-NEXT: fadd.s ft3, ft3, ft0 -; RV32IF-NEXT: fadd.s ft0, ft2, ft0 -; RV32IF-NEXT: fnmadd.s ft0, ft3, ft1, ft0 +; RV32IF-NEXT: fmv.w.x ft0, a1 +; RV32IF-NEXT: fmv.w.x ft1, a2 +; RV32IF-NEXT: fmv.w.x ft2, a0 +; RV32IF-NEXT: fmv.w.x ft3, zero +; RV32IF-NEXT: fadd.s ft2, ft2, ft3 +; RV32IF-NEXT: fadd.s ft1, ft1, ft3 +; RV32IF-NEXT: fnmadd.s ft0, ft2, ft0, ft1 ; RV32IF-NEXT: fmv.x.w a0, ft0 ; RV32IF-NEXT: ret ; ; RV64IF-LABEL: fnmadd_s: ; RV64IF: # %bb.0: -; RV64IF-NEXT: lui a3, %hi(.LCPI16_0) -; RV64IF-NEXT: addi a3, a3, %lo(.LCPI16_0) -; RV64IF-NEXT: flw ft0, 0(a3) -; RV64IF-NEXT: fmv.w.x ft1, a1 -; RV64IF-NEXT: fmv.w.x ft2, a2 -; RV64IF-NEXT: fmv.w.x ft3, a0 -; RV64IF-NEXT: fadd.s ft3, ft3, ft0 -; RV64IF-NEXT: fadd.s ft0, ft2, ft0 -; RV64IF-NEXT: fnmadd.s ft0, ft3, ft1, ft0 +; RV64IF-NEXT: fmv.w.x ft0, a1 +; RV64IF-NEXT: fmv.w.x ft1, a2 +; RV64IF-NEXT: fmv.w.x ft2, a0 +; RV64IF-NEXT: fmv.w.x ft3, zero +; RV64IF-NEXT: fadd.s ft2, ft2, ft3 +; RV64IF-NEXT: fadd.s ft1, ft1, ft3 +; RV64IF-NEXT: fnmadd.s ft0, ft2, ft0, ft1 ; RV64IF-NEXT: fmv.x.w a0, ft0 ; RV64IF-NEXT: ret %a_ = fadd float 0.0, %a @@ -407,27 +399,23 @@ define float @fnmadd_s(float %a, float %b, float %c) nounwind { define float @fnmsub_s(float %a, float %b, float %c) nounwind { ; RV32IF-LABEL: fnmsub_s: ; RV32IF: # %bb.0: -; RV32IF-NEXT: lui a3, %hi(.LCPI17_0) -; RV32IF-NEXT: addi a3, a3, %lo(.LCPI17_0) -; RV32IF-NEXT: flw ft0, 0(a3) -; RV32IF-NEXT: fmv.w.x ft1, a2 -; RV32IF-NEXT: fmv.w.x ft2, a1 -; RV32IF-NEXT: fmv.w.x ft3, a0 -; RV32IF-NEXT: fadd.s ft0, ft3, ft0 -; RV32IF-NEXT: fnmsub.s ft0, ft0, ft2, ft1 +; RV32IF-NEXT: fmv.w.x ft0, a2 +; RV32IF-NEXT: fmv.w.x ft1, a1 +; RV32IF-NEXT: fmv.w.x ft2, a0 +; RV32IF-NEXT: fmv.w.x ft3, zero +; RV32IF-NEXT: fadd.s ft2, ft2, ft3 +; RV32IF-NEXT: fnmsub.s ft0, ft2, ft1, ft0 ; RV32IF-NEXT: fmv.x.w a0, ft0 ; RV32IF-NEXT: ret ; ; RV64IF-LABEL: fnmsub_s: ; RV64IF: # %bb.0: -; RV64IF-NEXT: lui a3, %hi(.LCPI17_0) -; RV64IF-NEXT: addi a3, a3, %lo(.LCPI17_0) -; RV64IF-NEXT: flw ft0, 0(a3) -; RV64IF-NEXT: fmv.w.x ft1, a2 -; RV64IF-NEXT: fmv.w.x ft2, a1 -; RV64IF-NEXT: fmv.w.x ft3, a0 -; RV64IF-NEXT: fadd.s ft0, ft3, ft0 -; RV64IF-NEXT: fnmsub.s ft0, ft0, ft2, ft1 +; RV64IF-NEXT: fmv.w.x ft0, a2 +; RV64IF-NEXT: fmv.w.x ft1, a1 +; RV64IF-NEXT: fmv.w.x ft2, a0 +; RV64IF-NEXT: fmv.w.x ft3, zero +; RV64IF-NEXT: fadd.s ft2, ft2, ft3 +; RV64IF-NEXT: fnmsub.s ft0, ft2, ft1, ft0 ; RV64IF-NEXT: fmv.x.w a0, ft0 ; RV64IF-NEXT: ret %a_ = fadd float 0.0, %a diff --git a/llvm/test/CodeGen/RISCV/float-br-fcmp.ll b/llvm/test/CodeGen/RISCV/float-br-fcmp.ll index a1e5b32eeaffc..dede086fed853 100644 --- a/llvm/test/CodeGen/RISCV/float-br-fcmp.ll +++ b/llvm/test/CodeGen/RISCV/float-br-fcmp.ll @@ -720,10 +720,8 @@ define i32 @br_fcmp_store_load_stack_slot(float %a, float %b) nounwind { ; RV32IF-NEXT: sw ra, 12(sp) ; RV32IF-NEXT: mv a0, zero ; RV32IF-NEXT: call dummy -; RV32IF-NEXT: lui a1, %hi(.LCPI17_0) -; RV32IF-NEXT: addi a1, a1, %lo(.LCPI17_0) -; RV32IF-NEXT: flw ft1, 0(a1) ; RV32IF-NEXT: fmv.w.x ft0, a0 +; RV32IF-NEXT: fmv.w.x ft1, zero ; RV32IF-NEXT: fsw ft1, 8(sp) ; RV32IF-NEXT: feq.s a0, ft0, ft1 ; RV32IF-NEXT: beqz a0, .LBB17_3 @@ -747,9 +745,7 @@ define i32 @br_fcmp_store_load_stack_slot(float %a, float %b) nounwind { ; RV64IF-NEXT: addi sp, sp, -32 ; RV64IF-NEXT: sd ra, 24(sp) ; RV64IF-NEXT: sd s0, 16(sp) -; RV64IF-NEXT: lui a0, %hi(.LCPI17_0) -; RV64IF-NEXT: addi a0, a0, %lo(.LCPI17_0) -; RV64IF-NEXT: flw ft0, 0(a0) +; RV64IF-NEXT: fmv.w.x ft0, zero ; RV64IF-NEXT: fsw ft0, 12(sp) ; RV64IF-NEXT: fmv.x.w s0, ft0 ; RV64IF-NEXT: mv a0, s0 diff --git a/llvm/test/CodeGen/RISCV/fp-imm.ll b/llvm/test/CodeGen/RISCV/fp-imm.ll new file mode 100644 index 0000000000000..f70eaab625f65 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/fp-imm.ll @@ -0,0 +1,116 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -target-abi ilp32f -mattr=+f < %s \ +; RUN: | FileCheck --check-prefix=RV32F %s +; RUN: llc -mtriple=riscv32 -target-abi ilp32d -mattr=+f,+d < %s \ +; RUN: | FileCheck --check-prefix=RV32D %s +; RUN: llc -mtriple=riscv64 -target-abi lp64f -mattr=+f < %s \ +; RUN: | FileCheck --check-prefix=RV64F %s +; RUN: llc -mtriple=riscv64 -target-abi lp64d -mattr=+f,+d < %s \ +; RUN: | FileCheck --check-prefix=RV64D %s + +define float @f32_positive_zero(float *%pf) nounwind { +; RV32F-LABEL: f32_positive_zero: +; RV32F: # %bb.0: +; RV32F-NEXT: fmv.w.x fa0, zero +; RV32F-NEXT: ret +; +; RV32D-LABEL: f32_positive_zero: +; RV32D: # %bb.0: +; RV32D-NEXT: fmv.w.x fa0, zero +; RV32D-NEXT: ret +; +; RV64F-LABEL: f32_positive_zero: +; RV64F: # %bb.0: +; RV64F-NEXT: fmv.w.x fa0, zero +; RV64F-NEXT: ret +; +; RV64D-LABEL: f32_positive_zero: +; RV64D: # %bb.0: +; RV64D-NEXT: fmv.w.x fa0, zero +; RV64D-NEXT: ret + ret float 0.0 +} + +define float @f32_negative_zero(float *%pf) nounwind { +; RV32F-LABEL: f32_negative_zero: +; RV32F: # %bb.0: +; RV32F-NEXT: lui a0, %hi(.LCPI1_0) +; RV32F-NEXT: addi a0, a0, %lo(.LCPI1_0) +; RV32F-NEXT: flw fa0, 0(a0) +; RV32F-NEXT: ret +; +; RV32D-LABEL: f32_negative_zero: +; RV32D: # %bb.0: +; RV32D-NEXT: lui a0, %hi(.LCPI1_0) +; RV32D-NEXT: addi a0, a0, %lo(.LCPI1_0) +; RV32D-NEXT: flw fa0, 0(a0) +; RV32D-NEXT: ret +; +; RV64F-LABEL: f32_negative_zero: +; RV64F: # %bb.0: +; RV64F-NEXT: lui a0, %hi(.LCPI1_0) +; RV64F-NEXT: addi a0, a0, %lo(.LCPI1_0) +; RV64F-NEXT: flw fa0, 0(a0) +; RV64F-NEXT: ret +; +; RV64D-LABEL: f32_negative_zero: +; RV64D: # %bb.0: +; RV64D-NEXT: lui a0, %hi(.LCPI1_0) +; RV64D-NEXT: addi a0, a0, %lo(.LCPI1_0) +; RV64D-NEXT: flw fa0, 0(a0) +; RV64D-NEXT: ret + ret float -0.0 +} + +define double @f64_positive_zero(double *%pd) nounwind { +; RV32F-LABEL: f64_positive_zero: +; RV32F: # %bb.0: +; RV32F-NEXT: mv a0, zero +; RV32F-NEXT: mv a1, zero +; RV32F-NEXT: ret +; +; RV32D-LABEL: f64_positive_zero: +; RV32D: # %bb.0: +; RV32D-NEXT: fcvt.d.w fa0, zero +; RV32D-NEXT: ret +; +; RV64F-LABEL: f64_positive_zero: +; RV64F: # %bb.0: +; RV64F-NEXT: mv a0, zero +; RV64F-NEXT: ret +; +; RV64D-LABEL: f64_positive_zero: +; RV64D: # %bb.0: +; RV64D-NEXT: fmv.d.x fa0, zero +; RV64D-NEXT: ret + ret double 0.0 +} + +define double @f64_negative_zero(double *%pd) nounwind { +; RV32F-LABEL: f64_negative_zero: +; RV32F: # %bb.0: +; RV32F-NEXT: lui a1, 524288 +; RV32F-NEXT: mv a0, zero +; RV32F-NEXT: ret +; +; RV32D-LABEL: f64_negative_zero: +; RV32D: # %bb.0: +; RV32D-NEXT: lui a0, %hi(.LCPI3_0) +; RV32D-NEXT: addi a0, a0, %lo(.LCPI3_0) +; RV32D-NEXT: fld fa0, 0(a0) +; RV32D-NEXT: ret +; +; RV64F-LABEL: f64_negative_zero: +; RV64F: # %bb.0: +; RV64F-NEXT: addi a0, zero, -1 +; RV64F-NEXT: slli a0, a0, 63 +; RV64F-NEXT: ret +; +; RV64D-LABEL: f64_negative_zero: +; RV64D: # %bb.0: +; RV64D-NEXT: lui a0, %hi(.LCPI3_0) +; RV64D-NEXT: addi a0, a0, %lo(.LCPI3_0) +; RV64D-NEXT: fld fa0, 0(a0) +; RV64D-NEXT: ret + ret double -0.0 +} diff --git a/llvm/test/CodeGen/SystemZ/ipra-04.ll b/llvm/test/CodeGen/SystemZ/ipra-04.ll index 55a849e186a9d..71ed6d3ff9827 100644 --- a/llvm/test/CodeGen/SystemZ/ipra-04.ll +++ b/llvm/test/CodeGen/SystemZ/ipra-04.ll @@ -6,7 +6,8 @@ ; REQUIRES: asserts ; ; DBG: fun1 function optimized for not having CSR -; DBG: Call Instruction After Register Usage Info Propagation : CallBRASL @fun1{{.*}} $r14d $r15d +; DBG: Call Instruction After Register Usage Info Propagation : +; DBG-NEXT: CallBRASL @fun1{{.*}} $r14d $r15d declare dso_local fastcc signext i32 @foo(i16*, i32 signext) unnamed_addr diff --git a/llvm/test/CodeGen/SystemZ/shorten-fused-fp-ops.mir b/llvm/test/CodeGen/SystemZ/shorten-fused-fp-ops.mir new file mode 100644 index 0000000000000..c55bb7881004e --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/shorten-fused-fp-ops.mir @@ -0,0 +1,60 @@ +# RUN: llc -mtriple=s390x-linux-gnu -mcpu=z14 -start-before=postrapseudos %s -o - \ +# RUN: | FileCheck %s +# +# Test shortening of fused fp operations. + +--- | + define double @fun0(double %f1, double %f2, double %acc) { ret double 0.0 } +... + +# CHECK-LABEL: fun0: +# CHECK: madbr %f0, %f4, %f2 +# CHECK-NEXT: wfmadb %f0, %f4, %v16, %f0 +# CHECK-NEXT: wfmadb %f0, %f4, %f0, %f2 +# CHECK-NEXT: maebr %f0, %f4, %f2 +# CHECK-NEXT: wfmasb %f0, %f4, %v16, %f0 +# CHECK-NEXT: wfmasb %f0, %f4, %f0, %f2 +# CHECK-NEXT: msdbr %f0, %f4, %f2 +# CHECK-NEXT: wfmsdb %f0, %f4, %v16, %f0 +# CHECK-NEXT: wfmsdb %f0, %f4, %f0, %f2 +# CHECK-NEXT: msebr %f0, %f4, %f2 +# CHECK-NEXT: wfmssb %f0, %f4, %v16, %f0 +# CHECK-NEXT: wfmssb %f0, %f4, %f0, %f2 +# CHECK-NEXT: br %r14 +--- +name: fun0 +alignment: 16 +tracksRegLiveness: true +liveins: + - { reg: '$f0d' } + - { reg: '$f2d' } + - { reg: '$f4d' } +frameInfo: + maxAlignment: 1 + maxCallFrameSize: 0 +fixedStack: + - { id: 0, offset: -160, size: 8, alignment: 8 } +machineFunctionInfo: {} +body: | + bb.0 (%ir-block.0): + liveins: $f0d, $f2d, $f4d, $f16d + + renamable $f0d = nofpexcept WFMADB renamable $f4d, renamable $f2d, renamable $f0d, implicit $fpc + renamable $f0d = nofpexcept WFMADB renamable $f4d, renamable $f16d, renamable $f0d, implicit $fpc + renamable $f0d = nofpexcept WFMADB renamable $f4d, renamable $f0d, renamable $f2d, implicit $fpc + + renamable $f0s = nofpexcept WFMASB renamable $f4s, renamable $f2s, renamable $f0s, implicit $fpc + renamable $f0s = nofpexcept WFMASB renamable $f4s, renamable $f16s, renamable $f0s, implicit $fpc + renamable $f0s = nofpexcept WFMASB renamable $f4s, renamable $f0s, renamable $f2s, implicit $fpc + + renamable $f0d = nofpexcept WFMSDB renamable $f4d, renamable $f2d, renamable $f0d, implicit $fpc + renamable $f0d = nofpexcept WFMSDB renamable $f4d, renamable $f16d, renamable $f0d, implicit $fpc + renamable $f0d = nofpexcept WFMSDB renamable $f4d, renamable $f0d, renamable $f2d, implicit $fpc + + renamable $f0s = nofpexcept WFMSSB renamable $f4s, renamable $f2s, renamable $f0s, implicit $fpc + renamable $f0s = nofpexcept WFMSSB renamable $f4s, renamable $f16s, renamable $f0s, implicit $fpc + renamable $f0s = nofpexcept WFMSSB renamable $f4s, renamable $f0s, renamable $f2s, implicit $fpc + + Return implicit $f0d + +... diff --git a/llvm/test/CodeGen/Thumb/frame-access.ll b/llvm/test/CodeGen/Thumb/frame-access.ll index ff1d57db94483..8513982da9a9b 100644 --- a/llvm/test/CodeGen/Thumb/frame-access.ll +++ b/llvm/test/CodeGen/Thumb/frame-access.ll @@ -404,8 +404,8 @@ entry: ; CHECK-NEXT: sub sp, #508 ; CHECK-NEXT: sub sp, #8 ; Argument addresses computed relative to BP -; CHECK: adds r0, r6, #7 -; CHECK-NEXT: adds r0, #13 +; CHECK: adds r4, r6, #7 +; CHECK-NEXT: adds r4, #13 ; CHECK: adds r1, r6, #7 ; CHECK-NEXT: adds r1, #9 ; CHECK: adds r5, r6, #7 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll index ca6dfe186e05d..2dc9ece2e348a 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll @@ -400,18 +400,16 @@ define dso_local void @continue_on_zero(i32* noalias nocapture %arg, i32* noalia ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} -; CHECK-NEXT: mov r3, r0 ; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB4_1: @ %bb9 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r1], #16 ; CHECK-NEXT: vcmp.i32 ne, q0, zr ; CHECK-NEXT: vpst -; CHECK-NEXT: vldrwt.u32 q1, [r3], #16 +; CHECK-NEXT: vldrwt.u32 q1, [r0] ; CHECK-NEXT: vmul.i32 q0, q1, q0 ; CHECK-NEXT: vpst -; CHECK-NEXT: vstrwt.32 q0, [r0] -; CHECK-NEXT: mov r0, r3 +; CHECK-NEXT: vstrwt.32 q0, [r0], #16 ; CHECK-NEXT: letp lr, .LBB4_1 ; CHECK-NEXT: @ %bb.2: @ %bb27 ; CHECK-NEXT: pop {r7, pc} @@ -464,13 +462,12 @@ define dso_local arm_aapcs_vfpcc void @range_test(i32* noalias nocapture %arg, i ; CHECK-NEXT: bic r12, r12, #3 ; CHECK-NEXT: sub.w r12, r12, #4 ; CHECK-NEXT: add.w lr, lr, r12, lsr #2 -; CHECK-NEXT: mov r12, r0 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB5_1: @ %bb12 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r3 ; CHECK-NEXT: vpst -; CHECK-NEXT: vldrwt.u32 q0, [r12], #16 +; CHECK-NEXT: vldrwt.u32 q0, [r0] ; CHECK-NEXT: vpttt.i32 ne, q0, zr ; CHECK-NEXT: vcmpt.s32 le, q0, r2 ; CHECK-NEXT: vctpt.32 r3 @@ -478,8 +475,7 @@ define dso_local arm_aapcs_vfpcc void @range_test(i32* noalias nocapture %arg, i ; CHECK-NEXT: subs r3, #4 ; CHECK-NEXT: vmul.i32 q0, q1, q0 ; CHECK-NEXT: vpst -; CHECK-NEXT: vstrwt.32 q0, [r0] -; CHECK-NEXT: mov r0, r12 +; CHECK-NEXT: vstrwt.32 q0, [r0], #16 ; CHECK-NEXT: le lr, .LBB5_1 ; CHECK-NEXT: @ %bb.2: @ %bb32 ; CHECK-NEXT: pop {r7, pc} diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/ctlz-non-zeros.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/ctlz-non-zeros.mir new file mode 100644 index 0000000000000..38d7567505025 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/ctlz-non-zeros.mir @@ -0,0 +1,330 @@ +# RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -run-pass=arm-low-overhead-loops %s -o - | FileCheck %s + +# CHECK-NOT: LETP + +--- | + define arm_aapcs_vfpcc void @test_ctlz_i8(<8 x i16>* %a, <8 x i16>* %b, <8 x i16>* %c, i32 %elts, i32 %iters) #0 { + entry: + %cmp = icmp slt i32 %elts, 1 + br i1 %cmp, label %exit, label %loop.ph + + loop.ph: ; preds = %entry + call void @llvm.set.loop.iterations.i32(i32 %iters) + br label %loop.body + + loop.body: ; preds = %loop.body, %loop.ph + %lsr.iv = phi i32 [ %lsr.iv.next, %loop.body ], [ %iters, %loop.ph ] + %count = phi i32 [ %elts, %loop.ph ], [ %elts.rem, %loop.body ] + %addr.a = phi <8 x i16>* [ %a, %loop.ph ], [ %addr.a.next, %loop.body ] + %addr.b = phi <8 x i16>* [ %b, %loop.ph ], [ %addr.b.next, %loop.body ] + %addr.c = phi <8 x i16>* [ %c, %loop.ph ], [ %addr.c.next, %loop.body ] + %pred = call <8 x i1> @llvm.arm.mve.vctp16(i32 %count) + %elts.rem = sub i32 %count, 8 + %masked.load.a = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %addr.a, i32 2, <8 x i1> %pred, <8 x i16> undef) + %masked.load.b = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %addr.b, i32 2, <8 x i1> %pred, <8 x i16> undef) + %bitcast.a = bitcast <8 x i16> %masked.load.a to <16 x i8> + %ctlz = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %bitcast.a, i1 false) + %shrn = call <16 x i8> @llvm.arm.mve.vshrn.v16i8.v8i16(<16 x i8> %ctlz, <8 x i16> %masked.load.b, i32 1, i32 1, i32 0, i32 1, i32 0, i32 1) + %bitcast = bitcast <16 x i8> %shrn to <8 x i16> + call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %bitcast, <8 x i16>* %addr.c, i32 2, <8 x i1> %pred) + %addr.a.next = getelementptr <8 x i16>, <8 x i16>* %addr.b, i32 1 + %addr.b.next = getelementptr <8 x i16>, <8 x i16>* %addr.b, i32 1 + %addr.c.next = getelementptr <8 x i16>, <8 x i16>* %addr.c, i32 1 + %loop.dec = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %lsr.iv, i32 1) + %end = icmp ne i32 %loop.dec, 0 + %lsr.iv.next = add i32 %lsr.iv, -1 + br i1 %end, label %loop.body, label %exit + + exit: ; preds = %loop.body, %entry + ret void + } + + define arm_aapcs_vfpcc void @test_ctlz_i16(<4 x i32>* %a, <4 x i32>* %b, <4 x i32>* %c, i32 %elts, i32 %iters) #0 { + entry: + %cmp = icmp slt i32 %elts, 1 + br i1 %cmp, label %exit, label %loop.ph + + loop.ph: ; preds = %entry + call void @llvm.set.loop.iterations.i32(i32 %iters) + br label %loop.body + + loop.body: ; preds = %loop.body, %loop.ph + %lsr.iv = phi i32 [ %lsr.iv.next, %loop.body ], [ %iters, %loop.ph ] + %count = phi i32 [ %elts, %loop.ph ], [ %elts.rem, %loop.body ] + %addr.a = phi <4 x i32>* [ %a, %loop.ph ], [ %addr.a.next, %loop.body ] + %addr.b = phi <4 x i32>* [ %b, %loop.ph ], [ %addr.b.next, %loop.body ] + %addr.c = phi <4 x i32>* [ %c, %loop.ph ], [ %addr.c.next, %loop.body ] + %pred = call <4 x i1> @llvm.arm.mve.vctp32(i32 %count) + %elts.rem = sub i32 %count, 4 + %masked.load.a = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr.a, i32 4, <4 x i1> %pred, <4 x i32> undef) + %masked.load.b = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr.b, i32 4, <4 x i1> %pred, <4 x i32> undef) + %bitcast.a = bitcast <4 x i32> %masked.load.a to <8 x i16> + %ctlz = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %bitcast.a, i1 false) + %shrn = call <8 x i16> @llvm.arm.mve.vshrn.v8i16.v4i32(<8 x i16> %ctlz, <4 x i32> %masked.load.b, i32 3, i32 1, i32 0, i32 1, i32 0, i32 1) + %bitcast = bitcast <8 x i16> %shrn to <4 x i32> + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %bitcast, <4 x i32>* %addr.c, i32 4, <4 x i1> %pred) + %addr.a.next = getelementptr <4 x i32>, <4 x i32>* %addr.a, i32 1 + %addr.b.next = getelementptr <4 x i32>, <4 x i32>* %addr.b, i32 1 + %addr.c.next = getelementptr <4 x i32>, <4 x i32>* %addr.c, i32 1 + %loop.dec = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %lsr.iv, i32 1) + %end = icmp ne i32 %loop.dec, 0 + %lsr.iv.next = add i32 %lsr.iv, -1 + br i1 %end, label %loop.body, label %exit + + exit: ; preds = %loop.body, %entry + ret void + } + + define arm_aapcs_vfpcc void @test_ctlz_i32(<4 x i32>* %a, <4 x i32>* %b, <4 x i32>* %c, i32 %elts, i32 %iters) #0 { + entry: + %cmp = icmp slt i32 %elts, 1 + br i1 %cmp, label %exit, label %loop.ph + + loop.ph: ; preds = %entry + call void @llvm.set.loop.iterations.i32(i32 %iters) + br label %loop.body + + loop.body: ; preds = %loop.body, %loop.ph + %lsr.iv = phi i32 [ %lsr.iv.next, %loop.body ], [ %iters, %loop.ph ] + %count = phi i32 [ %elts, %loop.ph ], [ %elts.rem, %loop.body ] + %addr.a = phi <4 x i32>* [ %a, %loop.ph ], [ %addr.a.next, %loop.body ] + %addr.b = phi <4 x i32>* [ %b, %loop.ph ], [ %addr.b.next, %loop.body ] + %addr.c = phi <4 x i32>* [ %c, %loop.ph ], [ %addr.c.next, %loop.body ] + %pred = call <4 x i1> @llvm.arm.mve.vctp32(i32 %count) + %elts.rem = sub i32 %count, 4 + %masked.load.a = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr.a, i32 4, <4 x i1> %pred, <4 x i32> undef) + %masked.load.b = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr.b, i32 4, <4 x i1> %pred, <4 x i32> undef) + %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %masked.load.b, i1 false) + %bitcast.a = bitcast <4 x i32> %masked.load.a to <8 x i16> + %shrn = call <8 x i16> @llvm.arm.mve.vshrn.v8i16.v4i32(<8 x i16> %bitcast.a, <4 x i32> %ctlz, i32 3, i32 1, i32 0, i32 1, i32 0, i32 1) + %bitcast = bitcast <8 x i16> %shrn to <4 x i32> + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %bitcast, <4 x i32>* %addr.c, i32 4, <4 x i1> %pred) + %addr.a.next = getelementptr <4 x i32>, <4 x i32>* %addr.a, i32 1 + %addr.b.next = getelementptr <4 x i32>, <4 x i32>* %addr.b, i32 1 + %addr.c.next = getelementptr <4 x i32>, <4 x i32>* %addr.c, i32 1 + %loop.dec = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %lsr.iv, i32 1) + %end = icmp ne i32 %loop.dec, 0 + %lsr.iv.next = add i32 %lsr.iv, -1 + br i1 %end, label %loop.body, label %exit + + exit: ; preds = %loop.body, %entry + ret void + } + + declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1 immarg) + declare <8 x i16> @llvm.ctlz.v8i16(<8 x i16>, i1 immarg) + declare <16 x i8> @llvm.ctlz.v16i8(<16 x i8>, i1 immarg) + declare void @llvm.set.loop.iterations.i32(i32) + declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) + declare <4 x i1> @llvm.arm.mve.vctp32(i32) + declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) + declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) + declare <8 x i16> @llvm.arm.mve.vshrn.v8i16.v4i32(<8 x i16>, <4 x i32>, i32, i32, i32, i32, i32, i32) + declare <8 x i1> @llvm.arm.mve.vctp16(i32) + declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32 immarg, <8 x i1>, <8 x i16>) + declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32 immarg, <8 x i1>) + declare <16 x i8> @llvm.arm.mve.vshrn.v16i8.v8i16(<16 x i8>, <8 x i16>, i32, i32, i32, i32, i32, i32) + +... +--- +name: test_ctlz_i8 +alignment: 2 +tracksRegLiveness: true +registers: [] +liveins: + - { reg: '$r0', virtual-reg: '' } + - { reg: '$r1', virtual-reg: '' } + - { reg: '$r2', virtual-reg: '' } + - { reg: '$r3', virtual-reg: '' } +frameInfo: + stackSize: 8 + offsetAdjustment: 0 + maxAlignment: 4 +fixedStack: + - { id: 0, type: default, offset: 0, size: 4, alignment: 8, stack-id: default, + isImmutable: true, isAliased: false, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +stack: + - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r4', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +callSites: [] +constants: [] +machineFunctionInfo: {} +body: | + bb.0.entry: + successors: %bb.1(0x80000000) + liveins: $r0, $r1, $r2, $r3, $r4, $lr + + frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r4, killed $lr, implicit-def $sp, implicit $sp + frame-setup CFI_INSTRUCTION def_cfa_offset 8 + frame-setup CFI_INSTRUCTION offset $lr, -4 + frame-setup CFI_INSTRUCTION offset $r4, -8 + tCMPi8 renamable $r3, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr + t2IT 11, 8, implicit-def $itstate + tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r4, def $pc, implicit killed $itstate + renamable $r12 = t2LDRi12 $sp, 8, 14 /* CC::al */, $noreg :: (load 4 from %fixed-stack.0, align 8) + t2DoLoopStart renamable $r12 + $r4 = tMOVr killed $r12, 14 /* CC::al */, $noreg + + bb.1.loop.body: + successors: %bb.1(0x7c000000), %bb.2(0x04000000) + liveins: $r0, $r1, $r2, $r3, $r4 + + renamable $vpr = MVE_VCTP16 renamable $r3, 0, $noreg + MVE_VPST 4, implicit $vpr + renamable $r1, renamable $q0 = MVE_VLDRHU16_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.addr.b, align 2) + renamable $q1 = MVE_VLDRHU16 killed renamable $r0, 0, 1, renamable $vpr :: (load 16 from %ir.addr.a, align 2) + $lr = tMOVr $r4, 14 /* CC::al */, $noreg + renamable $r4, dead $cpsr = tSUBi8 killed $r4, 1, 14 /* CC::al */, $noreg + renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 8, 14 /* CC::al */, $noreg + renamable $q1 = MVE_VCLZs8 killed renamable $q1, 0, $noreg, undef renamable $q1 + renamable $lr = t2LoopDec killed renamable $lr, 1 + $r0 = tMOVr $r1, 14 /* CC::al */, $noreg + renamable $q1 = MVE_VQSHRUNs16th killed renamable $q1, killed renamable $q0, 1, 0, $noreg + MVE_VPST 8, implicit $vpr + renamable $r2 = MVE_VSTRHU16_post killed renamable $q1, killed renamable $r2, 16, 1, killed renamable $vpr :: (store 16 into %ir.addr.c, align 2) + t2LoopEnd killed renamable $lr, %bb.1, implicit-def dead $cpsr + tB %bb.2, 14 /* CC::al */, $noreg + + bb.2.exit: + tPOP_RET 14 /* CC::al */, $noreg, def $r4, def $pc + +... +--- +name: test_ctlz_i16 +alignment: 2 +tracksRegLiveness: true +registers: [] +liveins: + - { reg: '$r0', virtual-reg: '' } + - { reg: '$r1', virtual-reg: '' } + - { reg: '$r2', virtual-reg: '' } + - { reg: '$r3', virtual-reg: '' } +frameInfo: + stackSize: 8 + offsetAdjustment: 0 + maxAlignment: 4 +fixedStack: + - { id: 0, type: default, offset: 0, size: 4, alignment: 8, stack-id: default, + isImmutable: true, isAliased: false, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +stack: + - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r4', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +callSites: [] +constants: [] +machineFunctionInfo: {} +body: | + bb.0.entry: + successors: %bb.1(0x80000000) + liveins: $r0, $r1, $r2, $r3, $r4, $lr + + frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r4, killed $lr, implicit-def $sp, implicit $sp + frame-setup CFI_INSTRUCTION def_cfa_offset 8 + frame-setup CFI_INSTRUCTION offset $lr, -4 + frame-setup CFI_INSTRUCTION offset $r4, -8 + tCMPi8 renamable $r3, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr + t2IT 11, 8, implicit-def $itstate + tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r4, def $pc, implicit killed $itstate + renamable $r4 = tLDRspi $sp, 2, 14 /* CC::al */, $noreg :: (load 4 from %fixed-stack.0, align 8) + t2DoLoopStart renamable $r4 + $r12 = tMOVr killed $r4, 14 /* CC::al */, $noreg + + bb.1.loop.body: + successors: %bb.1(0x7c000000), %bb.2(0x04000000) + liveins: $r0, $r1, $r2, $r3, $r12 + + renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg + $lr = tMOVr $r12, 14 /* CC::al */, $noreg + MVE_VPST 4, implicit $vpr + renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.addr.b, align 4) + renamable $r0, renamable $q1 = MVE_VLDRWU32_post killed renamable $r0, 16, 1, renamable $vpr :: (load 16 from %ir.addr.a, align 4) + renamable $r12 = t2SUBri killed $r12, 1, 14 /* CC::al */, $noreg, $noreg + renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14 /* CC::al */, $noreg + renamable $q1 = MVE_VCLZs16 killed renamable $q1, 0, $noreg, undef renamable $q1 + renamable $lr = t2LoopDec killed renamable $lr, 1 + renamable $q1 = MVE_VQSHRUNs32th killed renamable $q1, killed renamable $q0, 3, 0, $noreg + MVE_VPST 8, implicit $vpr + renamable $r2 = MVE_VSTRWU32_post killed renamable $q1, killed renamable $r2, 16, 1, killed renamable $vpr :: (store 16 into %ir.addr.c, align 4) + t2LoopEnd killed renamable $lr, %bb.1, implicit-def dead $cpsr + tB %bb.2, 14 /* CC::al */, $noreg + + bb.2.exit: + tPOP_RET 14 /* CC::al */, $noreg, def $r4, def $pc + +... +--- +name: test_ctlz_i32 +alignment: 2 +tracksRegLiveness: true +registers: [] +liveins: + - { reg: '$r0', virtual-reg: '' } + - { reg: '$r1', virtual-reg: '' } + - { reg: '$r2', virtual-reg: '' } + - { reg: '$r3', virtual-reg: '' } +frameInfo: + stackSize: 8 + offsetAdjustment: 0 + maxAlignment: 4 +fixedStack: + - { id: 0, type: default, offset: 0, size: 4, alignment: 8, stack-id: default, + isImmutable: true, isAliased: false, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +stack: + - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r4', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +callSites: [] +constants: [] +machineFunctionInfo: {} +body: | + bb.0.entry: + successors: %bb.1(0x80000000) + liveins: $r0, $r1, $r2, $r3, $r4, $lr + + frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r4, killed $lr, implicit-def $sp, implicit $sp + frame-setup CFI_INSTRUCTION def_cfa_offset 8 + frame-setup CFI_INSTRUCTION offset $lr, -4 + frame-setup CFI_INSTRUCTION offset $r4, -8 + tCMPi8 renamable $r3, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr + t2IT 11, 8, implicit-def $itstate + tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r4, def $pc, implicit killed $itstate + renamable $r4 = tLDRspi $sp, 2, 14 /* CC::al */, $noreg :: (load 4 from %fixed-stack.0, align 8) + t2DoLoopStart renamable $r4 + $r12 = tMOVr killed $r4, 14 /* CC::al */, $noreg + + bb.1.loop.body: + successors: %bb.1(0x7c000000), %bb.2(0x04000000) + liveins: $r0, $r1, $r2, $r3, $r12 + + renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg + $lr = tMOVr $r12, 14 /* CC::al */, $noreg + MVE_VPST 4, implicit $vpr + renamable $r0, renamable $q0 = MVE_VLDRWU32_post killed renamable $r0, 16, 1, renamable $vpr :: (load 16 from %ir.addr.a, align 4) + renamable $r1, renamable $q1 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.addr.b, align 4) + renamable $r12 = t2SUBri killed $r12, 1, 14 /* CC::al */, $noreg, $noreg + renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14 /* CC::al */, $noreg + renamable $q1 = MVE_VCLZs32 killed renamable $q1, 0, $noreg, undef renamable $q1 + renamable $lr = t2LoopDec killed renamable $lr, 1 + renamable $q0 = MVE_VQSHRUNs32th killed renamable $q0, killed renamable $q1, 3, 0, $noreg + MVE_VPST 8, implicit $vpr + renamable $r2 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r2, 16, 1, killed renamable $vpr :: (store 16 into %ir.addr.c, align 4) + t2LoopEnd killed renamable $lr, %bb.1, implicit-def dead $cpsr + tB %bb.2, 14 /* CC::al */, $noreg + + bb.2.exit: + tPOP_RET 14 /* CC::al */, $noreg, def $r4, def $pc + +... diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/extending-loads.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/extending-loads.ll index 4512abb9584ff..b4846cd824e7f 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/extending-loads.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/extending-loads.ll @@ -8,14 +8,13 @@ define dso_local arm_aapcs_vfpcc void @sext_i8(i16* noalias nocapture %a, i8* no ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} -; CHECK-NEXT: mov r3, r0 ; CHECK-NEXT: dlstp.16 lr, r2 -; CHECK: .LBB0_1: @ %vector.body -; CHECK: vldrb.s16 q0, [r1], #8 -; CHECK-NEXT: vldrh.u16 q1, [r3], #16 +; CHECK-NEXT: .LBB0_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrb.s16 q0, [r1], #8 +; CHECK-NEXT: vldrh.u16 q1, [r0] ; CHECK-NEXT: vadd.i16 q0, q1, q0 -; CHECK-NEXT: vstrh.16 q0, [r0] -; CHECK-NEXT: mov r0, r3 +; CHECK-NEXT: vstrh.16 q0, [r0], #16 ; CHECK-NEXT: letp lr, .LBB0_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} @@ -63,14 +62,13 @@ define dso_local arm_aapcs_vfpcc void @zext_i8(i16* noalias nocapture %a, i8* no ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} -; CHECK-NEXT: mov r3, r0 ; CHECK-NEXT: dlstp.16 lr, r2 -; CHECK: .LBB1_1: @ %vector.body -; CHECK: vldrb.u16 q0, [r1], #8 -; CHECK-NEXT: vldrh.u16 q1, [r3], #16 +; CHECK-NEXT: .LBB1_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrb.u16 q0, [r1], #8 +; CHECK-NEXT: vldrh.u16 q1, [r0] ; CHECK-NEXT: vadd.i16 q0, q1, q0 -; CHECK-NEXT: vstrh.16 q0, [r0] -; CHECK-NEXT: mov r0, r3 +; CHECK-NEXT: vstrh.16 q0, [r0], #16 ; CHECK-NEXT: letp lr, .LBB1_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} @@ -118,14 +116,13 @@ define dso_local arm_aapcs_vfpcc void @sext_i16(i32* noalias nocapture %a, i16* ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} -; CHECK-NEXT: mov r3, r0 ; CHECK-NEXT: dlstp.32 lr, r2 -; CHECK: .LBB2_1: @ %vector.body -; CHECK: vldrh.s32 q0, [r1], #8 -; CHECK-NEXT: vldrw.u32 q1, [r3], #16 +; CHECK-NEXT: .LBB2_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrh.s32 q0, [r1], #8 +; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vadd.i32 q0, q1, q0 -; CHECK-NEXT: vstrw.32 q0, [r0] -; CHECK-NEXT: mov r0, r3 +; CHECK-NEXT: vstrw.32 q0, [r0], #16 ; CHECK-NEXT: letp lr, .LBB2_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} @@ -173,14 +170,13 @@ define dso_local arm_aapcs_vfpcc void @zext_i16(i32* noalias nocapture %a, i16* ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} -; CHECK-NEXT: mov r3, r0 ; CHECK-NEXT: dlstp.32 lr, r2 -; CHECK: .LBB3_1: @ %vector.body -; CHECK: vldrh.u32 q0, [r1], #8 -; CHECK-NEXT: vldrw.u32 q1, [r3], #16 +; CHECK-NEXT: .LBB3_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrh.u32 q0, [r1], #8 +; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: vadd.i32 q0, q1, q0 -; CHECK-NEXT: vstrw.32 q0, [r0] -; CHECK-NEXT: mov r0, r3 +; CHECK-NEXT: vstrw.32 q0, [r0], #16 ; CHECK-NEXT: letp lr, .LBB3_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/safe-retaining.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/safe-retaining.mir new file mode 100644 index 0000000000000..ee07d0c1f871d --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/safe-retaining.mir @@ -0,0 +1,273 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -run-pass=arm-low-overhead-loops %s -o - | FileCheck %s + +--- | + define arm_aapcs_vfpcc void @test_vqrshruntq_n_s32(<4 x i32>* %a, <4 x i32>* %b, <4 x i32>* %c, i32 %elts, i32 %iters) { + entry: + %cmp = icmp slt i32 %elts, 1 + br i1 %cmp, label %exit, label %loop.ph + + loop.ph: ; preds = %entry + call void @llvm.set.loop.iterations.i32(i32 %iters) + br label %loop.body + + loop.body: ; preds = %loop.body, %loop.ph + %lsr.iv = phi i32 [ %lsr.iv.next, %loop.body ], [ %iters, %loop.ph ] + %count = phi i32 [ %elts, %loop.ph ], [ %elts.rem, %loop.body ] + %addr.a = phi <4 x i32>* [ %a, %loop.ph ], [ %addr.a.next, %loop.body ] + %addr.b = phi <4 x i32>* [ %b, %loop.ph ], [ %addr.b.next, %loop.body ] + %addr.c = phi <4 x i32>* [ %c, %loop.ph ], [ %addr.c.next, %loop.body ] + %pred = call <4 x i1> @llvm.arm.mve.vctp32(i32 %count) + %elts.rem = sub i32 %count, 4 + %masked.load.a = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr.a, i32 4, <4 x i1> %pred, <4 x i32> undef) + %masked.load.b = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr.b, i32 4, <4 x i1> %pred, <4 x i32> undef) + %bitcast.a = bitcast <4 x i32> %masked.load.a to <8 x i16> + %shrn = call <8 x i16> @llvm.arm.mve.vshrn.v8i16.v4i32(<8 x i16> %bitcast.a, <4 x i32> %masked.load.b, i32 3, i32 1, i32 0, i32 1, i32 0, i32 1) + %bitcast = bitcast <8 x i16> %shrn to <4 x i32> + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %bitcast, <4 x i32>* %addr.c, i32 4, <4 x i1> %pred) + %addr.a.next = getelementptr <4 x i32>, <4 x i32>* %addr.a, i32 1 + %addr.b.next = getelementptr <4 x i32>, <4 x i32>* %addr.b, i32 1 + %addr.c.next = getelementptr <4 x i32>, <4 x i32>* %addr.c, i32 1 + %loop.dec = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %lsr.iv, i32 1) + %end = icmp ne i32 %loop.dec, 0 + %lsr.iv.next = add i32 %lsr.iv, -1 + br i1 %end, label %loop.body, label %exit + + exit: ; preds = %loop.body, %entry + ret void + } + + define arm_aapcs_vfpcc void @test_vqrshruntq_n_s16(<8 x i16>* %a, <8 x i16>* %b, <8 x i16>* %c, i32 %elts, i32 %iters) { + entry: + %cmp = icmp slt i32 %elts, 1 + br i1 %cmp, label %exit, label %loop.ph + + loop.ph: ; preds = %entry + call void @llvm.set.loop.iterations.i32(i32 %iters) + br label %loop.body + + loop.body: ; preds = %loop.body, %loop.ph + %lsr.iv = phi i32 [ %lsr.iv.next, %loop.body ], [ %iters, %loop.ph ] + %count = phi i32 [ %elts, %loop.ph ], [ %elts.rem, %loop.body ] + %addr.a = phi <8 x i16>* [ %a, %loop.ph ], [ %addr.a.next, %loop.body ] + %addr.b = phi <8 x i16>* [ %b, %loop.ph ], [ %addr.b.next, %loop.body ] + %addr.c = phi <8 x i16>* [ %c, %loop.ph ], [ %addr.c.next, %loop.body ] + %pred = call <8 x i1> @llvm.arm.mve.vctp16(i32 %count) + %elts.rem = sub i32 %count, 8 + %masked.load.a = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %addr.a, i32 2, <8 x i1> %pred, <8 x i16> undef) + %masked.load.b = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %addr.b, i32 2, <8 x i1> %pred, <8 x i16> undef) + %bitcast.a = bitcast <8 x i16> %masked.load.a to <16 x i8> + %shrn = call <16 x i8> @llvm.arm.mve.vshrn.v16i8.v8i16(<16 x i8> %bitcast.a, <8 x i16> %masked.load.b, i32 1, i32 1, i32 0, i32 1, i32 0, i32 1) + %bitcast = bitcast <16 x i8> %shrn to <8 x i16> + call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %bitcast, <8 x i16>* %addr.c, i32 2, <8 x i1> %pred) + %addr.a.next = getelementptr <8 x i16>, <8 x i16>* %addr.b, i32 1 + %addr.b.next = getelementptr <8 x i16>, <8 x i16>* %addr.b, i32 1 + %addr.c.next = getelementptr <8 x i16>, <8 x i16>* %addr.c, i32 1 + %loop.dec = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %lsr.iv, i32 1) + %end = icmp ne i32 %loop.dec, 0 + %lsr.iv.next = add i32 %lsr.iv, -1 + br i1 %end, label %loop.body, label %exit + + exit: ; preds = %loop.body, %entry + ret void + } + + declare void @llvm.set.loop.iterations.i32(i32) + declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) + declare <4 x i1> @llvm.arm.mve.vctp32(i32) + declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) + declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) + declare <8 x i16> @llvm.arm.mve.vshrn.v8i16.v4i32(<8 x i16>, <4 x i32>, i32, i32, i32, i32, i32, i32) + declare <8 x i1> @llvm.arm.mve.vctp16(i32) + declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32 immarg, <8 x i1>, <8 x i16>) + declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32 immarg, <8 x i1>) + declare <16 x i8> @llvm.arm.mve.vshrn.v16i8.v8i16(<16 x i8>, <8 x i16>, i32, i32, i32, i32, i32, i32) + +... +--- +name: test_vqrshruntq_n_s32 +alignment: 2 +tracksRegLiveness: true +registers: [] +liveins: + - { reg: '$r0', virtual-reg: '' } + - { reg: '$r1', virtual-reg: '' } + - { reg: '$r2', virtual-reg: '' } + - { reg: '$r3', virtual-reg: '' } +frameInfo: + stackSize: 8 + offsetAdjustment: 0 + maxAlignment: 4 + restorePoint: '' +fixedStack: + - { id: 0, type: default, offset: 0, size: 4, alignment: 8, stack-id: default, + isImmutable: true, isAliased: false, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +stack: + - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r4', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +callSites: [] +constants: [] +machineFunctionInfo: {} +body: | + ; CHECK-LABEL: name: test_vqrshruntq_n_s32 + ; CHECK: bb.0.entry: + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $lr, $r0, $r1, $r2, $r3, $r4 + ; CHECK: frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r4, killed $lr, implicit-def $sp, implicit $sp + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 + ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r4, -8 + ; CHECK: tCMPi8 renamable $r3, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr + ; CHECK: t2IT 11, 8, implicit-def $itstate + ; CHECK: tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r4, def $pc, implicit killed $itstate + ; CHECK: renamable $r4 = tLDRspi $sp, 2, 14 /* CC::al */, $noreg :: (load 4 from %fixed-stack.0, align 8) + ; CHECK: dead $lr = MVE_DLSTP_32 killed renamable $r3 + ; CHECK: $r12 = tMOVr killed $r4, 14 /* CC::al */, $noreg + ; CHECK: bb.1.loop.body: + ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK: liveins: $r0, $r1, $r2, $r12 + ; CHECK: $lr = tMOVr $r12, 14 /* CC::al */, $noreg + ; CHECK: renamable $r12 = t2SUBri killed $r12, 1, 14 /* CC::al */, $noreg, $noreg + ; CHECK: renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 0, $noreg :: (load 16 from %ir.addr.b, align 4) + ; CHECK: renamable $r0, renamable $q1 = MVE_VLDRWU32_post killed renamable $r0, 16, 0, $noreg :: (load 16 from %ir.addr.a, align 4) + ; CHECK: renamable $q1 = MVE_VQSHRUNs32th killed renamable $q1, killed renamable $q0, 3, 0, $noreg + ; CHECK: renamable $r2 = MVE_VSTRWU32_post killed renamable $q1, killed renamable $r2, 16, 0, killed $noreg :: (store 16 into %ir.addr.c, align 4) + ; CHECK: dead $lr = MVE_LETP killed renamable $lr, %bb.1 + ; CHECK: bb.2.exit: + ; CHECK: tPOP_RET 14 /* CC::al */, $noreg, def $r4, def $pc + bb.0.entry: + successors: %bb.1(0x80000000) + liveins: $r0, $r1, $r2, $r3, $r4, $lr + + frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r4, killed $lr, implicit-def $sp, implicit $sp + frame-setup CFI_INSTRUCTION def_cfa_offset 8 + frame-setup CFI_INSTRUCTION offset $lr, -4 + frame-setup CFI_INSTRUCTION offset $r4, -8 + tCMPi8 renamable $r3, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr + t2IT 11, 8, implicit-def $itstate + tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r4, def $pc, implicit killed $itstate + renamable $r4 = tLDRspi $sp, 2, 14 /* CC::al */, $noreg :: (load 4 from %fixed-stack.0, align 8) + t2DoLoopStart renamable $r4 + $r12 = tMOVr killed $r4, 14 /* CC::al */, $noreg + + bb.1.loop.body: + successors: %bb.1(0x7c000000), %bb.2(0x04000000) + liveins: $r0, $r1, $r2, $r3, $r12 + + renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg + $lr = tMOVr $r12, 14 /* CC::al */, $noreg + renamable $r12 = t2SUBri killed $r12, 1, 14 /* CC::al */, $noreg, $noreg + renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14 /* CC::al */, $noreg + MVE_VPST 4, implicit $vpr + renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.addr.b, align 4) + renamable $r0, renamable $q1 = MVE_VLDRWU32_post killed renamable $r0, 16, 1, renamable $vpr :: (load 16 from %ir.addr.a, align 4) + renamable $lr = t2LoopDec killed renamable $lr, 1 + renamable $q1 = MVE_VQSHRUNs32th killed renamable $q1, killed renamable $q0, 3, 0, $noreg + MVE_VPST 8, implicit $vpr + renamable $r2 = MVE_VSTRWU32_post killed renamable $q1, killed renamable $r2, 16, 1, killed renamable $vpr :: (store 16 into %ir.addr.c, align 4) + t2LoopEnd killed renamable $lr, %bb.1, implicit-def dead $cpsr + tB %bb.2, 14 /* CC::al */, $noreg + + bb.2.exit: + tPOP_RET 14 /* CC::al */, $noreg, def $r4, def $pc + +... +--- +name: test_vqrshruntq_n_s16 +alignment: 2 +tracksRegLiveness: true +registers: [] +liveins: + - { reg: '$r0', virtual-reg: '' } + - { reg: '$r1', virtual-reg: '' } + - { reg: '$r2', virtual-reg: '' } + - { reg: '$r3', virtual-reg: '' } +frameInfo: + stackSize: 8 + offsetAdjustment: 0 + maxAlignment: 4 +fixedStack: + - { id: 0, type: default, offset: 0, size: 4, alignment: 8, stack-id: default, + isImmutable: true, isAliased: false, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +stack: + - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r4', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +callSites: [] +constants: [] +machineFunctionInfo: {} +body: | + ; CHECK-LABEL: name: test_vqrshruntq_n_s16 + ; CHECK: bb.0.entry: + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $lr, $r0, $r1, $r2, $r3, $r4 + ; CHECK: frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r4, killed $lr, implicit-def $sp, implicit $sp + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 + ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r4, -8 + ; CHECK: tCMPi8 renamable $r3, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr + ; CHECK: t2IT 11, 8, implicit-def $itstate + ; CHECK: tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r4, def $pc, implicit killed $itstate + ; CHECK: renamable $r12 = t2LDRi12 $sp, 8, 14 /* CC::al */, $noreg :: (load 4 from %fixed-stack.0, align 8) + ; CHECK: dead $lr = MVE_DLSTP_16 killed renamable $r3 + ; CHECK: $r4 = tMOVr killed $r12, 14 /* CC::al */, $noreg + ; CHECK: bb.1.loop.body: + ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK: liveins: $r0, $r1, $r2, $r4 + ; CHECK: $lr = tMOVr $r4, 14 /* CC::al */, $noreg + ; CHECK: renamable $r4, dead $cpsr = tSUBi8 killed $r4, 1, 14 /* CC::al */, $noreg + ; CHECK: renamable $r1, renamable $q0 = MVE_VLDRHU16_post killed renamable $r1, 16, 0, $noreg :: (load 16 from %ir.addr.b, align 2) + ; CHECK: renamable $q1 = MVE_VLDRHU16 killed renamable $r0, 0, 0, $noreg :: (load 16 from %ir.addr.a, align 2) + ; CHECK: $r0 = tMOVr $r1, 14 /* CC::al */, $noreg + ; CHECK: renamable $q1 = MVE_VQSHRUNs16th killed renamable $q1, killed renamable $q0, 1, 0, $noreg + ; CHECK: renamable $r2 = MVE_VSTRHU16_post killed renamable $q1, killed renamable $r2, 16, 0, killed $noreg :: (store 16 into %ir.addr.c, align 2) + ; CHECK: dead $lr = MVE_LETP killed renamable $lr, %bb.1 + ; CHECK: bb.2.exit: + ; CHECK: tPOP_RET 14 /* CC::al */, $noreg, def $r4, def $pc + bb.0.entry: + successors: %bb.1(0x80000000) + liveins: $r0, $r1, $r2, $r3, $r4, $lr + + frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r4, killed $lr, implicit-def $sp, implicit $sp + frame-setup CFI_INSTRUCTION def_cfa_offset 8 + frame-setup CFI_INSTRUCTION offset $lr, -4 + frame-setup CFI_INSTRUCTION offset $r4, -8 + tCMPi8 renamable $r3, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr + t2IT 11, 8, implicit-def $itstate + tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r4, def $pc, implicit killed $itstate + renamable $r12 = t2LDRi12 $sp, 8, 14 /* CC::al */, $noreg :: (load 4 from %fixed-stack.0, align 8) + t2DoLoopStart renamable $r12 + $r4 = tMOVr killed $r12, 14 /* CC::al */, $noreg + + bb.1.loop.body: + successors: %bb.1(0x7c000000), %bb.2(0x04000000) + liveins: $r0, $r1, $r2, $r3, $r4 + + renamable $vpr = MVE_VCTP16 renamable $r3, 0, $noreg + $lr = tMOVr $r4, 14 /* CC::al */, $noreg + renamable $r4, dead $cpsr = tSUBi8 killed $r4, 1, 14 /* CC::al */, $noreg + renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 8, 14 /* CC::al */, $noreg + MVE_VPST 4, implicit $vpr + renamable $r1, renamable $q0 = MVE_VLDRHU16_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.addr.b, align 2) + renamable $q1 = MVE_VLDRHU16 killed renamable $r0, 0, 1, renamable $vpr :: (load 16 from %ir.addr.a, align 2) + renamable $lr = t2LoopDec killed renamable $lr, 1 + $r0 = tMOVr $r1, 14 /* CC::al */, $noreg + renamable $q1 = MVE_VQSHRUNs16th killed renamable $q1, killed renamable $q0, 1, 0, $noreg + MVE_VPST 8, implicit $vpr + renamable $r2 = MVE_VSTRHU16_post killed renamable $q1, killed renamable $r2, 16, 1, killed renamable $vpr :: (store 16 into %ir.addr.c, align 2) + t2LoopEnd killed renamable $lr, %bb.1, implicit-def dead $cpsr + tB %bb.2, 14 /* CC::al */, $noreg + + bb.2.exit: + tPOP_RET 14 /* CC::al */, $noreg, def $r4, def $pc + +... diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unsafe-retaining.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unsafe-retaining.mir new file mode 100644 index 0000000000000..b175a7ca7e392 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unsafe-retaining.mir @@ -0,0 +1,281 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -run-pass=arm-low-overhead-loops %s -o - | FileCheck %s + +--- | + define arm_aapcs_vfpcc void @test_vmvn(<4 x i32>* %a, <4 x i32>* %b, <4 x i32>* %c, i32 %elts, i32 %iters) #0 { + entry: + %cmp = icmp slt i32 %elts, 1 + br i1 %cmp, label %exit, label %loop.ph + + loop.ph: ; preds = %entry + call void @llvm.set.loop.iterations.i32(i32 %iters) + br label %loop.body + + loop.body: ; preds = %loop.body, %loop.ph + %lsr.iv = phi i32 [ %lsr.iv.next, %loop.body ], [ %iters, %loop.ph ] + %count = phi i32 [ %elts, %loop.ph ], [ %elts.rem, %loop.body ] + %addr.a = phi <4 x i32>* [ %a, %loop.ph ], [ %addr.a.next, %loop.body ] + %addr.b = phi <4 x i32>* [ %b, %loop.ph ], [ %addr.b.next, %loop.body ] + %addr.c = phi <4 x i32>* [ %c, %loop.ph ], [ %addr.c.next, %loop.body ] + %pred = call <4 x i1> @llvm.arm.mve.vctp32(i32 %count) + %elts.rem = sub i32 %count, 4 + %masked.load.a = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr.a, i32 4, <4 x i1> %pred, <4 x i32> undef) + %masked.load.b = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr.b, i32 4, <4 x i1> %pred, <4 x i32> undef) + %not = xor <4 x i32> %masked.load.b, + %bitcast.a = bitcast <4 x i32> %masked.load.a to <8 x i16> + %shrn = call <8 x i16> @llvm.arm.mve.vshrn.v8i16.v4i32(<8 x i16> %bitcast.a, <4 x i32> %not, i32 15, i32 1, i32 0, i32 0, i32 0, i32 0) + %bitcast = bitcast <8 x i16> %shrn to <4 x i32> + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %bitcast, <4 x i32>* %addr.c, i32 4, <4 x i1> %pred) + %addr.a.next = getelementptr <4 x i32>, <4 x i32>* %addr.a, i32 1 + %addr.b.next = getelementptr <4 x i32>, <4 x i32>* %addr.b, i32 1 + %addr.c.next = getelementptr <4 x i32>, <4 x i32>* %addr.c, i32 1 + %loop.dec = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %lsr.iv, i32 1) + %end = icmp ne i32 %loop.dec, 0 + %lsr.iv.next = add i32 %lsr.iv, -1 + br i1 %end, label %loop.body, label %exit + + exit: ; preds = %loop.body, %entry + ret void + } + + define arm_aapcs_vfpcc void @test_vorn(<4 x i32>* %a, <4 x i32>* %b, <4 x i32>* %c, i32 %elts, i32 %iters) #0 { + entry: + %cmp = icmp slt i32 %elts, 1 + br i1 %cmp, label %exit, label %loop.ph + + loop.ph: ; preds = %entry + call void @llvm.set.loop.iterations.i32(i32 %iters) + br label %loop.body + + loop.body: ; preds = %loop.body, %loop.ph + %lsr.iv = phi i32 [ %lsr.iv.next, %loop.body ], [ %iters, %loop.ph ] + %count = phi i32 [ %elts, %loop.ph ], [ %elts.rem, %loop.body ] + %addr.a = phi <4 x i32>* [ %a, %loop.ph ], [ %addr.a.next, %loop.body ] + %addr.b = phi <4 x i32>* [ %b, %loop.ph ], [ %addr.b.next, %loop.body ] + %addr.c = phi <4 x i32>* [ %c, %loop.ph ], [ %addr.c.next, %loop.body ] + %pred = call <4 x i1> @llvm.arm.mve.vctp32(i32 %count) + %elts.rem = sub i32 %count, 4 + %masked.load.a = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr.a, i32 4, <4 x i1> %pred, <4 x i32> undef) + %masked.load.b = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr.b, i32 4, <4 x i1> %pred, <4 x i32> undef) + %not = xor <4 x i32> %masked.load.b, + %or = or <4 x i32> %not, %masked.load.a + %bitcast.a = bitcast <4 x i32> %masked.load.a to <8 x i16> + %shrn = call <8 x i16> @llvm.arm.mve.vshrn.v8i16.v4i32(<8 x i16> %bitcast.a, <4 x i32> %or, i32 3, i32 1, i32 0, i32 1, i32 0, i32 1) + %bitcast = bitcast <8 x i16> %shrn to <4 x i32> + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %bitcast, <4 x i32>* %addr.c, i32 4, <4 x i1> %pred) + %addr.a.next = getelementptr <4 x i32>, <4 x i32>* %addr.a, i32 1 + %addr.b.next = getelementptr <4 x i32>, <4 x i32>* %addr.b, i32 1 + %addr.c.next = getelementptr <4 x i32>, <4 x i32>* %addr.c, i32 1 + %loop.dec = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %lsr.iv, i32 1) + %end = icmp ne i32 %loop.dec, 0 + %lsr.iv.next = add i32 %lsr.iv, -1 + br i1 %end, label %loop.body, label %exit + + exit: ; preds = %loop.body, %entry + ret void + } + + declare void @llvm.set.loop.iterations.i32(i32) + declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) + declare <4 x i1> @llvm.arm.mve.vctp32(i32) + declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) + declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) + declare <8 x i16> @llvm.arm.mve.vshrn.v8i16.v4i32(<8 x i16>, <4 x i32>, i32, i32, i32, i32, i32, i32) + +... +--- +name: test_vmvn +alignment: 2 +tracksRegLiveness: true +registers: [] +liveins: + - { reg: '$r0', virtual-reg: '' } + - { reg: '$r1', virtual-reg: '' } + - { reg: '$r2', virtual-reg: '' } + - { reg: '$r3', virtual-reg: '' } +frameInfo: + stackSize: 8 + offsetAdjustment: 0 + maxAlignment: 4 +fixedStack: + - { id: 0, type: default, offset: 0, size: 4, alignment: 8, stack-id: default, + isImmutable: true, isAliased: false, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +stack: + - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r4', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +callSites: [] +constants: [] +machineFunctionInfo: {} +body: | + ; CHECK-LABEL: name: test_vmvn + ; CHECK: bb.0.entry: + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $lr, $r0, $r1, $r2, $r3, $r4 + ; CHECK: frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r4, killed $lr, implicit-def $sp, implicit $sp + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 + ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r4, -8 + ; CHECK: tCMPi8 renamable $r3, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr + ; CHECK: t2IT 11, 8, implicit-def $itstate + ; CHECK: frame-destroy tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r4, def $pc, implicit killed $itstate + ; CHECK: renamable $r4 = tLDRspi $sp, 2, 14 /* CC::al */, $noreg :: (load 4 from %fixed-stack.0, align 8) + ; CHECK: dead $lr = t2DLS renamable $r4 + ; CHECK: $r12 = tMOVr killed $r4, 14 /* CC::al */, $noreg + ; CHECK: bb.1.loop.body: + ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK: liveins: $r0, $r1, $r2, $r3, $r12 + ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg + ; CHECK: $lr = tMOVr $r12, 14 /* CC::al */, $noreg + ; CHECK: MVE_VPST 4, implicit $vpr + ; CHECK: renamable $r0, renamable $q0 = MVE_VLDRWU32_post killed renamable $r0, 16, 1, renamable $vpr :: (load 16 from %ir.addr.a, align 4) + ; CHECK: renamable $r1, renamable $q1 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.addr.b, align 4) + ; CHECK: renamable $r12 = t2SUBri killed $r12, 1, 14 /* CC::al */, $noreg, $noreg + ; CHECK: renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14 /* CC::al */, $noreg + ; CHECK: renamable $q1 = MVE_VMVN killed renamable $q1, 0, $noreg, undef renamable $q1 + ; CHECK: renamable $q0 = MVE_VQSHRNbhs32 killed renamable $q0, killed renamable $q1, 15, 0, $noreg + ; CHECK: MVE_VPST 8, implicit $vpr + ; CHECK: renamable $r2 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r2, 16, 1, killed renamable $vpr :: (store 16 into %ir.addr.c, align 4) + ; CHECK: dead $lr = t2LEUpdate killed renamable $lr, %bb.1 + ; CHECK: bb.2.exit: + ; CHECK: frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r4, def $pc + bb.0.entry: + successors: %bb.1(0x80000000) + liveins: $r0, $r1, $r2, $r3, $r4, $lr + + frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r4, killed $lr, implicit-def $sp, implicit $sp + frame-setup CFI_INSTRUCTION def_cfa_offset 8 + frame-setup CFI_INSTRUCTION offset $lr, -4 + frame-setup CFI_INSTRUCTION offset $r4, -8 + tCMPi8 renamable $r3, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr + t2IT 11, 8, implicit-def $itstate + frame-destroy tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r4, def $pc, implicit killed $itstate + renamable $r4 = tLDRspi $sp, 2, 14 /* CC::al */, $noreg :: (load 4 from %fixed-stack.0, align 8) + t2DoLoopStart renamable $r4 + $r12 = tMOVr killed $r4, 14 /* CC::al */, $noreg + + bb.1.loop.body: + successors: %bb.1(0x7c000000), %bb.2(0x04000000) + liveins: $r0, $r1, $r2, $r3, $r12 + + renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg + $lr = tMOVr $r12, 14 /* CC::al */, $noreg + MVE_VPST 4, implicit $vpr + renamable $r0, renamable $q0 = MVE_VLDRWU32_post killed renamable $r0, 16, 1, renamable $vpr :: (load 16 from %ir.addr.a, align 4) + renamable $r1, renamable $q1 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.addr.b, align 4) + renamable $r12 = t2SUBri killed $r12, 1, 14 /* CC::al */, $noreg, $noreg + renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14 /* CC::al */, $noreg + renamable $q1 = MVE_VMVN killed renamable $q1, 0, $noreg, undef renamable $q1 + renamable $lr = t2LoopDec killed renamable $lr, 1 + renamable $q0 = MVE_VQSHRNbhs32 killed renamable $q0, killed renamable $q1, 15, 0, $noreg + MVE_VPST 8, implicit $vpr + renamable $r2 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r2, 16, 1, killed renamable $vpr :: (store 16 into %ir.addr.c, align 4) + t2LoopEnd killed renamable $lr, %bb.1, implicit-def dead $cpsr + tB %bb.2, 14 /* CC::al */, $noreg + + bb.2.exit: + frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r4, def $pc + +... +--- +name: test_vorn +alignment: 2 +tracksRegLiveness: true +registers: [] +liveins: + - { reg: '$r0', virtual-reg: '' } + - { reg: '$r1', virtual-reg: '' } + - { reg: '$r2', virtual-reg: '' } + - { reg: '$r3', virtual-reg: '' } +frameInfo: + stackSize: 8 + offsetAdjustment: 0 + maxAlignment: 4 +fixedStack: + - { id: 0, type: default, offset: 0, size: 4, alignment: 8, stack-id: default, + isImmutable: true, isAliased: false, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +stack: + - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r4', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +callSites: [] +constants: [] +machineFunctionInfo: {} +body: | + ; CHECK-LABEL: name: test_vorn + ; CHECK: bb.0.entry: + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $lr, $r0, $r1, $r2, $r3, $r4 + ; CHECK: frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r4, killed $lr, implicit-def $sp, implicit $sp + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 + ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r4, -8 + ; CHECK: tCMPi8 renamable $r3, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr + ; CHECK: t2IT 11, 8, implicit-def $itstate + ; CHECK: frame-destroy tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r4, def $pc, implicit killed $itstate + ; CHECK: renamable $r4 = tLDRspi $sp, 2, 14 /* CC::al */, $noreg :: (load 4 from %fixed-stack.0, align 8) + ; CHECK: dead $lr = t2DLS renamable $r4 + ; CHECK: $r12 = tMOVr killed $r4, 14 /* CC::al */, $noreg + ; CHECK: bb.1.loop.body: + ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK: liveins: $r0, $r1, $r2, $r3, $r12 + ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg + ; CHECK: MVE_VPST 4, implicit $vpr + ; CHECK: renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.addr.b, align 4) + ; CHECK: renamable $r0, renamable $q1 = MVE_VLDRWU32_post killed renamable $r0, 16, 1, renamable $vpr :: (load 16 from %ir.addr.a, align 4) + ; CHECK: $lr = tMOVr $r12, 14 /* CC::al */, $noreg + ; CHECK: renamable $r12 = t2SUBri killed $r12, 1, 14 /* CC::al */, $noreg, $noreg + ; CHECK: renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14 /* CC::al */, $noreg + ; CHECK: renamable $q0 = MVE_VORN renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0 + ; CHECK: renamable $q1 = MVE_VQSHRUNs32th killed renamable $q1, killed renamable $q0, 3, 0, $noreg + ; CHECK: MVE_VPST 8, implicit $vpr + ; CHECK: renamable $r2 = MVE_VSTRWU32_post killed renamable $q1, killed renamable $r2, 16, 1, killed renamable $vpr :: (store 16 into %ir.addr.c, align 4) + ; CHECK: dead $lr = t2LEUpdate killed renamable $lr, %bb.1 + ; CHECK: bb.2.exit: + ; CHECK: frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r4, def $pc + bb.0.entry: + successors: %bb.1(0x80000000) + liveins: $r0, $r1, $r2, $r3, $r4, $lr + + frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r4, killed $lr, implicit-def $sp, implicit $sp + frame-setup CFI_INSTRUCTION def_cfa_offset 8 + frame-setup CFI_INSTRUCTION offset $lr, -4 + frame-setup CFI_INSTRUCTION offset $r4, -8 + tCMPi8 renamable $r3, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr + t2IT 11, 8, implicit-def $itstate + frame-destroy tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r4, def $pc, implicit killed $itstate + renamable $r4 = tLDRspi $sp, 2, 14 /* CC::al */, $noreg :: (load 4 from %fixed-stack.0, align 8) + t2DoLoopStart renamable $r4 + $r12 = tMOVr killed $r4, 14 /* CC::al */, $noreg + + bb.1.loop.body: + successors: %bb.1(0x7c000000), %bb.2(0x04000000) + liveins: $r0, $r1, $r2, $r3, $r12 + + renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg + MVE_VPST 4, implicit $vpr + renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.addr.b, align 4) + renamable $r0, renamable $q1 = MVE_VLDRWU32_post killed renamable $r0, 16, 1, renamable $vpr :: (load 16 from %ir.addr.a, align 4) + $lr = tMOVr $r12, 14 /* CC::al */, $noreg + renamable $r12 = t2SUBri killed $r12, 1, 14 /* CC::al */, $noreg, $noreg + renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14 /* CC::al */, $noreg + renamable $q0 = MVE_VORN renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0 + renamable $lr = t2LoopDec killed renamable $lr, 1 + renamable $q1 = MVE_VQSHRUNs32th killed renamable $q1, killed renamable $q0, 3, 0, $noreg + MVE_VPST 8, implicit $vpr + renamable $r2 = MVE_VSTRWU32_post killed renamable $q1, killed renamable $r2, 16, 1, killed renamable $vpr :: (store 16 into %ir.addr.c, align 4) + t2LoopEnd killed renamable $lr, %bb.1, implicit-def dead $cpsr + tB %bb.2, 14 /* CC::al */, $noreg + + bb.2.exit: + frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r4, def $pc + +... diff --git a/llvm/test/CodeGen/Thumb2/cde-gpr.ll b/llvm/test/CodeGen/Thumb2/cde-gpr.ll new file mode 100644 index 0000000000000..19052125c14be --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/cde-gpr.ll @@ -0,0 +1,189 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=thumbv8.1m.main -mattr=+cdecp0 -mattr=+cdecp1 -verify-machineinstrs -o - %s | FileCheck %s +; RUN: llc -mtriple=thumbebv8.1m.main -mattr=+cdecp0 -mattr=+cdecp1 -verify-machineinstrs -o - %s | FileCheck %s + +declare i32 @llvm.arm.cde.cx1(i32 immarg, i32 immarg) +declare i32 @llvm.arm.cde.cx1a(i32 immarg, i32, i32 immarg) +declare { i32, i32 } @llvm.arm.cde.cx1d(i32 immarg, i32 immarg) +declare { i32, i32 } @llvm.arm.cde.cx1da(i32 immarg, i32, i32, i32 immarg) + +declare i32 @llvm.arm.cde.cx2(i32 immarg, i32, i32 immarg) +declare i32 @llvm.arm.cde.cx2a(i32 immarg, i32, i32, i32 immarg) +declare { i32, i32 } @llvm.arm.cde.cx2d(i32 immarg, i32, i32 immarg) +declare { i32, i32 } @llvm.arm.cde.cx2da(i32 immarg, i32, i32, i32, i32 immarg) + +declare i32 @llvm.arm.cde.cx3(i32 immarg, i32, i32, i32 immarg) +declare i32 @llvm.arm.cde.cx3a(i32 immarg, i32, i32, i32, i32 immarg) +declare { i32, i32 } @llvm.arm.cde.cx3d(i32 immarg, i32, i32, i32 immarg) +declare { i32, i32 } @llvm.arm.cde.cx3da(i32 immarg, i32, i32, i32, i32, i32 immarg) + +define arm_aapcs_vfpcc i32 @test_cx1() { +; CHECK-LABEL: test_cx1: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: cx1 p0, r0, #123 +; CHECK-NEXT: bx lr +entry: + %0 = call i32 @llvm.arm.cde.cx1(i32 0, i32 123) + ret i32 %0 +} + +define arm_aapcs_vfpcc i32 @test_cx1a(i32 %acc) { +; CHECK-LABEL: test_cx1a: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: cx1a p0, r0, #345 +; CHECK-NEXT: bx lr +entry: + %0 = call i32 @llvm.arm.cde.cx1a(i32 0, i32 %acc, i32 345) + ret i32 %0 +} + +define arm_aapcs_vfpcc i64 @test_cx1d() { +; CHECK-LABEL: test_cx1d: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: cx1d p1, r0, r1, #567 +; CHECK-NEXT: bx lr +entry: + %0 = call { i32, i32 } @llvm.arm.cde.cx1d(i32 1, i32 567) + %1 = extractvalue { i32, i32 } %0, 1 + %2 = zext i32 %1 to i64 + %3 = shl i64 %2, 32 + %4 = extractvalue { i32, i32 } %0, 0 + %5 = zext i32 %4 to i64 + %6 = or i64 %3, %5 + ret i64 %6 +} + +define arm_aapcs_vfpcc i64 @test_cx1da(i64 %acc) { +; CHECK-LABEL: test_cx1da: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: @ kill: def $r1 killed $r1 killed $r0_r1 def $r0_r1 +; CHECK-NEXT: @ kill: def $r0 killed $r0 killed $r0_r1 def $r0_r1 +; CHECK-NEXT: cx1da p0, r0, r1, #789 +; CHECK-NEXT: bx lr +entry: + %0 = lshr i64 %acc, 32 + %1 = trunc i64 %0 to i32 + %2 = trunc i64 %acc to i32 + %3 = call { i32, i32 } @llvm.arm.cde.cx1da(i32 0, i32 %2, i32 %1, i32 789) + %4 = extractvalue { i32, i32 } %3, 1 + %5 = zext i32 %4 to i64 + %6 = shl i64 %5, 32 + %7 = extractvalue { i32, i32 } %3, 0 + %8 = zext i32 %7 to i64 + %9 = or i64 %6, %8 + ret i64 %9 +} + +define arm_aapcs_vfpcc i32 @test_cx2(i32 %n) { +; CHECK-LABEL: test_cx2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: cx2 p0, r0, r0, #11 +; CHECK-NEXT: bx lr +entry: + %0 = call i32 @llvm.arm.cde.cx2(i32 0, i32 %n, i32 11) + ret i32 %0 +} + +define arm_aapcs_vfpcc i32 @test_cx2a(i32 %acc, i32 %n) { +; CHECK-LABEL: test_cx2a: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: cx2a p1, r0, r1, #22 +; CHECK-NEXT: bx lr +entry: + %0 = call i32 @llvm.arm.cde.cx2a(i32 1, i32 %acc, i32 %n, i32 22) + ret i32 %0 +} + +define arm_aapcs_vfpcc i64 @test_cx2d(i32 %n) #0 { +; CHECK-LABEL: test_cx2d: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: cx2d p1, r0, r1, r0, #33 +; CHECK-NEXT: bx lr +entry: + %0 = call { i32, i32 } @llvm.arm.cde.cx2d(i32 1, i32 %n, i32 33) + %1 = extractvalue { i32, i32 } %0, 1 + %2 = zext i32 %1 to i64 + %3 = shl i64 %2, 32 + %4 = extractvalue { i32, i32 } %0, 0 + %5 = zext i32 %4 to i64 + %6 = or i64 %3, %5 + ret i64 %6 +} + +define arm_aapcs_vfpcc i64 @test_cx2da(i64 %acc, i32 %n) { +; CHECK-LABEL: test_cx2da: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: @ kill: def $r1 killed $r1 killed $r0_r1 def $r0_r1 +; CHECK-NEXT: @ kill: def $r0 killed $r0 killed $r0_r1 def $r0_r1 +; CHECK-NEXT: cx2da p0, r0, r1, r2, #44 +; CHECK-NEXT: bx lr +entry: + %0 = lshr i64 %acc, 32 + %1 = trunc i64 %0 to i32 + %2 = trunc i64 %acc to i32 + %3 = call { i32, i32 } @llvm.arm.cde.cx2da(i32 0, i32 %2, i32 %1, i32 %n, i32 44) + %4 = extractvalue { i32, i32 } %3, 1 + %5 = zext i32 %4 to i64 + %6 = shl i64 %5, 32 + %7 = extractvalue { i32, i32 } %3, 0 + %8 = zext i32 %7 to i64 + %9 = or i64 %6, %8 + ret i64 %9 +} + +define arm_aapcs_vfpcc i32 @test_cx3(i32 %n, i32 %m) { +; CHECK-LABEL: test_cx3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: cx3 p0, r0, r0, r1, #1 +; CHECK-NEXT: bx lr +entry: + %0 = call i32 @llvm.arm.cde.cx3(i32 0, i32 %n, i32 %m, i32 1) + ret i32 %0 +} + +define arm_aapcs_vfpcc i32 @test_cx3a(i32 %acc, i32 %n, i32 %m) { +; CHECK-LABEL: test_cx3a: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: cx3a p1, r0, r1, r2, #2 +; CHECK-NEXT: bx lr +entry: + %0 = call i32 @llvm.arm.cde.cx3a(i32 1, i32 %acc, i32 %n, i32 %m, i32 2) + ret i32 %0 +} + +define arm_aapcs_vfpcc i64 @test_cx3d(i32 %n, i32 %m) { +; CHECK-LABEL: test_cx3d: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: cx3d p1, r0, r1, r0, r1, #3 +; CHECK-NEXT: bx lr +entry: + %0 = call { i32, i32 } @llvm.arm.cde.cx3d(i32 1, i32 %n, i32 %m, i32 3) + %1 = extractvalue { i32, i32 } %0, 1 + %2 = zext i32 %1 to i64 + %3 = shl i64 %2, 32 + %4 = extractvalue { i32, i32 } %0, 0 + %5 = zext i32 %4 to i64 + %6 = or i64 %3, %5 + ret i64 %6 +} + +define arm_aapcs_vfpcc i64 @test_cx3da(i64 %acc, i32 %n, i32 %m) { +; CHECK-LABEL: test_cx3da: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: @ kill: def $r1 killed $r1 killed $r0_r1 def $r0_r1 +; CHECK-NEXT: @ kill: def $r0 killed $r0 killed $r0_r1 def $r0_r1 +; CHECK-NEXT: cx3da p0, r0, r1, r2, r3, #4 +; CHECK-NEXT: bx lr +entry: + %0 = lshr i64 %acc, 32 + %1 = trunc i64 %0 to i32 + %2 = trunc i64 %acc to i32 + %3 = call { i32, i32 } @llvm.arm.cde.cx3da(i32 0, i32 %2, i32 %1, i32 %n, i32 %m, i32 4) + %4 = extractvalue { i32, i32 } %3, 1 + %5 = zext i32 %4 to i64 + %6 = shl i64 %5, 32 + %7 = extractvalue { i32, i32 } %3, 0 + %8 = zext i32 %7 to i64 + %9 = or i64 %6, %8 + ret i64 %9 +} diff --git a/llvm/test/CodeGen/Thumb2/cde-vec.ll b/llvm/test/CodeGen/Thumb2/cde-vec.ll new file mode 100644 index 0000000000000..5dfd11180e7f6 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/cde-vec.ll @@ -0,0 +1,114 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=thumbv8.1m.main -mattr=+cdecp0 -mattr=+cdecp1 -mattr=+mve -verify-machineinstrs -o - %s | FileCheck %s + +declare <16 x i8> @llvm.arm.cde.vcx1q(i32 immarg, i32 immarg) +declare <16 x i8> @llvm.arm.cde.vcx1qa(i32 immarg, <16 x i8>, i32 immarg) +declare <16 x i8> @llvm.arm.cde.vcx2q(i32 immarg, <16 x i8>, i32 immarg) +declare <16 x i8> @llvm.arm.cde.vcx2qa(i32 immarg, <16 x i8>, <16 x i8>, i32 immarg) +declare <16 x i8> @llvm.arm.cde.vcx3q(i32 immarg, <16 x i8>, <16 x i8>, i32 immarg) +declare <16 x i8> @llvm.arm.cde.vcx3qa(i32 immarg, <16 x i8>, <16 x i8>, <16 x i8>, i32 immarg) + +define arm_aapcs_vfpcc <16 x i8> @test_vcx1q_u8() { +; CHECK-LABEL: test_vcx1q_u8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vcx1 p0, q0, #1111 +; CHECK-NEXT: bx lr +entry: + %0 = call <16 x i8> @llvm.arm.cde.vcx1q(i32 0, i32 1111) + ret <16 x i8> %0 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vcx1qa_1(<16 x i8> %acc) { +; CHECK-LABEL: test_vcx1qa_1: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vcx1a p1, q0, #1112 +; CHECK-NEXT: bx lr +entry: + %0 = call <16 x i8> @llvm.arm.cde.vcx1qa(i32 1, <16 x i8> %acc, i32 1112) + ret <16 x i8> %0 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vcx1qa_2(<4 x i32> %acc) { +; CHECK-LABEL: test_vcx1qa_2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vcx1a p0, q0, #1113 +; CHECK-NEXT: bx lr +entry: + %0 = bitcast <4 x i32> %acc to <16 x i8> + %1 = call <16 x i8> @llvm.arm.cde.vcx1qa(i32 0, <16 x i8> %0, i32 1113) + %2 = bitcast <16 x i8> %1 to <4 x i32> + ret <4 x i32> %2 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vcx2q_u8(<8 x half> %n) { +; CHECK-LABEL: test_vcx2q_u8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vcx2 p1, q0, q0, #111 +; CHECK-NEXT: bx lr +entry: + %0 = bitcast <8 x half> %n to <16 x i8> + %1 = call <16 x i8> @llvm.arm.cde.vcx2q(i32 1, <16 x i8> %0, i32 111) + ret <16 x i8> %1 +} + +define arm_aapcs_vfpcc <4 x float> @test_vcx2q(<4 x float> %n) { +; CHECK-LABEL: test_vcx2q: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vcx2 p1, q0, q0, #112 +; CHECK-NEXT: bx lr +entry: + %0 = bitcast <4 x float> %n to <16 x i8> + %1 = call <16 x i8> @llvm.arm.cde.vcx2q(i32 1, <16 x i8> %0, i32 112) + %2 = bitcast <16 x i8> %1 to <4 x float> + ret <4 x float> %2 +} + +define arm_aapcs_vfpcc <4 x float> @test_vcx2qa(<4 x float> %acc, <2 x i64> %n) { +; CHECK-LABEL: test_vcx2qa: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vcx2a p0, q0, q1, #113 +; CHECK-NEXT: bx lr +entry: + %0 = bitcast <4 x float> %acc to <16 x i8> + %1 = bitcast <2 x i64> %n to <16 x i8> + %2 = call <16 x i8> @llvm.arm.cde.vcx2qa(i32 0, <16 x i8> %0, <16 x i8> %1, i32 113) + %3 = bitcast <16 x i8> %2 to <4 x float> + ret <4 x float> %3 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vcx3q_u8(<8 x i16> %n, <4 x i32> %m) { +; CHECK-LABEL: test_vcx3q_u8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vcx3 p0, q0, q0, q1, #11 +; CHECK-NEXT: bx lr +entry: + %0 = bitcast <8 x i16> %n to <16 x i8> + %1 = bitcast <4 x i32> %m to <16 x i8> + %2 = call <16 x i8> @llvm.arm.cde.vcx3q(i32 0, <16 x i8> %0, <16 x i8> %1, i32 11) + ret <16 x i8> %2 +} + +define arm_aapcs_vfpcc <2 x i64> @test_vcx3q(<2 x i64> %n, <4 x float> %m) { +; CHECK-LABEL: test_vcx3q: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vcx3 p1, q0, q0, q1, #12 +; CHECK-NEXT: bx lr +entry: + %0 = bitcast <2 x i64> %n to <16 x i8> + %1 = bitcast <4 x float> %m to <16 x i8> + %2 = call <16 x i8> @llvm.arm.cde.vcx3q(i32 1, <16 x i8> %0, <16 x i8> %1, i32 12) + %3 = bitcast <16 x i8> %2 to <2 x i64> + ret <2 x i64> %3 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vcx3qa(<16 x i8> %acc, <8 x i16> %n, <4 x float> %m) { +; CHECK-LABEL: test_vcx3qa: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vcx3a p1, q0, q1, q2, #13 +; CHECK-NEXT: bx lr +entry: + %0 = bitcast <8 x i16> %n to <16 x i8> + %1 = bitcast <4 x float> %m to <16 x i8> + %2 = call <16 x i8> @llvm.arm.cde.vcx3qa(i32 1, <16 x i8> %acc, <16 x i8> %0, <16 x i8> %1, i32 13) + ret <16 x i8> %2 +} diff --git a/llvm/test/CodeGen/Thumb2/cde-vfp.ll b/llvm/test/CodeGen/Thumb2/cde-vfp.ll new file mode 100644 index 0000000000000..54ee1d5166612 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/cde-vfp.ll @@ -0,0 +1,198 @@ +; RUN: llc -mtriple=thumbv8.1m.main -mattr=+cdecp0 -mattr=+cdecp1 -mattr=+mve -verify-machineinstrs -o - %s | FileCheck %s +; RUN: llc -mtriple=thumbv8m.main -mattr=+cdecp0 -mattr=+cdecp1 -mattr=+fp-armv8d16sp -verify-machineinstrs -o - %s | FileCheck %s + +declare float @llvm.arm.cde.vcx1.f32(i32 immarg, i32 immarg) +declare float @llvm.arm.cde.vcx1a.f32(i32 immarg, float, i32 immarg) +declare float @llvm.arm.cde.vcx2.f32(i32 immarg, float, i32 immarg) +declare float @llvm.arm.cde.vcx2a.f32(i32 immarg, float, float, i32 immarg) +declare float @llvm.arm.cde.vcx3.f32(i32 immarg, float, float, i32 immarg) +declare float @llvm.arm.cde.vcx3a.f32(i32 immarg, float, float, float, i32 immarg) + +declare double @llvm.arm.cde.vcx1.f64(i32 immarg, i32 immarg) +declare double @llvm.arm.cde.vcx1a.f64(i32 immarg, double, i32 immarg) +declare double @llvm.arm.cde.vcx2.f64(i32 immarg, double, i32 immarg) +declare double @llvm.arm.cde.vcx2a.f64(i32 immarg, double, double, i32 immarg) +declare double @llvm.arm.cde.vcx3.f64(i32 immarg, double, double, i32 immarg) +declare double @llvm.arm.cde.vcx3a.f64(i32 immarg, double, double, double, i32 immarg) + +define arm_aapcs_vfpcc i32 @test_vcx1_u32() { +; CHECK-LABEL: test_vcx1_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vcx1 p0, s0, #11 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: bx lr +entry: + %0 = call float @llvm.arm.cde.vcx1.f32(i32 0, i32 11) + %1 = bitcast float %0 to i32 + ret i32 %1 +} + +define arm_aapcs_vfpcc i32 @test_vcx1a_u32(i32 %acc) { +; CHECK-LABEL: test_vcx1a_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov s0, r0 +; CHECK-NEXT: vcx1a p1, s0, #12 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: bx lr +entry: + %0 = bitcast i32 %acc to float + %1 = call float @llvm.arm.cde.vcx1a.f32(i32 1, float %0, i32 12) + %2 = bitcast float %1 to i32 + ret i32 %2 +} + +define arm_aapcs_vfpcc i32 @test_vcx2_u32(i32 %n) { +; CHECK-LABEL: test_vcx2_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov s0, r0 +; CHECK-NEXT: vcx2 p0, s0, s0, #21 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: bx lr +entry: + %0 = bitcast i32 %n to float + %1 = call float @llvm.arm.cde.vcx2.f32(i32 0, float %0, i32 21) + %2 = bitcast float %1 to i32 + ret i32 %2 +} + +define arm_aapcs_vfpcc i32 @test_vcx2a_u32(i32 %acc, i32 %n) { +; CHECK-LABEL: test_vcx2a_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov s0, r1 +; CHECK-NEXT: vmov s2, r0 +; CHECK-NEXT: vcx2a p0, s2, s0, #22 +; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: bx lr +entry: + %0 = bitcast i32 %acc to float + %1 = bitcast i32 %n to float + %2 = call float @llvm.arm.cde.vcx2a.f32(i32 0, float %0, float %1, i32 22) + %3 = bitcast float %2 to i32 + ret i32 %3 +} + +define arm_aapcs_vfpcc i32 @test_vcx3_u32(i32 %n, i32 %m) { +; CHECK-LABEL: test_vcx3_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov s0, r1 +; CHECK-NEXT: vmov s2, r0 +; CHECK-NEXT: vcx3 p1, s0, s2, s0, #3 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: bx lr +entry: + %0 = bitcast i32 %n to float + %1 = bitcast i32 %m to float + %2 = call float @llvm.arm.cde.vcx3.f32(i32 1, float %0, float %1, i32 3) + %3 = bitcast float %2 to i32 + ret i32 %3 +} + +define arm_aapcs_vfpcc i32 @test_vcx3a_u32(i32 %acc, i32 %n, i32 %m) { +; CHECK-LABEL: test_vcx3a_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov s0, r2 +; CHECK-NEXT: vmov s2, r1 +; CHECK-NEXT: vmov s4, r0 +; CHECK-NEXT: vcx3a p0, s4, s2, s0, #5 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: bx lr +entry: + %0 = bitcast i32 %acc to float + %1 = bitcast i32 %n to float + %2 = bitcast i32 %m to float + %3 = call float @llvm.arm.cde.vcx3a.f32(i32 0, float %0, float %1, float %2, i32 5) + %4 = bitcast float %3 to i32 + ret i32 %4 +} + +define arm_aapcs_vfpcc i64 @test_vcx1d_u64() { +; CHECK-LABEL: test_vcx1d_u64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vcx1 p0, d0, #11 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: bx lr +entry: + %0 = call double @llvm.arm.cde.vcx1.f64(i32 0, i32 11) + %1 = bitcast double %0 to i64 + ret i64 %1 +} + +define arm_aapcs_vfpcc i64 @test_vcx1da_u64(i64 %acc) { +; CHECK-LABEL: test_vcx1da_u64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov d0, r0, r1 +; CHECK-NEXT: vcx1a p1, d0, #12 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: bx lr +entry: + %0 = bitcast i64 %acc to double + %1 = call double @llvm.arm.cde.vcx1a.f64(i32 1, double %0, i32 12) + %2 = bitcast double %1 to i64 + ret i64 %2 +} + +define arm_aapcs_vfpcc i64 @test_vcx2d_u64(i64 %n) { +; CHECK-LABEL: test_vcx2d_u64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov d0, r0, r1 +; CHECK-NEXT: vcx2 p0, d0, d0, #21 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: bx lr +entry: + %0 = bitcast i64 %n to double + %1 = call double @llvm.arm.cde.vcx2.f64(i32 0, double %0, i32 21) + %2 = bitcast double %1 to i64 + ret i64 %2 +} + +define arm_aapcs_vfpcc i64 @test_vcx2da_u64(i64 %acc, i64 %n) { +; CHECK-LABEL: test_vcx2da_u64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov d0, r2, r3 +; CHECK-NEXT: vmov d1, r0, r1 +; CHECK-NEXT: vcx2a p0, d1, d0, #22 +; CHECK-NEXT: vmov r0, r1, d1 +; CHECK-NEXT: bx lr +entry: + %0 = bitcast i64 %acc to double + %1 = bitcast i64 %n to double + %2 = call double @llvm.arm.cde.vcx2a.f64(i32 0, double %0, double %1, i32 22) + %3 = bitcast double %2 to i64 + ret i64 %3 +} + +define arm_aapcs_vfpcc i64 @test_vcx3d_u64(i64 %n, i64 %m) { +; CHECK-LABEL: test_vcx3d_u64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov d0, r2, r3 +; CHECK-NEXT: vmov d1, r0, r1 +; CHECK-NEXT: vcx3 p1, d0, d1, d0, #3 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: bx lr +entry: + %0 = bitcast i64 %n to double + %1 = bitcast i64 %m to double + %2 = call double @llvm.arm.cde.vcx3.f64(i32 1, double %0, double %1, i32 3) + %3 = bitcast double %2 to i64 + ret i64 %3 +} + +define arm_aapcs_vfpcc i64 @test_vcx3da_u64(i64 %acc, i64 %n, i64 %m) { +; CHECK-LABEL: test_vcx3da_u64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: ldrd lr, r12, [sp, #8] +; CHECK-DAG: vmov [[D0:d.*]], r0, r1 +; CHECK-DAG: vmov [[D1:d.*]], r2, r3 +; CHECK-DAG: vmov [[D2:d.*]], lr, r12 +; CHECK-NEXT: vcx3a p0, [[D0]], [[D1]], [[D2]], #5 +; CHECK-NEXT: vmov r0, r1, [[D0]] +; CHECK-NEXT: pop {r7, pc} +entry: + %0 = bitcast i64 %acc to double + %1 = bitcast i64 %n to double + %2 = bitcast i64 %m to double + %3 = call double @llvm.arm.cde.vcx3a.f64(i32 0, double %0, double %1, double %2, i32 5) + %4 = bitcast double %3 to i64 + ret i64 %4 +} diff --git a/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll new file mode 100644 index 0000000000000..8161b1326b6a9 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll @@ -0,0 +1,1470 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s + +define arm_aapcs_vfpcc void @test_fadd(half* noalias nocapture readonly %A, half *%BB, half* noalias nocapture %C, i32 %n) { +; CHECK-LABEL: test_fadd: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: cmp r3, #1 +; CHECK-NEXT: it lt +; CHECK-NEXT: bxlt lr +; CHECK-NEXT: vldr.16 s0, [r1] +; CHECK-NEXT: vmov.f16 r1, s0 +; CHECK-NEXT: vdup.16 q0, r1 +; CHECK-NEXT: .LBB0_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q1, [r0], #16 +; CHECK-NEXT: subs r3, #8 +; CHECK-NEXT: vadd.f16 q1, q1, q0 +; CHECK-NEXT: vstrb.8 q1, [r2], #16 +; CHECK-NEXT: bne .LBB0_1 +; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: bx lr +entry: + %B = load half, half* %BB + %0 = and i32 %n, 7 + %cmp = icmp eq i32 %0, 0 + tail call void @llvm.assume(i1 %cmp) + %cmp18 = icmp sgt i32 %n, 0 + br i1 %cmp18, label %vector.ph, label %for.cond.cleanup + +vector.ph: ; preds = %entry + %broadcast.splatinsert10 = insertelement <8 x half> undef, half %B, i32 0 + %broadcast.splat11 = shufflevector <8 x half> %broadcast.splatinsert10, <8 x half> undef, <8 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %1 = getelementptr inbounds half, half* %A, i32 %index + %2 = bitcast half* %1 to <8 x half>* + %wide.load = load <8 x half>, <8 x half>* %2, align 4 + %3 = fadd fast <8 x half> %wide.load, %broadcast.splat11 + %4 = getelementptr inbounds half, half* %C, i32 %index + %5 = bitcast half* %4 to <8 x half>* + store <8 x half> %3, <8 x half>* %5, align 4 + %index.next = add i32 %index, 8 + %6 = icmp eq i32 %index.next, %n + br i1 %6, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body, %entry + ret void +} + +define arm_aapcs_vfpcc void @test_fadd_r(half* noalias nocapture readonly %A, half *%BB, half* noalias nocapture %C, i32 %n) { +; CHECK-LABEL: test_fadd_r: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: cmp r3, #1 +; CHECK-NEXT: it lt +; CHECK-NEXT: bxlt lr +; CHECK-NEXT: vldr.16 s0, [r1] +; CHECK-NEXT: vmov.f16 r1, s0 +; CHECK-NEXT: vdup.16 q0, r1 +; CHECK-NEXT: .LBB1_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q1, [r0], #16 +; CHECK-NEXT: subs r3, #8 +; CHECK-NEXT: vadd.f16 q1, q0, q1 +; CHECK-NEXT: vstrb.8 q1, [r2], #16 +; CHECK-NEXT: bne .LBB1_1 +; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: bx lr +entry: + %B = load half, half* %BB + %0 = and i32 %n, 7 + %cmp = icmp eq i32 %0, 0 + tail call void @llvm.assume(i1 %cmp) + %cmp18 = icmp sgt i32 %n, 0 + br i1 %cmp18, label %vector.ph, label %for.cond.cleanup + +vector.ph: ; preds = %entry + %broadcast.splatinsert10 = insertelement <8 x half> undef, half %B, i32 0 + %broadcast.splat11 = shufflevector <8 x half> %broadcast.splatinsert10, <8 x half> undef, <8 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %1 = getelementptr inbounds half, half* %A, i32 %index + %2 = bitcast half* %1 to <8 x half>* + %wide.load = load <8 x half>, <8 x half>* %2, align 4 + %3 = fadd fast <8 x half> %broadcast.splat11, %wide.load + %4 = getelementptr inbounds half, half* %C, i32 %index + %5 = bitcast half* %4 to <8 x half>* + store <8 x half> %3, <8 x half>* %5, align 4 + %index.next = add i32 %index, 8 + %6 = icmp eq i32 %index.next, %n + br i1 %6, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body, %entry + ret void +} + +define arm_aapcs_vfpcc void @test_fmul(half* noalias nocapture readonly %A, half *%BB, half* noalias nocapture %C, i32 %n) { +; CHECK-LABEL: test_fmul: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: cmp r3, #1 +; CHECK-NEXT: it lt +; CHECK-NEXT: bxlt lr +; CHECK-NEXT: vldr.16 s0, [r1] +; CHECK-NEXT: vmov.f16 r1, s0 +; CHECK-NEXT: vdup.16 q0, r1 +; CHECK-NEXT: .LBB2_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q1, [r0], #16 +; CHECK-NEXT: subs r3, #8 +; CHECK-NEXT: vmul.f16 q1, q1, q0 +; CHECK-NEXT: vstrb.8 q1, [r2], #16 +; CHECK-NEXT: bne .LBB2_1 +; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: bx lr +entry: + %B = load half, half* %BB + %0 = and i32 %n, 7 + %cmp = icmp eq i32 %0, 0 + tail call void @llvm.assume(i1 %cmp) + %cmp18 = icmp sgt i32 %n, 0 + br i1 %cmp18, label %vector.ph, label %for.cond.cleanup + +vector.ph: ; preds = %entry + %broadcast.splatinsert10 = insertelement <8 x half> undef, half %B, i32 0 + %broadcast.splat11 = shufflevector <8 x half> %broadcast.splatinsert10, <8 x half> undef, <8 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %1 = getelementptr inbounds half, half* %A, i32 %index + %2 = bitcast half* %1 to <8 x half>* + %wide.load = load <8 x half>, <8 x half>* %2, align 4 + %3 = fmul fast <8 x half> %wide.load, %broadcast.splat11 + %4 = getelementptr inbounds half, half* %C, i32 %index + %5 = bitcast half* %4 to <8 x half>* + store <8 x half> %3, <8 x half>* %5, align 4 + %index.next = add i32 %index, 8 + %6 = icmp eq i32 %index.next, %n + br i1 %6, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body, %entry + ret void +} + +define arm_aapcs_vfpcc void @test_fmul_r(half* noalias nocapture readonly %A, half *%BB, half* noalias nocapture %C, i32 %n) { +; CHECK-LABEL: test_fmul_r: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: cmp r3, #1 +; CHECK-NEXT: it lt +; CHECK-NEXT: bxlt lr +; CHECK-NEXT: vldr.16 s0, [r1] +; CHECK-NEXT: vmov.f16 r1, s0 +; CHECK-NEXT: vdup.16 q0, r1 +; CHECK-NEXT: .LBB3_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q1, [r0], #16 +; CHECK-NEXT: subs r3, #8 +; CHECK-NEXT: vmul.f16 q1, q0, q1 +; CHECK-NEXT: vstrb.8 q1, [r2], #16 +; CHECK-NEXT: bne .LBB3_1 +; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: bx lr +entry: + %B = load half, half* %BB + %0 = and i32 %n, 7 + %cmp = icmp eq i32 %0, 0 + tail call void @llvm.assume(i1 %cmp) + %cmp18 = icmp sgt i32 %n, 0 + br i1 %cmp18, label %vector.ph, label %for.cond.cleanup + +vector.ph: ; preds = %entry + %broadcast.splatinsert10 = insertelement <8 x half> undef, half %B, i32 0 + %broadcast.splat11 = shufflevector <8 x half> %broadcast.splatinsert10, <8 x half> undef, <8 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %1 = getelementptr inbounds half, half* %A, i32 %index + %2 = bitcast half* %1 to <8 x half>* + %wide.load = load <8 x half>, <8 x half>* %2, align 4 + %3 = fmul fast <8 x half> %broadcast.splat11, %wide.load + %4 = getelementptr inbounds half, half* %C, i32 %index + %5 = bitcast half* %4 to <8 x half>* + store <8 x half> %3, <8 x half>* %5, align 4 + %index.next = add i32 %index, 8 + %6 = icmp eq i32 %index.next, %n + br i1 %6, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body, %entry + ret void +} + +define arm_aapcs_vfpcc void @test_fsub(half* noalias nocapture readonly %A, half *%BB, half* noalias nocapture %C, i32 %n) { +; CHECK-LABEL: test_fsub: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: cmp r3, #1 +; CHECK-NEXT: it lt +; CHECK-NEXT: bxlt lr +; CHECK-NEXT: vldr.16 s0, [r1] +; CHECK-NEXT: vmov.f16 r1, s0 +; CHECK-NEXT: vdup.16 q0, r1 +; CHECK-NEXT: .LBB4_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q1, [r0], #16 +; CHECK-NEXT: subs r3, #8 +; CHECK-NEXT: vsub.f16 q1, q1, q0 +; CHECK-NEXT: vstrb.8 q1, [r2], #16 +; CHECK-NEXT: bne .LBB4_1 +; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: bx lr +entry: + %B = load half, half* %BB + %0 = and i32 %n, 7 + %cmp = icmp eq i32 %0, 0 + tail call void @llvm.assume(i1 %cmp) + %cmp18 = icmp sgt i32 %n, 0 + br i1 %cmp18, label %vector.ph, label %for.cond.cleanup + +vector.ph: ; preds = %entry + %broadcast.splatinsert10 = insertelement <8 x half> undef, half %B, i32 0 + %broadcast.splat11 = shufflevector <8 x half> %broadcast.splatinsert10, <8 x half> undef, <8 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %1 = getelementptr inbounds half, half* %A, i32 %index + %2 = bitcast half* %1 to <8 x half>* + %wide.load = load <8 x half>, <8 x half>* %2, align 4 + %3 = fsub fast <8 x half> %wide.load, %broadcast.splat11 + %4 = getelementptr inbounds half, half* %C, i32 %index + %5 = bitcast half* %4 to <8 x half>* + store <8 x half> %3, <8 x half>* %5, align 4 + %index.next = add i32 %index, 8 + %6 = icmp eq i32 %index.next, %n + br i1 %6, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body, %entry + ret void +} + +define arm_aapcs_vfpcc void @test_fsub_r(half* noalias nocapture readonly %A, half *%BB, half* noalias nocapture %C, i32 %n) { +; CHECK-LABEL: test_fsub_r: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: cmp r3, #1 +; CHECK-NEXT: it lt +; CHECK-NEXT: bxlt lr +; CHECK-NEXT: vldr.16 s0, [r1] +; CHECK-NEXT: vmov.f16 r1, s0 +; CHECK-NEXT: vdup.16 q0, r1 +; CHECK-NEXT: .LBB5_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q1, [r0], #16 +; CHECK-NEXT: subs r3, #8 +; CHECK-NEXT: vsub.f16 q1, q0, q1 +; CHECK-NEXT: vstrb.8 q1, [r2], #16 +; CHECK-NEXT: bne .LBB5_1 +; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: bx lr +entry: + %B = load half, half* %BB + %0 = and i32 %n, 7 + %cmp = icmp eq i32 %0, 0 + tail call void @llvm.assume(i1 %cmp) + %cmp18 = icmp sgt i32 %n, 0 + br i1 %cmp18, label %vector.ph, label %for.cond.cleanup + +vector.ph: ; preds = %entry + %broadcast.splatinsert10 = insertelement <8 x half> undef, half %B, i32 0 + %broadcast.splat11 = shufflevector <8 x half> %broadcast.splatinsert10, <8 x half> undef, <8 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %1 = getelementptr inbounds half, half* %A, i32 %index + %2 = bitcast half* %1 to <8 x half>* + %wide.load = load <8 x half>, <8 x half>* %2, align 4 + %3 = fsub fast <8 x half> %broadcast.splat11, %wide.load + %4 = getelementptr inbounds half, half* %C, i32 %index + %5 = bitcast half* %4 to <8 x half>* + store <8 x half> %3, <8 x half>* %5, align 4 + %index.next = add i32 %index, 8 + %6 = icmp eq i32 %index.next, %n + br i1 %6, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body, %entry + ret void +} + + +define arm_aapcs_vfpcc void @test_fmas(half* noalias nocapture readonly %A, half* noalias nocapture readonly %B, half *%CC, half* noalias nocapture %D, i32 %n) { +; CHECK-LABEL: test_fmas: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldr.w r12, [sp] +; CHECK-NEXT: cmp.w r12, #1 +; CHECK-NEXT: it lt +; CHECK-NEXT: bxlt lr +; CHECK-NEXT: vldr.16 s0, [r2] +; CHECK-NEXT: vmov.f16 r2, s0 +; CHECK-NEXT: vdup.16 q0, r2 +; CHECK-NEXT: .LBB6_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q1, [r0], #16 +; CHECK-NEXT: vldrw.u32 q2, [r1], #16 +; CHECK-NEXT: vmov q3, q0 +; CHECK-NEXT: subs.w r12, r12, #8 +; CHECK-NEXT: vfma.f16 q3, q2, q1 +; CHECK-NEXT: vstrb.8 q3, [r3], #16 +; CHECK-NEXT: bne .LBB6_1 +; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: bx lr +entry: + %C = load half, half* %CC + %0 = and i32 %n, 7 + %cmp = icmp eq i32 %0, 0 + tail call void @llvm.assume(i1 %cmp) + %cmp110 = icmp sgt i32 %n, 0 + br i1 %cmp110, label %vector.ph, label %for.cond.cleanup + +vector.ph: ; preds = %entry + %broadcast.splatinsert13 = insertelement <8 x half> undef, half %C, i32 0 + %broadcast.splat14 = shufflevector <8 x half> %broadcast.splatinsert13, <8 x half> undef, <8 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %1 = getelementptr inbounds half, half* %A, i32 %index + %2 = bitcast half* %1 to <8 x half>* + %wide.load = load <8 x half>, <8 x half>* %2, align 4 + %3 = getelementptr inbounds half, half* %B, i32 %index + %4 = bitcast half* %3 to <8 x half>* + %wide.load12 = load <8 x half>, <8 x half>* %4, align 4 + %5 = fmul fast <8 x half> %wide.load12, %wide.load + %6 = fadd fast <8 x half> %5, %broadcast.splat14 + %7 = getelementptr inbounds half, half* %D, i32 %index + %8 = bitcast half* %7 to <8 x half>* + store <8 x half> %6, <8 x half>* %8, align 4 + %index.next = add i32 %index, 8 + %9 = icmp eq i32 %index.next, %n + br i1 %9, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body, %entry + ret void +} + +define arm_aapcs_vfpcc void @test_fmas_r(half* noalias nocapture readonly %A, half* noalias nocapture readonly %B, half *%CC, half* noalias nocapture %D, i32 %n) { +; CHECK-LABEL: test_fmas_r: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldr.w r12, [sp] +; CHECK-NEXT: cmp.w r12, #1 +; CHECK-NEXT: it lt +; CHECK-NEXT: bxlt lr +; CHECK-NEXT: vldr.16 s0, [r2] +; CHECK-NEXT: vmov.f16 r2, s0 +; CHECK-NEXT: vdup.16 q0, r2 +; CHECK-NEXT: .LBB7_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q1, [r0], #16 +; CHECK-NEXT: vldrw.u32 q2, [r1], #16 +; CHECK-NEXT: vmov q3, q0 +; CHECK-NEXT: subs.w r12, r12, #8 +; CHECK-NEXT: vfma.f16 q3, q2, q1 +; CHECK-NEXT: vstrb.8 q3, [r3], #16 +; CHECK-NEXT: bne .LBB7_1 +; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: bx lr +entry: + %C = load half, half* %CC + %0 = and i32 %n, 7 + %cmp = icmp eq i32 %0, 0 + tail call void @llvm.assume(i1 %cmp) + %cmp110 = icmp sgt i32 %n, 0 + br i1 %cmp110, label %vector.ph, label %for.cond.cleanup + +vector.ph: ; preds = %entry + %broadcast.splatinsert13 = insertelement <8 x half> undef, half %C, i32 0 + %broadcast.splat14 = shufflevector <8 x half> %broadcast.splatinsert13, <8 x half> undef, <8 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %1 = getelementptr inbounds half, half* %A, i32 %index + %2 = bitcast half* %1 to <8 x half>* + %wide.load = load <8 x half>, <8 x half>* %2, align 4 + %3 = getelementptr inbounds half, half* %B, i32 %index + %4 = bitcast half* %3 to <8 x half>* + %wide.load12 = load <8 x half>, <8 x half>* %4, align 4 + %5 = fmul fast <8 x half> %wide.load12, %wide.load + %6 = fadd fast <8 x half> %broadcast.splat14, %5 + %7 = getelementptr inbounds half, half* %D, i32 %index + %8 = bitcast half* %7 to <8 x half>* + store <8 x half> %6, <8 x half>* %8, align 4 + %index.next = add i32 %index, 8 + %9 = icmp eq i32 %index.next, %n + br i1 %9, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body, %entry + ret void +} + +define arm_aapcs_vfpcc void @test_fma(half* noalias nocapture readonly %A, half* noalias nocapture readonly %B, half *%CC, half* noalias nocapture %D, i32 %n) { +; CHECK-LABEL: test_fma: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldr.w r12, [sp] +; CHECK-NEXT: cmp.w r12, #1 +; CHECK-NEXT: it lt +; CHECK-NEXT: bxlt lr +; CHECK-NEXT: vldr.16 s0, [r2] +; CHECK-NEXT: vmov.f16 r2, s0 +; CHECK-NEXT: vdup.16 q0, r2 +; CHECK-NEXT: .LBB8_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q1, [r0], #16 +; CHECK-NEXT: vldrw.u32 q2, [r1], #16 +; CHECK-NEXT: subs.w r12, r12, #8 +; CHECK-NEXT: vfma.f16 q2, q1, q0 +; CHECK-NEXT: vstrb.8 q2, [r3], #16 +; CHECK-NEXT: bne .LBB8_1 +; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: bx lr +entry: + %C = load half, half* %CC + %0 = and i32 %n, 7 + %cmp = icmp eq i32 %0, 0 + tail call void @llvm.assume(i1 %cmp) + %cmp110 = icmp sgt i32 %n, 0 + br i1 %cmp110, label %vector.ph, label %for.cond.cleanup + +vector.ph: ; preds = %entry + %broadcast.splatinsert12 = insertelement <8 x half> undef, half %C, i32 0 + %broadcast.splat13 = shufflevector <8 x half> %broadcast.splatinsert12, <8 x half> undef, <8 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %1 = getelementptr inbounds half, half* %A, i32 %index + %2 = bitcast half* %1 to <8 x half>* + %wide.load = load <8 x half>, <8 x half>* %2, align 4 + %3 = fmul fast <8 x half> %wide.load, %broadcast.splat13 + %4 = getelementptr inbounds half, half* %B, i32 %index + %5 = bitcast half* %4 to <8 x half>* + %wide.load14 = load <8 x half>, <8 x half>* %5, align 4 + %6 = fadd fast <8 x half> %3, %wide.load14 + %7 = getelementptr inbounds half, half* %D, i32 %index + %8 = bitcast half* %7 to <8 x half>* + store <8 x half> %6, <8 x half>* %8, align 4 + %index.next = add i32 %index, 8 + %9 = icmp eq i32 %index.next, %n + br i1 %9, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body, %entry + ret void +} + +define arm_aapcs_vfpcc void @test_fma_r(half* noalias nocapture readonly %A, half* noalias nocapture readonly %B, half *%CC, half* noalias nocapture %D, i32 %n) { +; CHECK-LABEL: test_fma_r: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldr.w r12, [sp] +; CHECK-NEXT: cmp.w r12, #1 +; CHECK-NEXT: it lt +; CHECK-NEXT: bxlt lr +; CHECK-NEXT: vldr.16 s0, [r2] +; CHECK-NEXT: vmov.f16 r2, s0 +; CHECK-NEXT: vdup.16 q0, r2 +; CHECK-NEXT: .LBB9_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q1, [r0], #16 +; CHECK-NEXT: vldrw.u32 q2, [r1], #16 +; CHECK-NEXT: subs.w r12, r12, #8 +; CHECK-NEXT: vfma.f16 q2, q0, q1 +; CHECK-NEXT: vstrb.8 q2, [r3], #16 +; CHECK-NEXT: bne .LBB9_1 +; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: bx lr +entry: + %C = load half, half* %CC + %0 = and i32 %n, 7 + %cmp = icmp eq i32 %0, 0 + tail call void @llvm.assume(i1 %cmp) + %cmp110 = icmp sgt i32 %n, 0 + br i1 %cmp110, label %vector.ph, label %for.cond.cleanup + +vector.ph: ; preds = %entry + %broadcast.splatinsert12 = insertelement <8 x half> undef, half %C, i32 0 + %broadcast.splat13 = shufflevector <8 x half> %broadcast.splatinsert12, <8 x half> undef, <8 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %1 = getelementptr inbounds half, half* %A, i32 %index + %2 = bitcast half* %1 to <8 x half>* + %wide.load = load <8 x half>, <8 x half>* %2, align 4 + %3 = fmul fast <8 x half> %broadcast.splat13, %wide.load + %4 = getelementptr inbounds half, half* %B, i32 %index + %5 = bitcast half* %4 to <8 x half>* + %wide.load14 = load <8 x half>, <8 x half>* %5, align 4 + %6 = fadd fast <8 x half> %3, %wide.load14 + %7 = getelementptr inbounds half, half* %D, i32 %index + %8 = bitcast half* %7 to <8 x half>* + store <8 x half> %6, <8 x half>* %8, align 4 + %index.next = add i32 %index, 8 + %9 = icmp eq i32 %index.next, %n + br i1 %9, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body, %entry + ret void +} + + +define arm_aapcs_vfpcc void @test_fmss(half* noalias nocapture readonly %A, half* noalias nocapture readonly %B, half *%CC, half* noalias nocapture %D, i32 %n) { +; CHECK-LABEL: test_fmss: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldr.w r12, [sp] +; CHECK-NEXT: cmp.w r12, #1 +; CHECK-NEXT: it lt +; CHECK-NEXT: bxlt lr +; CHECK-NEXT: vldr.16 s0, [r2] +; CHECK-NEXT: vmov.f16 r2, s0 +; CHECK-NEXT: vdup.16 q0, r2 +; CHECK-NEXT: vneg.f16 q0, q0 +; CHECK-NEXT: .LBB10_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q1, [r0], #16 +; CHECK-NEXT: vldrw.u32 q2, [r1], #16 +; CHECK-NEXT: vmov q3, q0 +; CHECK-NEXT: subs.w r12, r12, #8 +; CHECK-NEXT: vfma.f16 q3, q2, q1 +; CHECK-NEXT: vstrb.8 q3, [r3], #16 +; CHECK-NEXT: bne .LBB10_1 +; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: bx lr +entry: + %C = load half, half* %CC + %0 = and i32 %n, 7 + %cmp = icmp eq i32 %0, 0 + tail call void @llvm.assume(i1 %cmp) + %cmp110 = icmp sgt i32 %n, 0 + br i1 %cmp110, label %vector.ph, label %for.cond.cleanup + +vector.ph: ; preds = %entry + %broadcast.splatinsert13 = insertelement <8 x half> undef, half %C, i32 0 + %broadcast.splat14 = shufflevector <8 x half> %broadcast.splatinsert13, <8 x half> undef, <8 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %1 = getelementptr inbounds half, half* %A, i32 %index + %2 = bitcast half* %1 to <8 x half>* + %wide.load = load <8 x half>, <8 x half>* %2, align 4 + %3 = getelementptr inbounds half, half* %B, i32 %index + %4 = bitcast half* %3 to <8 x half>* + %wide.load12 = load <8 x half>, <8 x half>* %4, align 4 + %5 = fmul fast <8 x half> %wide.load12, %wide.load + %6 = fsub fast <8 x half> %5, %broadcast.splat14 + %7 = getelementptr inbounds half, half* %D, i32 %index + %8 = bitcast half* %7 to <8 x half>* + store <8 x half> %6, <8 x half>* %8, align 4 + %index.next = add i32 %index, 8 + %9 = icmp eq i32 %index.next, %n + br i1 %9, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body, %entry + ret void +} + +define arm_aapcs_vfpcc void @test_fmss_r(half* noalias nocapture readonly %A, half* noalias nocapture readonly %B, half *%CC, half* noalias nocapture %D, i32 %n) { +; CHECK-LABEL: test_fmss_r: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldr.w r12, [sp] +; CHECK-NEXT: cmp.w r12, #1 +; CHECK-NEXT: it lt +; CHECK-NEXT: bxlt lr +; CHECK-NEXT: vldr.16 s0, [r2] +; CHECK-NEXT: vmov.f16 r2, s0 +; CHECK-NEXT: vdup.16 q0, r2 +; CHECK-NEXT: .LBB11_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q1, [r0], #16 +; CHECK-NEXT: vldrw.u32 q2, [r1], #16 +; CHECK-NEXT: vmov q3, q0 +; CHECK-NEXT: subs.w r12, r12, #8 +; CHECK-NEXT: vfms.f16 q3, q2, q1 +; CHECK-NEXT: vstrb.8 q3, [r3], #16 +; CHECK-NEXT: bne .LBB11_1 +; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: bx lr +entry: + %C = load half, half* %CC + %0 = and i32 %n, 7 + %cmp = icmp eq i32 %0, 0 + tail call void @llvm.assume(i1 %cmp) + %cmp110 = icmp sgt i32 %n, 0 + br i1 %cmp110, label %vector.ph, label %for.cond.cleanup + +vector.ph: ; preds = %entry + %broadcast.splatinsert13 = insertelement <8 x half> undef, half %C, i32 0 + %broadcast.splat14 = shufflevector <8 x half> %broadcast.splatinsert13, <8 x half> undef, <8 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %1 = getelementptr inbounds half, half* %A, i32 %index + %2 = bitcast half* %1 to <8 x half>* + %wide.load = load <8 x half>, <8 x half>* %2, align 4 + %3 = getelementptr inbounds half, half* %B, i32 %index + %4 = bitcast half* %3 to <8 x half>* + %wide.load12 = load <8 x half>, <8 x half>* %4, align 4 + %5 = fmul fast <8 x half> %wide.load12, %wide.load + %6 = fsub fast <8 x half> %broadcast.splat14, %5 + %7 = getelementptr inbounds half, half* %D, i32 %index + %8 = bitcast half* %7 to <8 x half>* + store <8 x half> %6, <8 x half>* %8, align 4 + %index.next = add i32 %index, 8 + %9 = icmp eq i32 %index.next, %n + br i1 %9, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body, %entry + ret void +} + +define arm_aapcs_vfpcc void @test_fms(half* noalias nocapture readonly %A, half* noalias nocapture readonly %B, half *%CC, half* noalias nocapture %D, i32 %n) { +; CHECK-LABEL: test_fms: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldr.w r12, [sp] +; CHECK-NEXT: cmp.w r12, #1 +; CHECK-NEXT: it lt +; CHECK-NEXT: bxlt lr +; CHECK-NEXT: vldr.16 s0, [r2] +; CHECK-NEXT: vmov.f16 r2, s0 +; CHECK-NEXT: vdup.16 q0, r2 +; CHECK-NEXT: .LBB12_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q1, [r1], #16 +; CHECK-NEXT: vldrw.u32 q2, [r0], #16 +; CHECK-NEXT: subs.w r12, r12, #8 +; CHECK-NEXT: vneg.f16 q1, q1 +; CHECK-NEXT: vfma.f16 q1, q2, q0 +; CHECK-NEXT: vstrb.8 q1, [r3], #16 +; CHECK-NEXT: bne .LBB12_1 +; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: bx lr +entry: + %C = load half, half* %CC + %0 = and i32 %n, 7 + %cmp = icmp eq i32 %0, 0 + tail call void @llvm.assume(i1 %cmp) + %cmp110 = icmp sgt i32 %n, 0 + br i1 %cmp110, label %vector.ph, label %for.cond.cleanup + +vector.ph: ; preds = %entry + %broadcast.splatinsert12 = insertelement <8 x half> undef, half %C, i32 0 + %broadcast.splat13 = shufflevector <8 x half> %broadcast.splatinsert12, <8 x half> undef, <8 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %1 = getelementptr inbounds half, half* %A, i32 %index + %2 = bitcast half* %1 to <8 x half>* + %wide.load = load <8 x half>, <8 x half>* %2, align 4 + %3 = fmul fast <8 x half> %wide.load, %broadcast.splat13 + %4 = getelementptr inbounds half, half* %B, i32 %index + %5 = bitcast half* %4 to <8 x half>* + %wide.load14 = load <8 x half>, <8 x half>* %5, align 4 + %6 = fsub fast <8 x half> %3, %wide.load14 + %7 = getelementptr inbounds half, half* %D, i32 %index + %8 = bitcast half* %7 to <8 x half>* + store <8 x half> %6, <8 x half>* %8, align 4 + %index.next = add i32 %index, 8 + %9 = icmp eq i32 %index.next, %n + br i1 %9, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body, %entry + ret void +} + +define arm_aapcs_vfpcc void @test_fms_r(half* noalias nocapture readonly %A, half* noalias nocapture readonly %B, half *%CC, half* noalias nocapture %D, i32 %n) { +; CHECK-LABEL: test_fms_r: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldr.w r12, [sp] +; CHECK-NEXT: cmp.w r12, #1 +; CHECK-NEXT: it lt +; CHECK-NEXT: bxlt lr +; CHECK-NEXT: vldr.16 s0, [r2] +; CHECK-NEXT: vmov.f16 r2, s0 +; CHECK-NEXT: vdup.16 q0, r2 +; CHECK-NEXT: .LBB13_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q1, [r1], #16 +; CHECK-NEXT: vldrw.u32 q2, [r0], #16 +; CHECK-NEXT: subs.w r12, r12, #8 +; CHECK-NEXT: vneg.f16 q1, q1 +; CHECK-NEXT: vfma.f16 q1, q0, q2 +; CHECK-NEXT: vstrb.8 q1, [r3], #16 +; CHECK-NEXT: bne .LBB13_1 +; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: bx lr +entry: + %C = load half, half* %CC + %0 = and i32 %n, 7 + %cmp = icmp eq i32 %0, 0 + tail call void @llvm.assume(i1 %cmp) + %cmp110 = icmp sgt i32 %n, 0 + br i1 %cmp110, label %vector.ph, label %for.cond.cleanup + +vector.ph: ; preds = %entry + %broadcast.splatinsert12 = insertelement <8 x half> undef, half %C, i32 0 + %broadcast.splat13 = shufflevector <8 x half> %broadcast.splatinsert12, <8 x half> undef, <8 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %1 = getelementptr inbounds half, half* %A, i32 %index + %2 = bitcast half* %1 to <8 x half>* + %wide.load = load <8 x half>, <8 x half>* %2, align 4 + %3 = fmul fast <8 x half> %broadcast.splat13, %wide.load + %4 = getelementptr inbounds half, half* %B, i32 %index + %5 = bitcast half* %4 to <8 x half>* + %wide.load14 = load <8 x half>, <8 x half>* %5, align 4 + %6 = fsub fast <8 x half> %3, %wide.load14 + %7 = getelementptr inbounds half, half* %D, i32 %index + %8 = bitcast half* %7 to <8 x half>* + store <8 x half> %6, <8 x half>* %8, align 4 + %index.next = add i32 %index, 8 + %9 = icmp eq i32 %index.next, %n + br i1 %9, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body, %entry + ret void +} + + +define dso_local void @test_nested(half* noalias nocapture %pInT1, half* noalias nocapture readonly %pOutT1, half* noalias nocapture readonly %pPRT_in, half* noalias nocapture readnone %pPRT_pDst, i32 %numRows, i32 %numCols, i32 %l, half *%ina) local_unnamed_addr #0 { +; CHECK-LABEL: test_nested: +; CHECK: @ %bb.0: @ %for.body.us.preheader +; CHECK-NEXT: .save {r4, r5, r6, r7, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, lr} +; CHECK-NEXT: ldrd lr, r12, [sp, #20] +; CHECK-NEXT: lsl.w r3, r12, #1 +; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: .LBB14_1: @ %for.body.us +; CHECK-NEXT: @ =>This Loop Header: Depth=1 +; CHECK-NEXT: @ Child Loop BB14_2 Depth 2 +; CHECK-NEXT: ldrh r4, [r1] +; CHECK-NEXT: mov r5, r12 +; CHECK-NEXT: vdup.16 q0, r4 +; CHECK-NEXT: movs r4, #0 +; CHECK-NEXT: .LBB14_2: @ %vector.body +; CHECK-NEXT: @ Parent Loop BB14_1 Depth=1 +; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 +; CHECK-NEXT: adds r6, r0, r4 +; CHECK-NEXT: adds r7, r2, r4 +; CHECK-NEXT: vldrw.u32 q1, [r7] +; CHECK-NEXT: vldrw.u32 q2, [r6] +; CHECK-NEXT: adds r4, #16 +; CHECK-NEXT: subs r5, #8 +; CHECK-NEXT: vfms.f16 q2, q1, q0 +; CHECK-NEXT: vstrw.32 q2, [r6] +; CHECK-NEXT: bne .LBB14_2 +; CHECK-NEXT: @ %bb.3: @ %for.cond6.for.end_crit_edge.us +; CHECK-NEXT: @ in Loop: Header=BB14_1 Depth=1 +; CHECK-NEXT: add r0, r3 +; CHECK-NEXT: add r2, r3 +; CHECK-NEXT: adds r1, #2 +; CHECK-NEXT: le lr, .LBB14_1 +; CHECK-NEXT: @ %bb.4: @ %for.end14 +; CHECK-NEXT: pop {r4, r5, r6, r7, pc} +for.body.us.preheader: + %in = load half, half* %ina + %cmp = icmp sgt i32 %numRows, 0 + tail call void @llvm.assume(i1 %cmp) + %cmp1 = icmp sgt i32 %numCols, 0 + tail call void @llvm.assume(i1 %cmp1) + %rem = and i32 %numCols, 7 + %cmp2 = icmp eq i32 %rem, 0 + tail call void @llvm.assume(i1 %cmp2) + %cmp3 = icmp slt i32 %l, %numCols + tail call void @llvm.assume(i1 %cmp3) + br label %for.body.us + +for.body.us: ; preds = %for.cond6.for.end_crit_edge.us, %for.body.us.preheader + %pInT1.addr.038.us = phi half* [ %scevgep40, %for.cond6.for.end_crit_edge.us ], [ %pInT1, %for.body.us.preheader ] + %i.037.us = phi i32 [ %inc13.us, %for.cond6.for.end_crit_edge.us ], [ 0, %for.body.us.preheader ] + %pOutT1.addr.036.us = phi half* [ %incdec.ptr.us, %for.cond6.for.end_crit_edge.us ], [ %pOutT1, %for.body.us.preheader ] + %pPRT_in.addr.035.us = phi half* [ %scevgep, %for.cond6.for.end_crit_edge.us ], [ %pPRT_in, %for.body.us.preheader ] + %scevgep = getelementptr half, half* %pPRT_in.addr.035.us, i32 %numCols + %0 = load half, half* %pOutT1.addr.036.us, align 4 + %broadcast.splatinsert47 = insertelement <8 x half> undef, half %0, i32 0 + %broadcast.splat48 = shufflevector <8 x half> %broadcast.splatinsert47, <8 x half> undef, <8 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %for.body.us + %index = phi i32 [ 0, %for.body.us ], [ %index.next, %vector.body ] + %next.gep = getelementptr half, half* %pInT1.addr.038.us, i32 %index + %next.gep45 = getelementptr half, half* %pPRT_in.addr.035.us, i32 %index + %1 = bitcast half* %next.gep to <8 x half>* + %wide.load = load <8 x half>, <8 x half>* %1, align 4 + %2 = bitcast half* %next.gep45 to <8 x half>* + %wide.load46 = load <8 x half>, <8 x half>* %2, align 4 + %3 = fmul fast <8 x half> %wide.load46, %broadcast.splat48 + %4 = fsub fast <8 x half> %wide.load, %3 + store <8 x half> %4, <8 x half>* %1, align 4 + %index.next = add i32 %index, 8 + %5 = icmp eq i32 %index.next, %numCols + br i1 %5, label %for.cond6.for.end_crit_edge.us, label %vector.body + +for.cond6.for.end_crit_edge.us: ; preds = %vector.body + %incdec.ptr.us = getelementptr inbounds half, half* %pOutT1.addr.036.us, i32 1 + %scevgep40 = getelementptr half, half* %pInT1.addr.038.us, i32 %numCols + %inc13.us = add nuw nsw i32 %i.037.us, 1 + %exitcond41 = icmp eq i32 %inc13.us, %numRows + br i1 %exitcond41, label %for.end14, label %for.body.us + +for.end14: ; preds = %for.cond6.for.end_crit_edge.us + ret void +} + +%struct.arm_fir_instance_f32 = type { i16, half*, half* } +define void @arm_fir_f32_1_4_mve(%struct.arm_fir_instance_f32* nocapture readonly %S, half* nocapture readonly %pSrc, half* %pDst, i32 %blockSize) { +; CHECK-LABEL: arm_fir_f32_1_4_mve: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr} +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: ldrh.w r10, [r0] +; CHECK-NEXT: ldr.w r12, [r0, #4] +; CHECK-NEXT: sub.w r7, r10, #1 +; CHECK-NEXT: cmp r7, #3 +; CHECK-NEXT: bhi .LBB15_6 +; CHECK-NEXT: @ %bb.1: @ %if.then +; CHECK-NEXT: ldr r6, [r0, #8] +; CHECK-NEXT: lsr.w lr, r3, #2 +; CHECK-NEXT: ldrh r4, [r6, #6] +; CHECK-NEXT: vdup.16 q0, r4 +; CHECK-NEXT: ldrh r4, [r6, #4] +; CHECK-NEXT: vdup.16 q1, r4 +; CHECK-NEXT: ldrh r4, [r6, #2] +; CHECK-NEXT: ldrh r6, [r6] +; CHECK-NEXT: vdup.16 q2, r4 +; CHECK-NEXT: add.w r4, r12, r7, lsl #1 +; CHECK-NEXT: vdup.16 q3, r6 +; CHECK-NEXT: wls lr, lr, .LBB15_5 +; CHECK-NEXT: @ %bb.2: @ %while.body.lr.ph +; CHECK-NEXT: bic r9, r3, #3 +; CHECK-NEXT: movs r6, #0 +; CHECK-NEXT: add.w r8, r2, r9, lsl #1 +; CHECK-NEXT: .LBB15_3: @ %while.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: adds r7, r1, r6 +; CHECK-NEXT: vldrw.u32 q4, [r7] +; CHECK-NEXT: adds r7, r4, r6 +; CHECK-NEXT: vstrw.32 q4, [r7] +; CHECK-NEXT: add.w r7, r12, r6 +; CHECK-NEXT: vldrw.u32 q4, [r7] +; CHECK-NEXT: adds r5, r7, #2 +; CHECK-NEXT: vldrw.u32 q5, [r5] +; CHECK-NEXT: adds r5, r7, #6 +; CHECK-NEXT: vmul.f16 q4, q4, q3 +; CHECK-NEXT: vfma.f16 q4, q5, q2 +; CHECK-NEXT: vldrw.u32 q5, [r7, #4] +; CHECK-NEXT: vfma.f16 q4, q5, q1 +; CHECK-NEXT: vldrw.u32 q5, [r5] +; CHECK-NEXT: adds r5, r2, r6 +; CHECK-NEXT: adds r6, #8 +; CHECK-NEXT: vfma.f16 q4, q5, q0 +; CHECK-NEXT: vstrw.32 q4, [r5] +; CHECK-NEXT: le lr, .LBB15_3 +; CHECK-NEXT: @ %bb.4: @ %while.end.loopexit +; CHECK-NEXT: add r4, r6 +; CHECK-NEXT: add.w r12, r12, r9, lsl #1 +; CHECK-NEXT: add.w r1, r1, r9, lsl #1 +; CHECK-NEXT: mov r2, r8 +; CHECK-NEXT: .LBB15_5: @ %while.end +; CHECK-NEXT: and r7, r3, #3 +; CHECK-NEXT: vldrw.u32 q4, [r1] +; CHECK-NEXT: vctp.16 r7 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrht.16 q4, [r4] +; CHECK-NEXT: vldrw.u32 q4, [r12] +; CHECK-NEXT: add.w r1, r12, #2 +; CHECK-NEXT: vmul.f16 q3, q4, q3 +; CHECK-NEXT: vldrw.u32 q4, [r1] +; CHECK-NEXT: add.w r1, r12, #6 +; CHECK-NEXT: vfma.f16 q3, q4, q2 +; CHECK-NEXT: vldrw.u32 q2, [r12, #4] +; CHECK-NEXT: vfma.f16 q3, q2, q1 +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vfma.f16 q3, q1, q0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrht.16 q3, [r2] +; CHECK-NEXT: ldr.w r12, [r0, #4] +; CHECK-NEXT: .LBB15_6: @ %if.end +; CHECK-NEXT: add.w r0, r12, r3, lsl #1 +; CHECK-NEXT: lsr.w lr, r10, #2 +; CHECK-NEXT: wls lr, lr, .LBB15_10 +; CHECK-NEXT: @ %bb.7: @ %while.body51.preheader +; CHECK-NEXT: bic r2, r10, #3 +; CHECK-NEXT: adds r1, r2, r3 +; CHECK-NEXT: mov r3, r12 +; CHECK-NEXT: add.w r1, r12, r1, lsl #1 +; CHECK-NEXT: .LBB15_8: @ %while.body51 +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q0, [r0], #8 +; CHECK-NEXT: vstrb.8 q0, [r3], #8 +; CHECK-NEXT: le lr, .LBB15_8 +; CHECK-NEXT: @ %bb.9: @ %while.end55.loopexit +; CHECK-NEXT: add.w r12, r12, r2, lsl #1 +; CHECK-NEXT: mov r0, r1 +; CHECK-NEXT: .LBB15_10: @ %while.end55 +; CHECK-NEXT: ands r1, r10, #3 +; CHECK-NEXT: beq .LBB15_12 +; CHECK-NEXT: @ %bb.11: @ %if.then59 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vctp.16 r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrht.16 q0, [r12] +; CHECK-NEXT: .LBB15_12: @ %if.end61 +; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc} +entry: + %pState1 = getelementptr inbounds %struct.arm_fir_instance_f32, %struct.arm_fir_instance_f32* %S, i32 0, i32 1 + %0 = load half*, half** %pState1, align 4 + %pCoeffs2 = getelementptr inbounds %struct.arm_fir_instance_f32, %struct.arm_fir_instance_f32* %S, i32 0, i32 2 + %1 = load half*, half** %pCoeffs2, align 4 + %numTaps3 = getelementptr inbounds %struct.arm_fir_instance_f32, %struct.arm_fir_instance_f32* %S, i32 0, i32 0 + %2 = load i16, i16* %numTaps3, align 4 + %conv = zext i16 %2 to i32 + %sub = add nsw i32 %conv, -1 + %cmp = icmp ult i32 %sub, 4 + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + %arrayidx = getelementptr inbounds half, half* %0, i32 %sub + %incdec.ptr = getelementptr inbounds half, half* %1, i32 1 + %3 = load half, half* %1, align 4 + %incdec.ptr6 = getelementptr inbounds half, half* %1, i32 2 + %4 = load half, half* %incdec.ptr, align 4 + %incdec.ptr7 = getelementptr inbounds half, half* %1, i32 3 + %5 = load half, half* %incdec.ptr6, align 4 + %6 = load half, half* %incdec.ptr7, align 4 + %shr = lshr i32 %blockSize, 2 + %cmp9146 = icmp eq i32 %shr, 0 + %.pre161 = insertelement <8 x half> undef, half %3, i32 0 + %.pre162 = shufflevector <8 x half> %.pre161, <8 x half> undef, <8 x i32> zeroinitializer + %.pre163 = insertelement <8 x half> undef, half %4, i32 0 + %.pre164 = shufflevector <8 x half> %.pre163, <8 x half> undef, <8 x i32> zeroinitializer + %.pre165 = insertelement <8 x half> undef, half %5, i32 0 + %.pre166 = shufflevector <8 x half> %.pre165, <8 x half> undef, <8 x i32> zeroinitializer + %.pre167 = insertelement <8 x half> undef, half %6, i32 0 + %.pre168 = shufflevector <8 x half> %.pre167, <8 x half> undef, <8 x i32> zeroinitializer + br i1 %cmp9146, label %while.end, label %while.body.lr.ph + +while.body.lr.ph: ; preds = %if.then + %7 = and i32 %blockSize, -4 + %scevgep158 = getelementptr half, half* %pDst, i32 %7 + br label %while.body + +while.body: ; preds = %while.body.lr.ph, %while.body + %pStateCur.0151 = phi half* [ %arrayidx, %while.body.lr.ph ], [ %add.ptr, %while.body ] + %pSamples.0150 = phi half* [ %0, %while.body.lr.ph ], [ %add.ptr24, %while.body ] + %pOutput.0149 = phi half* [ %pDst, %while.body.lr.ph ], [ %add.ptr23, %while.body ] + %pTempSrc.0148 = phi half* [ %pSrc, %while.body.lr.ph ], [ %add.ptr11, %while.body ] + %blkCnt.0147 = phi i32 [ %shr, %while.body.lr.ph ], [ %dec, %while.body ] + %8 = bitcast half* %pTempSrc.0148 to <8 x half>* + %9 = load <8 x half>, <8 x half>* %8, align 4 + %10 = bitcast half* %pStateCur.0151 to <8 x half>* + store <8 x half> %9, <8 x half>* %10, align 4 + %add.ptr = getelementptr inbounds half, half* %pStateCur.0151, i32 4 + %add.ptr11 = getelementptr inbounds half, half* %pTempSrc.0148, i32 4 + %11 = bitcast half* %pSamples.0150 to <8 x half>* + %12 = load <8 x half>, <8 x half>* %11, align 4 + %13 = fmul fast <8 x half> %12, %.pre162 + %arrayidx12 = getelementptr inbounds half, half* %pSamples.0150, i32 1 + %14 = bitcast half* %arrayidx12 to <8 x half>* + %15 = load <8 x half>, <8 x half>* %14, align 4 + %mul = fmul fast <8 x half> %15, %.pre164 + %add = fadd fast <8 x half> %mul, %13 + %arrayidx13 = getelementptr inbounds half, half* %pSamples.0150, i32 2 + %16 = bitcast half* %arrayidx13 to <8 x half>* + %17 = load <8 x half>, <8 x half>* %16, align 4 + %mul16 = fmul fast <8 x half> %17, %.pre166 + %add17 = fadd fast <8 x half> %add, %mul16 + %arrayidx18 = getelementptr inbounds half, half* %pSamples.0150, i32 3 + %18 = bitcast half* %arrayidx18 to <8 x half>* + %19 = load <8 x half>, <8 x half>* %18, align 4 + %mul21 = fmul fast <8 x half> %19, %.pre168 + %add22 = fadd fast <8 x half> %add17, %mul21 + %20 = bitcast half* %pOutput.0149 to <8 x half>* + store <8 x half> %add22, <8 x half>* %20, align 4 + %add.ptr23 = getelementptr inbounds half, half* %pOutput.0149, i32 4 + %add.ptr24 = getelementptr inbounds half, half* %pSamples.0150, i32 4 + %dec = add nsw i32 %blkCnt.0147, -1 + %cmp9 = icmp eq i32 %dec, 0 + br i1 %cmp9, label %while.end.loopexit, label %while.body + +while.end.loopexit: ; preds = %while.body + %scevgep157 = getelementptr half, half* %pSrc, i32 %7 + %scevgep159 = getelementptr half, half* %0, i32 %7 + br label %while.end + +while.end: ; preds = %if.then, %while.end.loopexit + %pTempSrc.0.lcssa = phi half* [ %scevgep157, %while.end.loopexit ], [ %pSrc, %if.then ] + %pOutput.0.lcssa = phi half* [ %scevgep158, %while.end.loopexit ], [ %pDst, %if.then ] + %pSamples.0.lcssa = phi half* [ %scevgep159, %while.end.loopexit ], [ %0, %if.then ] + %pStateCur.0.lcssa = phi half* [ %add.ptr, %while.end.loopexit ], [ %arrayidx, %if.then ] + %and = and i32 %blockSize, 3 + %21 = tail call <8 x i1> @llvm.arm.mve.vctp16(i32 %and) + %22 = bitcast half* %pTempSrc.0.lcssa to <8 x half>* + %23 = load <8 x half>, <8 x half>* %22, align 4 + %24 = bitcast half* %pStateCur.0.lcssa to <8 x half>* + tail call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %23, <8 x half>* %24, i32 4, <8 x i1> %21) + %25 = bitcast half* %pSamples.0.lcssa to <8 x half>* + %26 = load <8 x half>, <8 x half>* %25, align 4 + %27 = fmul fast <8 x half> %26, %.pre162 + %arrayidx29 = getelementptr inbounds half, half* %pSamples.0.lcssa, i32 1 + %28 = bitcast half* %arrayidx29 to <8 x half>* + %29 = load <8 x half>, <8 x half>* %28, align 4 + %mul32 = fmul fast <8 x half> %29, %.pre164 + %add33 = fadd fast <8 x half> %mul32, %27 + %arrayidx34 = getelementptr inbounds half, half* %pSamples.0.lcssa, i32 2 + %30 = bitcast half* %arrayidx34 to <8 x half>* + %31 = load <8 x half>, <8 x half>* %30, align 4 + %mul37 = fmul fast <8 x half> %31, %.pre166 + %add38 = fadd fast <8 x half> %add33, %mul37 + %arrayidx39 = getelementptr inbounds half, half* %pSamples.0.lcssa, i32 3 + %32 = bitcast half* %arrayidx39 to <8 x half>* + %33 = load <8 x half>, <8 x half>* %32, align 4 + %mul42 = fmul fast <8 x half> %33, %.pre168 + %add43 = fadd fast <8 x half> %add38, %mul42 + %34 = bitcast half* %pOutput.0.lcssa to <8 x half>* + tail call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %add43, <8 x half>* %34, i32 4, <8 x i1> %21) + %.pre = load half*, half** %pState1, align 4 + br label %if.end + +if.end: ; preds = %while.end, %entry + %35 = phi half* [ %.pre, %while.end ], [ %0, %entry ] + %arrayidx45 = getelementptr inbounds half, half* %35, i32 %blockSize + %shr47 = lshr i32 %conv, 2 + %cmp49141 = icmp eq i32 %shr47, 0 + br i1 %cmp49141, label %while.end55, label %while.body51.preheader + +while.body51.preheader: ; preds = %if.end + %36 = and i32 %conv, 65532 + %37 = add i32 %36, %blockSize + %scevgep = getelementptr half, half* %35, i32 %37 + br label %while.body51 + +while.body51: ; preds = %while.body51.preheader, %while.body51 + %pTempSrc.1144 = phi half* [ %add.ptr52, %while.body51 ], [ %arrayidx45, %while.body51.preheader ] + %pTempDest.0143 = phi half* [ %add.ptr53, %while.body51 ], [ %35, %while.body51.preheader ] + %blkCnt.1142 = phi i32 [ %dec54, %while.body51 ], [ %shr47, %while.body51.preheader ] + %38 = bitcast half* %pTempSrc.1144 to <8 x half>* + %39 = load <8 x half>, <8 x half>* %38, align 4 + %40 = bitcast half* %pTempDest.0143 to <8 x half>* + store <8 x half> %39, <8 x half>* %40, align 4 + %add.ptr52 = getelementptr inbounds half, half* %pTempSrc.1144, i32 4 + %add.ptr53 = getelementptr inbounds half, half* %pTempDest.0143, i32 4 + %dec54 = add nsw i32 %blkCnt.1142, -1 + %cmp49 = icmp eq i32 %dec54, 0 + br i1 %cmp49, label %while.end55.loopexit, label %while.body51 + +while.end55.loopexit: ; preds = %while.body51 + %scevgep156 = getelementptr half, half* %35, i32 %36 + br label %while.end55 + +while.end55: ; preds = %while.end55.loopexit, %if.end + %pTempDest.0.lcssa = phi half* [ %35, %if.end ], [ %scevgep156, %while.end55.loopexit ] + %pTempSrc.1.lcssa = phi half* [ %arrayidx45, %if.end ], [ %scevgep, %while.end55.loopexit ] + %and56 = and i32 %conv, 3 + %cmp57 = icmp eq i32 %and56, 0 + br i1 %cmp57, label %if.end61, label %if.then59 + +if.then59: ; preds = %while.end55 + %41 = tail call <8 x i1> @llvm.arm.mve.vctp16(i32 %and56) + %42 = bitcast half* %pTempSrc.1.lcssa to <8 x half>* + %43 = load <8 x half>, <8 x half>* %42, align 4 + %44 = bitcast half* %pTempDest.0.lcssa to <8 x half>* + tail call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %43, <8 x half>* %44, i32 4, <8 x i1> %41) + br label %if.end61 + +if.end61: ; preds = %while.end55, %if.then59 + ret void +} + + +define void @fir(%struct.arm_fir_instance_f32* nocapture readonly %S, half* nocapture readonly %pSrc, half* nocapture %pDst, i32 %blockSize) { +; CHECK-LABEL: fir: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-NEXT: .pad #28 +; CHECK-NEXT: sub sp, #28 +; CHECK-NEXT: cmp r3, #8 +; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill +; CHECK-NEXT: blo.w .LBB16_12 +; CHECK-NEXT: @ %bb.1: @ %if.then +; CHECK-NEXT: movs r7, #0 +; CHECK-NEXT: cmp.w r7, r3, lsr #2 +; CHECK-NEXT: beq.w .LBB16_12 +; CHECK-NEXT: @ %bb.2: @ %while.body.lr.ph +; CHECK-NEXT: ldrh r4, [r0] +; CHECK-NEXT: movs r1, #1 +; CHECK-NEXT: ldrd r5, r12, [r0, #4] +; CHECK-NEXT: lsr.w r11, r3, #2 +; CHECK-NEXT: sub.w r0, r4, #8 +; CHECK-NEXT: rsbs r3, r4, #0 +; CHECK-NEXT: add.w r7, r0, r0, lsr #29 +; CHECK-NEXT: and r0, r0, #7 +; CHECK-NEXT: asrs r6, r7, #3 +; CHECK-NEXT: cmp r6, #1 +; CHECK-NEXT: it gt +; CHECK-NEXT: asrgt r1, r7, #3 +; CHECK-NEXT: add.w r7, r5, r4, lsl #1 +; CHECK-NEXT: str r1, [sp] @ 4-byte Spill +; CHECK-NEXT: subs r1, r7, #2 +; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill +; CHECK-NEXT: add.w r3, r12, #16 +; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill +; CHECK-NEXT: adds r0, #1 +; CHECK-NEXT: str r4, [sp, #16] @ 4-byte Spill +; CHECK-NEXT: str r3, [sp, #8] @ 4-byte Spill +; CHECK-NEXT: str r0, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: b .LBB16_4 +; CHECK-NEXT: .LBB16_3: @ %while.end +; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1 +; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: subs.w r11, r11, #1 +; CHECK-NEXT: vstrb.8 q0, [r2], #8 +; CHECK-NEXT: add.w r0, r7, r0, lsl #1 +; CHECK-NEXT: add.w r5, r0, #8 +; CHECK-NEXT: beq.w .LBB16_12 +; CHECK-NEXT: .LBB16_4: @ %while.body +; CHECK-NEXT: @ =>This Loop Header: Depth=1 +; CHECK-NEXT: @ Child Loop BB16_6 Depth 2 +; CHECK-NEXT: @ Child Loop BB16_10 Depth 2 +; CHECK-NEXT: ldr r0, [sp, #24] @ 4-byte Reload +; CHECK-NEXT: ldrh.w lr, [r12, #14] +; CHECK-NEXT: vldrw.u32 q0, [r0], #8 +; CHECK-NEXT: ldrh.w r10, [r12, #12] +; CHECK-NEXT: ldrh.w r7, [r12, #10] +; CHECK-NEXT: ldrh.w r4, [r12, #8] +; CHECK-NEXT: ldrh.w r3, [r12, #6] +; CHECK-NEXT: ldrh.w r6, [r12, #4] +; CHECK-NEXT: ldrh.w r8, [r12, #2] +; CHECK-NEXT: ldrh.w r9, [r12] +; CHECK-NEXT: vstrb.8 q0, [r1], #8 +; CHECK-NEXT: vldrw.u32 q0, [r5] +; CHECK-NEXT: str r0, [sp, #24] @ 4-byte Spill +; CHECK-NEXT: adds r0, r5, #2 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vmul.f16 q0, q0, r9 +; CHECK-NEXT: adds r0, r5, #6 +; CHECK-NEXT: vfma.f16 q0, q1, r8 +; CHECK-NEXT: vldrw.u32 q1, [r5, #4] +; CHECK-NEXT: vfma.f16 q0, q1, r6 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: add.w r0, r5, #10 +; CHECK-NEXT: vfma.f16 q0, q1, r3 +; CHECK-NEXT: vldrw.u32 q1, [r5, #8] +; CHECK-NEXT: vfma.f16 q0, q1, r4 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: add.w r0, r5, #14 +; CHECK-NEXT: vfma.f16 q0, q1, r7 +; CHECK-NEXT: vldrw.u32 q1, [r5, #12] +; CHECK-NEXT: add.w r7, r5, #16 +; CHECK-NEXT: vfma.f16 q0, q1, r10 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload +; CHECK-NEXT: vfma.f16 q0, q1, lr +; CHECK-NEXT: cmp r0, #16 +; CHECK-NEXT: blo .LBB16_7 +; CHECK-NEXT: @ %bb.5: @ %for.body.preheader +; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1 +; CHECK-NEXT: ldr.w lr, [sp] @ 4-byte Reload +; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: ldr r6, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: .LBB16_6: @ %for.body +; CHECK-NEXT: @ Parent Loop BB16_4 Depth=1 +; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 +; CHECK-NEXT: ldrh r0, [r6] +; CHECK-NEXT: vldrw.u32 q1, [r7] +; CHECK-NEXT: adds r3, r7, #2 +; CHECK-NEXT: vfma.f16 q0, q1, r0 +; CHECK-NEXT: vldrw.u32 q1, [r3] +; CHECK-NEXT: ldrh r0, [r6, #2] +; CHECK-NEXT: adds r3, r7, #6 +; CHECK-NEXT: vfma.f16 q0, q1, r0 +; CHECK-NEXT: ldrh r0, [r6, #4] +; CHECK-NEXT: vldrw.u32 q1, [r7, #4] +; CHECK-NEXT: vfma.f16 q0, q1, r0 +; CHECK-NEXT: vldrw.u32 q1, [r3] +; CHECK-NEXT: ldrh r0, [r6, #6] +; CHECK-NEXT: add.w r3, r7, #10 +; CHECK-NEXT: vfma.f16 q0, q1, r0 +; CHECK-NEXT: ldrh r0, [r6, #8] +; CHECK-NEXT: vldrw.u32 q1, [r7, #8] +; CHECK-NEXT: vfma.f16 q0, q1, r0 +; CHECK-NEXT: vldrw.u32 q1, [r3] +; CHECK-NEXT: ldrh r0, [r6, #10] +; CHECK-NEXT: ldrh r3, [r6, #14] +; CHECK-NEXT: vfma.f16 q0, q1, r0 +; CHECK-NEXT: ldrh r0, [r6, #12] +; CHECK-NEXT: vldrw.u32 q1, [r7, #12] +; CHECK-NEXT: adds r6, #16 +; CHECK-NEXT: vfma.f16 q0, q1, r0 +; CHECK-NEXT: add.w r0, r7, #14 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: adds r7, #16 +; CHECK-NEXT: vfma.f16 q0, q1, r3 +; CHECK-NEXT: le lr, .LBB16_6 +; CHECK-NEXT: b .LBB16_8 +; CHECK-NEXT: .LBB16_7: @ in Loop: Header=BB16_4 Depth=1 +; CHECK-NEXT: ldr r6, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: .LBB16_8: @ %for.end +; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1 +; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: beq.w .LBB16_3 +; CHECK-NEXT: @ %bb.9: @ %while.body76.preheader +; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1 +; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: mov r5, r7 +; CHECK-NEXT: .LBB16_10: @ %while.body76 +; CHECK-NEXT: @ Parent Loop BB16_4 Depth=1 +; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 +; CHECK-NEXT: ldrh r3, [r6], #2 +; CHECK-NEXT: vldrh.u16 q1, [r5], #2 +; CHECK-NEXT: subs r0, #1 +; CHECK-NEXT: vfma.f16 q0, q1, r3 +; CHECK-NEXT: cmp r0, #1 +; CHECK-NEXT: bgt .LBB16_10 +; CHECK-NEXT: @ %bb.11: @ %while.end.loopexit +; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1 +; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: add.w r7, r7, r0, lsl #1 +; CHECK-NEXT: b .LBB16_3 +; CHECK-NEXT: .LBB16_12: @ %if.end +; CHECK-NEXT: add sp, #28 +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} +entry: + %pState1 = getelementptr inbounds %struct.arm_fir_instance_f32, %struct.arm_fir_instance_f32* %S, i32 0, i32 1 + %0 = load half*, half** %pState1, align 4 + %pCoeffs2 = getelementptr inbounds %struct.arm_fir_instance_f32, %struct.arm_fir_instance_f32* %S, i32 0, i32 2 + %1 = load half*, half** %pCoeffs2, align 4 + %numTaps3 = getelementptr inbounds %struct.arm_fir_instance_f32, %struct.arm_fir_instance_f32* %S, i32 0, i32 0 + %2 = load i16, i16* %numTaps3, align 4 + %conv = zext i16 %2 to i32 + %cmp = icmp ugt i32 %blockSize, 7 + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + %shr = lshr i32 %blockSize, 2 + %cmp5217 = icmp eq i32 %shr, 0 + br i1 %cmp5217, label %if.end, label %while.body.lr.ph + +while.body.lr.ph: ; preds = %if.then + %sub = add nsw i32 %conv, -1 + %arrayidx = getelementptr inbounds half, half* %0, i32 %sub + %incdec.ptr = getelementptr inbounds half, half* %1, i32 1 + %incdec.ptr7 = getelementptr inbounds half, half* %1, i32 2 + %incdec.ptr8 = getelementptr inbounds half, half* %1, i32 3 + %incdec.ptr9 = getelementptr inbounds half, half* %1, i32 4 + %incdec.ptr10 = getelementptr inbounds half, half* %1, i32 5 + %incdec.ptr11 = getelementptr inbounds half, half* %1, i32 6 + %incdec.ptr12 = getelementptr inbounds half, half* %1, i32 7 + %sub37 = add nsw i32 %conv, -8 + %div = sdiv i32 %sub37, 8 + %pCoeffsCur.0199 = getelementptr inbounds half, half* %1, i32 8 + %cmp38201 = icmp ugt i16 %2, 15 + %and = and i32 %sub37, 7 + %cmp74210 = icmp eq i32 %and, 0 + %idx.neg = sub nsw i32 0, %conv + %3 = icmp sgt i32 %div, 1 + %smax = select i1 %3, i32 %div, i32 1 + br label %while.body + +while.body: ; preds = %while.body.lr.ph, %while.end + %blkCnt.0222 = phi i32 [ %shr, %while.body.lr.ph ], [ %dec84, %while.end ] + %pStateCur.0221 = phi half* [ %arrayidx, %while.body.lr.ph ], [ %add.ptr, %while.end ] + %pSamples.0220 = phi half* [ %0, %while.body.lr.ph ], [ %add.ptr83, %while.end ] + %pTempSrc.0219 = phi half* [ %pSrc, %while.body.lr.ph ], [ %add.ptr14, %while.end ] + %pOutput.0218 = phi half* [ %pDst, %while.body.lr.ph ], [ %add.ptr81, %while.end ] + %4 = load half, half* %1, align 4 + %5 = load half, half* %incdec.ptr, align 4 + %6 = load half, half* %incdec.ptr7, align 4 + %7 = load half, half* %incdec.ptr8, align 4 + %8 = load half, half* %incdec.ptr9, align 4 + %9 = load half, half* %incdec.ptr10, align 4 + %10 = load half, half* %incdec.ptr11, align 4 + %11 = load half, half* %incdec.ptr12, align 4 + %12 = bitcast half* %pTempSrc.0219 to <8 x half>* + %13 = load <8 x half>, <8 x half>* %12, align 4 + %14 = bitcast half* %pStateCur.0221 to <8 x half>* + store <8 x half> %13, <8 x half>* %14, align 4 + %add.ptr = getelementptr inbounds half, half* %pStateCur.0221, i32 4 + %add.ptr14 = getelementptr inbounds half, half* %pTempSrc.0219, i32 4 + %15 = bitcast half* %pSamples.0220 to <8 x half>* + %16 = load <8 x half>, <8 x half>* %15, align 4 + %.splatinsert = insertelement <8 x half> undef, half %4, i32 0 + %.splat = shufflevector <8 x half> %.splatinsert, <8 x half> undef, <8 x i32> zeroinitializer + %17 = fmul fast <8 x half> %16, %.splat + %arrayidx15 = getelementptr inbounds half, half* %pSamples.0220, i32 1 + %18 = bitcast half* %arrayidx15 to <8 x half>* + %19 = load <8 x half>, <8 x half>* %18, align 4 + %.splatinsert16 = insertelement <8 x half> undef, half %5, i32 0 + %.splat17 = shufflevector <8 x half> %.splatinsert16, <8 x half> undef, <8 x i32> zeroinitializer + %20 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %19, <8 x half> %.splat17, <8 x half> %17) + %arrayidx18 = getelementptr inbounds half, half* %pSamples.0220, i32 2 + %21 = bitcast half* %arrayidx18 to <8 x half>* + %22 = load <8 x half>, <8 x half>* %21, align 4 + %.splatinsert19 = insertelement <8 x half> undef, half %6, i32 0 + %.splat20 = shufflevector <8 x half> %.splatinsert19, <8 x half> undef, <8 x i32> zeroinitializer + %23 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %22, <8 x half> %.splat20, <8 x half> %20) + %arrayidx21 = getelementptr inbounds half, half* %pSamples.0220, i32 3 + %24 = bitcast half* %arrayidx21 to <8 x half>* + %25 = load <8 x half>, <8 x half>* %24, align 4 + %.splatinsert22 = insertelement <8 x half> undef, half %7, i32 0 + %.splat23 = shufflevector <8 x half> %.splatinsert22, <8 x half> undef, <8 x i32> zeroinitializer + %26 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %25, <8 x half> %.splat23, <8 x half> %23) + %arrayidx24 = getelementptr inbounds half, half* %pSamples.0220, i32 4 + %27 = bitcast half* %arrayidx24 to <8 x half>* + %28 = load <8 x half>, <8 x half>* %27, align 4 + %.splatinsert25 = insertelement <8 x half> undef, half %8, i32 0 + %.splat26 = shufflevector <8 x half> %.splatinsert25, <8 x half> undef, <8 x i32> zeroinitializer + %29 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %28, <8 x half> %.splat26, <8 x half> %26) + %arrayidx27 = getelementptr inbounds half, half* %pSamples.0220, i32 5 + %30 = bitcast half* %arrayidx27 to <8 x half>* + %31 = load <8 x half>, <8 x half>* %30, align 4 + %.splatinsert28 = insertelement <8 x half> undef, half %9, i32 0 + %.splat29 = shufflevector <8 x half> %.splatinsert28, <8 x half> undef, <8 x i32> zeroinitializer + %32 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %31, <8 x half> %.splat29, <8 x half> %29) + %arrayidx30 = getelementptr inbounds half, half* %pSamples.0220, i32 6 + %33 = bitcast half* %arrayidx30 to <8 x half>* + %34 = load <8 x half>, <8 x half>* %33, align 4 + %.splatinsert31 = insertelement <8 x half> undef, half %10, i32 0 + %.splat32 = shufflevector <8 x half> %.splatinsert31, <8 x half> undef, <8 x i32> zeroinitializer + %35 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %34, <8 x half> %.splat32, <8 x half> %32) + %arrayidx33 = getelementptr inbounds half, half* %pSamples.0220, i32 7 + %36 = bitcast half* %arrayidx33 to <8 x half>* + %37 = load <8 x half>, <8 x half>* %36, align 4 + %.splatinsert34 = insertelement <8 x half> undef, half %11, i32 0 + %.splat35 = shufflevector <8 x half> %.splatinsert34, <8 x half> undef, <8 x i32> zeroinitializer + %38 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %37, <8 x half> %.splat35, <8 x half> %35) + %pSamples.1200 = getelementptr inbounds half, half* %pSamples.0220, i32 8 + br i1 %cmp38201, label %for.body, label %for.end + +for.body: ; preds = %while.body, %for.body + %pSamples.1207 = phi half* [ %pSamples.1, %for.body ], [ %pSamples.1200, %while.body ] + %pCoeffsCur.0206 = phi half* [ %pCoeffsCur.0, %for.body ], [ %pCoeffsCur.0199, %while.body ] + %.pn205 = phi half* [ %pCoeffsCur.0206, %for.body ], [ %1, %while.body ] + %i.0204 = phi i32 [ %inc, %for.body ], [ 0, %while.body ] + %vecAcc0.0203 = phi <8 x half> [ %70, %for.body ], [ %38, %while.body ] + %pSamples.0.pn202 = phi half* [ %pSamples.1207, %for.body ], [ %pSamples.0220, %while.body ] + %incdec.ptr40 = getelementptr inbounds half, half* %.pn205, i32 9 + %39 = load half, half* %pCoeffsCur.0206, align 4 + %incdec.ptr41 = getelementptr inbounds half, half* %.pn205, i32 10 + %40 = load half, half* %incdec.ptr40, align 4 + %incdec.ptr42 = getelementptr inbounds half, half* %.pn205, i32 11 + %41 = load half, half* %incdec.ptr41, align 4 + %incdec.ptr43 = getelementptr inbounds half, half* %.pn205, i32 12 + %42 = load half, half* %incdec.ptr42, align 4 + %incdec.ptr44 = getelementptr inbounds half, half* %.pn205, i32 13 + %43 = load half, half* %incdec.ptr43, align 4 + %incdec.ptr45 = getelementptr inbounds half, half* %.pn205, i32 14 + %44 = load half, half* %incdec.ptr44, align 4 + %incdec.ptr46 = getelementptr inbounds half, half* %.pn205, i32 15 + %45 = load half, half* %incdec.ptr45, align 4 + %46 = load half, half* %incdec.ptr46, align 4 + %47 = bitcast half* %pSamples.1207 to <8 x half>* + %48 = load <8 x half>, <8 x half>* %47, align 4 + %.splatinsert48 = insertelement <8 x half> undef, half %39, i32 0 + %.splat49 = shufflevector <8 x half> %.splatinsert48, <8 x half> undef, <8 x i32> zeroinitializer + %49 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %48, <8 x half> %.splat49, <8 x half> %vecAcc0.0203) + %arrayidx50 = getelementptr inbounds half, half* %pSamples.0.pn202, i32 9 + %50 = bitcast half* %arrayidx50 to <8 x half>* + %51 = load <8 x half>, <8 x half>* %50, align 4 + %.splatinsert51 = insertelement <8 x half> undef, half %40, i32 0 + %.splat52 = shufflevector <8 x half> %.splatinsert51, <8 x half> undef, <8 x i32> zeroinitializer + %52 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %51, <8 x half> %.splat52, <8 x half> %49) + %arrayidx53 = getelementptr inbounds half, half* %pSamples.0.pn202, i32 10 + %53 = bitcast half* %arrayidx53 to <8 x half>* + %54 = load <8 x half>, <8 x half>* %53, align 4 + %.splatinsert54 = insertelement <8 x half> undef, half %41, i32 0 + %.splat55 = shufflevector <8 x half> %.splatinsert54, <8 x half> undef, <8 x i32> zeroinitializer + %55 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %54, <8 x half> %.splat55, <8 x half> %52) + %arrayidx56 = getelementptr inbounds half, half* %pSamples.0.pn202, i32 11 + %56 = bitcast half* %arrayidx56 to <8 x half>* + %57 = load <8 x half>, <8 x half>* %56, align 4 + %.splatinsert57 = insertelement <8 x half> undef, half %42, i32 0 + %.splat58 = shufflevector <8 x half> %.splatinsert57, <8 x half> undef, <8 x i32> zeroinitializer + %58 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %57, <8 x half> %.splat58, <8 x half> %55) + %arrayidx59 = getelementptr inbounds half, half* %pSamples.0.pn202, i32 12 + %59 = bitcast half* %arrayidx59 to <8 x half>* + %60 = load <8 x half>, <8 x half>* %59, align 4 + %.splatinsert60 = insertelement <8 x half> undef, half %43, i32 0 + %.splat61 = shufflevector <8 x half> %.splatinsert60, <8 x half> undef, <8 x i32> zeroinitializer + %61 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %60, <8 x half> %.splat61, <8 x half> %58) + %arrayidx62 = getelementptr inbounds half, half* %pSamples.0.pn202, i32 13 + %62 = bitcast half* %arrayidx62 to <8 x half>* + %63 = load <8 x half>, <8 x half>* %62, align 4 + %.splatinsert63 = insertelement <8 x half> undef, half %44, i32 0 + %.splat64 = shufflevector <8 x half> %.splatinsert63, <8 x half> undef, <8 x i32> zeroinitializer + %64 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %63, <8 x half> %.splat64, <8 x half> %61) + %arrayidx65 = getelementptr inbounds half, half* %pSamples.0.pn202, i32 14 + %65 = bitcast half* %arrayidx65 to <8 x half>* + %66 = load <8 x half>, <8 x half>* %65, align 4 + %.splatinsert66 = insertelement <8 x half> undef, half %45, i32 0 + %.splat67 = shufflevector <8 x half> %.splatinsert66, <8 x half> undef, <8 x i32> zeroinitializer + %67 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %66, <8 x half> %.splat67, <8 x half> %64) + %arrayidx68 = getelementptr inbounds half, half* %pSamples.0.pn202, i32 15 + %68 = bitcast half* %arrayidx68 to <8 x half>* + %69 = load <8 x half>, <8 x half>* %68, align 4 + %.splatinsert69 = insertelement <8 x half> undef, half %46, i32 0 + %.splat70 = shufflevector <8 x half> %.splatinsert69, <8 x half> undef, <8 x i32> zeroinitializer + %70 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %69, <8 x half> %.splat70, <8 x half> %67) + %inc = add nuw nsw i32 %i.0204, 1 + %pCoeffsCur.0 = getelementptr inbounds half, half* %pCoeffsCur.0206, i32 8 + %pSamples.1 = getelementptr inbounds half, half* %pSamples.1207, i32 8 + %exitcond = icmp eq i32 %inc, %smax + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %while.body + %vecAcc0.0.lcssa = phi <8 x half> [ %38, %while.body ], [ %70, %for.body ] + %pCoeffsCur.0.lcssa = phi half* [ %pCoeffsCur.0199, %while.body ], [ %pCoeffsCur.0, %for.body ] + %pSamples.1.lcssa = phi half* [ %pSamples.1200, %while.body ], [ %pSamples.1, %for.body ] + br i1 %cmp74210, label %while.end, label %while.body76 + +while.body76: ; preds = %for.end, %while.body76 + %pCoeffsCur.1214 = phi half* [ %incdec.ptr77, %while.body76 ], [ %pCoeffsCur.0.lcssa, %for.end ] + %vecAcc0.1213 = phi <8 x half> [ %74, %while.body76 ], [ %vecAcc0.0.lcssa, %for.end ] + %numCnt.0212 = phi i32 [ %dec, %while.body76 ], [ %and, %for.end ] + %pSamples.2211 = phi half* [ %incdec.ptr80, %while.body76 ], [ %pSamples.1.lcssa, %for.end ] + %incdec.ptr77 = getelementptr inbounds half, half* %pCoeffsCur.1214, i32 1 + %71 = load half, half* %pCoeffsCur.1214, align 4 + %72 = bitcast half* %pSamples.2211 to <8 x half>* + %73 = load <8 x half>, <8 x half>* %72, align 4 + %.splatinsert78 = insertelement <8 x half> undef, half %71, i32 0 + %.splat79 = shufflevector <8 x half> %.splatinsert78, <8 x half> undef, <8 x i32> zeroinitializer + %74 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %73, <8 x half> %.splat79, <8 x half> %vecAcc0.1213) + %incdec.ptr80 = getelementptr inbounds half, half* %pSamples.2211, i32 1 + %dec = add nsw i32 %numCnt.0212, -1 + %cmp74 = icmp sgt i32 %numCnt.0212, 1 + br i1 %cmp74, label %while.body76, label %while.end.loopexit + +while.end.loopexit: ; preds = %while.body76 + %scevgep = getelementptr half, half* %pSamples.1.lcssa, i32 %and + br label %while.end + +while.end: ; preds = %while.end.loopexit, %for.end + %pSamples.2.lcssa = phi half* [ %pSamples.1.lcssa, %for.end ], [ %scevgep, %while.end.loopexit ] + %vecAcc0.1.lcssa = phi <8 x half> [ %vecAcc0.0.lcssa, %for.end ], [ %74, %while.end.loopexit ] + %75 = bitcast half* %pOutput.0218 to <8 x half>* + store <8 x half> %vecAcc0.1.lcssa, <8 x half>* %75, align 4 + %add.ptr81 = getelementptr inbounds half, half* %pOutput.0218, i32 4 + %add.ptr82 = getelementptr inbounds half, half* %pSamples.2.lcssa, i32 4 + %add.ptr83 = getelementptr inbounds half, half* %add.ptr82, i32 %idx.neg + %dec84 = add nsw i32 %blkCnt.0222, -1 + %cmp5 = icmp eq i32 %dec84, 0 + br i1 %cmp5, label %if.end, label %while.body + +if.end: ; preds = %while.end, %if.then, %entry + ret void +} + +declare void @llvm.assume(i1) +declare <8 x i1> @llvm.arm.mve.vctp16(i32) +declare <8 x half> @llvm.fma.v8f16(<8 x half>, <8 x half>, <8 x half>) +declare void @llvm.masked.store.v8f16.p0v8f16(<8 x half>, <8 x half>*, i32 immarg, <8 x i1>) diff --git a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll new file mode 100644 index 0000000000000..417c32d646238 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll @@ -0,0 +1,1418 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s + +define arm_aapcs_vfpcc void @test_fadd(float* noalias nocapture readonly %A, float %B, float* noalias nocapture %C, i32 %n) { +; CHECK-LABEL: test_fadd: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: cmp r2, #1 +; CHECK-NEXT: it lt +; CHECK-NEXT: bxlt lr +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vdup.32 q0, r3 +; CHECK-NEXT: .LBB0_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q1, [r0], #16 +; CHECK-NEXT: subs r2, #4 +; CHECK-NEXT: vadd.f32 q1, q1, q0 +; CHECK-NEXT: vstrb.8 q1, [r1], #16 +; CHECK-NEXT: bne .LBB0_1 +; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: bx lr +entry: + %0 = and i32 %n, 7 + %cmp = icmp eq i32 %0, 0 + tail call void @llvm.assume(i1 %cmp) + %cmp18 = icmp sgt i32 %n, 0 + br i1 %cmp18, label %vector.ph, label %for.cond.cleanup + +vector.ph: ; preds = %entry + %broadcast.splatinsert10 = insertelement <4 x float> undef, float %B, i32 0 + %broadcast.splat11 = shufflevector <4 x float> %broadcast.splatinsert10, <4 x float> undef, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %1 = getelementptr inbounds float, float* %A, i32 %index + %2 = bitcast float* %1 to <4 x float>* + %wide.load = load <4 x float>, <4 x float>* %2, align 4 + %3 = fadd fast <4 x float> %wide.load, %broadcast.splat11 + %4 = getelementptr inbounds float, float* %C, i32 %index + %5 = bitcast float* %4 to <4 x float>* + store <4 x float> %3, <4 x float>* %5, align 4 + %index.next = add i32 %index, 4 + %6 = icmp eq i32 %index.next, %n + br i1 %6, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body, %entry + ret void +} + +define arm_aapcs_vfpcc void @test_fadd_r(float* noalias nocapture readonly %A, float %B, float* noalias nocapture %C, i32 %n) { +; CHECK-LABEL: test_fadd_r: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: cmp r2, #1 +; CHECK-NEXT: it lt +; CHECK-NEXT: bxlt lr +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vdup.32 q0, r3 +; CHECK-NEXT: .LBB1_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q1, [r0], #16 +; CHECK-NEXT: subs r2, #4 +; CHECK-NEXT: vadd.f32 q1, q0, q1 +; CHECK-NEXT: vstrb.8 q1, [r1], #16 +; CHECK-NEXT: bne .LBB1_1 +; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: bx lr +entry: + %0 = and i32 %n, 7 + %cmp = icmp eq i32 %0, 0 + tail call void @llvm.assume(i1 %cmp) + %cmp18 = icmp sgt i32 %n, 0 + br i1 %cmp18, label %vector.ph, label %for.cond.cleanup + +vector.ph: ; preds = %entry + %broadcast.splatinsert10 = insertelement <4 x float> undef, float %B, i32 0 + %broadcast.splat11 = shufflevector <4 x float> %broadcast.splatinsert10, <4 x float> undef, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %1 = getelementptr inbounds float, float* %A, i32 %index + %2 = bitcast float* %1 to <4 x float>* + %wide.load = load <4 x float>, <4 x float>* %2, align 4 + %3 = fadd fast <4 x float> %broadcast.splat11, %wide.load + %4 = getelementptr inbounds float, float* %C, i32 %index + %5 = bitcast float* %4 to <4 x float>* + store <4 x float> %3, <4 x float>* %5, align 4 + %index.next = add i32 %index, 4 + %6 = icmp eq i32 %index.next, %n + br i1 %6, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body, %entry + ret void +} + +define arm_aapcs_vfpcc void @test_fmul(float* noalias nocapture readonly %A, float %B, float* noalias nocapture %C, i32 %n) { +; CHECK-LABEL: test_fmul: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: cmp r2, #1 +; CHECK-NEXT: it lt +; CHECK-NEXT: bxlt lr +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vdup.32 q0, r3 +; CHECK-NEXT: .LBB2_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q1, [r0], #16 +; CHECK-NEXT: subs r2, #4 +; CHECK-NEXT: vmul.f32 q1, q1, q0 +; CHECK-NEXT: vstrb.8 q1, [r1], #16 +; CHECK-NEXT: bne .LBB2_1 +; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: bx lr +entry: + %0 = and i32 %n, 7 + %cmp = icmp eq i32 %0, 0 + tail call void @llvm.assume(i1 %cmp) + %cmp18 = icmp sgt i32 %n, 0 + br i1 %cmp18, label %vector.ph, label %for.cond.cleanup + +vector.ph: ; preds = %entry + %broadcast.splatinsert10 = insertelement <4 x float> undef, float %B, i32 0 + %broadcast.splat11 = shufflevector <4 x float> %broadcast.splatinsert10, <4 x float> undef, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %1 = getelementptr inbounds float, float* %A, i32 %index + %2 = bitcast float* %1 to <4 x float>* + %wide.load = load <4 x float>, <4 x float>* %2, align 4 + %3 = fmul fast <4 x float> %wide.load, %broadcast.splat11 + %4 = getelementptr inbounds float, float* %C, i32 %index + %5 = bitcast float* %4 to <4 x float>* + store <4 x float> %3, <4 x float>* %5, align 4 + %index.next = add i32 %index, 4 + %6 = icmp eq i32 %index.next, %n + br i1 %6, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body, %entry + ret void +} + +define arm_aapcs_vfpcc void @test_fmul_r(float* noalias nocapture readonly %A, float %B, float* noalias nocapture %C, i32 %n) { +; CHECK-LABEL: test_fmul_r: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: cmp r2, #1 +; CHECK-NEXT: it lt +; CHECK-NEXT: bxlt lr +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vdup.32 q0, r3 +; CHECK-NEXT: .LBB3_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q1, [r0], #16 +; CHECK-NEXT: subs r2, #4 +; CHECK-NEXT: vmul.f32 q1, q0, q1 +; CHECK-NEXT: vstrb.8 q1, [r1], #16 +; CHECK-NEXT: bne .LBB3_1 +; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: bx lr +entry: + %0 = and i32 %n, 7 + %cmp = icmp eq i32 %0, 0 + tail call void @llvm.assume(i1 %cmp) + %cmp18 = icmp sgt i32 %n, 0 + br i1 %cmp18, label %vector.ph, label %for.cond.cleanup + +vector.ph: ; preds = %entry + %broadcast.splatinsert10 = insertelement <4 x float> undef, float %B, i32 0 + %broadcast.splat11 = shufflevector <4 x float> %broadcast.splatinsert10, <4 x float> undef, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %1 = getelementptr inbounds float, float* %A, i32 %index + %2 = bitcast float* %1 to <4 x float>* + %wide.load = load <4 x float>, <4 x float>* %2, align 4 + %3 = fmul fast <4 x float> %broadcast.splat11, %wide.load + %4 = getelementptr inbounds float, float* %C, i32 %index + %5 = bitcast float* %4 to <4 x float>* + store <4 x float> %3, <4 x float>* %5, align 4 + %index.next = add i32 %index, 4 + %6 = icmp eq i32 %index.next, %n + br i1 %6, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body, %entry + ret void +} + +define arm_aapcs_vfpcc void @test_fsub(float* noalias nocapture readonly %A, float %B, float* noalias nocapture %C, i32 %n) { +; CHECK-LABEL: test_fsub: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: cmp r2, #1 +; CHECK-NEXT: it lt +; CHECK-NEXT: bxlt lr +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vdup.32 q0, r3 +; CHECK-NEXT: .LBB4_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q1, [r0], #16 +; CHECK-NEXT: subs r2, #4 +; CHECK-NEXT: vsub.f32 q1, q1, q0 +; CHECK-NEXT: vstrb.8 q1, [r1], #16 +; CHECK-NEXT: bne .LBB4_1 +; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: bx lr +entry: + %0 = and i32 %n, 7 + %cmp = icmp eq i32 %0, 0 + tail call void @llvm.assume(i1 %cmp) + %cmp18 = icmp sgt i32 %n, 0 + br i1 %cmp18, label %vector.ph, label %for.cond.cleanup + +vector.ph: ; preds = %entry + %broadcast.splatinsert10 = insertelement <4 x float> undef, float %B, i32 0 + %broadcast.splat11 = shufflevector <4 x float> %broadcast.splatinsert10, <4 x float> undef, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %1 = getelementptr inbounds float, float* %A, i32 %index + %2 = bitcast float* %1 to <4 x float>* + %wide.load = load <4 x float>, <4 x float>* %2, align 4 + %3 = fsub fast <4 x float> %wide.load, %broadcast.splat11 + %4 = getelementptr inbounds float, float* %C, i32 %index + %5 = bitcast float* %4 to <4 x float>* + store <4 x float> %3, <4 x float>* %5, align 4 + %index.next = add i32 %index, 4 + %6 = icmp eq i32 %index.next, %n + br i1 %6, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body, %entry + ret void +} + +define arm_aapcs_vfpcc void @test_fsub_r(float* noalias nocapture readonly %A, float %B, float* noalias nocapture %C, i32 %n) { +; CHECK-LABEL: test_fsub_r: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: cmp r2, #1 +; CHECK-NEXT: it lt +; CHECK-NEXT: bxlt lr +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vdup.32 q0, r3 +; CHECK-NEXT: .LBB5_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q1, [r0], #16 +; CHECK-NEXT: subs r2, #4 +; CHECK-NEXT: vsub.f32 q1, q0, q1 +; CHECK-NEXT: vstrb.8 q1, [r1], #16 +; CHECK-NEXT: bne .LBB5_1 +; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: bx lr +entry: + %0 = and i32 %n, 7 + %cmp = icmp eq i32 %0, 0 + tail call void @llvm.assume(i1 %cmp) + %cmp18 = icmp sgt i32 %n, 0 + br i1 %cmp18, label %vector.ph, label %for.cond.cleanup + +vector.ph: ; preds = %entry + %broadcast.splatinsert10 = insertelement <4 x float> undef, float %B, i32 0 + %broadcast.splat11 = shufflevector <4 x float> %broadcast.splatinsert10, <4 x float> undef, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %1 = getelementptr inbounds float, float* %A, i32 %index + %2 = bitcast float* %1 to <4 x float>* + %wide.load = load <4 x float>, <4 x float>* %2, align 4 + %3 = fsub fast <4 x float> %broadcast.splat11, %wide.load + %4 = getelementptr inbounds float, float* %C, i32 %index + %5 = bitcast float* %4 to <4 x float>* + store <4 x float> %3, <4 x float>* %5, align 4 + %index.next = add i32 %index, 4 + %6 = icmp eq i32 %index.next, %n + br i1 %6, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body, %entry + ret void +} + + +define arm_aapcs_vfpcc void @test_fmas(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float %C, float* noalias nocapture %D, i32 %n) { +; CHECK-LABEL: test_fmas: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: cmp r3, #1 +; CHECK-NEXT: it lt +; CHECK-NEXT: bxlt lr +; CHECK-NEXT: vmov r12, s0 +; CHECK-NEXT: vdup.32 q0, r12 +; CHECK-NEXT: .LBB6_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q1, [r0], #16 +; CHECK-NEXT: vldrw.u32 q2, [r1], #16 +; CHECK-NEXT: vmov q3, q0 +; CHECK-NEXT: subs r3, #4 +; CHECK-NEXT: vfma.f32 q3, q2, q1 +; CHECK-NEXT: vstrb.8 q3, [r2], #16 +; CHECK-NEXT: bne .LBB6_1 +; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: bx lr +entry: + %0 = and i32 %n, 7 + %cmp = icmp eq i32 %0, 0 + tail call void @llvm.assume(i1 %cmp) + %cmp110 = icmp sgt i32 %n, 0 + br i1 %cmp110, label %vector.ph, label %for.cond.cleanup + +vector.ph: ; preds = %entry + %broadcast.splatinsert13 = insertelement <4 x float> undef, float %C, i32 0 + %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %1 = getelementptr inbounds float, float* %A, i32 %index + %2 = bitcast float* %1 to <4 x float>* + %wide.load = load <4 x float>, <4 x float>* %2, align 4 + %3 = getelementptr inbounds float, float* %B, i32 %index + %4 = bitcast float* %3 to <4 x float>* + %wide.load12 = load <4 x float>, <4 x float>* %4, align 4 + %5 = fmul fast <4 x float> %wide.load12, %wide.load + %6 = fadd fast <4 x float> %5, %broadcast.splat14 + %7 = getelementptr inbounds float, float* %D, i32 %index + %8 = bitcast float* %7 to <4 x float>* + store <4 x float> %6, <4 x float>* %8, align 4 + %index.next = add i32 %index, 4 + %9 = icmp eq i32 %index.next, %n + br i1 %9, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body, %entry + ret void +} + +define arm_aapcs_vfpcc void @test_fmas_r(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float %C, float* noalias nocapture %D, i32 %n) { +; CHECK-LABEL: test_fmas_r: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: cmp r3, #1 +; CHECK-NEXT: it lt +; CHECK-NEXT: bxlt lr +; CHECK-NEXT: vmov r12, s0 +; CHECK-NEXT: vdup.32 q0, r12 +; CHECK-NEXT: .LBB7_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q1, [r0], #16 +; CHECK-NEXT: vldrw.u32 q2, [r1], #16 +; CHECK-NEXT: vmov q3, q0 +; CHECK-NEXT: subs r3, #4 +; CHECK-NEXT: vfma.f32 q3, q2, q1 +; CHECK-NEXT: vstrb.8 q3, [r2], #16 +; CHECK-NEXT: bne .LBB7_1 +; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: bx lr +entry: + %0 = and i32 %n, 7 + %cmp = icmp eq i32 %0, 0 + tail call void @llvm.assume(i1 %cmp) + %cmp110 = icmp sgt i32 %n, 0 + br i1 %cmp110, label %vector.ph, label %for.cond.cleanup + +vector.ph: ; preds = %entry + %broadcast.splatinsert13 = insertelement <4 x float> undef, float %C, i32 0 + %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %1 = getelementptr inbounds float, float* %A, i32 %index + %2 = bitcast float* %1 to <4 x float>* + %wide.load = load <4 x float>, <4 x float>* %2, align 4 + %3 = getelementptr inbounds float, float* %B, i32 %index + %4 = bitcast float* %3 to <4 x float>* + %wide.load12 = load <4 x float>, <4 x float>* %4, align 4 + %5 = fmul fast <4 x float> %wide.load12, %wide.load + %6 = fadd fast <4 x float> %broadcast.splat14, %5 + %7 = getelementptr inbounds float, float* %D, i32 %index + %8 = bitcast float* %7 to <4 x float>* + store <4 x float> %6, <4 x float>* %8, align 4 + %index.next = add i32 %index, 4 + %9 = icmp eq i32 %index.next, %n + br i1 %9, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body, %entry + ret void +} + +define arm_aapcs_vfpcc void @test_fma(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float %C, float* noalias nocapture %D, i32 %n) { +; CHECK-LABEL: test_fma: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: cmp r3, #1 +; CHECK-NEXT: it lt +; CHECK-NEXT: bxlt lr +; CHECK-NEXT: vmov r12, s0 +; CHECK-NEXT: vdup.32 q0, r12 +; CHECK-NEXT: .LBB8_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q1, [r0], #16 +; CHECK-NEXT: vldrw.u32 q2, [r1], #16 +; CHECK-NEXT: subs r3, #4 +; CHECK-NEXT: vfma.f32 q2, q1, q0 +; CHECK-NEXT: vstrb.8 q2, [r2], #16 +; CHECK-NEXT: bne .LBB8_1 +; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: bx lr +entry: + %0 = and i32 %n, 7 + %cmp = icmp eq i32 %0, 0 + tail call void @llvm.assume(i1 %cmp) + %cmp110 = icmp sgt i32 %n, 0 + br i1 %cmp110, label %vector.ph, label %for.cond.cleanup + +vector.ph: ; preds = %entry + %broadcast.splatinsert12 = insertelement <4 x float> undef, float %C, i32 0 + %broadcast.splat13 = shufflevector <4 x float> %broadcast.splatinsert12, <4 x float> undef, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %1 = getelementptr inbounds float, float* %A, i32 %index + %2 = bitcast float* %1 to <4 x float>* + %wide.load = load <4 x float>, <4 x float>* %2, align 4 + %3 = fmul fast <4 x float> %wide.load, %broadcast.splat13 + %4 = getelementptr inbounds float, float* %B, i32 %index + %5 = bitcast float* %4 to <4 x float>* + %wide.load14 = load <4 x float>, <4 x float>* %5, align 4 + %6 = fadd fast <4 x float> %3, %wide.load14 + %7 = getelementptr inbounds float, float* %D, i32 %index + %8 = bitcast float* %7 to <4 x float>* + store <4 x float> %6, <4 x float>* %8, align 4 + %index.next = add i32 %index, 4 + %9 = icmp eq i32 %index.next, %n + br i1 %9, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body, %entry + ret void +} + +define arm_aapcs_vfpcc void @test_fma_r(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float %C, float* noalias nocapture %D, i32 %n) { +; CHECK-LABEL: test_fma_r: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: cmp r3, #1 +; CHECK-NEXT: it lt +; CHECK-NEXT: bxlt lr +; CHECK-NEXT: vmov r12, s0 +; CHECK-NEXT: vdup.32 q0, r12 +; CHECK-NEXT: .LBB9_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q1, [r0], #16 +; CHECK-NEXT: vldrw.u32 q2, [r1], #16 +; CHECK-NEXT: subs r3, #4 +; CHECK-NEXT: vfma.f32 q2, q0, q1 +; CHECK-NEXT: vstrb.8 q2, [r2], #16 +; CHECK-NEXT: bne .LBB9_1 +; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: bx lr +entry: + %0 = and i32 %n, 7 + %cmp = icmp eq i32 %0, 0 + tail call void @llvm.assume(i1 %cmp) + %cmp110 = icmp sgt i32 %n, 0 + br i1 %cmp110, label %vector.ph, label %for.cond.cleanup + +vector.ph: ; preds = %entry + %broadcast.splatinsert12 = insertelement <4 x float> undef, float %C, i32 0 + %broadcast.splat13 = shufflevector <4 x float> %broadcast.splatinsert12, <4 x float> undef, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %1 = getelementptr inbounds float, float* %A, i32 %index + %2 = bitcast float* %1 to <4 x float>* + %wide.load = load <4 x float>, <4 x float>* %2, align 4 + %3 = fmul fast <4 x float> %broadcast.splat13, %wide.load + %4 = getelementptr inbounds float, float* %B, i32 %index + %5 = bitcast float* %4 to <4 x float>* + %wide.load14 = load <4 x float>, <4 x float>* %5, align 4 + %6 = fadd fast <4 x float> %3, %wide.load14 + %7 = getelementptr inbounds float, float* %D, i32 %index + %8 = bitcast float* %7 to <4 x float>* + store <4 x float> %6, <4 x float>* %8, align 4 + %index.next = add i32 %index, 4 + %9 = icmp eq i32 %index.next, %n + br i1 %9, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body, %entry + ret void +} + + +define arm_aapcs_vfpcc void @test_fmss(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float %C, float* noalias nocapture %D, i32 %n) { +; CHECK-LABEL: test_fmss: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: cmp r3, #1 +; CHECK-NEXT: it lt +; CHECK-NEXT: bxlt lr +; CHECK-NEXT: vmov r12, s0 +; CHECK-NEXT: vdup.32 q0, r12 +; CHECK-NEXT: vneg.f32 q0, q0 +; CHECK-NEXT: .LBB10_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q1, [r0], #16 +; CHECK-NEXT: vldrw.u32 q2, [r1], #16 +; CHECK-NEXT: vmov q3, q0 +; CHECK-NEXT: subs r3, #4 +; CHECK-NEXT: vfma.f32 q3, q2, q1 +; CHECK-NEXT: vstrb.8 q3, [r2], #16 +; CHECK-NEXT: bne .LBB10_1 +; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: bx lr +entry: + %0 = and i32 %n, 7 + %cmp = icmp eq i32 %0, 0 + tail call void @llvm.assume(i1 %cmp) + %cmp110 = icmp sgt i32 %n, 0 + br i1 %cmp110, label %vector.ph, label %for.cond.cleanup + +vector.ph: ; preds = %entry + %broadcast.splatinsert13 = insertelement <4 x float> undef, float %C, i32 0 + %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %1 = getelementptr inbounds float, float* %A, i32 %index + %2 = bitcast float* %1 to <4 x float>* + %wide.load = load <4 x float>, <4 x float>* %2, align 4 + %3 = getelementptr inbounds float, float* %B, i32 %index + %4 = bitcast float* %3 to <4 x float>* + %wide.load12 = load <4 x float>, <4 x float>* %4, align 4 + %5 = fmul fast <4 x float> %wide.load12, %wide.load + %6 = fsub fast <4 x float> %5, %broadcast.splat14 + %7 = getelementptr inbounds float, float* %D, i32 %index + %8 = bitcast float* %7 to <4 x float>* + store <4 x float> %6, <4 x float>* %8, align 4 + %index.next = add i32 %index, 4 + %9 = icmp eq i32 %index.next, %n + br i1 %9, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body, %entry + ret void +} + +define arm_aapcs_vfpcc void @test_fmss_r(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float %C, float* noalias nocapture %D, i32 %n) { +; CHECK-LABEL: test_fmss_r: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: cmp r3, #1 +; CHECK-NEXT: it lt +; CHECK-NEXT: bxlt lr +; CHECK-NEXT: vmov r12, s0 +; CHECK-NEXT: vdup.32 q0, r12 +; CHECK-NEXT: .LBB11_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q1, [r0], #16 +; CHECK-NEXT: vldrw.u32 q2, [r1], #16 +; CHECK-NEXT: vmov q3, q0 +; CHECK-NEXT: subs r3, #4 +; CHECK-NEXT: vfms.f32 q3, q2, q1 +; CHECK-NEXT: vstrb.8 q3, [r2], #16 +; CHECK-NEXT: bne .LBB11_1 +; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: bx lr +entry: + %0 = and i32 %n, 7 + %cmp = icmp eq i32 %0, 0 + tail call void @llvm.assume(i1 %cmp) + %cmp110 = icmp sgt i32 %n, 0 + br i1 %cmp110, label %vector.ph, label %for.cond.cleanup + +vector.ph: ; preds = %entry + %broadcast.splatinsert13 = insertelement <4 x float> undef, float %C, i32 0 + %broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %1 = getelementptr inbounds float, float* %A, i32 %index + %2 = bitcast float* %1 to <4 x float>* + %wide.load = load <4 x float>, <4 x float>* %2, align 4 + %3 = getelementptr inbounds float, float* %B, i32 %index + %4 = bitcast float* %3 to <4 x float>* + %wide.load12 = load <4 x float>, <4 x float>* %4, align 4 + %5 = fmul fast <4 x float> %wide.load12, %wide.load + %6 = fsub fast <4 x float> %broadcast.splat14, %5 + %7 = getelementptr inbounds float, float* %D, i32 %index + %8 = bitcast float* %7 to <4 x float>* + store <4 x float> %6, <4 x float>* %8, align 4 + %index.next = add i32 %index, 4 + %9 = icmp eq i32 %index.next, %n + br i1 %9, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body, %entry + ret void +} + +define arm_aapcs_vfpcc void @test_fms(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float %C, float* noalias nocapture %D, i32 %n) { +; CHECK-LABEL: test_fms: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: cmp r3, #1 +; CHECK-NEXT: it lt +; CHECK-NEXT: bxlt lr +; CHECK-NEXT: vmov r12, s0 +; CHECK-NEXT: vdup.32 q0, r12 +; CHECK-NEXT: .LBB12_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q1, [r1], #16 +; CHECK-NEXT: vldrw.u32 q2, [r0], #16 +; CHECK-NEXT: subs r3, #4 +; CHECK-NEXT: vneg.f32 q1, q1 +; CHECK-NEXT: vfma.f32 q1, q2, q0 +; CHECK-NEXT: vstrb.8 q1, [r2], #16 +; CHECK-NEXT: bne .LBB12_1 +; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: bx lr +entry: + %0 = and i32 %n, 7 + %cmp = icmp eq i32 %0, 0 + tail call void @llvm.assume(i1 %cmp) + %cmp110 = icmp sgt i32 %n, 0 + br i1 %cmp110, label %vector.ph, label %for.cond.cleanup + +vector.ph: ; preds = %entry + %broadcast.splatinsert12 = insertelement <4 x float> undef, float %C, i32 0 + %broadcast.splat13 = shufflevector <4 x float> %broadcast.splatinsert12, <4 x float> undef, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %1 = getelementptr inbounds float, float* %A, i32 %index + %2 = bitcast float* %1 to <4 x float>* + %wide.load = load <4 x float>, <4 x float>* %2, align 4 + %3 = fmul fast <4 x float> %wide.load, %broadcast.splat13 + %4 = getelementptr inbounds float, float* %B, i32 %index + %5 = bitcast float* %4 to <4 x float>* + %wide.load14 = load <4 x float>, <4 x float>* %5, align 4 + %6 = fsub fast <4 x float> %3, %wide.load14 + %7 = getelementptr inbounds float, float* %D, i32 %index + %8 = bitcast float* %7 to <4 x float>* + store <4 x float> %6, <4 x float>* %8, align 4 + %index.next = add i32 %index, 4 + %9 = icmp eq i32 %index.next, %n + br i1 %9, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body, %entry + ret void +} + +define arm_aapcs_vfpcc void @test_fms_r(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float %C, float* noalias nocapture %D, i32 %n) { +; CHECK-LABEL: test_fms_r: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: cmp r3, #1 +; CHECK-NEXT: it lt +; CHECK-NEXT: bxlt lr +; CHECK-NEXT: vmov r12, s0 +; CHECK-NEXT: vdup.32 q0, r12 +; CHECK-NEXT: .LBB13_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q1, [r1], #16 +; CHECK-NEXT: vldrw.u32 q2, [r0], #16 +; CHECK-NEXT: subs r3, #4 +; CHECK-NEXT: vneg.f32 q1, q1 +; CHECK-NEXT: vfma.f32 q1, q0, q2 +; CHECK-NEXT: vstrb.8 q1, [r2], #16 +; CHECK-NEXT: bne .LBB13_1 +; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: bx lr +entry: + %0 = and i32 %n, 7 + %cmp = icmp eq i32 %0, 0 + tail call void @llvm.assume(i1 %cmp) + %cmp110 = icmp sgt i32 %n, 0 + br i1 %cmp110, label %vector.ph, label %for.cond.cleanup + +vector.ph: ; preds = %entry + %broadcast.splatinsert12 = insertelement <4 x float> undef, float %C, i32 0 + %broadcast.splat13 = shufflevector <4 x float> %broadcast.splatinsert12, <4 x float> undef, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %1 = getelementptr inbounds float, float* %A, i32 %index + %2 = bitcast float* %1 to <4 x float>* + %wide.load = load <4 x float>, <4 x float>* %2, align 4 + %3 = fmul fast <4 x float> %broadcast.splat13, %wide.load + %4 = getelementptr inbounds float, float* %B, i32 %index + %5 = bitcast float* %4 to <4 x float>* + %wide.load14 = load <4 x float>, <4 x float>* %5, align 4 + %6 = fsub fast <4 x float> %3, %wide.load14 + %7 = getelementptr inbounds float, float* %D, i32 %index + %8 = bitcast float* %7 to <4 x float>* + store <4 x float> %6, <4 x float>* %8, align 4 + %index.next = add i32 %index, 4 + %9 = icmp eq i32 %index.next, %n + br i1 %9, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body, %entry + ret void +} + + +define dso_local void @test_nested(float* noalias nocapture %pInT1, float* noalias nocapture readonly %pOutT1, float* noalias nocapture readonly %pPRT_in, float* noalias nocapture readnone %pPRT_pDst, i32 %numRows, i32 %numCols, i32 %l, float %in) local_unnamed_addr #0 { +; CHECK-LABEL: test_nested: +; CHECK: @ %bb.0: @ %for.body.us.preheader +; CHECK-NEXT: .save {r4, r5, r6, r7, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, lr} +; CHECK-NEXT: ldrd lr, r12, [sp, #20] +; CHECK-NEXT: lsl.w r3, r12, #2 +; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: .LBB14_1: @ %for.body.us +; CHECK-NEXT: @ =>This Loop Header: Depth=1 +; CHECK-NEXT: @ Child Loop BB14_2 Depth 2 +; CHECK-NEXT: ldr r4, [r1] +; CHECK-NEXT: mov r5, r12 +; CHECK-NEXT: vdup.32 q0, r4 +; CHECK-NEXT: movs r4, #0 +; CHECK-NEXT: .LBB14_2: @ %vector.body +; CHECK-NEXT: @ Parent Loop BB14_1 Depth=1 +; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 +; CHECK-NEXT: adds r6, r0, r4 +; CHECK-NEXT: adds r7, r2, r4 +; CHECK-NEXT: vldrw.u32 q1, [r7] +; CHECK-NEXT: vldrw.u32 q2, [r6] +; CHECK-NEXT: adds r4, #16 +; CHECK-NEXT: subs r5, #4 +; CHECK-NEXT: vfms.f32 q2, q1, q0 +; CHECK-NEXT: vstrw.32 q2, [r6] +; CHECK-NEXT: bne .LBB14_2 +; CHECK-NEXT: @ %bb.3: @ %for.cond6.for.end_crit_edge.us +; CHECK-NEXT: @ in Loop: Header=BB14_1 Depth=1 +; CHECK-NEXT: add r0, r3 +; CHECK-NEXT: add r2, r3 +; CHECK-NEXT: adds r1, #4 +; CHECK-NEXT: le lr, .LBB14_1 +; CHECK-NEXT: @ %bb.4: @ %for.end14 +; CHECK-NEXT: pop {r4, r5, r6, r7, pc} +for.body.us.preheader: + %cmp = icmp sgt i32 %numRows, 0 + tail call void @llvm.assume(i1 %cmp) + %cmp1 = icmp sgt i32 %numCols, 0 + tail call void @llvm.assume(i1 %cmp1) + %rem = and i32 %numCols, 7 + %cmp2 = icmp eq i32 %rem, 0 + tail call void @llvm.assume(i1 %cmp2) + %cmp3 = icmp slt i32 %l, %numCols + tail call void @llvm.assume(i1 %cmp3) + br label %for.body.us + +for.body.us: ; preds = %for.cond6.for.end_crit_edge.us, %for.body.us.preheader + %pInT1.addr.038.us = phi float* [ %scevgep40, %for.cond6.for.end_crit_edge.us ], [ %pInT1, %for.body.us.preheader ] + %i.037.us = phi i32 [ %inc13.us, %for.cond6.for.end_crit_edge.us ], [ 0, %for.body.us.preheader ] + %pOutT1.addr.036.us = phi float* [ %incdec.ptr.us, %for.cond6.for.end_crit_edge.us ], [ %pOutT1, %for.body.us.preheader ] + %pPRT_in.addr.035.us = phi float* [ %scevgep, %for.cond6.for.end_crit_edge.us ], [ %pPRT_in, %for.body.us.preheader ] + %scevgep = getelementptr float, float* %pPRT_in.addr.035.us, i32 %numCols + %0 = load float, float* %pOutT1.addr.036.us, align 4 + %broadcast.splatinsert47 = insertelement <4 x float> undef, float %0, i32 0 + %broadcast.splat48 = shufflevector <4 x float> %broadcast.splatinsert47, <4 x float> undef, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %for.body.us + %index = phi i32 [ 0, %for.body.us ], [ %index.next, %vector.body ] + %next.gep = getelementptr float, float* %pInT1.addr.038.us, i32 %index + %next.gep45 = getelementptr float, float* %pPRT_in.addr.035.us, i32 %index + %1 = bitcast float* %next.gep to <4 x float>* + %wide.load = load <4 x float>, <4 x float>* %1, align 4 + %2 = bitcast float* %next.gep45 to <4 x float>* + %wide.load46 = load <4 x float>, <4 x float>* %2, align 4 + %3 = fmul fast <4 x float> %wide.load46, %broadcast.splat48 + %4 = fsub fast <4 x float> %wide.load, %3 + store <4 x float> %4, <4 x float>* %1, align 4 + %index.next = add i32 %index, 4 + %5 = icmp eq i32 %index.next, %numCols + br i1 %5, label %for.cond6.for.end_crit_edge.us, label %vector.body + +for.cond6.for.end_crit_edge.us: ; preds = %vector.body + %incdec.ptr.us = getelementptr inbounds float, float* %pOutT1.addr.036.us, i32 1 + %scevgep40 = getelementptr float, float* %pInT1.addr.038.us, i32 %numCols + %inc13.us = add nuw nsw i32 %i.037.us, 1 + %exitcond41 = icmp eq i32 %inc13.us, %numRows + br i1 %exitcond41, label %for.end14, label %for.body.us + +for.end14: ; preds = %for.cond6.for.end_crit_edge.us + ret void +} + +%struct.arm_fir_instance_f32 = type { i16, float*, float* } +define void @arm_fir_f32_1_4_mve(%struct.arm_fir_instance_f32* nocapture readonly %S, float* nocapture readonly %pSrc, float* %pDst, i32 %blockSize) { +; CHECK-LABEL: arm_fir_f32_1_4_mve: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr} +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: ldrh.w r9, [r0] +; CHECK-NEXT: ldr.w r12, [r0, #4] +; CHECK-NEXT: sub.w r7, r9, #1 +; CHECK-NEXT: cmp r7, #3 +; CHECK-NEXT: bhi .LBB15_6 +; CHECK-NEXT: @ %bb.1: @ %if.then +; CHECK-NEXT: ldr r6, [r0, #8] +; CHECK-NEXT: add.w r4, r12, r7, lsl #2 +; CHECK-NEXT: ldrd lr, r8, [r6] +; CHECK-NEXT: ldrd r5, r6, [r6, #8] +; CHECK-NEXT: vdup.32 q3, lr +; CHECK-NEXT: vdup.32 q2, r8 +; CHECK-NEXT: vdup.32 q0, r6 +; CHECK-NEXT: vdup.32 q1, r5 +; CHECK-NEXT: lsr.w lr, r3, #2 +; CHECK-NEXT: wls lr, lr, .LBB15_5 +; CHECK-NEXT: @ %bb.2: @ %while.body.lr.ph +; CHECK-NEXT: bic r10, r3, #3 +; CHECK-NEXT: movs r6, #0 +; CHECK-NEXT: add.w r8, r2, r10, lsl #2 +; CHECK-NEXT: .LBB15_3: @ %while.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: adds r7, r1, r6 +; CHECK-NEXT: adds r5, r2, r6 +; CHECK-NEXT: vldrw.u32 q4, [r7] +; CHECK-NEXT: adds r7, r4, r6 +; CHECK-NEXT: vstrw.32 q4, [r7] +; CHECK-NEXT: add.w r7, r12, r6 +; CHECK-NEXT: vldrw.u32 q4, [r7] +; CHECK-NEXT: vldrw.u32 q5, [r7, #4] +; CHECK-NEXT: vldrw.u32 q6, [r7, #12] +; CHECK-NEXT: adds r6, #16 +; CHECK-NEXT: vmul.f32 q4, q4, q3 +; CHECK-NEXT: vfma.f32 q4, q5, q2 +; CHECK-NEXT: vldrw.u32 q5, [r7, #8] +; CHECK-NEXT: vfma.f32 q4, q5, q1 +; CHECK-NEXT: vfma.f32 q4, q6, q0 +; CHECK-NEXT: vstrw.32 q4, [r5] +; CHECK-NEXT: le lr, .LBB15_3 +; CHECK-NEXT: @ %bb.4: @ %while.end.loopexit +; CHECK-NEXT: add r4, r6 +; CHECK-NEXT: add.w r12, r12, r10, lsl #2 +; CHECK-NEXT: add.w r1, r1, r10, lsl #2 +; CHECK-NEXT: mov r2, r8 +; CHECK-NEXT: .LBB15_5: @ %while.end +; CHECK-NEXT: and r7, r3, #3 +; CHECK-NEXT: vldrw.u32 q4, [r1] +; CHECK-NEXT: vctp.32 r7 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrwt.32 q4, [r4] +; CHECK-NEXT: vldrw.u32 q4, [r12] +; CHECK-NEXT: vmul.f32 q3, q4, q3 +; CHECK-NEXT: vldrw.u32 q4, [r12, #4] +; CHECK-NEXT: vfma.f32 q3, q4, q2 +; CHECK-NEXT: vldrw.u32 q2, [r12, #8] +; CHECK-NEXT: vfma.f32 q3, q2, q1 +; CHECK-NEXT: vldrw.u32 q1, [r12, #12] +; CHECK-NEXT: vfma.f32 q3, q1, q0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrwt.32 q3, [r2] +; CHECK-NEXT: ldr.w r12, [r0, #4] +; CHECK-NEXT: .LBB15_6: @ %if.end +; CHECK-NEXT: add.w r0, r12, r3, lsl #2 +; CHECK-NEXT: lsr.w lr, r9, #2 +; CHECK-NEXT: wls lr, lr, .LBB15_10 +; CHECK-NEXT: @ %bb.7: @ %while.body51.preheader +; CHECK-NEXT: bic r2, r9, #3 +; CHECK-NEXT: adds r1, r2, r3 +; CHECK-NEXT: mov r3, r12 +; CHECK-NEXT: add.w r1, r12, r1, lsl #2 +; CHECK-NEXT: .LBB15_8: @ %while.body51 +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q0, [r0], #16 +; CHECK-NEXT: vstrb.8 q0, [r3], #16 +; CHECK-NEXT: le lr, .LBB15_8 +; CHECK-NEXT: @ %bb.9: @ %while.end55.loopexit +; CHECK-NEXT: add.w r12, r12, r2, lsl #2 +; CHECK-NEXT: mov r0, r1 +; CHECK-NEXT: .LBB15_10: @ %while.end55 +; CHECK-NEXT: ands r1, r9, #3 +; CHECK-NEXT: beq .LBB15_12 +; CHECK-NEXT: @ %bb.11: @ %if.then59 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vctp.32 r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrwt.32 q0, [r12] +; CHECK-NEXT: .LBB15_12: @ %if.end61 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc} +entry: + %pState1 = getelementptr inbounds %struct.arm_fir_instance_f32, %struct.arm_fir_instance_f32* %S, i32 0, i32 1 + %0 = load float*, float** %pState1, align 4 + %pCoeffs2 = getelementptr inbounds %struct.arm_fir_instance_f32, %struct.arm_fir_instance_f32* %S, i32 0, i32 2 + %1 = load float*, float** %pCoeffs2, align 4 + %numTaps3 = getelementptr inbounds %struct.arm_fir_instance_f32, %struct.arm_fir_instance_f32* %S, i32 0, i32 0 + %2 = load i16, i16* %numTaps3, align 4 + %conv = zext i16 %2 to i32 + %sub = add nsw i32 %conv, -1 + %cmp = icmp ult i32 %sub, 4 + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + %arrayidx = getelementptr inbounds float, float* %0, i32 %sub + %incdec.ptr = getelementptr inbounds float, float* %1, i32 1 + %3 = load float, float* %1, align 4 + %incdec.ptr6 = getelementptr inbounds float, float* %1, i32 2 + %4 = load float, float* %incdec.ptr, align 4 + %incdec.ptr7 = getelementptr inbounds float, float* %1, i32 3 + %5 = load float, float* %incdec.ptr6, align 4 + %6 = load float, float* %incdec.ptr7, align 4 + %shr = lshr i32 %blockSize, 2 + %cmp9146 = icmp eq i32 %shr, 0 + %.pre161 = insertelement <4 x float> undef, float %3, i32 0 + %.pre162 = shufflevector <4 x float> %.pre161, <4 x float> undef, <4 x i32> zeroinitializer + %.pre163 = insertelement <4 x float> undef, float %4, i32 0 + %.pre164 = shufflevector <4 x float> %.pre163, <4 x float> undef, <4 x i32> zeroinitializer + %.pre165 = insertelement <4 x float> undef, float %5, i32 0 + %.pre166 = shufflevector <4 x float> %.pre165, <4 x float> undef, <4 x i32> zeroinitializer + %.pre167 = insertelement <4 x float> undef, float %6, i32 0 + %.pre168 = shufflevector <4 x float> %.pre167, <4 x float> undef, <4 x i32> zeroinitializer + br i1 %cmp9146, label %while.end, label %while.body.lr.ph + +while.body.lr.ph: ; preds = %if.then + %7 = and i32 %blockSize, -4 + %scevgep158 = getelementptr float, float* %pDst, i32 %7 + br label %while.body + +while.body: ; preds = %while.body.lr.ph, %while.body + %pStateCur.0151 = phi float* [ %arrayidx, %while.body.lr.ph ], [ %add.ptr, %while.body ] + %pSamples.0150 = phi float* [ %0, %while.body.lr.ph ], [ %add.ptr24, %while.body ] + %pOutput.0149 = phi float* [ %pDst, %while.body.lr.ph ], [ %add.ptr23, %while.body ] + %pTempSrc.0148 = phi float* [ %pSrc, %while.body.lr.ph ], [ %add.ptr11, %while.body ] + %blkCnt.0147 = phi i32 [ %shr, %while.body.lr.ph ], [ %dec, %while.body ] + %8 = bitcast float* %pTempSrc.0148 to <4 x float>* + %9 = load <4 x float>, <4 x float>* %8, align 4 + %10 = bitcast float* %pStateCur.0151 to <4 x float>* + store <4 x float> %9, <4 x float>* %10, align 4 + %add.ptr = getelementptr inbounds float, float* %pStateCur.0151, i32 4 + %add.ptr11 = getelementptr inbounds float, float* %pTempSrc.0148, i32 4 + %11 = bitcast float* %pSamples.0150 to <4 x float>* + %12 = load <4 x float>, <4 x float>* %11, align 4 + %13 = fmul fast <4 x float> %12, %.pre162 + %arrayidx12 = getelementptr inbounds float, float* %pSamples.0150, i32 1 + %14 = bitcast float* %arrayidx12 to <4 x float>* + %15 = load <4 x float>, <4 x float>* %14, align 4 + %mul = fmul fast <4 x float> %15, %.pre164 + %add = fadd fast <4 x float> %mul, %13 + %arrayidx13 = getelementptr inbounds float, float* %pSamples.0150, i32 2 + %16 = bitcast float* %arrayidx13 to <4 x float>* + %17 = load <4 x float>, <4 x float>* %16, align 4 + %mul16 = fmul fast <4 x float> %17, %.pre166 + %add17 = fadd fast <4 x float> %add, %mul16 + %arrayidx18 = getelementptr inbounds float, float* %pSamples.0150, i32 3 + %18 = bitcast float* %arrayidx18 to <4 x float>* + %19 = load <4 x float>, <4 x float>* %18, align 4 + %mul21 = fmul fast <4 x float> %19, %.pre168 + %add22 = fadd fast <4 x float> %add17, %mul21 + %20 = bitcast float* %pOutput.0149 to <4 x float>* + store <4 x float> %add22, <4 x float>* %20, align 4 + %add.ptr23 = getelementptr inbounds float, float* %pOutput.0149, i32 4 + %add.ptr24 = getelementptr inbounds float, float* %pSamples.0150, i32 4 + %dec = add nsw i32 %blkCnt.0147, -1 + %cmp9 = icmp eq i32 %dec, 0 + br i1 %cmp9, label %while.end.loopexit, label %while.body + +while.end.loopexit: ; preds = %while.body + %scevgep157 = getelementptr float, float* %pSrc, i32 %7 + %scevgep159 = getelementptr float, float* %0, i32 %7 + br label %while.end + +while.end: ; preds = %if.then, %while.end.loopexit + %pTempSrc.0.lcssa = phi float* [ %scevgep157, %while.end.loopexit ], [ %pSrc, %if.then ] + %pOutput.0.lcssa = phi float* [ %scevgep158, %while.end.loopexit ], [ %pDst, %if.then ] + %pSamples.0.lcssa = phi float* [ %scevgep159, %while.end.loopexit ], [ %0, %if.then ] + %pStateCur.0.lcssa = phi float* [ %add.ptr, %while.end.loopexit ], [ %arrayidx, %if.then ] + %and = and i32 %blockSize, 3 + %21 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %and) + %22 = bitcast float* %pTempSrc.0.lcssa to <4 x float>* + %23 = load <4 x float>, <4 x float>* %22, align 4 + %24 = bitcast float* %pStateCur.0.lcssa to <4 x float>* + tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %23, <4 x float>* %24, i32 4, <4 x i1> %21) + %25 = bitcast float* %pSamples.0.lcssa to <4 x float>* + %26 = load <4 x float>, <4 x float>* %25, align 4 + %27 = fmul fast <4 x float> %26, %.pre162 + %arrayidx29 = getelementptr inbounds float, float* %pSamples.0.lcssa, i32 1 + %28 = bitcast float* %arrayidx29 to <4 x float>* + %29 = load <4 x float>, <4 x float>* %28, align 4 + %mul32 = fmul fast <4 x float> %29, %.pre164 + %add33 = fadd fast <4 x float> %mul32, %27 + %arrayidx34 = getelementptr inbounds float, float* %pSamples.0.lcssa, i32 2 + %30 = bitcast float* %arrayidx34 to <4 x float>* + %31 = load <4 x float>, <4 x float>* %30, align 4 + %mul37 = fmul fast <4 x float> %31, %.pre166 + %add38 = fadd fast <4 x float> %add33, %mul37 + %arrayidx39 = getelementptr inbounds float, float* %pSamples.0.lcssa, i32 3 + %32 = bitcast float* %arrayidx39 to <4 x float>* + %33 = load <4 x float>, <4 x float>* %32, align 4 + %mul42 = fmul fast <4 x float> %33, %.pre168 + %add43 = fadd fast <4 x float> %add38, %mul42 + %34 = bitcast float* %pOutput.0.lcssa to <4 x float>* + tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %add43, <4 x float>* %34, i32 4, <4 x i1> %21) + %.pre = load float*, float** %pState1, align 4 + br label %if.end + +if.end: ; preds = %while.end, %entry + %35 = phi float* [ %.pre, %while.end ], [ %0, %entry ] + %arrayidx45 = getelementptr inbounds float, float* %35, i32 %blockSize + %shr47 = lshr i32 %conv, 2 + %cmp49141 = icmp eq i32 %shr47, 0 + br i1 %cmp49141, label %while.end55, label %while.body51.preheader + +while.body51.preheader: ; preds = %if.end + %36 = and i32 %conv, 65532 + %37 = add i32 %36, %blockSize + %scevgep = getelementptr float, float* %35, i32 %37 + br label %while.body51 + +while.body51: ; preds = %while.body51.preheader, %while.body51 + %pTempSrc.1144 = phi float* [ %add.ptr52, %while.body51 ], [ %arrayidx45, %while.body51.preheader ] + %pTempDest.0143 = phi float* [ %add.ptr53, %while.body51 ], [ %35, %while.body51.preheader ] + %blkCnt.1142 = phi i32 [ %dec54, %while.body51 ], [ %shr47, %while.body51.preheader ] + %38 = bitcast float* %pTempSrc.1144 to <4 x float>* + %39 = load <4 x float>, <4 x float>* %38, align 4 + %40 = bitcast float* %pTempDest.0143 to <4 x float>* + store <4 x float> %39, <4 x float>* %40, align 4 + %add.ptr52 = getelementptr inbounds float, float* %pTempSrc.1144, i32 4 + %add.ptr53 = getelementptr inbounds float, float* %pTempDest.0143, i32 4 + %dec54 = add nsw i32 %blkCnt.1142, -1 + %cmp49 = icmp eq i32 %dec54, 0 + br i1 %cmp49, label %while.end55.loopexit, label %while.body51 + +while.end55.loopexit: ; preds = %while.body51 + %scevgep156 = getelementptr float, float* %35, i32 %36 + br label %while.end55 + +while.end55: ; preds = %while.end55.loopexit, %if.end + %pTempDest.0.lcssa = phi float* [ %35, %if.end ], [ %scevgep156, %while.end55.loopexit ] + %pTempSrc.1.lcssa = phi float* [ %arrayidx45, %if.end ], [ %scevgep, %while.end55.loopexit ] + %and56 = and i32 %conv, 3 + %cmp57 = icmp eq i32 %and56, 0 + br i1 %cmp57, label %if.end61, label %if.then59 + +if.then59: ; preds = %while.end55 + %41 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %and56) + %42 = bitcast float* %pTempSrc.1.lcssa to <4 x float>* + %43 = load <4 x float>, <4 x float>* %42, align 4 + %44 = bitcast float* %pTempDest.0.lcssa to <4 x float>* + tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %43, <4 x float>* %44, i32 4, <4 x i1> %41) + br label %if.end61 + +if.end61: ; preds = %while.end55, %if.then59 + ret void +} + + +define void @fir(%struct.arm_fir_instance_f32* nocapture readonly %S, float* nocapture readonly %pSrc, float* nocapture %pDst, i32 %blockSize) { +; CHECK-LABEL: fir: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-NEXT: .pad #4 +; CHECK-NEXT: sub sp, #4 +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: .pad #40 +; CHECK-NEXT: sub sp, #40 +; CHECK-NEXT: cmp r3, #8 +; CHECK-NEXT: blo.w .LBB16_12 +; CHECK-NEXT: @ %bb.1: @ %if.then +; CHECK-NEXT: movs r7, #0 +; CHECK-NEXT: cmp.w r7, r3, lsr #2 +; CHECK-NEXT: beq.w .LBB16_12 +; CHECK-NEXT: @ %bb.2: @ %while.body.lr.ph +; CHECK-NEXT: ldrh r4, [r0] +; CHECK-NEXT: movs r5, #1 +; CHECK-NEXT: ldrd r6, r12, [r0, #4] +; CHECK-NEXT: lsrs r3, r3, #2 +; CHECK-NEXT: sub.w r0, r4, #8 +; CHECK-NEXT: add.w r7, r0, r0, lsr #29 +; CHECK-NEXT: and r0, r0, #7 +; CHECK-NEXT: asr.w lr, r7, #3 +; CHECK-NEXT: cmp.w lr, #1 +; CHECK-NEXT: it gt +; CHECK-NEXT: asrgt r5, r7, #3 +; CHECK-NEXT: add.w r7, r6, r4, lsl #2 +; CHECK-NEXT: sub.w r11, r7, #4 +; CHECK-NEXT: rsbs r7, r4, #0 +; CHECK-NEXT: str r7, [sp, #16] @ 4-byte Spill +; CHECK-NEXT: add.w r7, r12, #32 +; CHECK-NEXT: str r0, [sp, #24] @ 4-byte Spill +; CHECK-NEXT: adds r0, #1 +; CHECK-NEXT: str r5, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: str r4, [sp, #20] @ 4-byte Spill +; CHECK-NEXT: str r7, [sp, #12] @ 4-byte Spill +; CHECK-NEXT: str r0, [sp, #8] @ 4-byte Spill +; CHECK-NEXT: b .LBB16_4 +; CHECK-NEXT: .LBB16_3: @ %while.end +; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1 +; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload +; CHECK-NEXT: ldrd r11, r3, [sp, #28] @ 8-byte Folded Reload +; CHECK-NEXT: vstrb.8 q0, [r2], #16 +; CHECK-NEXT: ldr r1, [sp, #36] @ 4-byte Reload +; CHECK-NEXT: subs r3, #1 +; CHECK-NEXT: add.w r0, r8, r0, lsl #2 +; CHECK-NEXT: add.w r6, r0, #16 +; CHECK-NEXT: beq .LBB16_12 +; CHECK-NEXT: .LBB16_4: @ %while.body +; CHECK-NEXT: @ =>This Loop Header: Depth=1 +; CHECK-NEXT: @ Child Loop BB16_6 Depth 2 +; CHECK-NEXT: @ Child Loop BB16_10 Depth 2 +; CHECK-NEXT: add.w lr, r12, #12 +; CHECK-NEXT: vldrw.u32 q0, [r1], #16 +; CHECK-NEXT: ldm.w r12, {r0, r5, r7} +; CHECK-NEXT: ldm.w lr, {r4, r9, lr} +; CHECK-NEXT: ldrd r8, r10, [r12, #24] +; CHECK-NEXT: vstrb.8 q0, [r11], #16 +; CHECK-NEXT: vldrw.u32 q0, [r6] +; CHECK-NEXT: vldrw.u32 q1, [r6, #4] +; CHECK-NEXT: vldrw.u32 q6, [r6, #8] +; CHECK-NEXT: vldrw.u32 q4, [r6, #12] +; CHECK-NEXT: vmul.f32 q0, q0, r0 +; CHECK-NEXT: vldrw.u32 q5, [r6, #16] +; CHECK-NEXT: vfma.f32 q0, q1, r5 +; CHECK-NEXT: vldrw.u32 q2, [r6, #20] +; CHECK-NEXT: vfma.f32 q0, q6, r7 +; CHECK-NEXT: vldrw.u32 q3, [r6, #24] +; CHECK-NEXT: vfma.f32 q0, q4, r4 +; CHECK-NEXT: vldrw.u32 q1, [r6, #28] +; CHECK-NEXT: vfma.f32 q0, q5, r9 +; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: vfma.f32 q0, q2, lr +; CHECK-NEXT: add.w r5, r6, #32 +; CHECK-NEXT: vfma.f32 q0, q3, r8 +; CHECK-NEXT: cmp r0, #16 +; CHECK-NEXT: vfma.f32 q0, q1, r10 +; CHECK-NEXT: str r1, [sp, #36] @ 4-byte Spill +; CHECK-NEXT: strd r11, r3, [sp, #28] @ 8-byte Folded Spill +; CHECK-NEXT: blo .LBB16_7 +; CHECK-NEXT: @ %bb.5: @ %for.body.preheader +; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1 +; CHECK-NEXT: ldr.w lr, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: ldr r6, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: .LBB16_6: @ %for.body +; CHECK-NEXT: @ Parent Loop BB16_4 Depth=1 +; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 +; CHECK-NEXT: ldm.w r6, {r0, r3, r4, r7, r10, r11} +; CHECK-NEXT: vldrw.u32 q1, [r5] +; CHECK-NEXT: vldrw.u32 q6, [r5, #8] +; CHECK-NEXT: vldrw.u32 q4, [r5, #12] +; CHECK-NEXT: vldrw.u32 q5, [r5, #16] +; CHECK-NEXT: vldrw.u32 q2, [r5, #20] +; CHECK-NEXT: vfma.f32 q0, q1, r0 +; CHECK-NEXT: vldrw.u32 q1, [r5, #4] +; CHECK-NEXT: ldrd r1, r9, [r6, #24] +; CHECK-NEXT: vldrw.u32 q3, [r5, #24] +; CHECK-NEXT: vfma.f32 q0, q1, r3 +; CHECK-NEXT: vldrw.u32 q1, [r5, #28] +; CHECK-NEXT: vfma.f32 q0, q6, r4 +; CHECK-NEXT: add.w r8, r5, #32 +; CHECK-NEXT: vfma.f32 q0, q4, r7 +; CHECK-NEXT: adds r6, #32 +; CHECK-NEXT: vfma.f32 q0, q5, r10 +; CHECK-NEXT: vfma.f32 q0, q2, r11 +; CHECK-NEXT: mov r5, r8 +; CHECK-NEXT: vfma.f32 q0, q3, r1 +; CHECK-NEXT: vfma.f32 q0, q1, r9 +; CHECK-NEXT: le lr, .LBB16_6 +; CHECK-NEXT: b .LBB16_8 +; CHECK-NEXT: .LBB16_7: @ in Loop: Header=BB16_4 Depth=1 +; CHECK-NEXT: ldr r6, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: mov r8, r5 +; CHECK-NEXT: .LBB16_8: @ %for.end +; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1 +; CHECK-NEXT: ldr r0, [sp, #24] @ 4-byte Reload +; CHECK-NEXT: cmp r0, #0 +; CHECK-NEXT: beq .LBB16_3 +; CHECK-NEXT: @ %bb.9: @ %while.body76.preheader +; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1 +; CHECK-NEXT: ldr r5, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: mov r0, r8 +; CHECK-NEXT: .LBB16_10: @ %while.body76 +; CHECK-NEXT: @ Parent Loop BB16_4 Depth=1 +; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 +; CHECK-NEXT: ldr r1, [r6], #4 +; CHECK-NEXT: vldrw.u32 q1, [r0], #4 +; CHECK-NEXT: subs r5, #1 +; CHECK-NEXT: vfma.f32 q0, q1, r1 +; CHECK-NEXT: cmp r5, #1 +; CHECK-NEXT: bgt .LBB16_10 +; CHECK-NEXT: @ %bb.11: @ %while.end.loopexit +; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1 +; CHECK-NEXT: ldr r0, [sp, #24] @ 4-byte Reload +; CHECK-NEXT: add.w r8, r8, r0, lsl #2 +; CHECK-NEXT: b .LBB16_3 +; CHECK-NEXT: .LBB16_12: @ %if.end +; CHECK-NEXT: add sp, #40 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: add sp, #4 +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} +entry: + %pState1 = getelementptr inbounds %struct.arm_fir_instance_f32, %struct.arm_fir_instance_f32* %S, i32 0, i32 1 + %0 = load float*, float** %pState1, align 4 + %pCoeffs2 = getelementptr inbounds %struct.arm_fir_instance_f32, %struct.arm_fir_instance_f32* %S, i32 0, i32 2 + %1 = load float*, float** %pCoeffs2, align 4 + %numTaps3 = getelementptr inbounds %struct.arm_fir_instance_f32, %struct.arm_fir_instance_f32* %S, i32 0, i32 0 + %2 = load i16, i16* %numTaps3, align 4 + %conv = zext i16 %2 to i32 + %cmp = icmp ugt i32 %blockSize, 7 + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + %shr = lshr i32 %blockSize, 2 + %cmp5217 = icmp eq i32 %shr, 0 + br i1 %cmp5217, label %if.end, label %while.body.lr.ph + +while.body.lr.ph: ; preds = %if.then + %sub = add nsw i32 %conv, -1 + %arrayidx = getelementptr inbounds float, float* %0, i32 %sub + %incdec.ptr = getelementptr inbounds float, float* %1, i32 1 + %incdec.ptr7 = getelementptr inbounds float, float* %1, i32 2 + %incdec.ptr8 = getelementptr inbounds float, float* %1, i32 3 + %incdec.ptr9 = getelementptr inbounds float, float* %1, i32 4 + %incdec.ptr10 = getelementptr inbounds float, float* %1, i32 5 + %incdec.ptr11 = getelementptr inbounds float, float* %1, i32 6 + %incdec.ptr12 = getelementptr inbounds float, float* %1, i32 7 + %sub37 = add nsw i32 %conv, -8 + %div = sdiv i32 %sub37, 8 + %pCoeffsCur.0199 = getelementptr inbounds float, float* %1, i32 8 + %cmp38201 = icmp ugt i16 %2, 15 + %and = and i32 %sub37, 7 + %cmp74210 = icmp eq i32 %and, 0 + %idx.neg = sub nsw i32 0, %conv + %3 = icmp sgt i32 %div, 1 + %smax = select i1 %3, i32 %div, i32 1 + br label %while.body + +while.body: ; preds = %while.body.lr.ph, %while.end + %blkCnt.0222 = phi i32 [ %shr, %while.body.lr.ph ], [ %dec84, %while.end ] + %pStateCur.0221 = phi float* [ %arrayidx, %while.body.lr.ph ], [ %add.ptr, %while.end ] + %pSamples.0220 = phi float* [ %0, %while.body.lr.ph ], [ %add.ptr83, %while.end ] + %pTempSrc.0219 = phi float* [ %pSrc, %while.body.lr.ph ], [ %add.ptr14, %while.end ] + %pOutput.0218 = phi float* [ %pDst, %while.body.lr.ph ], [ %add.ptr81, %while.end ] + %4 = load float, float* %1, align 4 + %5 = load float, float* %incdec.ptr, align 4 + %6 = load float, float* %incdec.ptr7, align 4 + %7 = load float, float* %incdec.ptr8, align 4 + %8 = load float, float* %incdec.ptr9, align 4 + %9 = load float, float* %incdec.ptr10, align 4 + %10 = load float, float* %incdec.ptr11, align 4 + %11 = load float, float* %incdec.ptr12, align 4 + %12 = bitcast float* %pTempSrc.0219 to <4 x float>* + %13 = load <4 x float>, <4 x float>* %12, align 4 + %14 = bitcast float* %pStateCur.0221 to <4 x float>* + store <4 x float> %13, <4 x float>* %14, align 4 + %add.ptr = getelementptr inbounds float, float* %pStateCur.0221, i32 4 + %add.ptr14 = getelementptr inbounds float, float* %pTempSrc.0219, i32 4 + %15 = bitcast float* %pSamples.0220 to <4 x float>* + %16 = load <4 x float>, <4 x float>* %15, align 4 + %.splatinsert = insertelement <4 x float> undef, float %4, i32 0 + %.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer + %17 = fmul fast <4 x float> %16, %.splat + %arrayidx15 = getelementptr inbounds float, float* %pSamples.0220, i32 1 + %18 = bitcast float* %arrayidx15 to <4 x float>* + %19 = load <4 x float>, <4 x float>* %18, align 4 + %.splatinsert16 = insertelement <4 x float> undef, float %5, i32 0 + %.splat17 = shufflevector <4 x float> %.splatinsert16, <4 x float> undef, <4 x i32> zeroinitializer + %20 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %19, <4 x float> %.splat17, <4 x float> %17) + %arrayidx18 = getelementptr inbounds float, float* %pSamples.0220, i32 2 + %21 = bitcast float* %arrayidx18 to <4 x float>* + %22 = load <4 x float>, <4 x float>* %21, align 4 + %.splatinsert19 = insertelement <4 x float> undef, float %6, i32 0 + %.splat20 = shufflevector <4 x float> %.splatinsert19, <4 x float> undef, <4 x i32> zeroinitializer + %23 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %22, <4 x float> %.splat20, <4 x float> %20) + %arrayidx21 = getelementptr inbounds float, float* %pSamples.0220, i32 3 + %24 = bitcast float* %arrayidx21 to <4 x float>* + %25 = load <4 x float>, <4 x float>* %24, align 4 + %.splatinsert22 = insertelement <4 x float> undef, float %7, i32 0 + %.splat23 = shufflevector <4 x float> %.splatinsert22, <4 x float> undef, <4 x i32> zeroinitializer + %26 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %25, <4 x float> %.splat23, <4 x float> %23) + %arrayidx24 = getelementptr inbounds float, float* %pSamples.0220, i32 4 + %27 = bitcast float* %arrayidx24 to <4 x float>* + %28 = load <4 x float>, <4 x float>* %27, align 4 + %.splatinsert25 = insertelement <4 x float> undef, float %8, i32 0 + %.splat26 = shufflevector <4 x float> %.splatinsert25, <4 x float> undef, <4 x i32> zeroinitializer + %29 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %28, <4 x float> %.splat26, <4 x float> %26) + %arrayidx27 = getelementptr inbounds float, float* %pSamples.0220, i32 5 + %30 = bitcast float* %arrayidx27 to <4 x float>* + %31 = load <4 x float>, <4 x float>* %30, align 4 + %.splatinsert28 = insertelement <4 x float> undef, float %9, i32 0 + %.splat29 = shufflevector <4 x float> %.splatinsert28, <4 x float> undef, <4 x i32> zeroinitializer + %32 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %31, <4 x float> %.splat29, <4 x float> %29) + %arrayidx30 = getelementptr inbounds float, float* %pSamples.0220, i32 6 + %33 = bitcast float* %arrayidx30 to <4 x float>* + %34 = load <4 x float>, <4 x float>* %33, align 4 + %.splatinsert31 = insertelement <4 x float> undef, float %10, i32 0 + %.splat32 = shufflevector <4 x float> %.splatinsert31, <4 x float> undef, <4 x i32> zeroinitializer + %35 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %34, <4 x float> %.splat32, <4 x float> %32) + %arrayidx33 = getelementptr inbounds float, float* %pSamples.0220, i32 7 + %36 = bitcast float* %arrayidx33 to <4 x float>* + %37 = load <4 x float>, <4 x float>* %36, align 4 + %.splatinsert34 = insertelement <4 x float> undef, float %11, i32 0 + %.splat35 = shufflevector <4 x float> %.splatinsert34, <4 x float> undef, <4 x i32> zeroinitializer + %38 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %37, <4 x float> %.splat35, <4 x float> %35) + %pSamples.1200 = getelementptr inbounds float, float* %pSamples.0220, i32 8 + br i1 %cmp38201, label %for.body, label %for.end + +for.body: ; preds = %while.body, %for.body + %pSamples.1207 = phi float* [ %pSamples.1, %for.body ], [ %pSamples.1200, %while.body ] + %pCoeffsCur.0206 = phi float* [ %pCoeffsCur.0, %for.body ], [ %pCoeffsCur.0199, %while.body ] + %.pn205 = phi float* [ %pCoeffsCur.0206, %for.body ], [ %1, %while.body ] + %i.0204 = phi i32 [ %inc, %for.body ], [ 0, %while.body ] + %vecAcc0.0203 = phi <4 x float> [ %70, %for.body ], [ %38, %while.body ] + %pSamples.0.pn202 = phi float* [ %pSamples.1207, %for.body ], [ %pSamples.0220, %while.body ] + %incdec.ptr40 = getelementptr inbounds float, float* %.pn205, i32 9 + %39 = load float, float* %pCoeffsCur.0206, align 4 + %incdec.ptr41 = getelementptr inbounds float, float* %.pn205, i32 10 + %40 = load float, float* %incdec.ptr40, align 4 + %incdec.ptr42 = getelementptr inbounds float, float* %.pn205, i32 11 + %41 = load float, float* %incdec.ptr41, align 4 + %incdec.ptr43 = getelementptr inbounds float, float* %.pn205, i32 12 + %42 = load float, float* %incdec.ptr42, align 4 + %incdec.ptr44 = getelementptr inbounds float, float* %.pn205, i32 13 + %43 = load float, float* %incdec.ptr43, align 4 + %incdec.ptr45 = getelementptr inbounds float, float* %.pn205, i32 14 + %44 = load float, float* %incdec.ptr44, align 4 + %incdec.ptr46 = getelementptr inbounds float, float* %.pn205, i32 15 + %45 = load float, float* %incdec.ptr45, align 4 + %46 = load float, float* %incdec.ptr46, align 4 + %47 = bitcast float* %pSamples.1207 to <4 x float>* + %48 = load <4 x float>, <4 x float>* %47, align 4 + %.splatinsert48 = insertelement <4 x float> undef, float %39, i32 0 + %.splat49 = shufflevector <4 x float> %.splatinsert48, <4 x float> undef, <4 x i32> zeroinitializer + %49 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %48, <4 x float> %.splat49, <4 x float> %vecAcc0.0203) + %arrayidx50 = getelementptr inbounds float, float* %pSamples.0.pn202, i32 9 + %50 = bitcast float* %arrayidx50 to <4 x float>* + %51 = load <4 x float>, <4 x float>* %50, align 4 + %.splatinsert51 = insertelement <4 x float> undef, float %40, i32 0 + %.splat52 = shufflevector <4 x float> %.splatinsert51, <4 x float> undef, <4 x i32> zeroinitializer + %52 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %51, <4 x float> %.splat52, <4 x float> %49) + %arrayidx53 = getelementptr inbounds float, float* %pSamples.0.pn202, i32 10 + %53 = bitcast float* %arrayidx53 to <4 x float>* + %54 = load <4 x float>, <4 x float>* %53, align 4 + %.splatinsert54 = insertelement <4 x float> undef, float %41, i32 0 + %.splat55 = shufflevector <4 x float> %.splatinsert54, <4 x float> undef, <4 x i32> zeroinitializer + %55 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %54, <4 x float> %.splat55, <4 x float> %52) + %arrayidx56 = getelementptr inbounds float, float* %pSamples.0.pn202, i32 11 + %56 = bitcast float* %arrayidx56 to <4 x float>* + %57 = load <4 x float>, <4 x float>* %56, align 4 + %.splatinsert57 = insertelement <4 x float> undef, float %42, i32 0 + %.splat58 = shufflevector <4 x float> %.splatinsert57, <4 x float> undef, <4 x i32> zeroinitializer + %58 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %57, <4 x float> %.splat58, <4 x float> %55) + %arrayidx59 = getelementptr inbounds float, float* %pSamples.0.pn202, i32 12 + %59 = bitcast float* %arrayidx59 to <4 x float>* + %60 = load <4 x float>, <4 x float>* %59, align 4 + %.splatinsert60 = insertelement <4 x float> undef, float %43, i32 0 + %.splat61 = shufflevector <4 x float> %.splatinsert60, <4 x float> undef, <4 x i32> zeroinitializer + %61 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %60, <4 x float> %.splat61, <4 x float> %58) + %arrayidx62 = getelementptr inbounds float, float* %pSamples.0.pn202, i32 13 + %62 = bitcast float* %arrayidx62 to <4 x float>* + %63 = load <4 x float>, <4 x float>* %62, align 4 + %.splatinsert63 = insertelement <4 x float> undef, float %44, i32 0 + %.splat64 = shufflevector <4 x float> %.splatinsert63, <4 x float> undef, <4 x i32> zeroinitializer + %64 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %63, <4 x float> %.splat64, <4 x float> %61) + %arrayidx65 = getelementptr inbounds float, float* %pSamples.0.pn202, i32 14 + %65 = bitcast float* %arrayidx65 to <4 x float>* + %66 = load <4 x float>, <4 x float>* %65, align 4 + %.splatinsert66 = insertelement <4 x float> undef, float %45, i32 0 + %.splat67 = shufflevector <4 x float> %.splatinsert66, <4 x float> undef, <4 x i32> zeroinitializer + %67 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %66, <4 x float> %.splat67, <4 x float> %64) + %arrayidx68 = getelementptr inbounds float, float* %pSamples.0.pn202, i32 15 + %68 = bitcast float* %arrayidx68 to <4 x float>* + %69 = load <4 x float>, <4 x float>* %68, align 4 + %.splatinsert69 = insertelement <4 x float> undef, float %46, i32 0 + %.splat70 = shufflevector <4 x float> %.splatinsert69, <4 x float> undef, <4 x i32> zeroinitializer + %70 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %69, <4 x float> %.splat70, <4 x float> %67) + %inc = add nuw nsw i32 %i.0204, 1 + %pCoeffsCur.0 = getelementptr inbounds float, float* %pCoeffsCur.0206, i32 8 + %pSamples.1 = getelementptr inbounds float, float* %pSamples.1207, i32 8 + %exitcond = icmp eq i32 %inc, %smax + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %while.body + %vecAcc0.0.lcssa = phi <4 x float> [ %38, %while.body ], [ %70, %for.body ] + %pCoeffsCur.0.lcssa = phi float* [ %pCoeffsCur.0199, %while.body ], [ %pCoeffsCur.0, %for.body ] + %pSamples.1.lcssa = phi float* [ %pSamples.1200, %while.body ], [ %pSamples.1, %for.body ] + br i1 %cmp74210, label %while.end, label %while.body76 + +while.body76: ; preds = %for.end, %while.body76 + %pCoeffsCur.1214 = phi float* [ %incdec.ptr77, %while.body76 ], [ %pCoeffsCur.0.lcssa, %for.end ] + %vecAcc0.1213 = phi <4 x float> [ %74, %while.body76 ], [ %vecAcc0.0.lcssa, %for.end ] + %numCnt.0212 = phi i32 [ %dec, %while.body76 ], [ %and, %for.end ] + %pSamples.2211 = phi float* [ %incdec.ptr80, %while.body76 ], [ %pSamples.1.lcssa, %for.end ] + %incdec.ptr77 = getelementptr inbounds float, float* %pCoeffsCur.1214, i32 1 + %71 = load float, float* %pCoeffsCur.1214, align 4 + %72 = bitcast float* %pSamples.2211 to <4 x float>* + %73 = load <4 x float>, <4 x float>* %72, align 4 + %.splatinsert78 = insertelement <4 x float> undef, float %71, i32 0 + %.splat79 = shufflevector <4 x float> %.splatinsert78, <4 x float> undef, <4 x i32> zeroinitializer + %74 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %73, <4 x float> %.splat79, <4 x float> %vecAcc0.1213) + %incdec.ptr80 = getelementptr inbounds float, float* %pSamples.2211, i32 1 + %dec = add nsw i32 %numCnt.0212, -1 + %cmp74 = icmp sgt i32 %numCnt.0212, 1 + br i1 %cmp74, label %while.body76, label %while.end.loopexit + +while.end.loopexit: ; preds = %while.body76 + %scevgep = getelementptr float, float* %pSamples.1.lcssa, i32 %and + br label %while.end + +while.end: ; preds = %while.end.loopexit, %for.end + %pSamples.2.lcssa = phi float* [ %pSamples.1.lcssa, %for.end ], [ %scevgep, %while.end.loopexit ] + %vecAcc0.1.lcssa = phi <4 x float> [ %vecAcc0.0.lcssa, %for.end ], [ %74, %while.end.loopexit ] + %75 = bitcast float* %pOutput.0218 to <4 x float>* + store <4 x float> %vecAcc0.1.lcssa, <4 x float>* %75, align 4 + %add.ptr81 = getelementptr inbounds float, float* %pOutput.0218, i32 4 + %add.ptr82 = getelementptr inbounds float, float* %pSamples.2.lcssa, i32 4 + %add.ptr83 = getelementptr inbounds float, float* %add.ptr82, i32 %idx.neg + %dec84 = add nsw i32 %blkCnt.0222, -1 + %cmp5 = icmp eq i32 %dec84, 0 + br i1 %cmp5, label %if.end, label %while.body + +if.end: ; preds = %while.end, %if.then, %entry + ret void +} + +declare void @llvm.assume(i1) +declare <4 x i1> @llvm.arm.mve.vctp32(i32) +declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) +declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32 immarg, <4 x i1>) diff --git a/llvm/test/CodeGen/Thumb2/mve-fmas.ll b/llvm/test/CodeGen/Thumb2/mve-fmas.ll index a65b663b3311f..789e06a011462 100644 --- a/llvm/test/CodeGen/Thumb2/mve-fmas.ll +++ b/llvm/test/CodeGen/Thumb2/mve-fmas.ll @@ -199,7 +199,7 @@ define arm_aapcs_vfpcc <8 x half> @vfmar16(<8 x half> %src1, <8 x half> %src2, f ; CHECK-MVE-FP-LABEL: vfmar16: ; CHECK-MVE-FP: @ %bb.0: @ %entry ; CHECK-MVE-FP-NEXT: vcvtb.f16.f32 s8, s8 -; CHECK-MVE-FP-NEXT: vmov r0, s8 +; CHECK-MVE-FP-NEXT: vmov.f16 r0, s8 ; CHECK-MVE-FP-NEXT: vmul.f16 q1, q1, r0 ; CHECK-MVE-FP-NEXT: vadd.f16 q0, q0, q1 ; CHECK-MVE-FP-NEXT: bx lr @@ -207,7 +207,7 @@ define arm_aapcs_vfpcc <8 x half> @vfmar16(<8 x half> %src1, <8 x half> %src2, f ; CHECK-MVE-VMLA-LABEL: vfmar16: ; CHECK-MVE-VMLA: @ %bb.0: @ %entry ; CHECK-MVE-VMLA-NEXT: vcvtb.f16.f32 s8, s8 -; CHECK-MVE-VMLA-NEXT: vmov r0, s8 +; CHECK-MVE-VMLA-NEXT: vmov.f16 r0, s8 ; CHECK-MVE-VMLA-NEXT: vfma.f16 q0, q1, r0 ; CHECK-MVE-VMLA-NEXT: bx lr ; @@ -266,14 +266,14 @@ define arm_aapcs_vfpcc <8 x half> @vfma16(<8 x half> %src1, <8 x half> %src2, fl ; CHECK-MVE-FP: @ %bb.0: @ %entry ; CHECK-MVE-FP-NEXT: vcvtb.f16.f32 s8, s8 ; CHECK-MVE-FP-NEXT: vmul.f16 q0, q0, q1 -; CHECK-MVE-FP-NEXT: vmov r0, s8 +; CHECK-MVE-FP-NEXT: vmov.f16 r0, s8 ; CHECK-MVE-FP-NEXT: vadd.f16 q0, q0, r0 ; CHECK-MVE-FP-NEXT: bx lr ; ; CHECK-MVE-VMLA-LABEL: vfma16: ; CHECK-MVE-VMLA: @ %bb.0: @ %entry ; CHECK-MVE-VMLA-NEXT: vcvtb.f16.f32 s8, s8 -; CHECK-MVE-VMLA-NEXT: vmov r0, s8 +; CHECK-MVE-VMLA-NEXT: vmov.f16 r0, s8 ; CHECK-MVE-VMLA-NEXT: vfmas.f16 q0, q1, r0 ; CHECK-MVE-VMLA-NEXT: bx lr ; @@ -437,8 +437,8 @@ entry: define arm_aapcs_vfpcc <4 x float> @vfmas32(<4 x float> %src1, <4 x float> %src2, float %src3) { ; CHECK-MVE-FP-LABEL: vfmas32: ; CHECK-MVE-FP: @ %bb.0: @ %entry -; CHECK-MVE-FP-NEXT: vmul.f32 q0, q0, q1 ; CHECK-MVE-FP-NEXT: vmov r0, s8 +; CHECK-MVE-FP-NEXT: vmul.f32 q0, q0, q1 ; CHECK-MVE-FP-NEXT: vadd.f32 q0, q0, r0 ; CHECK-MVE-FP-NEXT: bx lr ; diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/dup.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/dup.ll index 94e65c4f249f2..530ef11f871a7 100644 --- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/dup.ll +++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/dup.ll @@ -4,7 +4,7 @@ define arm_aapcs_vfpcc <8 x half> @test_vdupq_n_f16(float %a.coerce) { ; CHECK-LABEL: test_vdupq_n_f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmov.f16 r0, s0 ; CHECK-NEXT: vdup.16 q0, r0 ; CHECK-NEXT: bx lr entry: @@ -97,7 +97,7 @@ entry: define arm_aapcs_vfpcc <8 x half> @test_vdupq_m_n_f16(<8 x half> %inactive, float %a.coerce, i16 zeroext %p) { ; CHECK-LABEL: test_vdupq_m_n_f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r1, s4 +; CHECK-NEXT: vmov.f16 r1, s4 ; CHECK-NEXT: vmsr p0, r0 ; CHECK-NEXT: vpst ; CHECK-NEXT: vdupt.16 q0, r1 @@ -117,10 +117,10 @@ entry: define arm_aapcs_vfpcc <4 x float> @test_vdupq_m_n_f32(<4 x float> %inactive, float %a, i16 zeroext %p) { ; CHECK-LABEL: test_vdupq_m_n_f32: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov r1, s4 ; CHECK-NEXT: vmsr p0, r0 -; CHECK-NEXT: vmov r0, s4 ; CHECK-NEXT: vpst -; CHECK-NEXT: vdupt.32 q0, r0 +; CHECK-NEXT: vdupt.32 q0, r1 ; CHECK-NEXT: bx lr entry: %0 = zext i16 %p to i32 diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/ternary.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/ternary.ll index fdad64e962e6e..73c2707e7af78 100644 --- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/ternary.ll +++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/ternary.ll @@ -24,7 +24,7 @@ entry: define arm_aapcs_vfpcc <8 x half> @test_vfmaq_n_f16(<8 x half> %a, <8 x half> %b, float %c.coerce) { ; CHECK-LABEL: test_vfmaq_n_f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmov.f16 r0, s8 ; CHECK-NEXT: vfma.f16 q0, q1, r0 ; CHECK-NEXT: bx lr entry: @@ -53,7 +53,7 @@ entry: define arm_aapcs_vfpcc <8 x half> @test_vfmasq_n_f16(<8 x half> %a, <8 x half> %b, float %c.coerce) { ; CHECK-LABEL: test_vfmasq_n_f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmov.f16 r0, s8 ; CHECK-NEXT: vfmas.f16 q0, q1, r0 ; CHECK-NEXT: bx lr entry: @@ -295,6 +295,38 @@ entry: ret <4 x i32> %0 } +define arm_aapcs_vfpcc <16 x i8> @test_vqdmlashq_n_s8(<16 x i8> %m1, <16 x i8> %m2, i8 signext %add) { +; CHECK-LABEL: test_vqdmlashq_n_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vqdmlash.s8 q0, q1, r0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i8 %add to i32 + %1 = tail call <16 x i8> @llvm.arm.mve.vqdmlash.v16i8(<16 x i8> %m1, <16 x i8> %m2, i32 %0) + ret <16 x i8> %1 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vqdmlashq_n_s16(<8 x i16> %m1, <8 x i16> %m2, i16 signext %add) { +; CHECK-LABEL: test_vqdmlashq_n_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vqdmlash.s16 q0, q1, r0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %add to i32 + %1 = tail call <8 x i16> @llvm.arm.mve.vqdmlash.v8i16(<8 x i16> %m1, <8 x i16> %m2, i32 %0) + ret <8 x i16> %1 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vqdmlashq_n_s32(<4 x i32> %m1, <4 x i32> %m2, i32 %add) { +; CHECK-LABEL: test_vqdmlashq_n_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vqdmlash.s32 q0, q1, r0 +; CHECK-NEXT: bx lr +entry: + %0 = tail call <4 x i32> @llvm.arm.mve.vqdmlash.v4i32(<4 x i32> %m1, <4 x i32> %m2, i32 %add) + ret <4 x i32> %0 +} + define arm_aapcs_vfpcc <16 x i8> @test_vqrdmlahq_n_s8(<16 x i8> %a, <16 x i8> %b, i8 signext %c) { ; CHECK-LABEL: test_vqrdmlahq_n_s8: ; CHECK: @ %bb.0: @ %entry @@ -390,7 +422,7 @@ entry: define arm_aapcs_vfpcc <8 x half> @test_vfmaq_m_n_f16(<8 x half> %a, <8 x half> %b, float %c.coerce, i16 zeroext %p) { ; CHECK-LABEL: test_vfmaq_m_n_f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r1, s8 +; CHECK-NEXT: vmov.f16 r1, s8 ; CHECK-NEXT: vmsr p0, r0 ; CHECK-NEXT: vpst ; CHECK-NEXT: vfmat.f16 q0, q1, r1 @@ -410,10 +442,10 @@ entry: define arm_aapcs_vfpcc <4 x float> @test_vfmaq_m_n_f32(<4 x float> %a, <4 x float> %b, float %c, i16 zeroext %p) { ; CHECK-LABEL: test_vfmaq_m_n_f32: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov r1, s8 ; CHECK-NEXT: vmsr p0, r0 -; CHECK-NEXT: vmov r0, s8 ; CHECK-NEXT: vpst -; CHECK-NEXT: vfmat.f32 q0, q1, r0 +; CHECK-NEXT: vfmat.f32 q0, q1, r1 ; CHECK-NEXT: bx lr entry: %.splatinsert = insertelement <4 x float> undef, float %c, i32 0 @@ -427,7 +459,7 @@ entry: define arm_aapcs_vfpcc <8 x half> @test_vfmasq_m_n_f16(<8 x half> %a, <8 x half> %b, float %c.coerce, i16 zeroext %p) { ; CHECK-LABEL: test_vfmasq_m_n_f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r1, s8 +; CHECK-NEXT: vmov.f16 r1, s8 ; CHECK-NEXT: vmsr p0, r0 ; CHECK-NEXT: vpst ; CHECK-NEXT: vfmast.f16 q0, q1, r1 @@ -447,10 +479,10 @@ entry: define arm_aapcs_vfpcc <4 x float> @test_vfmasq_m_n_f32(<4 x float> %a, <4 x float> %b, float %c, i16 zeroext %p) { ; CHECK-LABEL: test_vfmasq_m_n_f32: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov r1, s8 ; CHECK-NEXT: vmsr p0, r0 -; CHECK-NEXT: vmov r0, s8 ; CHECK-NEXT: vpst -; CHECK-NEXT: vfmast.f32 q0, q1, r0 +; CHECK-NEXT: vfmast.f32 q0, q1, r1 ; CHECK-NEXT: bx lr entry: %.splatinsert = insertelement <4 x float> undef, float %c, i32 0 @@ -711,6 +743,50 @@ entry: ret <4 x i32> %2 } +define arm_aapcs_vfpcc <16 x i8> @test_vqdmlashq_m_n_s8(<16 x i8> %m1, <16 x i8> %m2, i8 signext %add, i16 zeroext %p) { +; CHECK-LABEL: test_vqdmlashq_m_n_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vqdmlasht.s8 q0, q1, r0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i8 %add to i32 + %1 = zext i16 %p to i32 + %2 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1) + %3 = tail call <16 x i8> @llvm.arm.mve.vqdmlash.predicated.v16i8.v16i1(<16 x i8> %m1, <16 x i8> %m2, i32 %0, <16 x i1> %2) + ret <16 x i8> %3 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vqdmlashq_m_n_s16(<8 x i16> %m1, <8 x i16> %m2, i16 signext %add, i16 zeroext %p) { +; CHECK-LABEL: test_vqdmlashq_m_n_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vqdmlasht.s16 q0, q1, r0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %add to i32 + %1 = zext i16 %p to i32 + %2 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1) + %3 = tail call <8 x i16> @llvm.arm.mve.vqdmlash.predicated.v8i16.v8i1(<8 x i16> %m1, <8 x i16> %m2, i32 %0, <8 x i1> %2) + ret <8 x i16> %3 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vqdmlashq_m_n_s32(<4 x i32> %m1, <4 x i32> %m2, i32 %add, i16 zeroext %p) { +; CHECK-LABEL: test_vqdmlashq_m_n_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vqdmlasht.s32 q0, q1, r0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = tail call <4 x i32> @llvm.arm.mve.vqdmlash.predicated.v4i32.v4i1(<4 x i32> %m1, <4 x i32> %m2, i32 %add, <4 x i1> %1) + ret <4 x i32> %2 +} + define arm_aapcs_vfpcc <16 x i8> @test_vqrdmlahq_m_n_s8(<16 x i8> %a, <16 x i8> %b, i8 signext %c, i16 zeroext %p) { ; CHECK-LABEL: test_vqrdmlahq_m_n_s8: ; CHECK: @ %bb.0: @ %entry @@ -816,6 +892,9 @@ declare <4 x i32> @llvm.arm.mve.vmlas.n.predicated.v4i32.v4i1(<4 x i32>, <4 x i3 declare <16 x i8> @llvm.arm.mve.vqdmlah.v16i8(<16 x i8>, <16 x i8>, i32) declare <8 x i16> @llvm.arm.mve.vqdmlah.v8i16(<8 x i16>, <8 x i16>, i32) declare <4 x i32> @llvm.arm.mve.vqdmlah.v4i32(<4 x i32>, <4 x i32>, i32) +declare <16 x i8> @llvm.arm.mve.vqdmlash.v16i8(<16 x i8>, <16 x i8>, i32) +declare <8 x i16> @llvm.arm.mve.vqdmlash.v8i16(<8 x i16>, <8 x i16>, i32) +declare <4 x i32> @llvm.arm.mve.vqdmlash.v4i32(<4 x i32>, <4 x i32>, i32) declare <16 x i8> @llvm.arm.mve.vqrdmlah.v16i8(<16 x i8>, <16 x i8>, i32) declare <8 x i16> @llvm.arm.mve.vqrdmlah.v8i16(<8 x i16>, <8 x i16>, i32) declare <4 x i32> @llvm.arm.mve.vqrdmlah.v4i32(<4 x i32>, <4 x i32>, i32) @@ -825,6 +904,9 @@ declare <4 x i32> @llvm.arm.mve.vqrdmlash.v4i32(<4 x i32>, <4 x i32>, i32) declare <16 x i8> @llvm.arm.mve.vqdmlah.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, i32, <16 x i1>) declare <8 x i16> @llvm.arm.mve.vqdmlah.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, i32, <8 x i1>) declare <4 x i32> @llvm.arm.mve.vqdmlah.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>) +declare <16 x i8> @llvm.arm.mve.vqdmlash.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, i32, <16 x i1>) +declare <8 x i16> @llvm.arm.mve.vqdmlash.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, i32, <8 x i1>) +declare <4 x i32> @llvm.arm.mve.vqdmlash.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>) declare <16 x i8> @llvm.arm.mve.vqrdmlah.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, i32, <16 x i1>) declare <8 x i16> @llvm.arm.mve.vqrdmlah.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, i32, <8 x i1>) declare <4 x i32> @llvm.arm.mve.vqrdmlah.predicated.v4i32.v4i1(<4 x i32>, <4 x i32>, i32, <4 x i1>) diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vaddq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vaddq.ll index 03e64cc06641e..97bc14969a095 100644 --- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vaddq.ll +++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vaddq.ll @@ -106,7 +106,7 @@ entry: define arm_aapcs_vfpcc <8 x half> @test_vaddq_n_f16(<8 x half> %a, float %b.coerce) { ; CHECK-LABEL: test_vaddq_n_f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov.f16 r0, s4 ; CHECK-NEXT: vadd.f16 q0, q0, r0 ; CHECK-NEXT: bx lr entry: @@ -138,10 +138,10 @@ entry: define arm_aapcs_vfpcc <4 x float> @test_vaddq_m_n_f32(<4 x float> %inactive, <4 x float> %a, float %b, i16 zeroext %p) { ; CHECK-LABEL: test_vaddq_m_n_f32: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov r1, s8 ; CHECK-NEXT: vmsr p0, r0 -; CHECK-NEXT: vmov r0, s8 ; CHECK-NEXT: vpst -; CHECK-NEXT: vaddt.f32 q0, q1, r0 +; CHECK-NEXT: vaddt.f32 q0, q1, r1 ; CHECK-NEXT: bx lr entry: %.splatinsert = insertelement <4 x float> undef, float %b, i32 0 @@ -171,7 +171,7 @@ entry: define arm_aapcs_vfpcc <8 x half> @test_vaddq_x_n_f16(<8 x half> %a, float %b.coerce, i16 zeroext %p) { ; CHECK-LABEL: test_vaddq_x_n_f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r1, s4 +; CHECK-NEXT: vmov.f16 r1, s4 ; CHECK-NEXT: vmsr p0, r0 ; CHECK-NEXT: vpst ; CHECK-NEXT: vaddt.f16 q0, q0, r1 diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vaddv.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vaddv.ll new file mode 100644 index 0000000000000..12db8d95a3274 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vaddv.ll @@ -0,0 +1,416 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -o - %s | FileCheck %s + +define arm_aapcs_vfpcc i32 @test_vaddvq_s8(<16 x i8> %a) { +; CHECK-LABEL: test_vaddvq_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vaddv.s8 r0, q0 +; CHECK-NEXT: bx lr +entry: + %0 = tail call i32 @llvm.arm.mve.addv.v16i8(<16 x i8> %a, i32 0) + ret i32 %0 +} + +define arm_aapcs_vfpcc i32 @test_vaddvq_s16(<8 x i16> %a) { +; CHECK-LABEL: test_vaddvq_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vaddv.s16 r0, q0 +; CHECK-NEXT: bx lr +entry: + %0 = tail call i32 @llvm.arm.mve.addv.v8i16(<8 x i16> %a, i32 0) + ret i32 %0 +} + +define arm_aapcs_vfpcc i32 @test_vaddvq_s32(<4 x i32> %a) { +; CHECK-LABEL: test_vaddvq_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vaddv.s32 r0, q0 +; CHECK-NEXT: bx lr +entry: + %0 = tail call i32 @llvm.arm.mve.addv.v4i32(<4 x i32> %a, i32 0) + ret i32 %0 +} + +define arm_aapcs_vfpcc i32 @test_vaddvq_u8(<16 x i8> %a) { +; CHECK-LABEL: test_vaddvq_u8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vaddv.u8 r0, q0 +; CHECK-NEXT: bx lr +entry: + %0 = tail call i32 @llvm.arm.mve.addv.v16i8(<16 x i8> %a, i32 1) + ret i32 %0 +} + +define arm_aapcs_vfpcc i32 @test_vaddvq_u16(<8 x i16> %a) { +; CHECK-LABEL: test_vaddvq_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vaddv.u16 r0, q0 +; CHECK-NEXT: bx lr +entry: + %0 = tail call i32 @llvm.arm.mve.addv.v8i16(<8 x i16> %a, i32 1) + ret i32 %0 +} + +define arm_aapcs_vfpcc i32 @test_vaddvq_u32(<4 x i32> %a) { +; CHECK-LABEL: test_vaddvq_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vaddv.u32 r0, q0 +; CHECK-NEXT: bx lr +entry: + %0 = tail call i32 @llvm.arm.mve.addv.v4i32(<4 x i32> %a, i32 1) + ret i32 %0 +} + +define arm_aapcs_vfpcc i32 @test_vaddvaq_s8(i32 %a, <16 x i8> %b) { +; CHECK-LABEL: test_vaddvaq_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vaddva.s8 r0, q0 +; CHECK-NEXT: bx lr +entry: + %0 = tail call i32 @llvm.arm.mve.addv.v16i8(<16 x i8> %b, i32 0) + %1 = add i32 %0, %a + ret i32 %1 +} + +define arm_aapcs_vfpcc i32 @test_vaddvaq_s16(i32 %a, <8 x i16> %b) { +; CHECK-LABEL: test_vaddvaq_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vaddva.s16 r0, q0 +; CHECK-NEXT: bx lr +entry: + %0 = tail call i32 @llvm.arm.mve.addv.v8i16(<8 x i16> %b, i32 0) + %1 = add i32 %0, %a + ret i32 %1 +} + +define arm_aapcs_vfpcc i32 @test_vaddvaq_s32(i32 %a, <4 x i32> %b) { +; CHECK-LABEL: test_vaddvaq_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vaddva.s32 r0, q0 +; CHECK-NEXT: bx lr +entry: + %0 = tail call i32 @llvm.arm.mve.addv.v4i32(<4 x i32> %b, i32 0) + %1 = add i32 %0, %a + ret i32 %1 +} + +define arm_aapcs_vfpcc i32 @test_vaddvaq_u8(i32 %a, <16 x i8> %b) { +; CHECK-LABEL: test_vaddvaq_u8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vaddva.u8 r0, q0 +; CHECK-NEXT: bx lr +entry: + %0 = tail call i32 @llvm.arm.mve.addv.v16i8(<16 x i8> %b, i32 1) + %1 = add i32 %0, %a + ret i32 %1 +} + +define arm_aapcs_vfpcc i32 @test_vaddvaq_u16(i32 %a, <8 x i16> %b) { +; CHECK-LABEL: test_vaddvaq_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vaddva.u16 r0, q0 +; CHECK-NEXT: bx lr +entry: + %0 = tail call i32 @llvm.arm.mve.addv.v8i16(<8 x i16> %b, i32 1) + %1 = add i32 %0, %a + ret i32 %1 +} + +define arm_aapcs_vfpcc i32 @test_vaddvaq_u32(i32 %a, <4 x i32> %b) { +; CHECK-LABEL: test_vaddvaq_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vaddva.u32 r0, q0 +; CHECK-NEXT: bx lr +entry: + %0 = tail call i32 @llvm.arm.mve.addv.v4i32(<4 x i32> %b, i32 1) + %1 = add i32 %0, %a + ret i32 %1 +} + +define arm_aapcs_vfpcc i32 @test_vaddvq_p_s8(<16 x i8> %a, i16 zeroext %p) { +; CHECK-LABEL: test_vaddvq_p_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vaddvt.s8 r0, q0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) + %2 = tail call i32 @llvm.arm.mve.addv.predicated.v16i8.v16i1(<16 x i8> %a, i32 0, <16 x i1> %1) + ret i32 %2 +} + +define arm_aapcs_vfpcc i32 @test_vaddvq_p_s16(<8 x i16> %a, i16 zeroext %p) { +; CHECK-LABEL: test_vaddvq_p_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vaddvt.s16 r0, q0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = tail call i32 @llvm.arm.mve.addv.predicated.v8i16.v8i1(<8 x i16> %a, i32 0, <8 x i1> %1) + ret i32 %2 +} + +define arm_aapcs_vfpcc i32 @test_vaddvq_p_s32(<4 x i32> %a, i16 zeroext %p) { +; CHECK-LABEL: test_vaddvq_p_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vaddvt.s32 r0, q0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = tail call i32 @llvm.arm.mve.addv.predicated.v4i32.v4i1(<4 x i32> %a, i32 0, <4 x i1> %1) + ret i32 %2 +} + +define arm_aapcs_vfpcc i32 @test_vaddvq_p_u8(<16 x i8> %a, i16 zeroext %p) { +; CHECK-LABEL: test_vaddvq_p_u8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vaddvt.u8 r0, q0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) + %2 = tail call i32 @llvm.arm.mve.addv.predicated.v16i8.v16i1(<16 x i8> %a, i32 1, <16 x i1> %1) + ret i32 %2 +} + +define arm_aapcs_vfpcc i32 @test_vaddvq_p_u16(<8 x i16> %a, i16 zeroext %p) { +; CHECK-LABEL: test_vaddvq_p_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vaddvt.u16 r0, q0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = tail call i32 @llvm.arm.mve.addv.predicated.v8i16.v8i1(<8 x i16> %a, i32 1, <8 x i1> %1) + ret i32 %2 +} + +define arm_aapcs_vfpcc i32 @test_vaddvq_p_u32(<4 x i32> %a, i16 zeroext %p) { +; CHECK-LABEL: test_vaddvq_p_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vaddvt.u32 r0, q0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = tail call i32 @llvm.arm.mve.addv.predicated.v4i32.v4i1(<4 x i32> %a, i32 1, <4 x i1> %1) + ret i32 %2 +} + +define arm_aapcs_vfpcc i32 @test_vaddvaq_p_s8(i32 %a, <16 x i8> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vaddvaq_p_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vaddvat.s8 r0, q0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) + %2 = tail call i32 @llvm.arm.mve.addv.predicated.v16i8.v16i1(<16 x i8> %b, i32 0, <16 x i1> %1) + %3 = add i32 %2, %a + ret i32 %3 +} + +define arm_aapcs_vfpcc i32 @test_vaddvaq_p_s16(i32 %a, <8 x i16> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vaddvaq_p_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vaddvat.s16 r0, q0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = tail call i32 @llvm.arm.mve.addv.predicated.v8i16.v8i1(<8 x i16> %b, i32 0, <8 x i1> %1) + %3 = add i32 %2, %a + ret i32 %3 +} + +define arm_aapcs_vfpcc i32 @test_vaddvaq_p_s32(i32 %a, <4 x i32> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vaddvaq_p_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vaddvat.s32 r0, q0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = tail call i32 @llvm.arm.mve.addv.predicated.v4i32.v4i1(<4 x i32> %b, i32 0, <4 x i1> %1) + %3 = add i32 %2, %a + ret i32 %3 +} + +define arm_aapcs_vfpcc i32 @test_vaddvaq_p_u8(i32 %a, <16 x i8> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vaddvaq_p_u8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vaddvat.u8 r0, q0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) + %2 = tail call i32 @llvm.arm.mve.addv.predicated.v16i8.v16i1(<16 x i8> %b, i32 1, <16 x i1> %1) + %3 = add i32 %2, %a + ret i32 %3 +} + +define arm_aapcs_vfpcc i32 @test_vaddvaq_p_u16(i32 %a, <8 x i16> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vaddvaq_p_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vaddvat.u16 r0, q0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = tail call i32 @llvm.arm.mve.addv.predicated.v8i16.v8i1(<8 x i16> %b, i32 1, <8 x i1> %1) + %3 = add i32 %2, %a + ret i32 %3 +} + +define arm_aapcs_vfpcc i32 @test_vaddvaq_p_u32(i32 %a, <4 x i32> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vaddvaq_p_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vaddvat.u32 r0, q0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = tail call i32 @llvm.arm.mve.addv.predicated.v4i32.v4i1(<4 x i32> %b, i32 1, <4 x i1> %1) + %3 = add i32 %2, %a + ret i32 %3 +} + +define arm_aapcs_vfpcc i64 @test_vaddlvq_s32(<4 x i32> %a) { +; CHECK-LABEL: test_vaddlvq_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vaddlv.s32 r0, r1, q0 +; CHECK-NEXT: bx lr +entry: + %0 = tail call i64 @llvm.arm.mve.addlv.v4i32(<4 x i32> %a, i32 0) + ret i64 %0 +} + +define arm_aapcs_vfpcc i64 @test_vaddlvq_u32(<4 x i32> %a) { +; CHECK-LABEL: test_vaddlvq_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vaddlv.u32 r0, r1, q0 +; CHECK-NEXT: bx lr +entry: + %0 = tail call i64 @llvm.arm.mve.addlv.v4i32(<4 x i32> %a, i32 1) + ret i64 %0 +} + +define arm_aapcs_vfpcc i64 @test_vaddlvaq_s32(i64 %a, <4 x i32> %b) { +; CHECK-LABEL: test_vaddlvaq_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vaddlva.s32 r0, r1, q0 +; CHECK-NEXT: bx lr +entry: + %0 = tail call i64 @llvm.arm.mve.addlv.v4i32(<4 x i32> %b, i32 0) + %1 = add i64 %0, %a + ret i64 %1 +} + +define arm_aapcs_vfpcc i64 @test_vaddlvaq_u32(i64 %a, <4 x i32> %b) { +; CHECK-LABEL: test_vaddlvaq_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vaddlva.u32 r0, r1, q0 +; CHECK-NEXT: bx lr +entry: + %0 = tail call i64 @llvm.arm.mve.addlv.v4i32(<4 x i32> %b, i32 1) + %1 = add i64 %0, %a + ret i64 %1 +} + +define arm_aapcs_vfpcc i64 @test_vaddlvq_p_s32(<4 x i32> %a, i16 zeroext %p) { +; CHECK-LABEL: test_vaddlvq_p_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vaddlvt.s32 r0, r1, q0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = tail call i64 @llvm.arm.mve.addlv.predicated.v4i32.v4i1(<4 x i32> %a, i32 0, <4 x i1> %1) + ret i64 %2 +} + +define arm_aapcs_vfpcc i64 @test_vaddlvq_p_u32(<4 x i32> %a, i16 zeroext %p) { +; CHECK-LABEL: test_vaddlvq_p_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vaddlvt.u32 r0, r1, q0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = tail call i64 @llvm.arm.mve.addlv.predicated.v4i32.v4i1(<4 x i32> %a, i32 1, <4 x i1> %1) + ret i64 %2 +} + +define arm_aapcs_vfpcc i64 @test_vaddlvaq_p_s32(i64 %a, <4 x i32> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vaddlvaq_p_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r2 +; CHECK-NEXT: vpst +; CHECK-NEXT: vaddlvat.s32 r0, r1, q0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = tail call i64 @llvm.arm.mve.addlv.predicated.v4i32.v4i1(<4 x i32> %b, i32 0, <4 x i1> %1) + %3 = add i64 %2, %a + ret i64 %3 +} + +define arm_aapcs_vfpcc i64 @test_vaddlvaq_p_u32(i64 %a, <4 x i32> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vaddlvaq_p_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r2 +; CHECK-NEXT: vpst +; CHECK-NEXT: vaddlvat.u32 r0, r1, q0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = tail call i64 @llvm.arm.mve.addlv.predicated.v4i32.v4i1(<4 x i32> %b, i32 1, <4 x i1> %1) + %3 = add i64 %2, %a + ret i64 %3 +} + +declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32) +declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) +declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) + +declare i32 @llvm.arm.mve.addv.v16i8(<16 x i8>, i32) +declare i32 @llvm.arm.mve.addv.v8i16(<8 x i16>, i32) +declare i32 @llvm.arm.mve.addv.v4i32(<4 x i32>, i32) +declare i64 @llvm.arm.mve.addlv.v4i32(<4 x i32>, i32) + +declare i32 @llvm.arm.mve.addv.predicated.v16i8.v16i1(<16 x i8>, i32, <16 x i1>) +declare i32 @llvm.arm.mve.addv.predicated.v8i16.v8i1(<8 x i16>, i32, <8 x i1>) +declare i32 @llvm.arm.mve.addv.predicated.v4i32.v4i1(<4 x i32>, i32, <4 x i1>) +declare i64 @llvm.arm.mve.addlv.predicated.v4i32.v4i1(<4 x i32>, i32, <4 x i1>) diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vminvq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vminvq.ll index a7c37802065f3..fd1daef4b9ec8 100644 --- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vminvq.ll +++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vminvq.ll @@ -1,36 +1,865 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -o - %s | FileCheck %s +define arm_aapcs_vfpcc signext i8 @test_vminvq_s8(i8 signext %a, <16 x i8> %b) { +; CHECK-LABEL: test_vminvq_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vminv.s8 r0, q0 +; CHECK-NEXT: sxtb r0, r0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i8 %a to i32 + %1 = tail call i32 @llvm.arm.mve.minv.v16i8(i32 %0, <16 x i8> %b, i32 0) + %2 = trunc i32 %1 to i8 + ret i8 %2 +} + +define arm_aapcs_vfpcc signext i16 @test_vminvq_s16(i16 signext %a, <8 x i16> %b) { +; CHECK-LABEL: test_vminvq_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vminv.s16 r0, q0 +; CHECK-NEXT: sxth r0, r0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %a to i32 + %1 = tail call i32 @llvm.arm.mve.minv.v8i16(i32 %0, <8 x i16> %b, i32 0) + %2 = trunc i32 %1 to i16 + ret i16 %2 +} + +define arm_aapcs_vfpcc i32 @test_vminvq_s32(i32 %a, <4 x i32> %b) { +; CHECK-LABEL: test_vminvq_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vminv.s32 r0, q0 +; CHECK-NEXT: bx lr +entry: + %0 = tail call i32 @llvm.arm.mve.minv.v4i32(i32 %a, <4 x i32> %b, i32 0) + ret i32 %0 +} + +define arm_aapcs_vfpcc zeroext i8 @test_vminvq_u8(i8 zeroext %a, <16 x i8> %b) { +; CHECK-LABEL: test_vminvq_u8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vminv.u8 r0, q0 +; CHECK-NEXT: uxtb r0, r0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i8 %a to i32 + %1 = tail call i32 @llvm.arm.mve.minv.v16i8(i32 %0, <16 x i8> %b, i32 1) + %2 = trunc i32 %1 to i8 + ret i8 %2 +} + +define arm_aapcs_vfpcc zeroext i16 @test_vminvq_u16(i16 zeroext %a, <8 x i16> %b) { +; CHECK-LABEL: test_vminvq_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vminv.u16 r0, q0 +; CHECK-NEXT: uxth r0, r0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %a to i32 + %1 = tail call i32 @llvm.arm.mve.minv.v8i16(i32 %0, <8 x i16> %b, i32 1) + %2 = trunc i32 %1 to i16 + ret i16 %2 +} + define arm_aapcs_vfpcc i32 @test_vminvq_u32(i32 %a, <4 x i32> %b) { ; CHECK-LABEL: test_vminvq_u32: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vminv.u32 r0, q0 ; CHECK-NEXT: bx lr entry: - %0 = tail call i32 @llvm.arm.mve.minv.u.v4i32(i32 %a, <4 x i32> %b) + %0 = tail call i32 @llvm.arm.mve.minv.v4i32(i32 %a, <4 x i32> %b, i32 1) + ret i32 %0 +} + +define arm_aapcs_vfpcc signext i8 @test_vmaxvq_s8(i8 signext %a, <16 x i8> %b) { +; CHECK-LABEL: test_vmaxvq_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmaxv.s8 r0, q0 +; CHECK-NEXT: sxtb r0, r0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i8 %a to i32 + %1 = tail call i32 @llvm.arm.mve.maxv.v16i8(i32 %0, <16 x i8> %b, i32 0) + %2 = trunc i32 %1 to i8 + ret i8 %2 +} + +define arm_aapcs_vfpcc signext i16 @test_vmaxvq_s16(i16 signext %a, <8 x i16> %b) { +; CHECK-LABEL: test_vmaxvq_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmaxv.s16 r0, q0 +; CHECK-NEXT: sxth r0, r0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %a to i32 + %1 = tail call i32 @llvm.arm.mve.maxv.v8i16(i32 %0, <8 x i16> %b, i32 0) + %2 = trunc i32 %1 to i16 + ret i16 %2 +} + +define arm_aapcs_vfpcc i32 @test_vmaxvq_s32(i32 %a, <4 x i32> %b) { +; CHECK-LABEL: test_vmaxvq_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmaxv.s32 r0, q0 +; CHECK-NEXT: bx lr +entry: + %0 = tail call i32 @llvm.arm.mve.maxv.v4i32(i32 %a, <4 x i32> %b, i32 0) ret i32 %0 } -define arm_aapcs_vfpcc i32 @test_vmaxvq_u8(i32 %a, <16 x i8> %b) { +define arm_aapcs_vfpcc zeroext i8 @test_vmaxvq_u8(i8 zeroext %a, <16 x i8> %b) { ; CHECK-LABEL: test_vmaxvq_u8: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmaxv.u8 r0, q0 +; CHECK-NEXT: uxtb r0, r0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i8 %a to i32 + %1 = tail call i32 @llvm.arm.mve.maxv.v16i8(i32 %0, <16 x i8> %b, i32 1) + %2 = trunc i32 %1 to i8 + ret i8 %2 +} + +define arm_aapcs_vfpcc zeroext i16 @test_vmaxvq_u16(i16 zeroext %a, <8 x i16> %b) { +; CHECK-LABEL: test_vmaxvq_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmaxv.u16 r0, q0 +; CHECK-NEXT: uxth r0, r0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %a to i32 + %1 = tail call i32 @llvm.arm.mve.maxv.v8i16(i32 %0, <8 x i16> %b, i32 1) + %2 = trunc i32 %1 to i16 + ret i16 %2 +} + +define arm_aapcs_vfpcc i32 @test_vmaxvq_u32(i32 %a, <4 x i32> %b) { +; CHECK-LABEL: test_vmaxvq_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmaxv.u32 r0, q0 ; CHECK-NEXT: bx lr entry: - %0 = tail call i32 @llvm.arm.mve.maxv.u.v16i8(i32 %a, <16 x i8> %b) + %0 = tail call i32 @llvm.arm.mve.maxv.v4i32(i32 %a, <4 x i32> %b, i32 1) ret i32 %0 } -define arm_aapcs_vfpcc i32 @test_vminvq_s16(i32 %a, <8 x i16> %b) { -; CHECK-LABEL: test_vminvq_s16: +define arm_aapcs_vfpcc zeroext i8 @test_vminavq_s8(i8 zeroext %a, <16 x i8> %b) { +; CHECK-LABEL: test_vminavq_s8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vminv.s16 r0, q0 +; CHECK-NEXT: vminav.s8 r0, q0 +; CHECK-NEXT: uxtb r0, r0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i8 %a to i32 + %1 = tail call i32 @llvm.arm.mve.minav.v16i8(i32 %0, <16 x i8> %b) + %2 = trunc i32 %1 to i8 + ret i8 %2 +} + +define arm_aapcs_vfpcc zeroext i16 @test_vminavq_s16(i16 zeroext %a, <8 x i16> %b) { +; CHECK-LABEL: test_vminavq_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vminav.s16 r0, q0 +; CHECK-NEXT: uxth r0, r0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %a to i32 + %1 = tail call i32 @llvm.arm.mve.minav.v8i16(i32 %0, <8 x i16> %b) + %2 = trunc i32 %1 to i16 + ret i16 %2 +} + +define arm_aapcs_vfpcc i32 @test_vminavq_s32(i32 %a, <4 x i32> %b) { +; CHECK-LABEL: test_vminavq_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vminav.s32 r0, q0 +; CHECK-NEXT: bx lr +entry: + %0 = tail call i32 @llvm.arm.mve.minav.v4i32(i32 %a, <4 x i32> %b) + ret i32 %0 +} + +define arm_aapcs_vfpcc zeroext i8 @test_vmaxavq_s8(i8 zeroext %a, <16 x i8> %b) { +; CHECK-LABEL: test_vmaxavq_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmaxav.s8 r0, q0 +; CHECK-NEXT: uxtb r0, r0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i8 %a to i32 + %1 = tail call i32 @llvm.arm.mve.maxav.v16i8(i32 %0, <16 x i8> %b) + %2 = trunc i32 %1 to i8 + ret i8 %2 +} + +define arm_aapcs_vfpcc zeroext i16 @test_vmaxavq_s16(i16 zeroext %a, <8 x i16> %b) { +; CHECK-LABEL: test_vmaxavq_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmaxav.s16 r0, q0 +; CHECK-NEXT: uxth r0, r0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %a to i32 + %1 = tail call i32 @llvm.arm.mve.maxav.v8i16(i32 %0, <8 x i16> %b) + %2 = trunc i32 %1 to i16 + ret i16 %2 +} + +define arm_aapcs_vfpcc i32 @test_vmaxavq_s32(i32 %a, <4 x i32> %b) { +; CHECK-LABEL: test_vmaxavq_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmaxav.s32 r0, q0 ; CHECK-NEXT: bx lr entry: - %0 = tail call i32 @llvm.arm.mve.minv.s.v8i16(i32 %a, <8 x i16> %b) + %0 = tail call i32 @llvm.arm.mve.maxav.v4i32(i32 %a, <4 x i32> %b) ret i32 %0 } -declare i32 @llvm.arm.mve.minv.u.v4i32(i32, <4 x i32>) -declare i32 @llvm.arm.mve.maxv.u.v16i8(i32, <16 x i8>) -declare i32 @llvm.arm.mve.minv.s.v8i16(i32, <8 x i16>) +define arm_aapcs_vfpcc float @test_vminnmvq_f16(float %a.coerce, <8 x half> %b) { +; CHECK-LABEL: test_vminnmvq_f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: sub sp, #4 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vminnmv.f16 r0, q1 +; CHECK-NEXT: vmov s0, r0 +; CHECK-NEXT: vstr.16 s0, [sp, #2] +; CHECK-NEXT: ldrh.w r0, [sp, #2] +; CHECK-NEXT: vmov s0, r0 +; CHECK-NEXT: add sp, #4 +; CHECK-NEXT: bx lr +entry: + %0 = bitcast float %a.coerce to i32 + %tmp.0.extract.trunc = trunc i32 %0 to i16 + %1 = bitcast i16 %tmp.0.extract.trunc to half + %2 = tail call half @llvm.arm.mve.minnmv.f16.v8f16(half %1, <8 x half> %b) + %3 = bitcast half %2 to i16 + %tmp2.0.insert.ext = zext i16 %3 to i32 + %4 = bitcast i32 %tmp2.0.insert.ext to float + ret float %4 +} + +define arm_aapcs_vfpcc float @test_vminnmvq_f32(float %a, <4 x float> %b) { +; CHECK-LABEL: test_vminnmvq_f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vminnmv.f32 r0, q1 +; CHECK-NEXT: vmov s0, r0 +; CHECK-NEXT: bx lr +entry: + %0 = tail call float @llvm.arm.mve.minnmv.f32.v4f32(float %a, <4 x float> %b) + ret float %0 +} + +define arm_aapcs_vfpcc float @test_vminnmavq_f16(float %a.coerce, <8 x half> %b) { +; CHECK-LABEL: test_vminnmavq_f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: sub sp, #4 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vminnmav.f16 r0, q1 +; CHECK-NEXT: vmov s0, r0 +; CHECK-NEXT: vstr.16 s0, [sp, #2] +; CHECK-NEXT: ldrh.w r0, [sp, #2] +; CHECK-NEXT: vmov s0, r0 +; CHECK-NEXT: add sp, #4 +; CHECK-NEXT: bx lr +entry: + %0 = bitcast float %a.coerce to i32 + %tmp.0.extract.trunc = trunc i32 %0 to i16 + %1 = bitcast i16 %tmp.0.extract.trunc to half + %2 = tail call half @llvm.arm.mve.minnmav.f16.v8f16(half %1, <8 x half> %b) + %3 = bitcast half %2 to i16 + %tmp2.0.insert.ext = zext i16 %3 to i32 + %4 = bitcast i32 %tmp2.0.insert.ext to float + ret float %4 +} + +define arm_aapcs_vfpcc float @test_vminnmavq_f32(float %a, <4 x float> %b) { +; CHECK-LABEL: test_vminnmavq_f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vminnmav.f32 r0, q1 +; CHECK-NEXT: vmov s0, r0 +; CHECK-NEXT: bx lr +entry: + %0 = tail call float @llvm.arm.mve.minnmav.f32.v4f32(float %a, <4 x float> %b) + ret float %0 +} + +define arm_aapcs_vfpcc float @test_vmaxnmvq_f16(float %a.coerce, <8 x half> %b) { +; CHECK-LABEL: test_vmaxnmvq_f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: sub sp, #4 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmaxnmv.f16 r0, q1 +; CHECK-NEXT: vmov s0, r0 +; CHECK-NEXT: vstr.16 s0, [sp, #2] +; CHECK-NEXT: ldrh.w r0, [sp, #2] +; CHECK-NEXT: vmov s0, r0 +; CHECK-NEXT: add sp, #4 +; CHECK-NEXT: bx lr +entry: + %0 = bitcast float %a.coerce to i32 + %tmp.0.extract.trunc = trunc i32 %0 to i16 + %1 = bitcast i16 %tmp.0.extract.trunc to half + %2 = tail call half @llvm.arm.mve.maxnmv.f16.v8f16(half %1, <8 x half> %b) + %3 = bitcast half %2 to i16 + %tmp2.0.insert.ext = zext i16 %3 to i32 + %4 = bitcast i32 %tmp2.0.insert.ext to float + ret float %4 +} + +define arm_aapcs_vfpcc float @test_vmaxnmvq_f32(float %a, <4 x float> %b) { +; CHECK-LABEL: test_vmaxnmvq_f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmaxnmv.f32 r0, q1 +; CHECK-NEXT: vmov s0, r0 +; CHECK-NEXT: bx lr +entry: + %0 = tail call float @llvm.arm.mve.maxnmv.f32.v4f32(float %a, <4 x float> %b) + ret float %0 +} + +define arm_aapcs_vfpcc float @test_vmaxnmavq_f16(float %a.coerce, <8 x half> %b) { +; CHECK-LABEL: test_vmaxnmavq_f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: sub sp, #4 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmaxnmav.f16 r0, q1 +; CHECK-NEXT: vmov s0, r0 +; CHECK-NEXT: vstr.16 s0, [sp, #2] +; CHECK-NEXT: ldrh.w r0, [sp, #2] +; CHECK-NEXT: vmov s0, r0 +; CHECK-NEXT: add sp, #4 +; CHECK-NEXT: bx lr +entry: + %0 = bitcast float %a.coerce to i32 + %tmp.0.extract.trunc = trunc i32 %0 to i16 + %1 = bitcast i16 %tmp.0.extract.trunc to half + %2 = tail call half @llvm.arm.mve.maxnmav.f16.v8f16(half %1, <8 x half> %b) + %3 = bitcast half %2 to i16 + %tmp2.0.insert.ext = zext i16 %3 to i32 + %4 = bitcast i32 %tmp2.0.insert.ext to float + ret float %4 +} + +define arm_aapcs_vfpcc float @test_vmaxnmavq_f32(float %a, <4 x float> %b) { +; CHECK-LABEL: test_vmaxnmavq_f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmaxnmav.f32 r0, q1 +; CHECK-NEXT: vmov s0, r0 +; CHECK-NEXT: bx lr +entry: + %0 = tail call float @llvm.arm.mve.maxnmav.f32.v4f32(float %a, <4 x float> %b) + ret float %0 +} + +define arm_aapcs_vfpcc signext i8 @test_vminvq_p_s8(i8 signext %a, <16 x i8> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vminvq_p_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vminvt.s8 r0, q0 +; CHECK-NEXT: sxtb r0, r0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i8 %a to i32 + %1 = zext i16 %p to i32 + %2 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1) + %3 = tail call i32 @llvm.arm.mve.minv.predicated.v16i8.v16i1(i32 %0, <16 x i8> %b, i32 0, <16 x i1> %2) + %4 = trunc i32 %3 to i8 + ret i8 %4 +} + +define arm_aapcs_vfpcc signext i16 @test_vminvq_p_s16(i16 signext %a, <8 x i16> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vminvq_p_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vminvt.s16 r0, q0 +; CHECK-NEXT: sxth r0, r0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %a to i32 + %1 = zext i16 %p to i32 + %2 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1) + %3 = tail call i32 @llvm.arm.mve.minv.predicated.v8i16.v8i1(i32 %0, <8 x i16> %b, i32 0, <8 x i1> %2) + %4 = trunc i32 %3 to i16 + ret i16 %4 +} + +define arm_aapcs_vfpcc i32 @test_vminvq_p_s32(i32 %a, <4 x i32> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vminvq_p_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vminvt.s32 r0, q0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = tail call i32 @llvm.arm.mve.minv.predicated.v4i32.v4i1(i32 %a, <4 x i32> %b, i32 0, <4 x i1> %1) + ret i32 %2 +} + +define arm_aapcs_vfpcc zeroext i8 @test_vminvq_p_u8(i8 zeroext %a, <16 x i8> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vminvq_p_u8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vminvt.u8 r0, q0 +; CHECK-NEXT: uxtb r0, r0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i8 %a to i32 + %1 = zext i16 %p to i32 + %2 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1) + %3 = tail call i32 @llvm.arm.mve.minv.predicated.v16i8.v16i1(i32 %0, <16 x i8> %b, i32 1, <16 x i1> %2) + %4 = trunc i32 %3 to i8 + ret i8 %4 +} + +define arm_aapcs_vfpcc zeroext i16 @test_vminvq_p_u16(i16 zeroext %a, <8 x i16> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vminvq_p_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vminvt.u16 r0, q0 +; CHECK-NEXT: uxth r0, r0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %a to i32 + %1 = zext i16 %p to i32 + %2 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1) + %3 = tail call i32 @llvm.arm.mve.minv.predicated.v8i16.v8i1(i32 %0, <8 x i16> %b, i32 1, <8 x i1> %2) + %4 = trunc i32 %3 to i16 + ret i16 %4 +} + +define arm_aapcs_vfpcc i32 @test_vminvq_p_u32(i32 %a, <4 x i32> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vminvq_p_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vminvt.u32 r0, q0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = tail call i32 @llvm.arm.mve.minv.predicated.v4i32.v4i1(i32 %a, <4 x i32> %b, i32 1, <4 x i1> %1) + ret i32 %2 +} + +define arm_aapcs_vfpcc signext i8 @test_vmaxvq_p_s8(i8 signext %a, <16 x i8> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vmaxvq_p_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmaxvt.s8 r0, q0 +; CHECK-NEXT: sxtb r0, r0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i8 %a to i32 + %1 = zext i16 %p to i32 + %2 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1) + %3 = tail call i32 @llvm.arm.mve.maxv.predicated.v16i8.v16i1(i32 %0, <16 x i8> %b, i32 0, <16 x i1> %2) + %4 = trunc i32 %3 to i8 + ret i8 %4 +} + +define arm_aapcs_vfpcc signext i16 @test_vmaxvq_p_s16(i16 signext %a, <8 x i16> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vmaxvq_p_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmaxvt.s16 r0, q0 +; CHECK-NEXT: sxth r0, r0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %a to i32 + %1 = zext i16 %p to i32 + %2 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1) + %3 = tail call i32 @llvm.arm.mve.maxv.predicated.v8i16.v8i1(i32 %0, <8 x i16> %b, i32 0, <8 x i1> %2) + %4 = trunc i32 %3 to i16 + ret i16 %4 +} + +define arm_aapcs_vfpcc i32 @test_vmaxvq_p_s32(i32 %a, <4 x i32> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vmaxvq_p_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmaxvt.s32 r0, q0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = tail call i32 @llvm.arm.mve.maxv.predicated.v4i32.v4i1(i32 %a, <4 x i32> %b, i32 0, <4 x i1> %1) + ret i32 %2 +} + +define arm_aapcs_vfpcc zeroext i8 @test_vmaxvq_p_u8(i8 zeroext %a, <16 x i8> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vmaxvq_p_u8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmaxvt.u8 r0, q0 +; CHECK-NEXT: uxtb r0, r0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i8 %a to i32 + %1 = zext i16 %p to i32 + %2 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1) + %3 = tail call i32 @llvm.arm.mve.maxv.predicated.v16i8.v16i1(i32 %0, <16 x i8> %b, i32 1, <16 x i1> %2) + %4 = trunc i32 %3 to i8 + ret i8 %4 +} + +define arm_aapcs_vfpcc zeroext i16 @test_vmaxvq_p_u16(i16 zeroext %a, <8 x i16> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vmaxvq_p_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmaxvt.u16 r0, q0 +; CHECK-NEXT: uxth r0, r0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %a to i32 + %1 = zext i16 %p to i32 + %2 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1) + %3 = tail call i32 @llvm.arm.mve.maxv.predicated.v8i16.v8i1(i32 %0, <8 x i16> %b, i32 1, <8 x i1> %2) + %4 = trunc i32 %3 to i16 + ret i16 %4 +} + +define arm_aapcs_vfpcc i32 @test_vmaxvq_p_u32(i32 %a, <4 x i32> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vmaxvq_p_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmaxvt.u32 r0, q0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = tail call i32 @llvm.arm.mve.maxv.predicated.v4i32.v4i1(i32 %a, <4 x i32> %b, i32 1, <4 x i1> %1) + ret i32 %2 +} + +define arm_aapcs_vfpcc zeroext i8 @test_vminavq_p_s8(i8 zeroext %a, <16 x i8> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vminavq_p_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vminavt.s8 r0, q0 +; CHECK-NEXT: uxtb r0, r0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i8 %a to i32 + %1 = zext i16 %p to i32 + %2 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1) + %3 = tail call i32 @llvm.arm.mve.minav.predicated.v16i8.v16i1(i32 %0, <16 x i8> %b, <16 x i1> %2) + %4 = trunc i32 %3 to i8 + ret i8 %4 +} + +define arm_aapcs_vfpcc zeroext i16 @test_vminavq_p_s16(i16 zeroext %a, <8 x i16> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vminavq_p_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vminavt.s16 r0, q0 +; CHECK-NEXT: uxth r0, r0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %a to i32 + %1 = zext i16 %p to i32 + %2 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1) + %3 = tail call i32 @llvm.arm.mve.minav.predicated.v8i16.v8i1(i32 %0, <8 x i16> %b, <8 x i1> %2) + %4 = trunc i32 %3 to i16 + ret i16 %4 +} + +define arm_aapcs_vfpcc i32 @test_vminavq_p_s32(i32 %a, <4 x i32> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vminavq_p_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vminavt.s32 r0, q0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = tail call i32 @llvm.arm.mve.minav.predicated.v4i32.v4i1(i32 %a, <4 x i32> %b, <4 x i1> %1) + ret i32 %2 +} + +define arm_aapcs_vfpcc zeroext i8 @test_vmaxavq_p_s8(i8 zeroext %a, <16 x i8> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vmaxavq_p_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmaxavt.s8 r0, q0 +; CHECK-NEXT: uxtb r0, r0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i8 %a to i32 + %1 = zext i16 %p to i32 + %2 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1) + %3 = tail call i32 @llvm.arm.mve.maxav.predicated.v16i8.v16i1(i32 %0, <16 x i8> %b, <16 x i1> %2) + %4 = trunc i32 %3 to i8 + ret i8 %4 +} + +define arm_aapcs_vfpcc zeroext i16 @test_vmaxavq_p_s16(i16 zeroext %a, <8 x i16> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vmaxavq_p_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmaxavt.s16 r0, q0 +; CHECK-NEXT: uxth r0, r0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %a to i32 + %1 = zext i16 %p to i32 + %2 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1) + %3 = tail call i32 @llvm.arm.mve.maxav.predicated.v8i16.v8i1(i32 %0, <8 x i16> %b, <8 x i1> %2) + %4 = trunc i32 %3 to i16 + ret i16 %4 +} + +define arm_aapcs_vfpcc i32 @test_vmaxavq_p_s32(i32 %a, <4 x i32> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vmaxavq_p_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmaxavt.s32 r0, q0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = tail call i32 @llvm.arm.mve.maxav.predicated.v4i32.v4i1(i32 %a, <4 x i32> %b, <4 x i1> %1) + ret i32 %2 +} + +define arm_aapcs_vfpcc float @test_vminnmvq_p_f16(float %a.coerce, <8 x half> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vminnmvq_p_f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: sub sp, #4 +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vminnmvt.f16 r1, q1 +; CHECK-NEXT: vmov s0, r1 +; CHECK-NEXT: vstr.16 s0, [sp, #2] +; CHECK-NEXT: ldrh.w r0, [sp, #2] +; CHECK-NEXT: vmov s0, r0 +; CHECK-NEXT: add sp, #4 +; CHECK-NEXT: bx lr +entry: + %0 = bitcast float %a.coerce to i32 + %tmp.0.extract.trunc = trunc i32 %0 to i16 + %1 = bitcast i16 %tmp.0.extract.trunc to half + %2 = zext i16 %p to i32 + %3 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %2) + %4 = tail call half @llvm.arm.mve.minnmv.predicated.f16.v8f16.v8i1(half %1, <8 x half> %b, <8 x i1> %3) + %5 = bitcast half %4 to i16 + %tmp2.0.insert.ext = zext i16 %5 to i32 + %6 = bitcast i32 %tmp2.0.insert.ext to float + ret float %6 +} + +define arm_aapcs_vfpcc float @test_vminnmvq_p_f32(float %a, <4 x float> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vminnmvq_p_f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vminnmvt.f32 r0, q1 +; CHECK-NEXT: vmov s0, r0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = tail call float @llvm.arm.mve.minnmv.predicated.f32.v4f32.v4i1(float %a, <4 x float> %b, <4 x i1> %1) + ret float %2 +} + +define arm_aapcs_vfpcc float @test_vminnmavq_p_f16(float %a.coerce, <8 x half> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vminnmavq_p_f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: sub sp, #4 +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vminnmavt.f16 r1, q1 +; CHECK-NEXT: vmov s0, r1 +; CHECK-NEXT: vstr.16 s0, [sp, #2] +; CHECK-NEXT: ldrh.w r0, [sp, #2] +; CHECK-NEXT: vmov s0, r0 +; CHECK-NEXT: add sp, #4 +; CHECK-NEXT: bx lr +entry: + %0 = bitcast float %a.coerce to i32 + %tmp.0.extract.trunc = trunc i32 %0 to i16 + %1 = bitcast i16 %tmp.0.extract.trunc to half + %2 = zext i16 %p to i32 + %3 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %2) + %4 = tail call half @llvm.arm.mve.minnmav.predicated.f16.v8f16.v8i1(half %1, <8 x half> %b, <8 x i1> %3) + %5 = bitcast half %4 to i16 + %tmp2.0.insert.ext = zext i16 %5 to i32 + %6 = bitcast i32 %tmp2.0.insert.ext to float + ret float %6 +} + +define arm_aapcs_vfpcc float @test_vminnmavq_p_f32(float %a, <4 x float> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vminnmavq_p_f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vminnmavt.f32 r0, q1 +; CHECK-NEXT: vmov s0, r0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = tail call float @llvm.arm.mve.minnmav.predicated.f32.v4f32.v4i1(float %a, <4 x float> %b, <4 x i1> %1) + ret float %2 +} + +define arm_aapcs_vfpcc float @test_vmaxnmvq_p_f16(float %a.coerce, <8 x half> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vmaxnmvq_p_f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: sub sp, #4 +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmaxnmvt.f16 r1, q1 +; CHECK-NEXT: vmov s0, r1 +; CHECK-NEXT: vstr.16 s0, [sp, #2] +; CHECK-NEXT: ldrh.w r0, [sp, #2] +; CHECK-NEXT: vmov s0, r0 +; CHECK-NEXT: add sp, #4 +; CHECK-NEXT: bx lr +entry: + %0 = bitcast float %a.coerce to i32 + %tmp.0.extract.trunc = trunc i32 %0 to i16 + %1 = bitcast i16 %tmp.0.extract.trunc to half + %2 = zext i16 %p to i32 + %3 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %2) + %4 = tail call half @llvm.arm.mve.maxnmv.predicated.f16.v8f16.v8i1(half %1, <8 x half> %b, <8 x i1> %3) + %5 = bitcast half %4 to i16 + %tmp2.0.insert.ext = zext i16 %5 to i32 + %6 = bitcast i32 %tmp2.0.insert.ext to float + ret float %6 +} + +define arm_aapcs_vfpcc float @test_vmaxnmvq_p_f32(float %a, <4 x float> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vmaxnmvq_p_f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmaxnmvt.f32 r0, q1 +; CHECK-NEXT: vmov s0, r0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = tail call float @llvm.arm.mve.maxnmv.predicated.f32.v4f32.v4i1(float %a, <4 x float> %b, <4 x i1> %1) + ret float %2 +} + +define arm_aapcs_vfpcc float @test_vmaxnmavq_p_f16(float %a.coerce, <8 x half> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vmaxnmavq_p_f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: sub sp, #4 +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmaxnmavt.f16 r1, q1 +; CHECK-NEXT: vmov s0, r1 +; CHECK-NEXT: vstr.16 s0, [sp, #2] +; CHECK-NEXT: ldrh.w r0, [sp, #2] +; CHECK-NEXT: vmov s0, r0 +; CHECK-NEXT: add sp, #4 +; CHECK-NEXT: bx lr +entry: + %0 = bitcast float %a.coerce to i32 + %tmp.0.extract.trunc = trunc i32 %0 to i16 + %1 = bitcast i16 %tmp.0.extract.trunc to half + %2 = zext i16 %p to i32 + %3 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %2) + %4 = tail call half @llvm.arm.mve.maxnmav.predicated.f16.v8f16.v8i1(half %1, <8 x half> %b, <8 x i1> %3) + %5 = bitcast half %4 to i16 + %tmp2.0.insert.ext = zext i16 %5 to i32 + %6 = bitcast i32 %tmp2.0.insert.ext to float + ret float %6 +} + +define arm_aapcs_vfpcc float @test_vmaxnmavq_p_f32(float %a, <4 x float> %b, i16 zeroext %p) { +; CHECK-LABEL: test_vmaxnmavq_p_f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmaxnmavt.f32 r0, q1 +; CHECK-NEXT: vmov s0, r0 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = tail call float @llvm.arm.mve.maxnmav.predicated.f32.v4f32.v4i1(float %a, <4 x float> %b, <4 x i1> %1) + ret float %2 +} + +declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32) +declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) +declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) + +declare i32 @llvm.arm.mve.minv.v16i8(i32, <16 x i8>, i32) +declare i32 @llvm.arm.mve.minv.v8i16(i32, <8 x i16>, i32) +declare i32 @llvm.arm.mve.minv.v4i32(i32, <4 x i32>, i32) +declare i32 @llvm.arm.mve.maxv.v16i8(i32, <16 x i8>, i32) +declare i32 @llvm.arm.mve.maxv.v8i16(i32, <8 x i16>, i32) +declare i32 @llvm.arm.mve.maxv.v4i32(i32, <4 x i32>, i32) +declare i32 @llvm.arm.mve.minav.v16i8(i32, <16 x i8>) +declare i32 @llvm.arm.mve.minav.v8i16(i32, <8 x i16>) +declare i32 @llvm.arm.mve.minav.v4i32(i32, <4 x i32>) +declare i32 @llvm.arm.mve.maxav.v16i8(i32, <16 x i8>) +declare i32 @llvm.arm.mve.maxav.v8i16(i32, <8 x i16>) +declare i32 @llvm.arm.mve.maxav.v4i32(i32, <4 x i32>) +declare i32 @llvm.arm.mve.minv.predicated.v16i8.v16i1(i32, <16 x i8>, i32, <16 x i1>) +declare i32 @llvm.arm.mve.minv.predicated.v8i16.v8i1(i32, <8 x i16>, i32, <8 x i1>) +declare i32 @llvm.arm.mve.minv.predicated.v4i32.v4i1(i32, <4 x i32>, i32, <4 x i1>) +declare i32 @llvm.arm.mve.maxv.predicated.v16i8.v16i1(i32, <16 x i8>, i32, <16 x i1>) +declare i32 @llvm.arm.mve.maxv.predicated.v8i16.v8i1(i32, <8 x i16>, i32, <8 x i1>) +declare i32 @llvm.arm.mve.maxv.predicated.v4i32.v4i1(i32, <4 x i32>, i32, <4 x i1>) +declare i32 @llvm.arm.mve.minav.predicated.v16i8.v16i1(i32, <16 x i8>, <16 x i1>) +declare i32 @llvm.arm.mve.minav.predicated.v8i16.v8i1(i32, <8 x i16>, <8 x i1>) +declare i32 @llvm.arm.mve.minav.predicated.v4i32.v4i1(i32, <4 x i32>, <4 x i1>) +declare i32 @llvm.arm.mve.maxav.predicated.v16i8.v16i1(i32, <16 x i8>, <16 x i1>) +declare i32 @llvm.arm.mve.maxav.predicated.v8i16.v8i1(i32, <8 x i16>, <8 x i1>) +declare i32 @llvm.arm.mve.maxav.predicated.v4i32.v4i1(i32, <4 x i32>, <4 x i1>) + +declare half @llvm.arm.mve.minnmv.f16.v8f16(half, <8 x half>) +declare half @llvm.arm.mve.minnmav.f16.v8f16(half, <8 x half>) +declare half @llvm.arm.mve.maxnmv.f16.v8f16(half, <8 x half>) +declare half @llvm.arm.mve.maxnmav.f16.v8f16(half, <8 x half>) +declare half @llvm.arm.mve.minnmv.predicated.f16.v8f16.v8i1(half, <8 x half>, <8 x i1>) +declare half @llvm.arm.mve.minnmav.predicated.f16.v8f16.v8i1(half, <8 x half>, <8 x i1>) +declare half @llvm.arm.mve.maxnmv.predicated.f16.v8f16.v8i1(half, <8 x half>, <8 x i1>) +declare half @llvm.arm.mve.maxnmav.predicated.f16.v8f16.v8i1(half, <8 x half>, <8 x i1>) + +declare float @llvm.arm.mve.minnmv.f32.v4f32(float, <4 x float>) +declare float @llvm.arm.mve.minnmav.f32.v4f32(float, <4 x float>) +declare float @llvm.arm.mve.maxnmv.f32.v4f32(float, <4 x float>) +declare float @llvm.arm.mve.maxnmav.f32.v4f32(float, <4 x float>) +declare float @llvm.arm.mve.minnmv.predicated.f32.v4f32.v4i1(float, <4 x float>, <4 x i1>) +declare float @llvm.arm.mve.minnmav.predicated.f32.v4f32.v4i1(float, <4 x float>, <4 x i1>) +declare float @llvm.arm.mve.maxnmv.predicated.f32.v4f32.v4i1(float, <4 x float>, <4 x i1>) +declare float @llvm.arm.mve.maxnmav.predicated.f32.v4f32.v4i1(float, <4 x float>, <4 x i1>) diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmulq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmulq.ll index a342647e16a22..075e609479a62 100644 --- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmulq.ll +++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vmulq.ll @@ -269,7 +269,7 @@ entry: define arm_aapcs_vfpcc <8 x half> @test_vmulq_m_n_f16(<8 x half> %inactive, <8 x half> %a, float %b.coerce, i16 zeroext %p) { ; CHECK-LABEL: test_vmulq_m_n_f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r1, s8 +; CHECK-NEXT: vmov.f16 r1, s8 ; CHECK-NEXT: vmsr p0, r0 ; CHECK-NEXT: vpst ; CHECK-NEXT: vmult.f16 q0, q1, r1 @@ -337,10 +337,10 @@ entry: define arm_aapcs_vfpcc <4 x float> @test_vmulq_x_n_f32(<4 x float> %a, float %b, i16 zeroext %p) { ; CHECK-LABEL: test_vmulq_x_n_f32: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov r1, s4 ; CHECK-NEXT: vmsr p0, r0 -; CHECK-NEXT: vmov r0, s4 ; CHECK-NEXT: vpst -; CHECK-NEXT: vmult.f32 q0, q0, r0 +; CHECK-NEXT: vmult.f32 q0, q0, r1 ; CHECK-NEXT: bx lr entry: %.splatinsert = insertelement <4 x float> undef, float %b, i32 0 diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vsubq.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vsubq.ll index 243ff4070af33..e65a84a19c3ba 100644 --- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vsubq.ll +++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vsubq.ll @@ -106,7 +106,7 @@ entry: define arm_aapcs_vfpcc <8 x half> @test_vsubq_n_f16(<8 x half> %a, float %b.coerce) { ; CHECK-LABEL: test_vsubq_n_f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov.f16 r0, s4 ; CHECK-NEXT: vsub.f16 q0, q0, r0 ; CHECK-NEXT: bx lr entry: @@ -138,10 +138,10 @@ entry: define arm_aapcs_vfpcc <4 x float> @test_vsubq_m_n_f32(<4 x float> %inactive, <4 x float> %a, float %b, i16 zeroext %p) { ; CHECK-LABEL: test_vsubq_m_n_f32: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov r1, s8 ; CHECK-NEXT: vmsr p0, r0 -; CHECK-NEXT: vmov r0, s8 ; CHECK-NEXT: vpst -; CHECK-NEXT: vsubt.f32 q0, q1, r0 +; CHECK-NEXT: vsubt.f32 q0, q1, r1 ; CHECK-NEXT: bx lr entry: %.splatinsert = insertelement <4 x float> undef, float %b, i32 0 @@ -171,7 +171,7 @@ entry: define arm_aapcs_vfpcc <8 x half> @test_vsubq_x_n_f16(<8 x half> %a, float %b.coerce, i16 zeroext %p) { ; CHECK-LABEL: test_vsubq_x_n_f16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmov r1, s4 +; CHECK-NEXT: vmov.f16 r1, s4 ; CHECK-NEXT: vmsr p0, r0 ; CHECK-NEXT: vpst ; CHECK-NEXT: vsubt.f16 q0, q0, r1 diff --git a/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll b/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll index c94ff1cfd7a62..789b6604b4845 100644 --- a/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll +++ b/llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll @@ -133,26 +133,24 @@ define void @fma_tailpred(float* noalias nocapture readonly %A, float* noalias n ; CHECK-NEXT: bic r12, r12, #3 ; CHECK-NEXT: mov.w lr, #1 ; CHECK-NEXT: sub.w r12, r12, #4 -; CHECK-NEXT: subs r3, #1 ; CHECK-NEXT: vldrw.u32 q0, [r4] -; CHECK-NEXT: vdup.32 q1, r3 ; CHECK-NEXT: add.w lr, lr, r12, lsr #2 -; CHECK-NEXT: mov.w r12, #0 -; CHECK-NEXT: mov r3, r2 +; CHECK-NEXT: sub.w r12, r3, #1 +; CHECK-NEXT: movs r3, #0 +; CHECK-NEXT: vdup.32 q1, r12 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB1_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vdup.32 q2, r12 -; CHECK-NEXT: add.w r12, r12, #4 +; CHECK-NEXT: vdup.32 q2, r3 +; CHECK-NEXT: adds r3, #4 ; CHECK-NEXT: vorr q2, q2, q0 ; CHECK-NEXT: vpttt.u32 cs, q1, q2 ; CHECK-NEXT: vldrwt.u32 q2, [r0], #16 ; CHECK-NEXT: vldrwt.u32 q3, [r1], #16 -; CHECK-NEXT: vldrwt.u32 q4, [r3], #16 +; CHECK-NEXT: vldrwt.u32 q4, [r2] ; CHECK-NEXT: vfma.f32 q4, q3, q2 ; CHECK-NEXT: vpst -; CHECK-NEXT: vstrwt.32 q4, [r2] -; CHECK-NEXT: mov r2, r3 +; CHECK-NEXT: vstrwt.32 q4, [r2], #16 ; CHECK-NEXT: le lr, .LBB1_2 ; CHECK-NEXT: .LBB1_3: @ %for.cond.cleanup ; CHECK-NEXT: vpop {d8, d9} diff --git a/llvm/test/CodeGen/Thumb2/mve-pred-threshold.ll b/llvm/test/CodeGen/Thumb2/mve-pred-threshold.ll index 3d4779ff4b23d..0fb5efb4bd99a 100644 --- a/llvm/test/CodeGen/Thumb2/mve-pred-threshold.ll +++ b/llvm/test/CodeGen/Thumb2/mve-pred-threshold.ll @@ -183,7 +183,6 @@ define arm_aapcs_vfpcc void @thresh_f32(float* %data, i16 zeroext %N, float %T) ; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} -; CHECK-NEXT: vneg.f32 s4, s0 ; CHECK-NEXT: mvn r2, #3 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: movs r2, #1 @@ -192,7 +191,7 @@ define arm_aapcs_vfpcc void @thresh_f32(float* %data, i16 zeroext %N, float %T) ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: vdup.32 q0, r1 ; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: vmov r1, s4 +; CHECK-NEXT: eor r1, r1, #-2147483648 ; CHECK-NEXT: vdup.32 q1, r1 ; CHECK-NEXT: .LBB3_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 @@ -250,10 +249,10 @@ define arm_aapcs_vfpcc void @thresh_f16(half* %data, i16 zeroext %N, float %T.co ; CHECK-NEXT: movs r2, #1 ; CHECK-NEXT: vmov.i32 q2, #0x0 ; CHECK-NEXT: add.w lr, r2, r1, lsr #3 -; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: vmov.f16 r1, s0 ; CHECK-NEXT: vneg.f16 s0, s0 ; CHECK-NEXT: vdup.16 q1, r1 -; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov.f16 r2, s0 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: vdup.16 q0, r2 ; CHECK-NEXT: .LBB4_1: @ %vector.body @@ -486,7 +485,6 @@ define arm_aapcs_vfpcc void @thresh_rev_f32(float* %data, i16 zeroext %N, float ; CHECK-NEXT: cmp r1, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} -; CHECK-NEXT: vneg.f32 s4, s0 ; CHECK-NEXT: mvn r2, #3 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: movs r2, #1 @@ -495,7 +493,7 @@ define arm_aapcs_vfpcc void @thresh_rev_f32(float* %data, i16 zeroext %N, float ; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: vdup.32 q0, r1 ; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: vmov r1, s4 +; CHECK-NEXT: eor r1, r1, #-2147483648 ; CHECK-NEXT: vdup.32 q1, r1 ; CHECK-NEXT: .LBB8_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 @@ -553,10 +551,10 @@ define arm_aapcs_vfpcc void @thresh_rev_f16(half* %data, i16 zeroext %N, float % ; CHECK-NEXT: movs r2, #1 ; CHECK-NEXT: vmov.i32 q2, #0x0 ; CHECK-NEXT: add.w lr, r2, r1, lsr #3 -; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: vmov.f16 r1, s0 ; CHECK-NEXT: vneg.f16 s0, s0 ; CHECK-NEXT: vdup.16 q1, r1 -; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov.f16 r2, s0 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: vdup.16 q0, r2 ; CHECK-NEXT: .LBB9_1: @ %vector.body diff --git a/llvm/test/CodeGen/Thumb2/mve-vcmpfr.ll b/llvm/test/CodeGen/Thumb2/mve-vcmpfr.ll index 79700e046f0ef..c9325c2828939 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vcmpfr.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vcmpfr.ll @@ -916,8 +916,7 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_oeq_v8f16(<8 x half> %src, half* %src2p, ; ; CHECK-MVEFP-LABEL: vcmp_oeq_v8f16: ; CHECK-MVEFP: @ %bb.0: @ %entry -; CHECK-MVEFP-NEXT: vldr.16 s12, [r0] -; CHECK-MVEFP-NEXT: vmov r0, s12 +; CHECK-MVEFP-NEXT: ldrh r0, [r0] ; CHECK-MVEFP-NEXT: vcmp.f16 eq, q0, r0 ; CHECK-MVEFP-NEXT: vpsel q0, q1, q2 ; CHECK-MVEFP-NEXT: bx lr @@ -1058,8 +1057,7 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_one_v8f16(<8 x half> %src, half* %src2p, ; ; CHECK-MVEFP-LABEL: vcmp_one_v8f16: ; CHECK-MVEFP: @ %bb.0: @ %entry -; CHECK-MVEFP-NEXT: vldr.16 s12, [r0] -; CHECK-MVEFP-NEXT: vmov r0, s12 +; CHECK-MVEFP-NEXT: ldrh r0, [r0] ; CHECK-MVEFP-NEXT: vpt.f16 ge, q0, r0 ; CHECK-MVEFP-NEXT: vcmpt.f16 le, q0, r0 ; CHECK-MVEFP-NEXT: vpnot @@ -1186,8 +1184,7 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ogt_v8f16(<8 x half> %src, half* %src2p, ; ; CHECK-MVEFP-LABEL: vcmp_ogt_v8f16: ; CHECK-MVEFP: @ %bb.0: @ %entry -; CHECK-MVEFP-NEXT: vldr.16 s12, [r0] -; CHECK-MVEFP-NEXT: vmov r0, s12 +; CHECK-MVEFP-NEXT: ldrh r0, [r0] ; CHECK-MVEFP-NEXT: vcmp.f16 gt, q0, r0 ; CHECK-MVEFP-NEXT: vpsel q0, q1, q2 ; CHECK-MVEFP-NEXT: bx lr @@ -1312,8 +1309,7 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_oge_v8f16(<8 x half> %src, half* %src2p, ; ; CHECK-MVEFP-LABEL: vcmp_oge_v8f16: ; CHECK-MVEFP: @ %bb.0: @ %entry -; CHECK-MVEFP-NEXT: vldr.16 s12, [r0] -; CHECK-MVEFP-NEXT: vmov r0, s12 +; CHECK-MVEFP-NEXT: ldrh r0, [r0] ; CHECK-MVEFP-NEXT: vcmp.f16 ge, q0, r0 ; CHECK-MVEFP-NEXT: vpsel q0, q1, q2 ; CHECK-MVEFP-NEXT: bx lr @@ -1438,8 +1434,7 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_olt_v8f16(<8 x half> %src, half* %src2p, ; ; CHECK-MVEFP-LABEL: vcmp_olt_v8f16: ; CHECK-MVEFP: @ %bb.0: @ %entry -; CHECK-MVEFP-NEXT: vldr.16 s12, [r0] -; CHECK-MVEFP-NEXT: vmov r0, s12 +; CHECK-MVEFP-NEXT: ldrh r0, [r0] ; CHECK-MVEFP-NEXT: vcmp.f16 lt, q0, r0 ; CHECK-MVEFP-NEXT: vpsel q0, q1, q2 ; CHECK-MVEFP-NEXT: bx lr @@ -1564,8 +1559,7 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ole_v8f16(<8 x half> %src, half* %src2p, ; ; CHECK-MVEFP-LABEL: vcmp_ole_v8f16: ; CHECK-MVEFP: @ %bb.0: @ %entry -; CHECK-MVEFP-NEXT: vldr.16 s12, [r0] -; CHECK-MVEFP-NEXT: vmov r0, s12 +; CHECK-MVEFP-NEXT: ldrh r0, [r0] ; CHECK-MVEFP-NEXT: vcmp.f16 le, q0, r0 ; CHECK-MVEFP-NEXT: vpsel q0, q1, q2 ; CHECK-MVEFP-NEXT: bx lr @@ -1706,8 +1700,7 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ueq_v8f16(<8 x half> %src, half* %src2p, ; ; CHECK-MVEFP-LABEL: vcmp_ueq_v8f16: ; CHECK-MVEFP: @ %bb.0: @ %entry -; CHECK-MVEFP-NEXT: vldr.16 s12, [r0] -; CHECK-MVEFP-NEXT: vmov r0, s12 +; CHECK-MVEFP-NEXT: ldrh r0, [r0] ; CHECK-MVEFP-NEXT: vpt.f16 ge, q0, r0 ; CHECK-MVEFP-NEXT: vcmpt.f16 le, q0, r0 ; CHECK-MVEFP-NEXT: vpsel q0, q1, q2 @@ -1833,8 +1826,7 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_une_v8f16(<8 x half> %src, half* %src2p, ; ; CHECK-MVEFP-LABEL: vcmp_une_v8f16: ; CHECK-MVEFP: @ %bb.0: @ %entry -; CHECK-MVEFP-NEXT: vldr.16 s12, [r0] -; CHECK-MVEFP-NEXT: vmov r0, s12 +; CHECK-MVEFP-NEXT: ldrh r0, [r0] ; CHECK-MVEFP-NEXT: vcmp.f16 ne, q0, r0 ; CHECK-MVEFP-NEXT: vpsel q0, q1, q2 ; CHECK-MVEFP-NEXT: bx lr @@ -1959,8 +1951,7 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ugt_v8f16(<8 x half> %src, half* %src2p, ; ; CHECK-MVEFP-LABEL: vcmp_ugt_v8f16: ; CHECK-MVEFP: @ %bb.0: @ %entry -; CHECK-MVEFP-NEXT: vldr.16 s12, [r0] -; CHECK-MVEFP-NEXT: vmov r0, s12 +; CHECK-MVEFP-NEXT: ldrh r0, [r0] ; CHECK-MVEFP-NEXT: vcmp.f16 le, q0, r0 ; CHECK-MVEFP-NEXT: vpnot ; CHECK-MVEFP-NEXT: vpsel q0, q1, q2 @@ -2086,8 +2077,7 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_uge_v8f16(<8 x half> %src, half* %src2p, ; ; CHECK-MVEFP-LABEL: vcmp_uge_v8f16: ; CHECK-MVEFP: @ %bb.0: @ %entry -; CHECK-MVEFP-NEXT: vldr.16 s12, [r0] -; CHECK-MVEFP-NEXT: vmov r0, s12 +; CHECK-MVEFP-NEXT: ldrh r0, [r0] ; CHECK-MVEFP-NEXT: vcmp.f16 lt, q0, r0 ; CHECK-MVEFP-NEXT: vpnot ; CHECK-MVEFP-NEXT: vpsel q0, q1, q2 @@ -2213,8 +2203,7 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ult_v8f16(<8 x half> %src, half* %src2p, ; ; CHECK-MVEFP-LABEL: vcmp_ult_v8f16: ; CHECK-MVEFP: @ %bb.0: @ %entry -; CHECK-MVEFP-NEXT: vldr.16 s12, [r0] -; CHECK-MVEFP-NEXT: vmov r0, s12 +; CHECK-MVEFP-NEXT: ldrh r0, [r0] ; CHECK-MVEFP-NEXT: vcmp.f16 ge, q0, r0 ; CHECK-MVEFP-NEXT: vpnot ; CHECK-MVEFP-NEXT: vpsel q0, q1, q2 @@ -2340,8 +2329,7 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ule_v8f16(<8 x half> %src, half* %src2p, ; ; CHECK-MVEFP-LABEL: vcmp_ule_v8f16: ; CHECK-MVEFP: @ %bb.0: @ %entry -; CHECK-MVEFP-NEXT: vldr.16 s12, [r0] -; CHECK-MVEFP-NEXT: vmov r0, s12 +; CHECK-MVEFP-NEXT: ldrh r0, [r0] ; CHECK-MVEFP-NEXT: vcmp.f16 gt, q0, r0 ; CHECK-MVEFP-NEXT: vpnot ; CHECK-MVEFP-NEXT: vpsel q0, q1, q2 @@ -2467,8 +2455,7 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ord_v8f16(<8 x half> %src, half* %src2p, ; ; CHECK-MVEFP-LABEL: vcmp_ord_v8f16: ; CHECK-MVEFP: @ %bb.0: @ %entry -; CHECK-MVEFP-NEXT: vldr.16 s12, [r0] -; CHECK-MVEFP-NEXT: vmov r0, s12 +; CHECK-MVEFP-NEXT: ldrh r0, [r0] ; CHECK-MVEFP-NEXT: vpt.f16 ge, q0, r0 ; CHECK-MVEFP-NEXT: vcmpt.f16 lt, q0, r0 ; CHECK-MVEFP-NEXT: vpnot @@ -2595,8 +2582,7 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_uno_v8f16(<8 x half> %src, half* %src2p, ; ; CHECK-MVEFP-LABEL: vcmp_uno_v8f16: ; CHECK-MVEFP: @ %bb.0: @ %entry -; CHECK-MVEFP-NEXT: vldr.16 s12, [r0] -; CHECK-MVEFP-NEXT: vmov r0, s12 +; CHECK-MVEFP-NEXT: ldrh r0, [r0] ; CHECK-MVEFP-NEXT: vpt.f16 ge, q0, r0 ; CHECK-MVEFP-NEXT: vcmpt.f16 lt, q0, r0 ; CHECK-MVEFP-NEXT: vpsel q0, q1, q2 @@ -3527,8 +3513,7 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_oeq_v8f16(<8 x half> %src, half* %src2 ; ; CHECK-MVEFP-LABEL: vcmp_r_oeq_v8f16: ; CHECK-MVEFP: @ %bb.0: @ %entry -; CHECK-MVEFP-NEXT: vldr.16 s12, [r0] -; CHECK-MVEFP-NEXT: vmov r0, s12 +; CHECK-MVEFP-NEXT: ldrh r0, [r0] ; CHECK-MVEFP-NEXT: vcmp.f16 eq, q0, r0 ; CHECK-MVEFP-NEXT: vpsel q0, q1, q2 ; CHECK-MVEFP-NEXT: bx lr @@ -3669,8 +3654,7 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_one_v8f16(<8 x half> %src, half* %src2 ; ; CHECK-MVEFP-LABEL: vcmp_r_one_v8f16: ; CHECK-MVEFP: @ %bb.0: @ %entry -; CHECK-MVEFP-NEXT: vldr.16 s12, [r0] -; CHECK-MVEFP-NEXT: vmov r0, s12 +; CHECK-MVEFP-NEXT: ldrh r0, [r0] ; CHECK-MVEFP-NEXT: vpt.f16 le, q0, r0 ; CHECK-MVEFP-NEXT: vcmpt.f16 ge, q0, r0 ; CHECK-MVEFP-NEXT: vpnot @@ -3797,8 +3781,7 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ogt_v8f16(<8 x half> %src, half* %src2 ; ; CHECK-MVEFP-LABEL: vcmp_r_ogt_v8f16: ; CHECK-MVEFP: @ %bb.0: @ %entry -; CHECK-MVEFP-NEXT: vldr.16 s12, [r0] -; CHECK-MVEFP-NEXT: vmov r0, s12 +; CHECK-MVEFP-NEXT: ldrh r0, [r0] ; CHECK-MVEFP-NEXT: vcmp.f16 lt, q0, r0 ; CHECK-MVEFP-NEXT: vpsel q0, q1, q2 ; CHECK-MVEFP-NEXT: bx lr @@ -3923,8 +3906,7 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_oge_v8f16(<8 x half> %src, half* %src2 ; ; CHECK-MVEFP-LABEL: vcmp_r_oge_v8f16: ; CHECK-MVEFP: @ %bb.0: @ %entry -; CHECK-MVEFP-NEXT: vldr.16 s12, [r0] -; CHECK-MVEFP-NEXT: vmov r0, s12 +; CHECK-MVEFP-NEXT: ldrh r0, [r0] ; CHECK-MVEFP-NEXT: vcmp.f16 le, q0, r0 ; CHECK-MVEFP-NEXT: vpsel q0, q1, q2 ; CHECK-MVEFP-NEXT: bx lr @@ -4049,8 +4031,7 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_olt_v8f16(<8 x half> %src, half* %src2 ; ; CHECK-MVEFP-LABEL: vcmp_r_olt_v8f16: ; CHECK-MVEFP: @ %bb.0: @ %entry -; CHECK-MVEFP-NEXT: vldr.16 s12, [r0] -; CHECK-MVEFP-NEXT: vmov r0, s12 +; CHECK-MVEFP-NEXT: ldrh r0, [r0] ; CHECK-MVEFP-NEXT: vcmp.f16 gt, q0, r0 ; CHECK-MVEFP-NEXT: vpsel q0, q1, q2 ; CHECK-MVEFP-NEXT: bx lr @@ -4175,8 +4156,7 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ole_v8f16(<8 x half> %src, half* %src2 ; ; CHECK-MVEFP-LABEL: vcmp_r_ole_v8f16: ; CHECK-MVEFP: @ %bb.0: @ %entry -; CHECK-MVEFP-NEXT: vldr.16 s12, [r0] -; CHECK-MVEFP-NEXT: vmov r0, s12 +; CHECK-MVEFP-NEXT: ldrh r0, [r0] ; CHECK-MVEFP-NEXT: vcmp.f16 ge, q0, r0 ; CHECK-MVEFP-NEXT: vpsel q0, q1, q2 ; CHECK-MVEFP-NEXT: bx lr @@ -4317,8 +4297,7 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ueq_v8f16(<8 x half> %src, half* %src2 ; ; CHECK-MVEFP-LABEL: vcmp_r_ueq_v8f16: ; CHECK-MVEFP: @ %bb.0: @ %entry -; CHECK-MVEFP-NEXT: vldr.16 s12, [r0] -; CHECK-MVEFP-NEXT: vmov r0, s12 +; CHECK-MVEFP-NEXT: ldrh r0, [r0] ; CHECK-MVEFP-NEXT: vpt.f16 le, q0, r0 ; CHECK-MVEFP-NEXT: vcmpt.f16 ge, q0, r0 ; CHECK-MVEFP-NEXT: vpsel q0, q1, q2 @@ -4444,8 +4423,7 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_une_v8f16(<8 x half> %src, half* %src2 ; ; CHECK-MVEFP-LABEL: vcmp_r_une_v8f16: ; CHECK-MVEFP: @ %bb.0: @ %entry -; CHECK-MVEFP-NEXT: vldr.16 s12, [r0] -; CHECK-MVEFP-NEXT: vmov r0, s12 +; CHECK-MVEFP-NEXT: ldrh r0, [r0] ; CHECK-MVEFP-NEXT: vcmp.f16 ne, q0, r0 ; CHECK-MVEFP-NEXT: vpsel q0, q1, q2 ; CHECK-MVEFP-NEXT: bx lr @@ -4570,8 +4548,7 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ugt_v8f16(<8 x half> %src, half* %src2 ; ; CHECK-MVEFP-LABEL: vcmp_r_ugt_v8f16: ; CHECK-MVEFP: @ %bb.0: @ %entry -; CHECK-MVEFP-NEXT: vldr.16 s12, [r0] -; CHECK-MVEFP-NEXT: vmov r0, s12 +; CHECK-MVEFP-NEXT: ldrh r0, [r0] ; CHECK-MVEFP-NEXT: vcmp.f16 ge, q0, r0 ; CHECK-MVEFP-NEXT: vpnot ; CHECK-MVEFP-NEXT: vpsel q0, q1, q2 @@ -4697,8 +4674,7 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_uge_v8f16(<8 x half> %src, half* %src2 ; ; CHECK-MVEFP-LABEL: vcmp_r_uge_v8f16: ; CHECK-MVEFP: @ %bb.0: @ %entry -; CHECK-MVEFP-NEXT: vldr.16 s12, [r0] -; CHECK-MVEFP-NEXT: vmov r0, s12 +; CHECK-MVEFP-NEXT: ldrh r0, [r0] ; CHECK-MVEFP-NEXT: vcmp.f16 gt, q0, r0 ; CHECK-MVEFP-NEXT: vpnot ; CHECK-MVEFP-NEXT: vpsel q0, q1, q2 @@ -4824,8 +4800,7 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ult_v8f16(<8 x half> %src, half* %src2 ; ; CHECK-MVEFP-LABEL: vcmp_r_ult_v8f16: ; CHECK-MVEFP: @ %bb.0: @ %entry -; CHECK-MVEFP-NEXT: vldr.16 s12, [r0] -; CHECK-MVEFP-NEXT: vmov r0, s12 +; CHECK-MVEFP-NEXT: ldrh r0, [r0] ; CHECK-MVEFP-NEXT: vcmp.f16 le, q0, r0 ; CHECK-MVEFP-NEXT: vpnot ; CHECK-MVEFP-NEXT: vpsel q0, q1, q2 @@ -4951,8 +4926,7 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ule_v8f16(<8 x half> %src, half* %src2 ; ; CHECK-MVEFP-LABEL: vcmp_r_ule_v8f16: ; CHECK-MVEFP: @ %bb.0: @ %entry -; CHECK-MVEFP-NEXT: vldr.16 s12, [r0] -; CHECK-MVEFP-NEXT: vmov r0, s12 +; CHECK-MVEFP-NEXT: ldrh r0, [r0] ; CHECK-MVEFP-NEXT: vcmp.f16 lt, q0, r0 ; CHECK-MVEFP-NEXT: vpnot ; CHECK-MVEFP-NEXT: vpsel q0, q1, q2 @@ -5078,8 +5052,7 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_ord_v8f16(<8 x half> %src, half* %src2 ; ; CHECK-MVEFP-LABEL: vcmp_r_ord_v8f16: ; CHECK-MVEFP: @ %bb.0: @ %entry -; CHECK-MVEFP-NEXT: vldr.16 s12, [r0] -; CHECK-MVEFP-NEXT: vmov r0, s12 +; CHECK-MVEFP-NEXT: ldrh r0, [r0] ; CHECK-MVEFP-NEXT: vpt.f16 le, q0, r0 ; CHECK-MVEFP-NEXT: vcmpt.f16 gt, q0, r0 ; CHECK-MVEFP-NEXT: vpnot @@ -5206,8 +5179,7 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_r_uno_v8f16(<8 x half> %src, half* %src2 ; ; CHECK-MVEFP-LABEL: vcmp_r_uno_v8f16: ; CHECK-MVEFP: @ %bb.0: @ %entry -; CHECK-MVEFP-NEXT: vldr.16 s12, [r0] -; CHECK-MVEFP-NEXT: vmov r0, s12 +; CHECK-MVEFP-NEXT: ldrh r0, [r0] ; CHECK-MVEFP-NEXT: vpt.f16 le, q0, r0 ; CHECK-MVEFP-NEXT: vcmpt.f16 gt, q0, r0 ; CHECK-MVEFP-NEXT: vpsel q0, q1, q2 diff --git a/llvm/test/CodeGen/Thumb2/mve-vdup.ll b/llvm/test/CodeGen/Thumb2/mve-vdup.ll index 3cc9cfd3f442f..829319ba7c075 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vdup.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vdup.ll @@ -82,7 +82,7 @@ define arm_aapcs_vfpcc <8 x half> @vdup_f16(half* %src1, half* %src2) { ; CHECK-NEXT: vldr.16 s0, [r1] ; CHECK-NEXT: vldr.16 s2, [r0] ; CHECK-NEXT: vadd.f16 s0, s2, s0 -; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmov.f16 r0, s0 ; CHECK-NEXT: vdup.16 q0, r0 ; CHECK-NEXT: bx lr entry: diff --git a/llvm/test/CodeGen/Thumb2/mve-vldst4.ll b/llvm/test/CodeGen/Thumb2/mve-vldst4.ll index 0bb11cef1f34c..3ebf113a5c9ff 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vldst4.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vldst4.ll @@ -21,7 +21,7 @@ define void @vldst4(half* nocapture readonly %pIn, half* nocapture %pOut, i32 %n ; CHECK-NEXT: subs r2, #8 ; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: add.w lr, r3, r2, lsr #3 -; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov.f16 r2, s0 ; CHECK-NEXT: vdup.16 q0, r2 ; CHECK-NEXT: vstrw.32 q0, [sp, #80] @ 16-byte Spill ; CHECK-NEXT: dls lr, lr diff --git a/llvm/test/CodeGen/Thumb2/mve-vmovnstore.ll b/llvm/test/CodeGen/Thumb2/mve-vmovnstore.ll new file mode 100644 index 0000000000000..566e79780a44e --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/mve-vmovnstore.ll @@ -0,0 +1,461 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s + +define arm_aapcs_vfpcc void @vmovn32_trunc1(<4 x i32> %src1, <4 x i32> %src2, <8 x i16> *%dest) { +; CHECK-LABEL: vmovn32_trunc1: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmovnt.i32 q0, q1 +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %strided.vec = shufflevector <4 x i32> %src1, <4 x i32> %src2, <8 x i32> + %out = trunc <8 x i32> %strided.vec to <8 x i16> + store <8 x i16> %out, <8 x i16> *%dest, align 8 + ret void +} + +define arm_aapcs_vfpcc void @vmovn32_trunc2(<4 x i32> %src1, <4 x i32> %src2, <8 x i16> *%dest) { +; CHECK-LABEL: vmovn32_trunc2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmovnt.i32 q1, q0 +; CHECK-NEXT: vstrw.32 q1, [r0] +; CHECK-NEXT: bx lr +entry: + %strided.vec = shufflevector <4 x i32> %src1, <4 x i32> %src2, <8 x i32> + %out = trunc <8 x i32> %strided.vec to <8 x i16> + store <8 x i16> %out, <8 x i16> *%dest, align 8 + ret void +} + +define arm_aapcs_vfpcc void @vmovn16_trunc1(<8 x i16> %src1, <8 x i16> %src2, <16 x i8> *%dest) { +; CHECK-LABEL: vmovn16_trunc1: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmovnt.i16 q0, q1 +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %strided.vec = shufflevector <8 x i16> %src1, <8 x i16> %src2, <16 x i32> + %out = trunc <16 x i16> %strided.vec to <16 x i8> + store <16 x i8> %out, <16 x i8> *%dest, align 8 + ret void +} + +define arm_aapcs_vfpcc void @vmovn16_trunc2(<8 x i16> %src1, <8 x i16> %src2, <16 x i8> *%dest) { +; CHECK-LABEL: vmovn16_trunc2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmovnt.i16 q1, q0 +; CHECK-NEXT: vstrw.32 q1, [r0] +; CHECK-NEXT: bx lr +entry: + %strided.vec = shufflevector <8 x i16> %src1, <8 x i16> %src2, <16 x i32> + %out = trunc <16 x i16> %strided.vec to <16 x i8> + store <16 x i8> %out, <16 x i8> *%dest, align 8 + ret void +} + + +define arm_aapcs_vfpcc void @vmovn64_t1(<2 x i64> %src1, <2 x i64> %src2, <2 x i64> *%dest) { +; CHECK-LABEL: vmovn64_t1: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.f32 s2, s4 +; CHECK-NEXT: vmov.f32 s3, s5 +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %out = shufflevector <2 x i64> %src1, <2 x i64> %src2, <2 x i32> + store <2 x i64> %out, <2 x i64> *%dest, align 8 + ret void +} + +define arm_aapcs_vfpcc void @vmovn64_t2(<2 x i64> %src1, <2 x i64> %src2, <2 x i64> *%dest) { +; CHECK-LABEL: vmovn64_t2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.f32 s6, s0 +; CHECK-NEXT: vmov.f32 s7, s1 +; CHECK-NEXT: vstrw.32 q1, [r0] +; CHECK-NEXT: bx lr +entry: + %out = shufflevector <2 x i64> %src1, <2 x i64> %src2, <2 x i32> + store <2 x i64> %out, <2 x i64> *%dest, align 8 + ret void +} + +define arm_aapcs_vfpcc void @vmovn64_b1(<2 x i64> %src1, <2 x i64> %src2, <2 x i64> *%dest) { +; CHECK-LABEL: vmovn64_b1: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.f32 s2, s6 +; CHECK-NEXT: vmov.f32 s3, s7 +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %out = shufflevector <2 x i64> %src1, <2 x i64> %src2, <2 x i32> + store <2 x i64> %out, <2 x i64> *%dest, align 8 + ret void +} + +define arm_aapcs_vfpcc void @vmovn64_b2(<2 x i64> %src1, <2 x i64> %src2, <2 x i64> *%dest) { +; CHECK-LABEL: vmovn64_b2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.f32 s8, s6 +; CHECK-NEXT: vmov.f32 s9, s7 +; CHECK-NEXT: vmov.f32 s10, s0 +; CHECK-NEXT: vmov.f32 s11, s1 +; CHECK-NEXT: vstrw.32 q2, [r0] +; CHECK-NEXT: bx lr +entry: + %out = shufflevector <2 x i64> %src1, <2 x i64> %src2, <2 x i32> + store <2 x i64> %out, <2 x i64> *%dest, align 8 + ret void +} + +define arm_aapcs_vfpcc void @vmovn64_b3(<2 x i64> %src1, <2 x i64> %src2, <2 x i64> *%dest) { +; CHECK-LABEL: vmovn64_b3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.f32 s8, s2 +; CHECK-NEXT: vmov.f32 s9, s3 +; CHECK-NEXT: vmov.f32 s10, s4 +; CHECK-NEXT: vmov.f32 s11, s5 +; CHECK-NEXT: vstrw.32 q2, [r0] +; CHECK-NEXT: bx lr +entry: + %out = shufflevector <2 x i64> %src1, <2 x i64> %src2, <2 x i32> + store <2 x i64> %out, <2 x i64> *%dest, align 8 + ret void +} + +define arm_aapcs_vfpcc void @vmovn64_b4(<2 x i64> %src1, <2 x i64> %src2, <2 x i64> *%dest) { +; CHECK-LABEL: vmovn64_b4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.f32 s6, s2 +; CHECK-NEXT: vmov.f32 s7, s3 +; CHECK-NEXT: vstrw.32 q1, [r0] +; CHECK-NEXT: bx lr +entry: + %out = shufflevector <2 x i64> %src1, <2 x i64> %src2, <2 x i32> + store <2 x i64> %out, <2 x i64> *%dest, align 8 + ret void +} + + + +define arm_aapcs_vfpcc void @vmovn32_t1(<4 x i32> %src1, <4 x i32> %src2, <4 x i32> *%dest) { +; CHECK-LABEL: vmovn32_t1: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.f32 s1, s4 +; CHECK-NEXT: vmov.f32 s3, s6 +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %out = shufflevector <4 x i32> %src1, <4 x i32> %src2, <4 x i32> + store <4 x i32> %out, <4 x i32> *%dest, align 8 + ret void +} + +define arm_aapcs_vfpcc void @vmovn32_t2(<4 x i32> %src1, <4 x i32> %src2, <4 x i32> *%dest) { +; CHECK-LABEL: vmovn32_t2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.f32 s5, s0 +; CHECK-NEXT: vmov.f32 s7, s2 +; CHECK-NEXT: vstrw.32 q1, [r0] +; CHECK-NEXT: bx lr +entry: + %out = shufflevector <4 x i32> %src1, <4 x i32> %src2, <4 x i32> + store <4 x i32> %out, <4 x i32> *%dest, align 8 + ret void +} + +define arm_aapcs_vfpcc void @vmovn32_b1(<4 x i32> %src1, <4 x i32> %src2, <4 x i32> *%dest) { +; CHECK-LABEL: vmovn32_b1: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.f32 s1, s5 +; CHECK-NEXT: vmov.f32 s3, s7 +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %out = shufflevector <4 x i32> %src1, <4 x i32> %src2, <4 x i32> + store <4 x i32> %out, <4 x i32> *%dest, align 8 + ret void +} + +define arm_aapcs_vfpcc void @vmovn32_b2(<4 x i32> %src1, <4 x i32> %src2, <4 x i32> *%dest) { +; CHECK-LABEL: vmovn32_b2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.f32 s8, s5 +; CHECK-NEXT: vmov.f32 s9, s0 +; CHECK-NEXT: vmov.f32 s10, s7 +; CHECK-NEXT: vmov.f32 s11, s2 +; CHECK-NEXT: vstrw.32 q2, [r0] +; CHECK-NEXT: bx lr +entry: + %out = shufflevector <4 x i32> %src1, <4 x i32> %src2, <4 x i32> + store <4 x i32> %out, <4 x i32> *%dest, align 8 + ret void +} + +define arm_aapcs_vfpcc void @vmovn32_b3(<4 x i32> %src1, <4 x i32> %src2, <4 x i32> *%dest) { +; CHECK-LABEL: vmovn32_b3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.f32 s8, s1 +; CHECK-NEXT: vmov.f32 s9, s4 +; CHECK-NEXT: vmov.f32 s10, s3 +; CHECK-NEXT: vmov.f32 s11, s6 +; CHECK-NEXT: vstrw.32 q2, [r0] +; CHECK-NEXT: bx lr +entry: + %out = shufflevector <4 x i32> %src1, <4 x i32> %src2, <4 x i32> + store <4 x i32> %out, <4 x i32> *%dest, align 8 + ret void +} + +define arm_aapcs_vfpcc void @vmovn32_b4(<4 x i32> %src1, <4 x i32> %src2, <4 x i32> *%dest) { +; CHECK-LABEL: vmovn32_b4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.f32 s5, s1 +; CHECK-NEXT: vmov.f32 s7, s3 +; CHECK-NEXT: vstrw.32 q1, [r0] +; CHECK-NEXT: bx lr +entry: + %out = shufflevector <4 x i32> %src1, <4 x i32> %src2, <4 x i32> + store <4 x i32> %out, <4 x i32> *%dest, align 8 + ret void +} + + + + +define arm_aapcs_vfpcc void @vmovn16_t1(<8 x i16> %src1, <8 x i16> %src2, <8 x i16> *%dest) { +; CHECK-LABEL: vmovn16_t1: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmovnt.i32 q0, q1 +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %out = shufflevector <8 x i16> %src1, <8 x i16> %src2, <8 x i32> + store <8 x i16> %out, <8 x i16> *%dest, align 8 + ret void +} + +define arm_aapcs_vfpcc void @vmovn16_t2(<8 x i16> %src1, <8 x i16> %src2, <8 x i16> *%dest) { +; CHECK-LABEL: vmovn16_t2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmovnt.i32 q1, q0 +; CHECK-NEXT: vstrw.32 q1, [r0] +; CHECK-NEXT: bx lr +entry: + %out = shufflevector <8 x i16> %src1, <8 x i16> %src2, <8 x i32> + store <8 x i16> %out, <8 x i16> *%dest, align 8 + ret void +} + +define arm_aapcs_vfpcc void @vmovn16_b1(<8 x i16> %src1, <8 x i16> %src2, <8 x i16> *%dest) { +; CHECK-LABEL: vmovn16_b1: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmovnb.i32 q1, q0 +; CHECK-NEXT: vstrw.32 q1, [r0] +; CHECK-NEXT: bx lr +entry: + %out = shufflevector <8 x i16> %src1, <8 x i16> %src2, <8 x i32> + store <8 x i16> %out, <8 x i16> *%dest, align 8 + ret void +} + +define arm_aapcs_vfpcc void @vmovn16_b2(<8 x i16> %src1, <8 x i16> %src2, <8 x i16> *%dest) { +; CHECK-LABEL: vmovn16_b2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.u16 r1, q1[1] +; CHECK-NEXT: vmov.16 q2[0], r1 +; CHECK-NEXT: vmov.u16 r1, q0[0] +; CHECK-NEXT: vmov.16 q2[1], r1 +; CHECK-NEXT: vmov.u16 r1, q1[3] +; CHECK-NEXT: vmov.16 q2[2], r1 +; CHECK-NEXT: vmov.u16 r1, q0[2] +; CHECK-NEXT: vmov.16 q2[3], r1 +; CHECK-NEXT: vmov.u16 r1, q1[5] +; CHECK-NEXT: vmov.16 q2[4], r1 +; CHECK-NEXT: vmov.u16 r1, q0[4] +; CHECK-NEXT: vmov.16 q2[5], r1 +; CHECK-NEXT: vmov.u16 r1, q1[7] +; CHECK-NEXT: vmov.16 q2[6], r1 +; CHECK-NEXT: vmov.u16 r1, q0[6] +; CHECK-NEXT: vmov.16 q2[7], r1 +; CHECK-NEXT: vstrw.32 q2, [r0] +; CHECK-NEXT: bx lr +entry: + %out = shufflevector <8 x i16> %src1, <8 x i16> %src2, <8 x i32> + store <8 x i16> %out, <8 x i16> *%dest, align 8 + ret void +} + +define arm_aapcs_vfpcc void @vmovn16_b3(<8 x i16> %src1, <8 x i16> %src2, <8 x i16> *%dest) { +; CHECK-LABEL: vmovn16_b3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.u16 r1, q0[1] +; CHECK-NEXT: vmov.16 q2[0], r1 +; CHECK-NEXT: vmov.u16 r1, q1[0] +; CHECK-NEXT: vmov.16 q2[1], r1 +; CHECK-NEXT: vmov.u16 r1, q0[3] +; CHECK-NEXT: vmov.16 q2[2], r1 +; CHECK-NEXT: vmov.u16 r1, q1[2] +; CHECK-NEXT: vmov.16 q2[3], r1 +; CHECK-NEXT: vmov.u16 r1, q0[5] +; CHECK-NEXT: vmov.16 q2[4], r1 +; CHECK-NEXT: vmov.u16 r1, q1[4] +; CHECK-NEXT: vmov.16 q2[5], r1 +; CHECK-NEXT: vmov.u16 r1, q0[7] +; CHECK-NEXT: vmov.16 q2[6], r1 +; CHECK-NEXT: vmov.u16 r1, q1[6] +; CHECK-NEXT: vmov.16 q2[7], r1 +; CHECK-NEXT: vstrw.32 q2, [r0] +; CHECK-NEXT: bx lr +entry: + %out = shufflevector <8 x i16> %src1, <8 x i16> %src2, <8 x i32> + store <8 x i16> %out, <8 x i16> *%dest, align 8 + ret void +} + +define arm_aapcs_vfpcc void @vmovn16_b4(<8 x i16> %src1, <8 x i16> %src2, <8 x i16> *%dest) { +; CHECK-LABEL: vmovn16_b4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmovnb.i32 q0, q1 +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %out = shufflevector <8 x i16> %src1, <8 x i16> %src2, <8 x i32> + store <8 x i16> %out, <8 x i16> *%dest, align 8 + ret void +} + + +define arm_aapcs_vfpcc void @vmovn8_b1(<16 x i8> %src1, <16 x i8> %src2, <16 x i8> *%dest) { +; CHECK-LABEL: vmovn8_b1: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmovnt.i16 q0, q1 +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %out = shufflevector <16 x i8> %src1, <16 x i8> %src2, <16 x i32> + store <16 x i8> %out, <16 x i8> *%dest, align 8 + ret void +} + +define arm_aapcs_vfpcc void @vmovn8_b2(<16 x i8> %src1, <16 x i8> %src2, <16 x i8> *%dest) { +; CHECK-LABEL: vmovn8_b2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmovnt.i16 q1, q0 +; CHECK-NEXT: vstrw.32 q1, [r0] +; CHECK-NEXT: bx lr +entry: + %out = shufflevector <16 x i8> %src1, <16 x i8> %src2, <16 x i32> + store <16 x i8> %out, <16 x i8> *%dest, align 8 + ret void +} + +define arm_aapcs_vfpcc void @vmovn8_t1(<16 x i8> %src1, <16 x i8> %src2, <16 x i8> *%dest) { +; CHECK-LABEL: vmovn8_t1: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmovnb.i16 q1, q0 +; CHECK-NEXT: vstrw.32 q1, [r0] +; CHECK-NEXT: bx lr +entry: + %out = shufflevector <16 x i8> %src1, <16 x i8> %src2, <16 x i32> + store <16 x i8> %out, <16 x i8> *%dest, align 8 + ret void +} + +define arm_aapcs_vfpcc void @vmovn8_t2(<16 x i8> %src1, <16 x i8> %src2, <16 x i8> *%dest) { +; CHECK-LABEL: vmovn8_t2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.u8 r1, q1[1] +; CHECK-NEXT: vmov.8 q2[0], r1 +; CHECK-NEXT: vmov.u8 r1, q0[0] +; CHECK-NEXT: vmov.8 q2[1], r1 +; CHECK-NEXT: vmov.u8 r1, q1[3] +; CHECK-NEXT: vmov.8 q2[2], r1 +; CHECK-NEXT: vmov.u8 r1, q0[2] +; CHECK-NEXT: vmov.8 q2[3], r1 +; CHECK-NEXT: vmov.u8 r1, q1[5] +; CHECK-NEXT: vmov.8 q2[4], r1 +; CHECK-NEXT: vmov.u8 r1, q0[4] +; CHECK-NEXT: vmov.8 q2[5], r1 +; CHECK-NEXT: vmov.u8 r1, q1[7] +; CHECK-NEXT: vmov.8 q2[6], r1 +; CHECK-NEXT: vmov.u8 r1, q0[6] +; CHECK-NEXT: vmov.8 q2[7], r1 +; CHECK-NEXT: vmov.u8 r1, q1[9] +; CHECK-NEXT: vmov.8 q2[8], r1 +; CHECK-NEXT: vmov.u8 r1, q0[8] +; CHECK-NEXT: vmov.8 q2[9], r1 +; CHECK-NEXT: vmov.u8 r1, q1[11] +; CHECK-NEXT: vmov.8 q2[10], r1 +; CHECK-NEXT: vmov.u8 r1, q0[10] +; CHECK-NEXT: vmov.8 q2[11], r1 +; CHECK-NEXT: vmov.u8 r1, q1[13] +; CHECK-NEXT: vmov.8 q2[12], r1 +; CHECK-NEXT: vmov.u8 r1, q0[12] +; CHECK-NEXT: vmov.8 q2[13], r1 +; CHECK-NEXT: vmov.u8 r1, q1[15] +; CHECK-NEXT: vmov.8 q2[14], r1 +; CHECK-NEXT: vmov.u8 r1, q0[14] +; CHECK-NEXT: vmov.8 q2[15], r1 +; CHECK-NEXT: vstrw.32 q2, [r0] +; CHECK-NEXT: bx lr +entry: + %out = shufflevector <16 x i8> %src1, <16 x i8> %src2, <16 x i32> + store <16 x i8> %out, <16 x i8> *%dest, align 8 + ret void +} + +define arm_aapcs_vfpcc void @vmovn8_t3(<16 x i8> %src1, <16 x i8> %src2, <16 x i8> *%dest) { +; CHECK-LABEL: vmovn8_t3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.u8 r1, q0[1] +; CHECK-NEXT: vmov.8 q2[0], r1 +; CHECK-NEXT: vmov.u8 r1, q1[0] +; CHECK-NEXT: vmov.8 q2[1], r1 +; CHECK-NEXT: vmov.u8 r1, q0[3] +; CHECK-NEXT: vmov.8 q2[2], r1 +; CHECK-NEXT: vmov.u8 r1, q1[2] +; CHECK-NEXT: vmov.8 q2[3], r1 +; CHECK-NEXT: vmov.u8 r1, q0[5] +; CHECK-NEXT: vmov.8 q2[4], r1 +; CHECK-NEXT: vmov.u8 r1, q1[4] +; CHECK-NEXT: vmov.8 q2[5], r1 +; CHECK-NEXT: vmov.u8 r1, q0[7] +; CHECK-NEXT: vmov.8 q2[6], r1 +; CHECK-NEXT: vmov.u8 r1, q1[6] +; CHECK-NEXT: vmov.8 q2[7], r1 +; CHECK-NEXT: vmov.u8 r1, q0[9] +; CHECK-NEXT: vmov.8 q2[8], r1 +; CHECK-NEXT: vmov.u8 r1, q1[8] +; CHECK-NEXT: vmov.8 q2[9], r1 +; CHECK-NEXT: vmov.u8 r1, q0[11] +; CHECK-NEXT: vmov.8 q2[10], r1 +; CHECK-NEXT: vmov.u8 r1, q1[10] +; CHECK-NEXT: vmov.8 q2[11], r1 +; CHECK-NEXT: vmov.u8 r1, q0[13] +; CHECK-NEXT: vmov.8 q2[12], r1 +; CHECK-NEXT: vmov.u8 r1, q1[12] +; CHECK-NEXT: vmov.8 q2[13], r1 +; CHECK-NEXT: vmov.u8 r1, q0[15] +; CHECK-NEXT: vmov.8 q2[14], r1 +; CHECK-NEXT: vmov.u8 r1, q1[14] +; CHECK-NEXT: vmov.8 q2[15], r1 +; CHECK-NEXT: vstrw.32 q2, [r0] +; CHECK-NEXT: bx lr +entry: + %out = shufflevector <16 x i8> %src1, <16 x i8> %src2, <16 x i32> + store <16 x i8> %out, <16 x i8> *%dest, align 8 + ret void +} + +define arm_aapcs_vfpcc void @vmovn8_t4(<16 x i8> %src1, <16 x i8> %src2, <16 x i8> *%dest) { +; CHECK-LABEL: vmovn8_t4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmovnb.i16 q0, q1 +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %out = shufflevector <16 x i8> %src1, <16 x i8> %src2, <16 x i32> + store <16 x i8> %out, <16 x i8> *%dest, align 8 + ret void +} diff --git a/llvm/test/CodeGen/Thumb2/mve-vmull.ll b/llvm/test/CodeGen/Thumb2/mve-vmull.ll index b7b28068f5280..c1e720fa3ab6c 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vmull.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vmull.ll @@ -1,6 +1,105 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK +define arm_aapcs_vfpcc <2 x i64> @sext_02(<4 x i32> %src1, <4 x i32> %src2) { +; CHECK-LABEL: sext_02: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: smull r0, r1, r1, r0 +; CHECK-NEXT: vmov.32 q2[0], r0 +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov.32 q2[1], r1 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: smull r0, r1, r1, r0 +; CHECK-NEXT: vmov.32 q2[2], r0 +; CHECK-NEXT: vmov.32 q2[3], r1 +; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: bx lr +entry: + %shuf1 = shufflevector <4 x i32> %src1, <4 x i32> undef, <2 x i32> + %out1 = sext <2 x i32> %shuf1 to <2 x i64> + %shuf2 = shufflevector <4 x i32> %src2, <4 x i32> undef, <2 x i32> + %out2 = sext <2 x i32> %shuf2 to <2 x i64> + %out = mul <2 x i64> %out1, %out2 + ret <2 x i64> %out +} + +define arm_aapcs_vfpcc <2 x i64> @sext_13(<4 x i32> %src1, <4 x i32> %src2) { +; CHECK-LABEL: sext_13: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vrev64.32 q2, q1 +; CHECK-NEXT: vrev64.32 q1, q0 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmov r1, s4 +; CHECK-NEXT: smull r0, r1, r1, r0 +; CHECK-NEXT: vmov.32 q0[0], r0 +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: vmov.32 q0[1], r1 +; CHECK-NEXT: vmov r1, s6 +; CHECK-NEXT: smull r0, r1, r1, r0 +; CHECK-NEXT: vmov.32 q0[2], r0 +; CHECK-NEXT: vmov.32 q0[3], r1 +; CHECK-NEXT: bx lr +entry: + %shuf1 = shufflevector <4 x i32> %src1, <4 x i32> undef, <2 x i32> + %out1 = sext <2 x i32> %shuf1 to <2 x i64> + %shuf2 = shufflevector <4 x i32> %src2, <4 x i32> undef, <2 x i32> + %out2 = sext <2 x i32> %shuf2 to <2 x i64> + %out = mul <2 x i64> %out1, %out2 + ret <2 x i64> %out +} + +define arm_aapcs_vfpcc <2 x i64> @zext_02(<4 x i32> %src1, <4 x i32> %src2) { +; CHECK-LABEL: zext_02: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: umull r0, r1, r1, r0 +; CHECK-NEXT: vmov.32 q2[0], r0 +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov.32 q2[1], r1 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: umull r0, r1, r1, r0 +; CHECK-NEXT: vmov.32 q2[2], r0 +; CHECK-NEXT: vmov.32 q2[3], r1 +; CHECK-NEXT: vmov q0, q2 +; CHECK-NEXT: bx lr +entry: + %shuf1 = shufflevector <4 x i32> %src1, <4 x i32> undef, <2 x i32> + %out1 = zext <2 x i32> %shuf1 to <2 x i64> + %shuf2 = shufflevector <4 x i32> %src2, <4 x i32> undef, <2 x i32> + %out2 = zext <2 x i32> %shuf2 to <2 x i64> + %out = mul <2 x i64> %out1, %out2 + ret <2 x i64> %out +} + +define arm_aapcs_vfpcc <2 x i64> @zext_13(<4 x i32> %src1, <4 x i32> %src2) { +; CHECK-LABEL: zext_13: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vrev64.32 q2, q1 +; CHECK-NEXT: vrev64.32 q1, q0 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmov r1, s4 +; CHECK-NEXT: umull r0, r1, r1, r0 +; CHECK-NEXT: vmov.32 q0[0], r0 +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: vmov.32 q0[1], r1 +; CHECK-NEXT: vmov r1, s6 +; CHECK-NEXT: umull r0, r1, r1, r0 +; CHECK-NEXT: vmov.32 q0[2], r0 +; CHECK-NEXT: vmov.32 q0[3], r1 +; CHECK-NEXT: bx lr +entry: + %shuf1 = shufflevector <4 x i32> %src1, <4 x i32> undef, <2 x i32> + %out1 = zext <2 x i32> %shuf1 to <2 x i64> + %shuf2 = shufflevector <4 x i32> %src2, <4 x i32> undef, <2 x i32> + %out2 = zext <2 x i32> %shuf2 to <2 x i64> + %out = mul <2 x i64> %out1, %out2 + ret <2 x i64> %out +} + + define arm_aapcs_vfpcc <4 x i32> @sext_0246(<8 x i16> %src1, <8 x i16> %src2) { ; CHECK-LABEL: sext_0246: ; CHECK: @ %bb.0: @ %entry diff --git a/llvm/test/CodeGen/WebAssembly/simd-arith.ll b/llvm/test/CodeGen/WebAssembly/simd-arith.ll index 4ccc6f8b613fb..78aabaf0272f6 100644 --- a/llvm/test/CodeGen/WebAssembly/simd-arith.ll +++ b/llvm/test/CodeGen/WebAssembly/simd-arith.ll @@ -118,6 +118,18 @@ define <16 x i8> @avgr_u_v16i8_wrap(<16 x i8> %x, <16 x i8> %y) { ret <16 x i8> %c } +; CHECK-LABEL: abs_v16i8: +; NO-SIMD128-NOT: i8x16 +; SIMD128-NEXT: .functype abs_v16i8 (v128) -> (v128){{$}} +; SIMD128-NEXT: i8x16.abs $push[[R:[0-9]+]]=, $0{{$}} +; SIMD128-NEXT: return $pop[[R]]{{$}} +define <16 x i8> @abs_v16i8(<16 x i8> %x) { + %a = sub <16 x i8> zeroinitializer, %x + %b = icmp slt <16 x i8> %x, zeroinitializer + %c = select <16 x i1> %b, <16 x i8> %a, <16 x i8> %x + ret <16 x i8> %c +} + ; CHECK-LABEL: neg_v16i8: ; NO-SIMD128-NOT: i8x16 ; SIMD128-NEXT: .functype neg_v16i8 (v128) -> (v128){{$}} @@ -431,6 +443,18 @@ define <8 x i16> @avgr_u_v8i16_wrap(<8 x i16> %x, <8 x i16> %y) { ret <8 x i16> %c } +; CHECK-LABEL: abs_v8i16: +; NO-SIMD128-NOT: i16x8 +; SIMD128-NEXT: .functype abs_v8i16 (v128) -> (v128){{$}} +; SIMD128-NEXT: i16x8.abs $push[[R:[0-9]+]]=, $0{{$}} +; SIMD128-NEXT: return $pop[[R]]{{$}} +define <8 x i16> @abs_v8i16(<8 x i16> %x) { + %a = sub <8 x i16> zeroinitializer, %x + %b = icmp slt <8 x i16> %x, zeroinitializer + %c = select <8 x i1> %b, <8 x i16> %a, <8 x i16> %x + ret <8 x i16> %c +} + ; CHECK-LABEL: neg_v8i16: ; NO-SIMD128-NOT: i16x8 ; SIMD128-NEXT: .functype neg_v8i16 (v128) -> (v128){{$}} @@ -713,6 +737,18 @@ define <4 x i32> @max_u_v4i32(<4 x i32> %x, <4 x i32> %y) { ret <4 x i32> %a } +; CHECK-LABEL: abs_v4i32: +; NO-SIMD128-NOT: i32x4 +; SIMD128-NEXT: .functype abs_v4i32 (v128) -> (v128){{$}} +; SIMD128-NEXT: i32x4.abs $push[[R:[0-9]+]]=, $0{{$}} +; SIMD128-NEXT: return $pop[[R]]{{$}} +define <4 x i32> @abs_v4i32(<4 x i32> %x) { + %a = sub <4 x i32> zeroinitializer, %x + %b = icmp slt <4 x i32> %x, zeroinitializer + %c = select <4 x i1> %b, <4 x i32> %a, <4 x i32> %x + ret <4 x i32> %c +} + ; CHECK-LABEL: neg_v4i32: ; NO-SIMD128-NOT: i32x4 ; SIMD128-NEXT: .functype neg_v4i32 (v128) -> (v128){{$}} diff --git a/llvm/test/CodeGen/WebAssembly/simd-intrinsics.ll b/llvm/test/CodeGen/WebAssembly/simd-intrinsics.ll index b6680dd36aa79..77e677df6459a 100644 --- a/llvm/test/CodeGen/WebAssembly/simd-intrinsics.ll +++ b/llvm/test/CodeGen/WebAssembly/simd-intrinsics.ll @@ -95,6 +95,16 @@ define i32 @all_v16i8(<16 x i8> %x) { ret i32 %a } +; CHECK-LABEL: bitmask_v16i8: +; SIMD128-NEXT: .functype bitmask_v16i8 (v128) -> (i32){{$}} +; SIMD128-NEXT: i8x16.bitmask $push[[R:[0-9]+]]=, $0{{$}} +; SIMD128-NEXT: return $pop[[R]]{{$}} +declare i32 @llvm.wasm.bitmask.v16i8(<16 x i8>) +define i32 @bitmask_v16i8(<16 x i8> %x) { + %a = call i32 @llvm.wasm.bitmask.v16i8(<16 x i8> %x) + ret i32 %a +} + ; CHECK-LABEL: bitselect_v16i8: ; SIMD128-NEXT: .functype bitselect_v16i8 (v128, v128, v128) -> (v128){{$}} ; SIMD128-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $0, $1, $2{{$}} @@ -208,6 +218,16 @@ define i32 @all_v8i16(<8 x i16> %x) { ret i32 %a } +; CHECK-LABEL: bitmask_v8i16: +; SIMD128-NEXT: .functype bitmask_v8i16 (v128) -> (i32){{$}} +; SIMD128-NEXT: i16x8.bitmask $push[[R:[0-9]+]]=, $0{{$}} +; SIMD128-NEXT: return $pop[[R]]{{$}} +declare i32 @llvm.wasm.bitmask.v8i16(<8 x i16>) +define i32 @bitmask_v8i16(<8 x i16> %x) { + %a = call i32 @llvm.wasm.bitmask.v8i16(<8 x i16> %x) + ret i32 %a +} + ; CHECK-LABEL: bitselect_v8i16: ; SIMD128-NEXT: .functype bitselect_v8i16 (v128, v128, v128) -> (v128){{$}} ; SIMD128-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $0, $1, $2{{$}} @@ -317,6 +337,16 @@ define i32 @all_v4i32(<4 x i32> %x) { ret i32 %a } +; CHECK-LABEL: bitmask_v4i32: +; SIMD128-NEXT: .functype bitmask_v4i32 (v128) -> (i32){{$}} +; SIMD128-NEXT: i32x4.bitmask $push[[R:[0-9]+]]=, $0{{$}} +; SIMD128-NEXT: return $pop[[R]]{{$}} +declare i32 @llvm.wasm.bitmask.v4i32(<4 x i32>) +define i32 @bitmask_v4i32(<4 x i32> %x) { + %a = call i32 @llvm.wasm.bitmask.v4i32(<4 x i32> %x) + ret i32 %a +} + ; CHECK-LABEL: bitselect_v4i32: ; SIMD128-NEXT: .functype bitselect_v4i32 (v128, v128, v128) -> (v128){{$}} ; SIMD128-NEXT: v128.bitselect $push[[R:[0-9]+]]=, $0, $1, $2{{$}} diff --git a/llvm/test/CodeGen/WebAssembly/swiftcc.ll b/llvm/test/CodeGen/WebAssembly/swiftcc.ll new file mode 100644 index 0000000000000..dd704b89e2036 --- /dev/null +++ b/llvm/test/CodeGen/WebAssembly/swiftcc.ll @@ -0,0 +1,46 @@ +; RUN: llc < %s -asm-verbose=false -wasm-keep-registers | FileCheck %s --check-prefix=REG +; RUN: llc < %s -asm-verbose=false | FileCheck %s + +target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128" +target triple = "wasm32-unknown-unknown" + +; Test direct and indirect function call between mismatched signatures +; CHECK-LABEL: foo: +; CHECK-NEXT: .functype foo (i32, i32, i32, i32) -> () +define swiftcc void @foo(i32, i32) { + ret void +} +@data = global i8* bitcast (void (i32, i32)* @foo to i8*) + +; CHECK-LABEL: bar: +; CHECK-NEXT: .functype bar (i32, i32) -> () +define swiftcc void @bar() { + %1 = load i8*, i8** @data +; REG: call foo, $pop{{[0-9]+}}, $pop{{[0-9]+}}, $pop{{[0-9]+}}, $pop{{[0-9]+}} + call swiftcc void @foo(i32 1, i32 2) + + %2 = bitcast i8* %1 to void (i32, i32)* +; REG: call_indirect $pop{{[0-9]+}}, $pop{{[0-9]+}}, $pop{{[0-9]+}}, $pop{{[0-9]+}} +; CHECK: call_indirect (i32, i32, i32, i32) -> () + call swiftcc void %2(i32 1, i32 2) + + %3 = bitcast i8* %1 to void (i32, i32, i32)* +; REG: call_indirect $pop{{[0-9]+}}, $pop{{[0-9]+}}, $pop{{[0-9]+}}, $pop{{[0-9]+}} +; CHECK: call_indirect (i32, i32, i32, i32) -> () + call swiftcc void %3(i32 1, i32 2, i32 swiftself 3) + + %err = alloca swifterror i32*, align 4 + + %4 = bitcast i8* %1 to void (i32, i32, i32**)* +; REG: call_indirect $pop{{[0-9]+}}, $pop{{[0-9]+}}, $pop{{[0-9]+}}, $pop{{[0-9]+}} +; CHECK: call_indirect (i32, i32, i32, i32) -> () + call swiftcc void %4(i32 1, i32 2, i32** swifterror %err) + + %5 = bitcast i8* %1 to void (i32, i32, i32, i32**)* +; REG: call_indirect $pop{{[0-9]+}}, $pop{{[0-9]+}}, $pop{{[0-9]+}}, $pop{{[0-9]+}} +; CHECK: call_indirect (i32, i32, i32, i32) -> () + call swiftcc void %5(i32 1, i32 2, i32 swiftself 3, i32** swifterror %err) + + ret void +} + diff --git a/llvm/test/CodeGen/X86/avg.ll b/llvm/test/CodeGen/X86/avg.ll index 1448ba0c39e25..88b08cc1273cb 100644 --- a/llvm/test/CodeGen/X86/avg.ll +++ b/llvm/test/CodeGen/X86/avg.ll @@ -87,6 +87,193 @@ define void @avg_v16i8(<16 x i8>* %a, <16 x i8>* %b) nounwind { ret void } +define void @avg_v24i8(<24 x i8>* %a, <24 x i8>* %b) nounwind { +; SSE2-LABEL: avg_v24i8: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa (%rdi), %xmm5 +; SSE2-NEXT: movdqa 16(%rdi), %xmm6 +; SSE2-NEXT: movdqa (%rsi), %xmm0 +; SSE2-NEXT: movdqa 16(%rsi), %xmm1 +; SSE2-NEXT: pxor %xmm7, %xmm7 +; SSE2-NEXT: movdqa %xmm5, %xmm4 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm7[8],xmm4[9],xmm7[9],xmm4[10],xmm7[10],xmm4[11],xmm7[11],xmm4[12],xmm7[12],xmm4[13],xmm7[13],xmm4[14],xmm7[14],xmm4[15],xmm7[15] +; SSE2-NEXT: movdqa %xmm4, %xmm8 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] +; SSE2-NEXT: movdqa %xmm5, %xmm9 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] +; SSE2-NEXT: movdqa %xmm6, %xmm10 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm7[4],xmm10[5],xmm7[5],xmm10[6],xmm7[6],xmm10[7],xmm7[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15] +; SSE2-NEXT: movdqa %xmm2, %xmm3 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] +; SSE2-NEXT: paddd %xmm8, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3] +; SSE2-NEXT: paddd %xmm4, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3],xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7] +; SSE2-NEXT: paddd %xmm9, %xmm4 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3] +; SSE2-NEXT: paddd %xmm5, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] +; SSE2-NEXT: movdqa %xmm1, %xmm5 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7] +; SSE2-NEXT: paddd %xmm10, %xmm5 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3] +; SSE2-NEXT: paddd %xmm6, %xmm1 +; SSE2-NEXT: pcmpeqd %xmm6, %xmm6 +; SSE2-NEXT: psubd %xmm6, %xmm3 +; SSE2-NEXT: psubd %xmm6, %xmm2 +; SSE2-NEXT: psubd %xmm6, %xmm4 +; SSE2-NEXT: psubd %xmm6, %xmm0 +; SSE2-NEXT: psubd %xmm6, %xmm5 +; SSE2-NEXT: psubd %xmm6, %xmm1 +; SSE2-NEXT: psrld $1, %xmm1 +; SSE2-NEXT: psrld $1, %xmm5 +; SSE2-NEXT: psrld $1, %xmm0 +; SSE2-NEXT: psrld $1, %xmm4 +; SSE2-NEXT: psrld $1, %xmm2 +; SSE2-NEXT: psrld $1, %xmm3 +; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; SSE2-NEXT: pand %xmm6, %xmm3 +; SSE2-NEXT: pand %xmm6, %xmm2 +; SSE2-NEXT: packuswb %xmm3, %xmm2 +; SSE2-NEXT: pand %xmm6, %xmm4 +; SSE2-NEXT: pand %xmm6, %xmm0 +; SSE2-NEXT: packuswb %xmm4, %xmm0 +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: pand %xmm6, %xmm5 +; SSE2-NEXT: pand %xmm6, %xmm1 +; SSE2-NEXT: packuswb %xmm5, %xmm1 +; SSE2-NEXT: packuswb %xmm0, %xmm1 +; SSE2-NEXT: movq %xmm1, (%rax) +; SSE2-NEXT: movdqu %xmm0, (%rax) +; SSE2-NEXT: retq +; +; AVX1-LABEL: avg_v24i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,0,1] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,2,3] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[1,1,2,3] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm8 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; AVX1-NEXT: vmovdqa (%rsi), %xmm6 +; AVX1-NEXT: vmovdqa 16(%rsi), %xmm7 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[2,3,0,1] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero +; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[3,3,0,1] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[1,1,2,3] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero +; AVX1-NEXT: vpaddd %xmm3, %xmm4, %xmm3 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero +; AVX1-NEXT: vpaddd %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm7[1,1,2,3] +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero +; AVX1-NEXT: vpaddd %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero +; AVX1-NEXT: vpaddd %xmm5, %xmm8, %xmm5 +; AVX1-NEXT: vpcmpeqd %xmm6, %xmm6, %xmm6 +; AVX1-NEXT: vpsubd %xmm6, %xmm1, %xmm1 +; AVX1-NEXT: vpsubd %xmm6, %xmm2, %xmm2 +; AVX1-NEXT: vpsubd %xmm6, %xmm3, %xmm3 +; AVX1-NEXT: vpsubd %xmm6, %xmm0, %xmm0 +; AVX1-NEXT: vpsubd %xmm6, %xmm4, %xmm4 +; AVX1-NEXT: vpsubd %xmm6, %xmm5, %xmm5 +; AVX1-NEXT: vpsrld $1, %xmm5, %xmm5 +; AVX1-NEXT: vpsrld $1, %xmm4, %xmm4 +; AVX1-NEXT: vpackusdw %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpsrld $1, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $1, %xmm3, %xmm3 +; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpsrld $1, %xmm2, %xmm2 +; AVX1-NEXT: vpsrld $1, %xmm1, %xmm1 +; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX1-NEXT: vmovq %xmm1, (%rax) +; AVX1-NEXT: vmovdqu %xmm0, (%rax) +; AVX1-NEXT: retq +; +; AVX2-LABEL: avg_v24i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastq 8(%rdi), %xmm0 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpbroadcastq 8(%rsi), %xmm3 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero +; AVX2-NEXT: vpaddd %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpaddd %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 +; AVX2-NEXT: vpsubd %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpsubd %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpsubd %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpsrld $1, %ymm2, %ymm2 +; AVX2-NEXT: vpsrld $1, %ymm1, %ymm1 +; AVX2-NEXT: vpsrld $1, %ymm0, %ymm0 +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm1[2,3],ymm0[2,3] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: vpackusdw %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm1 +; AVX2-NEXT: vpackusdw %xmm1, %xmm2, %xmm1 +; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vmovq %xmm1, (%rax) +; AVX2-NEXT: vmovdqu %xmm0, (%rax) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: avg_v24i8: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovdqa (%rdi), %xmm0 +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512-NEXT: vpaddd %ymm2, %ymm1, %ymm1 +; AVX512-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 +; AVX512-NEXT: vpsubd %ymm2, %ymm1, %ymm1 +; AVX512-NEXT: vpsrld $1, %ymm1, %ymm1 +; AVX512-NEXT: vpmovdb %zmm1, %xmm1 +; AVX512-NEXT: vpavgb (%rsi), %xmm0, %xmm0 +; AVX512-NEXT: vmovdqu %xmm0, (%rax) +; AVX512-NEXT: vmovq %xmm1, (%rax) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %1 = load <24 x i8>, <24 x i8>* %a + %2 = load <24 x i8>, <24 x i8>* %b + %3 = zext <24 x i8> %1 to <24 x i32> + %4 = zext <24 x i8> %2 to <24 x i32> + %5 = add nuw nsw <24 x i32> %3, + %6 = add nuw nsw <24 x i32> %5, %4 + %7 = lshr <24 x i32> %6, + %8 = trunc <24 x i32> %7 to <24 x i8> + store <24 x i8> %8, <24 x i8>* undef, align 4 + ret void +} + define void @avg_v32i8(<32 x i8>* %a, <32 x i8>* %b) nounwind { ; SSE2-LABEL: avg_v32i8: ; SSE2: # %bb.0: @@ -724,6 +911,210 @@ define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) nounwind { ret void } +define void @avg_v40i16(<40 x i16>* %a, <40 x i16>* %b) nounwind { +; SSE2-LABEL: avg_v40i16: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqa 64(%rdi), %xmm10 +; SSE2-NEXT: movdqa (%rdi), %xmm5 +; SSE2-NEXT: movdqa 16(%rdi), %xmm6 +; SSE2-NEXT: movdqa 32(%rdi), %xmm13 +; SSE2-NEXT: movdqa 48(%rdi), %xmm12 +; SSE2-NEXT: movdqa 64(%rsi), %xmm8 +; SSE2-NEXT: movdqa (%rsi), %xmm1 +; SSE2-NEXT: movdqa 16(%rsi), %xmm14 +; SSE2-NEXT: movdqa 32(%rsi), %xmm11 +; SSE2-NEXT: movdqa 48(%rsi), %xmm9 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: movdqa %xmm5, %xmm3 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] +; SSE2-NEXT: paddd %xmm3, %xmm4 +; SSE2-NEXT: movdqa %xmm6, %xmm7 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; SSE2-NEXT: paddd %xmm5, %xmm1 +; SSE2-NEXT: movdqa %xmm14, %xmm3 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE2-NEXT: paddd %xmm7, %xmm3 +; SSE2-NEXT: movdqa %xmm13, %xmm5 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm2[0],xmm13[1],xmm2[1],xmm13[2],xmm2[2],xmm13[3],xmm2[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm2[0],xmm14[1],xmm2[1],xmm14[2],xmm2[2],xmm14[3],xmm2[3] +; SSE2-NEXT: paddd %xmm6, %xmm14 +; SSE2-NEXT: movdqa %xmm11, %xmm7 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7] +; SSE2-NEXT: paddd %xmm5, %xmm7 +; SSE2-NEXT: movdqa %xmm12, %xmm5 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm2[0],xmm12[1],xmm2[1],xmm12[2],xmm2[2],xmm12[3],xmm2[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm2[0],xmm11[1],xmm2[1],xmm11[2],xmm2[2],xmm11[3],xmm2[3] +; SSE2-NEXT: paddd %xmm13, %xmm11 +; SSE2-NEXT: movdqa %xmm9, %xmm6 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] +; SSE2-NEXT: paddd %xmm5, %xmm6 +; SSE2-NEXT: movdqa %xmm10, %xmm0 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3] +; SSE2-NEXT: paddd %xmm12, %xmm9 +; SSE2-NEXT: movdqa %xmm8, %xmm5 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] +; SSE2-NEXT: paddd %xmm0, %xmm5 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3] +; SSE2-NEXT: paddd %xmm10, %xmm8 +; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 +; SSE2-NEXT: psubd %xmm0, %xmm4 +; SSE2-NEXT: psubd %xmm0, %xmm1 +; SSE2-NEXT: psubd %xmm0, %xmm3 +; SSE2-NEXT: psubd %xmm0, %xmm14 +; SSE2-NEXT: psubd %xmm0, %xmm7 +; SSE2-NEXT: psubd %xmm0, %xmm11 +; SSE2-NEXT: psubd %xmm0, %xmm6 +; SSE2-NEXT: psubd %xmm0, %xmm9 +; SSE2-NEXT: psubd %xmm0, %xmm5 +; SSE2-NEXT: psubd %xmm0, %xmm8 +; SSE2-NEXT: psrld $1, %xmm1 +; SSE2-NEXT: psrld $1, %xmm4 +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE2-NEXT: psrld $1, %xmm14 +; SSE2-NEXT: psrld $1, %xmm3 +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm14[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] +; SSE2-NEXT: psrld $1, %xmm11 +; SSE2-NEXT: psrld $1, %xmm7 +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm11[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0] +; SSE2-NEXT: psrld $1, %xmm9 +; SSE2-NEXT: psrld $1, %xmm6 +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm9[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm0[0] +; SSE2-NEXT: psrld $1, %xmm8 +; SSE2-NEXT: psrld $1, %xmm5 +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm8[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm0[0] +; SSE2-NEXT: movdqu %xmm5, (%rax) +; SSE2-NEXT: movdqu %xmm4, (%rax) +; SSE2-NEXT: movdqu %xmm3, (%rax) +; SSE2-NEXT: movdqu %xmm2, (%rax) +; SSE2-NEXT: movdqu %xmm1, (%rax) +; SSE2-NEXT: retq +; +; AVX1-LABEL: avg_v40i16: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqa (%rdi), %xmm0 +; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX1-NEXT: vmovdqa 64(%rdi), %xmm4 +; AVX1-NEXT: vpavgw 64(%rsi), %xmm4, %xmm4 +; AVX1-NEXT: vpavgw (%rsi), %xmm0, %xmm0 +; AVX1-NEXT: vpavgw 16(%rsi), %xmm1, %xmm1 +; AVX1-NEXT: vpavgw 32(%rsi), %xmm2, %xmm2 +; AVX1-NEXT: vpavgw 48(%rsi), %xmm3, %xmm3 +; AVX1-NEXT: vmovdqu %xmm3, (%rax) +; AVX1-NEXT: vmovdqu %xmm2, (%rax) +; AVX1-NEXT: vmovdqu %xmm1, (%rax) +; AVX1-NEXT: vmovdqu %xmm0, (%rax) +; AVX1-NEXT: vmovdqu %xmm4, (%rax) +; AVX1-NEXT: retq +; +; AVX2-LABEL: avg_v40i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX2-NEXT: vmovdqa 32(%rdi), %xmm2 +; AVX2-NEXT: vmovdqa 48(%rdi), %xmm3 +; AVX2-NEXT: vmovdqa 64(%rdi), %xmm4 +; AVX2-NEXT: vpavgw 64(%rsi), %xmm4, %xmm4 +; AVX2-NEXT: vpavgw (%rsi), %xmm0, %xmm0 +; AVX2-NEXT: vpavgw 16(%rsi), %xmm1, %xmm1 +; AVX2-NEXT: vpavgw 32(%rsi), %xmm2, %xmm2 +; AVX2-NEXT: vpavgw 48(%rsi), %xmm3, %xmm3 +; AVX2-NEXT: vmovdqu %xmm3, (%rax) +; AVX2-NEXT: vmovdqu %xmm2, (%rax) +; AVX2-NEXT: vmovdqu %xmm1, (%rax) +; AVX2-NEXT: vmovdqu %xmm0, (%rax) +; AVX2-NEXT: vmovdqu %xmm4, (%rax) +; AVX2-NEXT: retq +; +; AVX512F-LABEL: avg_v40i16: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512F-NEXT: vpaddd %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 +; AVX512F-NEXT: vpsubd %ymm3, %ymm2, %ymm2 +; AVX512F-NEXT: vpsrld $1, %ymm2, %ymm2 +; AVX512F-NEXT: vpmovdw %zmm2, %ymm2 +; AVX512F-NEXT: vpavgw (%rsi), %ymm0, %ymm0 +; AVX512F-NEXT: vpavgw 32(%rsi), %ymm1, %ymm1 +; AVX512F-NEXT: vmovdqu %ymm1, (%rax) +; AVX512F-NEXT: vmovdqu %ymm0, (%rax) +; AVX512F-NEXT: vmovdqu %xmm2, (%rax) +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: avg_v40i16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512BW-NEXT: vpmovzxwd {{.*#+}} zmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512BW-NEXT: vpaddd %ymm3, %ymm2, %ymm2 +; AVX512BW-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 +; AVX512BW-NEXT: vpsubd %ymm3, %ymm2, %ymm2 +; AVX512BW-NEXT: vpsrld $1, %ymm2, %ymm2 +; AVX512BW-NEXT: vpavgw 32(%rsi), %ymm1, %ymm1 +; AVX512BW-NEXT: vpavgw (%rsi), %ymm0, %ymm0 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovdw %zmm2, %ymm1 +; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rax) +; AVX512BW-NEXT: vmovdqu %xmm1, (%rax) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %1 = load <40 x i16>, <40 x i16>* %a + %2 = load <40 x i16>, <40 x i16>* %b + %3 = zext <40 x i16> %1 to <40 x i32> + %4 = zext <40 x i16> %2 to <40 x i32> + %5 = add nuw nsw <40 x i32> %3, + %6 = add nuw nsw <40 x i32> %5, %4 + %7 = lshr <40 x i32> %6, + %8 = trunc <40 x i32> %7 to <40 x i16> + store <40 x i16> %8, <40 x i16>* undef, align 4 + ret void +} + define void @avg_v4i8_2(<4 x i8>* %a, <4 x i8>* %b) nounwind { ; SSE2-LABEL: avg_v4i8_2: ; SSE2: # %bb.0: diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll index 888829e887522..851ce75b4632f 100644 --- a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll @@ -26,189 +26,293 @@ define i16 @unpckbw_test(i16 %a0, i16 %a1) { ret i16 %res } +define <16 x i32>@test_int_x86_avx512_pbroadcastd_gpr_512(i32 %x0, <16 x i32> %x1) { +; X86-LABEL: test_int_x86_avx512_pbroadcastd_gpr_512: +; X86: ## %bb.0: +; X86-NEXT: vbroadcastss {{[0-9]+}}(%esp), %zmm0 ## encoding: [0x62,0xf2,0x7d,0x48,0x18,0x44,0x24,0x01] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_pbroadcastd_gpr_512: +; X64: ## %bb.0: +; X64-NEXT: vpbroadcastd %edi, %zmm0 ## encoding: [0x62,0xf2,0x7d,0x48,0x7c,0xc7] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32 %x0, <16 x i32> %x1, i16 -1) + ret <16 x i32> %res +} + define <16 x i32>@test_int_x86_avx512_mask_pbroadcastd_gpr_512(i32 %x0, <16 x i32> %x1, i16 %mask) { ; X86-LABEL: test_int_x86_avx512_mask_pbroadcastd_gpr_512: ; X86: ## %bb.0: -; X86-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %zmm1 ## encoding: [0x62,0xf2,0x7d,0x48,0x58,0x4c,0x24,0x01] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] -; X86-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x6f,0xc1] -; X86-NEXT: vmovdqa32 %zmm1, %zmm2 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0x6f,0xd1] -; X86-NEXT: vpaddd %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfe,0xc2] -; X86-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x75,0x48,0xfe,0xc0] +; X86-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %zmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x58,0x44,0x24,0x01] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pbroadcastd_gpr_512: ; X64: ## %bb.0: -; X64-NEXT: vpbroadcastd %edi, %zmm1 ## encoding: [0x62,0xf2,0x7d,0x48,0x7c,0xcf] ; X64-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; X64-NEXT: vpbroadcastd %edi, %zmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x7c,0xc7] -; X64-NEXT: vpbroadcastd %edi, %zmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x7c,0xd7] -; X64-NEXT: vpaddd %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfe,0xc2] -; X64-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x75,0x48,0xfe,0xc0] ; X64-NEXT: retq ## encoding: [0xc3] - %res = call <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32 %x0, <16 x i32> %x1, i16 -1) - %res1 = call <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32 %x0, <16 x i32> %x1, i16 %mask) - %res2 = call <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32 %x0, <16 x i32> zeroinitializer, i16 %mask) - %res3 = add <16 x i32> %res, %res1 - %res4 = add <16 x i32> %res2, %res3 - ret <16 x i32> %res4 - } + %res = call <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32 %x0, <16 x i32> %x1, i16 %mask) + ret <16 x i32> %res +} + +define <16 x i32>@test_int_x86_avx512_maskz_pbroadcastd_gpr_512(i32 %x0, i16 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_pbroadcastd_gpr_512: +; X86: ## %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] +; X86-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x58,0x44,0x24,0x01] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pbroadcastd_gpr_512: +; X64: ## %bb.0: +; X64-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] +; X64-NEXT: vpbroadcastd %edi, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x7c,0xc7] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32 %x0, <16 x i32> zeroinitializer, i16 %mask) + ret <16 x i32> %res +} + declare <16 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.512(i32, <16 x i32>, i16) +define <8 x i64>@test_int_x86_avx512_pbroadcastq_gpr_512(i64 %x0, <8 x i64> %x1) { +; X86-LABEL: test_int_x86_avx512_pbroadcastq_gpr_512: +; X86: ## %bb.0: +; X86-NEXT: vbroadcastsd {{[0-9]+}}(%esp), %zmm0 ## encoding: [0x62,0xf2,0xfd,0x48,0x19,0x84,0x24,0x04,0x00,0x00,0x00] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_pbroadcastq_gpr_512: +; X64: ## %bb.0: +; X64-NEXT: vpbroadcastq %rdi, %zmm0 ## encoding: [0x62,0xf2,0xfd,0x48,0x7c,0xc7] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <8 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.512(i64 %x0, <8 x i64> %x1,i8 -1) + ret <8 x i64> %res +} define <8 x i64>@test_int_x86_avx512_mask_pbroadcastq_gpr_512(i64 %x0, <8 x i64> %x1, i8 %mask) { ; X86-LABEL: test_int_x86_avx512_mask_pbroadcastq_gpr_512: ; X86: ## %bb.0: -; X86-NEXT: vpbroadcastq {{[0-9]+}}(%esp), %zmm1 ## encoding: [0x62,0xf2,0xfd,0x48,0x59,0x8c,0x24,0x04,0x00,0x00,0x00] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x0c] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0x6f,0xc1] -; X86-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0x6f,0xd1] -; X86-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xd4,0xc2] -; X86-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc0] +; X86-NEXT: vpbroadcastq {{[0-9]+}}(%esp), %zmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x59,0x84,0x24,0x04,0x00,0x00,0x00] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pbroadcastq_gpr_512: ; X64: ## %bb.0: -; X64-NEXT: vpbroadcastq %rdi, %zmm1 ## encoding: [0x62,0xf2,0xfd,0x48,0x7c,0xcf] ; X64-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; X64-NEXT: vpbroadcastq %rdi, %zmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x7c,0xc7] -; X64-NEXT: vpbroadcastq %rdi, %zmm2 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x7c,0xd7] -; X64-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xd4,0xc2] -; X64-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc0] ; X64-NEXT: retq ## encoding: [0xc3] - %res = call <8 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.512(i64 %x0, <8 x i64> %x1,i8 -1) - %res1 = call <8 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.512(i64 %x0, <8 x i64> %x1,i8 %mask) - %res2 = call <8 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.512(i64 %x0, <8 x i64> zeroinitializer,i8 %mask) - %res3 = add <8 x i64> %res, %res1 - %res4 = add <8 x i64> %res2, %res3 - ret <8 x i64> %res4 + %res = call <8 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.512(i64 %x0, <8 x i64> %x1,i8 %mask) + ret <8 x i64> %res +} + +define <8 x i64>@test_int_x86_avx512_maskz_pbroadcastq_gpr_512(i64 %x0, i8 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_pbroadcastq_gpr_512: +; X86: ## %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x0c] +; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpbroadcastq {{[0-9]+}}(%esp), %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x59,0x84,0x24,0x04,0x00,0x00,0x00] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pbroadcastq_gpr_512: +; X64: ## %bb.0: +; X64-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] +; X64-NEXT: vpbroadcastq %rdi, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x7c,0xc7] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <8 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.512(i64 %x0, <8 x i64> zeroinitializer,i8 %mask) + ret <8 x i64> %res } + declare <8 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.512(i64, <8 x i64>, i8) declare <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float>, <16 x float>, i16) nounwind readonly -define <16 x float> @test_x86_vbroadcast_ss_ps_512(<4 x float> %a0, <16 x float> %a1, i16 %mask ) { -; X86-LABEL: test_x86_vbroadcast_ss_ps_512: +define <16 x float> @test_x86_vbroadcast_ss_ps_512(<4 x float> %a0, <16 x float> %a1) { +; CHECK-LABEL: test_x86_vbroadcast_ss_ps_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vbroadcastss %xmm0, %zmm0 ## encoding: [0x62,0xf2,0x7d,0x48,0x18,0xc0] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + + %res = call <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float> %a0, <16 x float> undef, i16 -1) + ret <16 x float> %res +} + +define <16 x float> @test_x86_mask_vbroadcast_ss_ps_512(<4 x float> %a0, <16 x float> %a1, i16 %mask ) { +; X86-LABEL: test_x86_mask_vbroadcast_ss_ps_512: ; X86: ## %bb.0: -; X86-NEXT: vbroadcastss %xmm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x18,0xd0] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vbroadcastss %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x18,0xc8] -; X86-NEXT: vaddps %zmm1, %zmm2, %zmm1 ## encoding: [0x62,0xf1,0x6c,0x48,0x58,0xc9] -; X86-NEXT: vbroadcastss %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x18,0xc0] -; X86-NEXT: vaddps %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x58,0xc1] +; X86-NEXT: vmovaps %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1] ; X86-NEXT: retl ## encoding: [0xc3] ; -; X64-LABEL: test_x86_vbroadcast_ss_ps_512: +; X64-LABEL: test_x86_mask_vbroadcast_ss_ps_512: ; X64: ## %bb.0: -; X64-NEXT: vbroadcastss %xmm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x18,0xd0] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vbroadcastss %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x18,0xc8] -; X64-NEXT: vaddps %zmm1, %zmm2, %zmm1 ## encoding: [0x62,0xf1,0x6c,0x48,0x58,0xc9] +; X64-NEXT: vmovaps %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1] +; X64-NEXT: retq ## encoding: [0xc3] + + %res = call <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float> %a0, <16 x float> %a1, i16 %mask) + ret <16 x float> %res +} + +define <16 x float> @test_x86_maskz_vbroadcast_ss_ps_512(<4 x float> %a0, i16 %mask ) { +; X86-LABEL: test_x86_maskz_vbroadcast_ss_ps_512: +; X86: ## %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vbroadcastss %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x18,0xc0] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_x86_maskz_vbroadcast_ss_ps_512: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vbroadcastss %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x18,0xc0] -; X64-NEXT: vaddps %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x58,0xc1] ; X64-NEXT: retq ## encoding: [0xc3] - %res = call <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float> %a0, <16 x float> zeroinitializer, i16 -1) - %res1 = call <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float> %a0, <16 x float> %a1, i16 %mask) - %res2 = call <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float> %a0, <16 x float> zeroinitializer, i16 %mask) - %res3 = fadd <16 x float> %res, %res1 - %res4 = fadd <16 x float> %res2, %res3 - ret <16 x float> %res4 + %res = call <16 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.512(<4 x float> %a0, <16 x float> zeroinitializer, i16 %mask) + ret <16 x float> %res } declare <8 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.512(<2 x double>, <8 x double>, i8) nounwind readonly -define <8 x double> @test_x86_vbroadcast_sd_pd_512(<2 x double> %a0, <8 x double> %a1, i8 %mask ) { -; X86-LABEL: test_x86_vbroadcast_sd_pd_512: +define <8 x double> @test_x86_vbroadcast_sd_pd_512(<2 x double> %a0, <8 x double> %a1) { +; CHECK-LABEL: test_x86_vbroadcast_sd_pd_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vbroadcastsd %xmm0, %zmm0 ## encoding: [0x62,0xf2,0xfd,0x48,0x19,0xc0] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + + %res = call <8 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.512(<2 x double> %a0, <8 x double> undef, i8 -1) + ret <8 x double> %res +} + +define <8 x double> @test_x86_mask_vbroadcast_sd_pd_512(<2 x double> %a0, <8 x double> %a1, i8 %mask ) { +; X86-LABEL: test_x86_mask_vbroadcast_sd_pd_512: ; X86: ## %bb.0: -; X86-NEXT: vbroadcastsd %xmm0, %zmm2 ## encoding: [0x62,0xf2,0xfd,0x48,0x19,0xd0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vbroadcastsd %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x19,0xc8] -; X86-NEXT: vaddpd %zmm1, %zmm2, %zmm1 ## encoding: [0x62,0xf1,0xed,0x48,0x58,0xc9] -; X86-NEXT: vbroadcastsd %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x19,0xc0] -; X86-NEXT: vaddpd %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x58,0xc1] +; X86-NEXT: vmovapd %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc1] ; X86-NEXT: retl ## encoding: [0xc3] ; -; X64-LABEL: test_x86_vbroadcast_sd_pd_512: +; X64-LABEL: test_x86_mask_vbroadcast_sd_pd_512: ; X64: ## %bb.0: -; X64-NEXT: vbroadcastsd %xmm0, %zmm2 ## encoding: [0x62,0xf2,0xfd,0x48,0x19,0xd0] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vbroadcastsd %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x19,0xc8] -; X64-NEXT: vaddpd %zmm1, %zmm2, %zmm1 ## encoding: [0x62,0xf1,0xed,0x48,0x58,0xc9] +; X64-NEXT: vmovapd %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc1] +; X64-NEXT: retq ## encoding: [0xc3] + + %res = call <8 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.512(<2 x double> %a0, <8 x double> %a1, i8 %mask) + ret <8 x double> %res +} + +define <8 x double> @test_x86_maskz_vbroadcast_sd_pd_512(<2 x double> %a0, i8 %mask ) { +; X86-LABEL: test_x86_maskz_vbroadcast_sd_pd_512: +; X86: ## %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vbroadcastsd %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x19,0xc0] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_x86_maskz_vbroadcast_sd_pd_512: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vbroadcastsd %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x19,0xc0] -; X64-NEXT: vaddpd %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x58,0xc1] ; X64-NEXT: retq ## encoding: [0xc3] - %res = call <8 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.512(<2 x double> %a0, <8 x double> zeroinitializer, i8 -1) - %res1 = call <8 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.512(<2 x double> %a0, <8 x double> %a1, i8 %mask) - %res2 = call <8 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.512(<2 x double> %a0, <8 x double> zeroinitializer, i8 %mask) - %res3 = fadd <8 x double> %res, %res1 - %res4 = fadd <8 x double> %res2, %res3 - ret <8 x double> %res4 + %res = call <8 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.512(<2 x double> %a0, <8 x double> zeroinitializer, i8 %mask) + ret <8 x double> %res } declare <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32>, <16 x i32>, i16) -define <16 x i32>@test_int_x86_avx512_pbroadcastd_512(<4 x i32> %x0, <16 x i32> %x1, i16 %mask) { -; X86-LABEL: test_int_x86_avx512_pbroadcastd_512: +define <16 x i32>@test_int_x86_avx512_pbroadcastd_512(<4 x i32> %x0, <16 x i32> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pbroadcastd_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vbroadcastss %xmm0, %zmm0 ## encoding: [0x62,0xf2,0x7d,0x48,0x18,0xc0] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32> %x0, <16 x i32> %x1, i16 -1) + ret <16 x i32> %res +} + +define <16 x i32>@test_int_x86_avx512_mask_pbroadcastd_512(<4 x i32> %x0, <16 x i32> %x1, i16 %mask) { +; X86-LABEL: test_int_x86_avx512_mask_pbroadcastd_512: ; X86: ## %bb.0: -; X86-NEXT: vpbroadcastd %xmm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x58,0xd0] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpbroadcastd %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x58,0xc8] -; X86-NEXT: vpbroadcastd %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x58,0xc0] -; X86-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x75,0x48,0xfe,0xc0] -; X86-NEXT: vpaddd %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc0] +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl ## encoding: [0xc3] ; -; X64-LABEL: test_int_x86_avx512_pbroadcastd_512: +; X64-LABEL: test_int_x86_avx512_mask_pbroadcastd_512: ; X64: ## %bb.0: -; X64-NEXT: vpbroadcastd %xmm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x58,0xd0] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpbroadcastd %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x58,0xc8] +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32> %x0, <16 x i32> %x1, i16 %mask) + ret <16 x i32> %res +} + +define <16 x i32>@test_int_x86_avx512_maskz_pbroadcastd_512(<4 x i32> %x0, i16 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_pbroadcastd_512: +; X86: ## %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpbroadcastd %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x58,0xc0] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pbroadcastd_512: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpbroadcastd %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x58,0xc0] -; X64-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x75,0x48,0xfe,0xc0] -; X64-NEXT: vpaddd %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc0] ; X64-NEXT: retq ## encoding: [0xc3] - %res = call <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32> %x0, <16 x i32> %x1, i16 -1) - %res1 = call <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32> %x0, <16 x i32> %x1, i16 %mask) - %res2 = call <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32> %x0, <16 x i32> zeroinitializer, i16 %mask) - %res3 = add <16 x i32> %res, %res1 - %res4 = add <16 x i32> %res2, %res3 - ret <16 x i32> %res4 + %res = call <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32> %x0, <16 x i32> zeroinitializer, i16 %mask) + ret <16 x i32> %res } declare <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64>, <8 x i64>, i8) -define <8 x i64>@test_int_x86_avx512_pbroadcastq_512(<2 x i64> %x0, <8 x i64> %x1, i8 %mask) { -; X86-LABEL: test_int_x86_avx512_pbroadcastq_512: +define <8 x i64>@test_int_x86_avx512_pbroadcastq_512(<2 x i64> %x0, <8 x i64> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pbroadcastq_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vbroadcastsd %xmm0, %zmm0 ## encoding: [0x62,0xf2,0xfd,0x48,0x19,0xc0] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64> %x0, <8 x i64> %x1,i8 -1) + ret <8 x i64> %res +} + +define <8 x i64>@test_int_x86_avx512_mask_pbroadcastq_512(<2 x i64> %x0, <8 x i64> %x1, i8 %mask) { +; X86-LABEL: test_int_x86_avx512_mask_pbroadcastq_512: ; X86: ## %bb.0: -; X86-NEXT: vpbroadcastq %xmm0, %zmm2 ## encoding: [0x62,0xf2,0xfd,0x48,0x59,0xd0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpbroadcastq %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x59,0xc8] -; X86-NEXT: vpbroadcastq %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x59,0xc0] -; X86-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc0] -; X86-NEXT: vpaddq %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc0] +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl ## encoding: [0xc3] ; -; X64-LABEL: test_int_x86_avx512_pbroadcastq_512: +; X64-LABEL: test_int_x86_avx512_mask_pbroadcastq_512: ; X64: ## %bb.0: -; X64-NEXT: vpbroadcastq %xmm0, %zmm2 ## encoding: [0x62,0xf2,0xfd,0x48,0x59,0xd0] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpbroadcastq %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x59,0xc8] +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64> %x0, <8 x i64> %x1,i8 %mask) + ret <8 x i64> %res +} + +define <8 x i64>@test_int_x86_avx512_maskz_pbroadcastq_512(<2 x i64> %x0, i8 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_pbroadcastq_512: +; X86: ## %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpbroadcastq %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x59,0xc0] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pbroadcastq_512: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpbroadcastq %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x59,0xc0] -; X64-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc0] -; X64-NEXT: vpaddq %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc0] ; X64-NEXT: retq ## encoding: [0xc3] - %res = call <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64> %x0, <8 x i64> %x1,i8 -1) - %res1 = call <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64> %x0, <8 x i64> %x1,i8 %mask) - %res2 = call <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64> %x0, <8 x i64> zeroinitializer,i8 %mask) - %res3 = add <8 x i64> %res, %res1 - %res4 = add <8 x i64> %res2, %res3 - ret <8 x i64> %res4 + %res = call <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64> %x0, <8 x i64> zeroinitializer,i8 %mask) + ret <8 x i64> %res } declare <16 x float> @llvm.x86.avx512.mask.movsldup.512(<16 x float>, <16 x float>, i16) @@ -285,113 +389,158 @@ define <16 x float>@test_int_x86_avx512_mask_movshdup_512(<16 x float> %x0, <16 declare <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double>, <8 x double>, i8) +define <8 x double>@test_int_x86_avx512_movddup_512(<8 x double> %x0, <8 x double> %x1) { +; CHECK-LABEL: test_int_x86_avx512_movddup_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vmovddup %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xff,0x48,0x12,0xc0] +; CHECK-NEXT: ## zmm0 = zmm0[0,0,2,2,4,4,6,6] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double> %x0, <8 x double> %x1, i8 -1) + ret <8 x double> %res +} + define <8 x double>@test_int_x86_avx512_mask_movddup_512(<8 x double> %x0, <8 x double> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_movddup_512: ; X86: ## %bb.0: -; X86-NEXT: vmovddup %zmm0, %zmm2 ## encoding: [0x62,0xf1,0xff,0x48,0x12,0xd0] -; X86-NEXT: ## zmm2 = zmm0[0,0,2,2,4,4,6,6] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vmovddup %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xff,0x49,0x12,0xc8] ; X86-NEXT: ## zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6] -; X86-NEXT: vaddpd %zmm2, %zmm1, %zmm1 ## encoding: [0x62,0xf1,0xf5,0x48,0x58,0xca] -; X86-NEXT: vmovddup %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xff,0xc9,0x12,0xc0] -; X86-NEXT: ## zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6] -; X86-NEXT: vaddpd %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x58,0xc1] +; X86-NEXT: vmovapd %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc1] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_movddup_512: ; X64: ## %bb.0: -; X64-NEXT: vmovddup %zmm0, %zmm2 ## encoding: [0x62,0xf1,0xff,0x48,0x12,0xd0] -; X64-NEXT: ## zmm2 = zmm0[0,0,2,2,4,4,6,6] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vmovddup %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xff,0x49,0x12,0xc8] ; X64-NEXT: ## zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6] -; X64-NEXT: vaddpd %zmm2, %zmm1, %zmm1 ## encoding: [0x62,0xf1,0xf5,0x48,0x58,0xca] +; X64-NEXT: vmovapd %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc1] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double> %x0, <8 x double> %x1, i8 %x2) + ret <8 x double> %res +} + +define <8 x double>@test_int_x86_avx512_maskz_movddup_512(<8 x double> %x0, i8 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_movddup_512: +; X86: ## %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vmovddup %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xff,0xc9,0x12,0xc0] +; X86-NEXT: ## zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_movddup_512: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vmovddup %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xff,0xc9,0x12,0xc0] ; X64-NEXT: ## zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6] -; X64-NEXT: vaddpd %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x58,0xc1] ; X64-NEXT: retq ## encoding: [0xc3] - %res = call <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double> %x0, <8 x double> %x1, i8 %x2) - %res1 = call <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double> %x0, <8 x double> %x1, i8 -1) - %res2 = call <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double> %x0, <8 x double> zeroinitializer, i8 %x2) - %res3 = fadd <8 x double> %res, %res1 - %res4 = fadd <8 x double> %res2, %res3 - ret <8 x double> %res4 + %res = call <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double> %x0, <8 x double> zeroinitializer, i8 %x2) + ret <8 x double> %res } declare <8 x double> @llvm.x86.avx512.mask.perm.df.512(<8 x double>, i32, <8 x double>, i8) -define <8 x double>@test_int_x86_avx512_mask_perm_df_512(<8 x double> %x0, i32 %x1, <8 x double> %x2, i8 %x3) { +define <8 x double>@test_int_x86_avx512_perm_df_512(<8 x double> %x0, <8 x double> %x2) { +; CHECK-LABEL: test_int_x86_avx512_perm_df_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpermpd $3, %zmm0, %zmm0 ## encoding: [0x62,0xf3,0xfd,0x48,0x01,0xc0,0x03] +; CHECK-NEXT: ## zmm0 = zmm0[3,0,0,0,7,4,4,4] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <8 x double> @llvm.x86.avx512.mask.perm.df.512(<8 x double> %x0, i32 3, <8 x double> %x2, i8 -1) + ret <8 x double> %res +} + +define <8 x double>@test_int_x86_avx512_mask_perm_df_512(<8 x double> %x0, <8 x double> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_perm_df_512: ; X86: ## %bb.0: -; X86-NEXT: vpermpd $3, %zmm0, %zmm2 ## encoding: [0x62,0xf3,0xfd,0x48,0x01,0xd0,0x03] -; X86-NEXT: ## zmm2 = zmm0[3,0,0,0,7,4,4,4] -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermpd $3, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x01,0xc8,0x03] ; X86-NEXT: ## zmm1 {%k1} = zmm0[3,0,0,0,7,4,4,4] -; X86-NEXT: vpermpd $3, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xc9,0x01,0xc0,0x03] -; X86-NEXT: ## zmm0 {%k1} {z} = zmm0[3,0,0,0,7,4,4,4] -; X86-NEXT: vaddpd %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0x58,0xc0] -; X86-NEXT: vaddpd %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x58,0xc2] +; X86-NEXT: vmovapd %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc1] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_perm_df_512: ; X64: ## %bb.0: -; X64-NEXT: vpermpd $3, %zmm0, %zmm2 ## encoding: [0x62,0xf3,0xfd,0x48,0x01,0xd0,0x03] -; X64-NEXT: ## zmm2 = zmm0[3,0,0,0,7,4,4,4] -; X64-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermpd $3, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x01,0xc8,0x03] ; X64-NEXT: ## zmm1 {%k1} = zmm0[3,0,0,0,7,4,4,4] +; X64-NEXT: vmovapd %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc1] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <8 x double> @llvm.x86.avx512.mask.perm.df.512(<8 x double> %x0, i32 3, <8 x double> %x2, i8 %x3) + ret <8 x double> %res +} + +define <8 x double>@test_int_x86_avx512_maskz_perm_df_512(<8 x double> %x0, i8 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_perm_df_512: +; X86: ## %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpermpd $3, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xc9,0x01,0xc0,0x03] +; X86-NEXT: ## zmm0 {%k1} {z} = zmm0[3,0,0,0,7,4,4,4] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_perm_df_512: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermpd $3, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xc9,0x01,0xc0,0x03] ; X64-NEXT: ## zmm0 {%k1} {z} = zmm0[3,0,0,0,7,4,4,4] -; X64-NEXT: vaddpd %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0x58,0xc0] -; X64-NEXT: vaddpd %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x58,0xc2] ; X64-NEXT: retq ## encoding: [0xc3] - %res = call <8 x double> @llvm.x86.avx512.mask.perm.df.512(<8 x double> %x0, i32 3, <8 x double> %x2, i8 %x3) - %res1 = call <8 x double> @llvm.x86.avx512.mask.perm.df.512(<8 x double> %x0, i32 3, <8 x double> zeroinitializer, i8 %x3) - %res2 = call <8 x double> @llvm.x86.avx512.mask.perm.df.512(<8 x double> %x0, i32 3, <8 x double> %x2, i8 -1) - %res3 = fadd <8 x double> %res, %res1 - %res4 = fadd <8 x double> %res3, %res2 - ret <8 x double> %res4 + %res = call <8 x double> @llvm.x86.avx512.mask.perm.df.512(<8 x double> %x0, i32 3, <8 x double> zeroinitializer, i8 %x3) + ret <8 x double> %res } declare <8 x i64> @llvm.x86.avx512.mask.perm.di.512(<8 x i64>, i32, <8 x i64>, i8) +define <8 x i64>@test_int_x86_avx512_perm_di_512(<8 x i64> %x0, i32 %x1, <8 x i64> %x2) { +; CHECK-LABEL: test_int_x86_avx512_perm_di_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpermpd $3, %zmm0, %zmm0 ## encoding: [0x62,0xf3,0xfd,0x48,0x01,0xc0,0x03] +; CHECK-NEXT: ## zmm0 = zmm0[3,0,0,0,7,4,4,4] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <8 x i64> @llvm.x86.avx512.mask.perm.di.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 -1) + ret <8 x i64> %res +} + define <8 x i64>@test_int_x86_avx512_mask_perm_di_512(<8 x i64> %x0, i32 %x1, <8 x i64> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_perm_di_512: ; X86: ## %bb.0: -; X86-NEXT: vpermq $3, %zmm0, %zmm2 ## encoding: [0x62,0xf3,0xfd,0x48,0x00,0xd0,0x03] -; X86-NEXT: ## zmm2 = zmm0[3,0,0,0,7,4,4,4] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermq $3, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x00,0xc8,0x03] ; X86-NEXT: ## zmm1 {%k1} = zmm0[3,0,0,0,7,4,4,4] -; X86-NEXT: vpermq $3, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xc9,0x00,0xc0,0x03] -; X86-NEXT: ## zmm0 {%k1} {z} = zmm0[3,0,0,0,7,4,4,4] -; X86-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xd4,0xc2] -; X86-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc0] +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_perm_di_512: ; X64: ## %bb.0: -; X64-NEXT: vpermq $3, %zmm0, %zmm2 ## encoding: [0x62,0xf3,0xfd,0x48,0x00,0xd0,0x03] -; X64-NEXT: ## zmm2 = zmm0[3,0,0,0,7,4,4,4] ; X64-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; X64-NEXT: vpermq $3, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x00,0xc8,0x03] ; X64-NEXT: ## zmm1 {%k1} = zmm0[3,0,0,0,7,4,4,4] +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <8 x i64> @llvm.x86.avx512.mask.perm.di.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 %x3) + ret <8 x i64> %res +} + +define <8 x i64>@test_int_x86_avx512_maskz_perm_di_512(<8 x i64> %x0, i32 %x1, i8 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_perm_di_512: +; X86: ## %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08] +; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpermq $3, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xc9,0x00,0xc0,0x03] +; X86-NEXT: ## zmm0 {%k1} {z} = zmm0[3,0,0,0,7,4,4,4] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_perm_di_512: +; X64: ## %bb.0: +; X64-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; X64-NEXT: vpermq $3, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xc9,0x00,0xc0,0x03] ; X64-NEXT: ## zmm0 {%k1} {z} = zmm0[3,0,0,0,7,4,4,4] -; X64-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xd4,0xc2] -; X64-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc0] ; X64-NEXT: retq ## encoding: [0xc3] - %res = call <8 x i64> @llvm.x86.avx512.mask.perm.di.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 %x3) - %res1 = call <8 x i64> @llvm.x86.avx512.mask.perm.di.512(<8 x i64> %x0, i32 3, <8 x i64> zeroinitializer, i8 %x3) - %res2 = call <8 x i64> @llvm.x86.avx512.mask.perm.di.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 -1) - %res3 = add <8 x i64> %res, %res1 - %res4 = add <8 x i64> %res3, %res2 - ret <8 x i64> %res4 + %res = call <8 x i64> @llvm.x86.avx512.mask.perm.di.512(<8 x i64> %x0, i32 3, <8 x i64> zeroinitializer, i8 %x3) + ret <8 x i64> %res } define void @test_store1(<16 x float> %data, i8* %ptr, i8* %ptr2, i16 %mask) { @@ -830,111 +979,154 @@ define <8 x i64> @test_mask_load_aligned_q(<8 x i64> %data, i8* %ptr, i8 %mask) declare <8 x double> @llvm.x86.avx512.mask.vpermil.pd.512(<8 x double>, i32, <8 x double>, i8) +define <8 x double>@test_int_x86_avx512_vpermil_pd_512(<8 x double> %x0, <8 x double> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermil_pd_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpermilpd $22, %zmm0, %zmm0 ## encoding: [0x62,0xf3,0xfd,0x48,0x05,0xc0,0x16] +; CHECK-NEXT: ## zmm0 = zmm0[0,1,3,2,5,4,6,6] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <8 x double> @llvm.x86.avx512.mask.vpermil.pd.512(<8 x double> %x0, i32 22, <8 x double> %x2, i8 -1) + ret <8 x double> %res +} + define <8 x double>@test_int_x86_avx512_mask_vpermil_pd_512(<8 x double> %x0, <8 x double> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermil_pd_512: ; X86: ## %bb.0: -; X86-NEXT: vpermilpd $22, %zmm0, %zmm2 ## encoding: [0x62,0xf3,0xfd,0x48,0x05,0xd0,0x16] -; X86-NEXT: ## zmm2 = zmm0[0,1,3,2,5,4,6,6] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermilpd $22, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x05,0xc8,0x16] ; X86-NEXT: ## zmm1 {%k1} = zmm0[0,1,3,2,5,4,6,6] -; X86-NEXT: vpermilpd $22, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xc9,0x05,0xc0,0x16] -; X86-NEXT: ## zmm0 {%k1} {z} = zmm0[0,1,3,2,5,4,6,6] -; X86-NEXT: vaddpd %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0x58,0xc0] -; X86-NEXT: vaddpd %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x58,0xc2] +; X86-NEXT: vmovapd %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc1] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermil_pd_512: ; X64: ## %bb.0: -; X64-NEXT: vpermilpd $22, %zmm0, %zmm2 ## encoding: [0x62,0xf3,0xfd,0x48,0x05,0xd0,0x16] -; X64-NEXT: ## zmm2 = zmm0[0,1,3,2,5,4,6,6] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermilpd $22, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x05,0xc8,0x16] ; X64-NEXT: ## zmm1 {%k1} = zmm0[0,1,3,2,5,4,6,6] +; X64-NEXT: vmovapd %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc1] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <8 x double> @llvm.x86.avx512.mask.vpermil.pd.512(<8 x double> %x0, i32 22, <8 x double> %x2, i8 %x3) + ret <8 x double> %res +} + +define <8 x double>@test_int_x86_avx512_maskz_vpermil_pd_512(<8 x double> %x0, i8 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_vpermil_pd_512: +; X86: ## %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpermilpd $22, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xc9,0x05,0xc0,0x16] +; X86-NEXT: ## zmm0 {%k1} {z} = zmm0[0,1,3,2,5,4,6,6] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_vpermil_pd_512: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermilpd $22, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xc9,0x05,0xc0,0x16] ; X64-NEXT: ## zmm0 {%k1} {z} = zmm0[0,1,3,2,5,4,6,6] -; X64-NEXT: vaddpd %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0x58,0xc0] -; X64-NEXT: vaddpd %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x58,0xc2] ; X64-NEXT: retq ## encoding: [0xc3] - %res = call <8 x double> @llvm.x86.avx512.mask.vpermil.pd.512(<8 x double> %x0, i32 22, <8 x double> %x2, i8 %x3) - %res1 = call <8 x double> @llvm.x86.avx512.mask.vpermil.pd.512(<8 x double> %x0, i32 22, <8 x double> zeroinitializer, i8 %x3) - %res2 = call <8 x double> @llvm.x86.avx512.mask.vpermil.pd.512(<8 x double> %x0, i32 22, <8 x double> %x2, i8 -1) - %res3 = fadd <8 x double> %res, %res1 - %res4 = fadd <8 x double> %res3, %res2 - ret <8 x double> %res4 + %res = call <8 x double> @llvm.x86.avx512.mask.vpermil.pd.512(<8 x double> %x0, i32 22, <8 x double> zeroinitializer, i8 %x3) + ret <8 x double> %res } declare <16 x float> @llvm.x86.avx512.mask.vpermil.ps.512(<16 x float>, i32, <16 x float>, i16) +define <16 x float>@test_int_x86_avx512_vpermil_ps_512(<16 x float> %x0, <16 x float> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermil_ps_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpermilps $22, %zmm0, %zmm0 ## encoding: [0x62,0xf3,0x7d,0x48,0x04,0xc0,0x16] +; CHECK-NEXT: ## zmm0 = zmm0[2,1,1,0,6,5,5,4,10,9,9,8,14,13,13,12] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.vpermil.ps.512(<16 x float> %x0, i32 22, <16 x float> %x2, i16 -1) + ret <16 x float> %res +} + define <16 x float>@test_int_x86_avx512_mask_vpermil_ps_512(<16 x float> %x0, <16 x float> %x2, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermil_ps_512: ; X86: ## %bb.0: -; X86-NEXT: vpermilps $22, %zmm0, %zmm2 ## encoding: [0x62,0xf3,0x7d,0x48,0x04,0xd0,0x16] -; X86-NEXT: ## zmm2 = zmm0[2,1,1,0,6,5,5,4,10,9,9,8,14,13,13,12] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpermilps $22, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x04,0xc8,0x16] ; X86-NEXT: ## zmm1 {%k1} = zmm0[2,1,1,0,6,5,5,4,10,9,9,8,14,13,13,12] -; X86-NEXT: vpermilps $22, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xc9,0x04,0xc0,0x16] -; X86-NEXT: ## zmm0 {%k1} {z} = zmm0[2,1,1,0,6,5,5,4,10,9,9,8,14,13,13,12] -; X86-NEXT: vaddps %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x74,0x48,0x58,0xc0] -; X86-NEXT: vaddps %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x58,0xc2] +; X86-NEXT: vmovaps %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermil_ps_512: ; X64: ## %bb.0: -; X64-NEXT: vpermilps $22, %zmm0, %zmm2 ## encoding: [0x62,0xf3,0x7d,0x48,0x04,0xd0,0x16] -; X64-NEXT: ## zmm2 = zmm0[2,1,1,0,6,5,5,4,10,9,9,8,14,13,13,12] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermilps $22, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x04,0xc8,0x16] ; X64-NEXT: ## zmm1 {%k1} = zmm0[2,1,1,0,6,5,5,4,10,9,9,8,14,13,13,12] +; X64-NEXT: vmovaps %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.vpermil.ps.512(<16 x float> %x0, i32 22, <16 x float> %x2, i16 %x3) + ret <16 x float> %res +} + +define <16 x float>@test_int_x86_avx512_maskz_vpermil_ps_512(<16 x float> %x0, i16 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_vpermil_ps_512: +; X86: ## %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpermilps $22, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xc9,0x04,0xc0,0x16] +; X86-NEXT: ## zmm0 {%k1} {z} = zmm0[2,1,1,0,6,5,5,4,10,9,9,8,14,13,13,12] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_vpermil_ps_512: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermilps $22, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xc9,0x04,0xc0,0x16] ; X64-NEXT: ## zmm0 {%k1} {z} = zmm0[2,1,1,0,6,5,5,4,10,9,9,8,14,13,13,12] -; X64-NEXT: vaddps %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x74,0x48,0x58,0xc0] -; X64-NEXT: vaddps %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x58,0xc2] ; X64-NEXT: retq ## encoding: [0xc3] - %res = call <16 x float> @llvm.x86.avx512.mask.vpermil.ps.512(<16 x float> %x0, i32 22, <16 x float> %x2, i16 %x3) - %res1 = call <16 x float> @llvm.x86.avx512.mask.vpermil.ps.512(<16 x float> %x0, i32 22, <16 x float> zeroinitializer, i16 %x3) - %res2 = call <16 x float> @llvm.x86.avx512.mask.vpermil.ps.512(<16 x float> %x0, i32 22, <16 x float> %x2, i16 -1) - %res3 = fadd <16 x float> %res, %res1 - %res4 = fadd <16 x float> %res3, %res2 - ret <16 x float> %res4 + %res = call <16 x float> @llvm.x86.avx512.mask.vpermil.ps.512(<16 x float> %x0, i32 22, <16 x float> zeroinitializer, i16 %x3) + ret <16 x float> %res } declare <16 x i32> @llvm.x86.avx512.mask.pshuf.d.512(<16 x i32>, i32, <16 x i32>, i16) +define <16 x i32>@test_int_x86_avx512_pshuf_d_512(<16 x i32> %x0, i32 %x1, <16 x i32> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pshuf_d_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpermilps $3, %zmm0, %zmm0 ## encoding: [0x62,0xf3,0x7d,0x48,0x04,0xc0,0x03] +; CHECK-NEXT: ## zmm0 = zmm0[3,0,0,0,7,4,4,4,11,8,8,8,15,12,12,12] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <16 x i32> @llvm.x86.avx512.mask.pshuf.d.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 -1) + ret <16 x i32> %res +} + define <16 x i32>@test_int_x86_avx512_mask_pshuf_d_512(<16 x i32> %x0, i32 %x1, <16 x i32> %x2, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pshuf_d_512: ; X86: ## %bb.0: -; X86-NEXT: vpshufd $3, %zmm0, %zmm2 ## encoding: [0x62,0xf1,0x7d,0x48,0x70,0xd0,0x03] -; X86-NEXT: ## zmm2 = zmm0[3,0,0,0,7,4,4,4,11,8,8,8,15,12,12,12] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] ; X86-NEXT: vpshufd $3, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x70,0xc8,0x03] ; X86-NEXT: ## zmm1 {%k1} = zmm0[3,0,0,0,7,4,4,4,11,8,8,8,15,12,12,12] -; X86-NEXT: vpshufd $3, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0x70,0xc0,0x03] -; X86-NEXT: ## zmm0 {%k1} {z} = zmm0[3,0,0,0,7,4,4,4,11,8,8,8,15,12,12,12] -; X86-NEXT: vpaddd %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfe,0xc2] -; X86-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x75,0x48,0xfe,0xc0] +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pshuf_d_512: ; X64: ## %bb.0: -; X64-NEXT: vpshufd $3, %zmm0, %zmm2 ## encoding: [0x62,0xf1,0x7d,0x48,0x70,0xd0,0x03] -; X64-NEXT: ## zmm2 = zmm0[3,0,0,0,7,4,4,4,11,8,8,8,15,12,12,12] ; X64-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; X64-NEXT: vpshufd $3, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x70,0xc8,0x03] ; X64-NEXT: ## zmm1 {%k1} = zmm0[3,0,0,0,7,4,4,4,11,8,8,8,15,12,12,12] +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <16 x i32> @llvm.x86.avx512.mask.pshuf.d.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 %x3) + ret <16 x i32> %res +} + +define <16 x i32>@test_int_x86_avx512_maskz_pshuf_d_512(<16 x i32> %x0, i32 %x1, i16 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_pshuf_d_512: +; X86: ## %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] +; X86-NEXT: vpshufd $3, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0x70,0xc0,0x03] +; X86-NEXT: ## zmm0 {%k1} {z} = zmm0[3,0,0,0,7,4,4,4,11,8,8,8,15,12,12,12] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pshuf_d_512: +; X64: ## %bb.0: +; X64-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; X64-NEXT: vpshufd $3, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xc9,0x70,0xc0,0x03] ; X64-NEXT: ## zmm0 {%k1} {z} = zmm0[3,0,0,0,7,4,4,4,11,8,8,8,15,12,12,12] -; X64-NEXT: vpaddd %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfe,0xc2] -; X64-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x75,0x48,0xfe,0xc0] ; X64-NEXT: retq ## encoding: [0xc3] - %res = call <16 x i32> @llvm.x86.avx512.mask.pshuf.d.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 %x3) - %res1 = call <16 x i32> @llvm.x86.avx512.mask.pshuf.d.512(<16 x i32> %x0, i32 3, <16 x i32> zeroinitializer, i16 %x3) - %res2 = call <16 x i32> @llvm.x86.avx512.mask.pshuf.d.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 -1) - %res3 = add <16 x i32> %res, %res1 - %res4 = add <16 x i32> %res3, %res2 - ret <16 x i32> %res4 + %res = call <16 x i32> @llvm.x86.avx512.mask.pshuf.d.512(<16 x i32> %x0, i32 3, <16 x i32> zeroinitializer, i16 %x3) + ret <16 x i32> %res } define i16 @test_pcmpeq_d(<16 x i32> %a, <16 x i32> %b) { @@ -1083,238 +1275,281 @@ declare i8 @llvm.x86.avx512.mask.pcmpgt.q.512(<8 x i64>, <8 x i64>, i8) declare <8 x double> @llvm.x86.avx512.mask.unpckh.pd.512(<8 x double>, <8 x double>, <8 x double>, i8) +define <8 x double>@test_int_x86_avx512_unpckh_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2) { +; CHECK-LABEL: test_int_x86_avx512_unpckh_pd_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vunpckhpd %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x15,0xc1] +; CHECK-NEXT: ## zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <8 x double> @llvm.x86.avx512.mask.unpckh.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1) + ret <8 x double> %res +} + define <8 x double>@test_int_x86_avx512_mask_unpckh_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_unpckh_pd_512: ; X86: ## %bb.0: -; X86-NEXT: vunpckhpd %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf1,0xfd,0x48,0x15,0xd9] -; X86-NEXT: ## zmm3 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vunpckhpd %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0x15,0xd1] ; X86-NEXT: ## zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] -; X86-NEXT: vaddpd %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0x58,0xc3] +; X86-NEXT: vmovapd %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc2] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_unpckh_pd_512: ; X64: ## %bb.0: -; X64-NEXT: vunpckhpd %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf1,0xfd,0x48,0x15,0xd9] -; X64-NEXT: ## zmm3 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vunpckhpd %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0x15,0xd1] ; X64-NEXT: ## zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] -; X64-NEXT: vaddpd %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0x58,0xc3] +; X64-NEXT: vmovapd %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc2] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <8 x double> @llvm.x86.avx512.mask.unpckh.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3) - %res1 = call <8 x double> @llvm.x86.avx512.mask.unpckh.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1) - %res2 = fadd <8 x double> %res, %res1 - ret <8 x double> %res2 + ret <8 x double> %res } declare <16 x float> @llvm.x86.avx512.mask.unpckh.ps.512(<16 x float>, <16 x float>, <16 x float>, i16) +define <16 x float>@test_int_x86_avx512_unpckh_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2) { +; CHECK-LABEL: test_int_x86_avx512_unpckh_ps_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vunpckhps %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x15,0xc1] +; CHECK-NEXT: ## zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.unpckh.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1) + ret <16 x float> %res +} + define <16 x float>@test_int_x86_avx512_mask_unpckh_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_unpckh_ps_512: ; X86: ## %bb.0: -; X86-NEXT: vunpckhps %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf1,0x7c,0x48,0x15,0xd9] -; X86-NEXT: ## zmm3 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vunpckhps %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x15,0xd1] ; X86-NEXT: ## zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] -; X86-NEXT: vaddps %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6c,0x48,0x58,0xc3] +; X86-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_unpckh_ps_512: ; X64: ## %bb.0: -; X64-NEXT: vunpckhps %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf1,0x7c,0x48,0x15,0xd9] -; X64-NEXT: ## zmm3 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vunpckhps %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x15,0xd1] ; X64-NEXT: ## zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] -; X64-NEXT: vaddps %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6c,0x48,0x58,0xc3] +; X64-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <16 x float> @llvm.x86.avx512.mask.unpckh.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) - %res1 = call <16 x float> @llvm.x86.avx512.mask.unpckh.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1) - %res2 = fadd <16 x float> %res, %res1 - ret <16 x float> %res2 + ret <16 x float> %res } declare <8 x double> @llvm.x86.avx512.mask.unpckl.pd.512(<8 x double>, <8 x double>, <8 x double>, i8) +define <8 x double>@test_int_x86_avx512_unpckl_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2) { +; CHECK-LABEL: test_int_x86_avx512_unpckl_pd_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vunpcklpd %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x14,0xc1] +; CHECK-NEXT: ## zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <8 x double> @llvm.x86.avx512.mask.unpckl.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1) + ret <8 x double> %res +} + define <8 x double>@test_int_x86_avx512_mask_unpckl_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_unpckl_pd_512: ; X86: ## %bb.0: -; X86-NEXT: vunpcklpd %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf1,0xfd,0x48,0x14,0xd9] -; X86-NEXT: ## zmm3 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vunpcklpd %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0x14,0xd1] ; X86-NEXT: ## zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; X86-NEXT: vaddpd %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0x58,0xc3] +; X86-NEXT: vmovapd %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc2] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_unpckl_pd_512: ; X64: ## %bb.0: -; X64-NEXT: vunpcklpd %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf1,0xfd,0x48,0x14,0xd9] -; X64-NEXT: ## zmm3 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vunpcklpd %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0x14,0xd1] ; X64-NEXT: ## zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; X64-NEXT: vaddpd %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0x58,0xc3] +; X64-NEXT: vmovapd %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc2] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <8 x double> @llvm.x86.avx512.mask.unpckl.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3) - %res1 = call <8 x double> @llvm.x86.avx512.mask.unpckl.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x2, i8 -1) - %res2 = fadd <8 x double> %res, %res1 - ret <8 x double> %res2 + ret <8 x double> %res } declare <16 x float> @llvm.x86.avx512.mask.unpckl.ps.512(<16 x float>, <16 x float>, <16 x float>, i16) +define <16 x float>@test_int_x86_avx512_unpckl_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2) { +; CHECK-LABEL: test_int_x86_avx512_unpckl_ps_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vunpcklps %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x14,0xc1] +; CHECK-NEXT: ## zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.unpckl.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1) + ret <16 x float> %res +} + define <16 x float>@test_int_x86_avx512_mask_unpckl_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_unpckl_ps_512: ; X86: ## %bb.0: -; X86-NEXT: vunpcklps %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf1,0x7c,0x48,0x14,0xd9] -; X86-NEXT: ## zmm3 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vunpcklps %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x14,0xd1] ; X86-NEXT: ## zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] -; X86-NEXT: vaddps %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6c,0x48,0x58,0xc3] +; X86-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_unpckl_ps_512: ; X64: ## %bb.0: -; X64-NEXT: vunpcklps %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf1,0x7c,0x48,0x14,0xd9] -; X64-NEXT: ## zmm3 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vunpcklps %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0x14,0xd1] ; X64-NEXT: ## zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] -; X64-NEXT: vaddps %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6c,0x48,0x58,0xc3] +; X64-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <16 x float> @llvm.x86.avx512.mask.unpckl.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) - %res1 = call <16 x float> @llvm.x86.avx512.mask.unpckl.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1) - %res2 = fadd <16 x float> %res, %res1 - ret <16 x float> %res2 + ret <16 x float> %res } declare <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) +define <8 x i64>@test_int_x86_avx512_punpcklqd_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) { +; CHECK-LABEL: test_int_x86_avx512_punpcklqd_q_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vunpcklpd %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x14,0xc1] +; CHECK-NEXT: ## zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) + ret <8 x i64> %res +} + define <8 x i64>@test_int_x86_avx512_mask_punpcklqd_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_punpcklqd_q_512: ; X86: ## %bb.0: -; X86-NEXT: vpunpcklqdq %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf1,0xfd,0x48,0x6c,0xd9] -; X86-NEXT: ## zmm3 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpunpcklqdq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0x6c,0xd1] ; X86-NEXT: ## zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; X86-NEXT: vpunpcklqdq %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0x6c,0xc1] -; X86-NEXT: ## zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; X86-NEXT: vpaddq %zmm0, %zmm3, %zmm0 ## encoding: [0x62,0xf1,0xe5,0x48,0xd4,0xc0] -; X86-NEXT: vpaddq %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc0] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_punpcklqd_q_512: ; X64: ## %bb.0: -; X64-NEXT: vpunpcklqdq %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf1,0xfd,0x48,0x6c,0xd9] -; X64-NEXT: ## zmm3 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpunpcklqdq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0x6c,0xd1] ; X64-NEXT: ## zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) + ret <8 x i64> %res +} + +define <8 x i64>@test_int_x86_avx512_maskz_punpcklqd_q_512(<8 x i64> %x0, <8 x i64> %x1, i8 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_punpcklqd_q_512: +; X86: ## %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpunpcklqdq %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0x6c,0xc1] +; X86-NEXT: ## zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_punpcklqd_q_512: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpunpcklqdq %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0x6c,0xc1] ; X64-NEXT: ## zmm0 {%k1} {z} = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; X64-NEXT: vpaddq %zmm0, %zmm3, %zmm0 ## encoding: [0x62,0xf1,0xe5,0x48,0xd4,0xc0] -; X64-NEXT: vpaddq %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc0] ; X64-NEXT: retq ## encoding: [0xc3] - %res = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) - %res1 = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) - %res2 = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer,i8 %x3) - %res3 = add <8 x i64> %res, %res1 - %res4 = add <8 x i64> %res2, %res3 - ret <8 x i64> %res4 + %res = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer,i8 %x3) + ret <8 x i64> %res } declare <8 x i64> @llvm.x86.avx512.mask.punpckhqd.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) +define <8 x i64>@test_int_x86_avx512_punpckhqd_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) { +; CHECK-LABEL: test_int_x86_avx512_punpckhqd_q_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vunpckhpd %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x15,0xc1] +; CHECK-NEXT: ## zmm0 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <8 x i64> @llvm.x86.avx512.mask.punpckhqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) + ret <8 x i64> %res +} + define <8 x i64>@test_int_x86_avx512_mask_punpckhqd_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_punpckhqd_q_512: ; X86: ## %bb.0: -; X86-NEXT: vpunpckhqdq %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf1,0xfd,0x48,0x6d,0xd9] -; X86-NEXT: ## zmm3 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpunpckhqdq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0x6d,0xd1] ; X86-NEXT: ## zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] -; X86-NEXT: vpaddq %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc3] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_punpckhqd_q_512: ; X64: ## %bb.0: -; X64-NEXT: vpunpckhqdq %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf1,0xfd,0x48,0x6d,0xd9] -; X64-NEXT: ## zmm3 = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpunpckhqdq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0x6d,0xd1] ; X64-NEXT: ## zmm2 {%k1} = zmm0[1],zmm1[1],zmm0[3],zmm1[3],zmm0[5],zmm1[5],zmm0[7],zmm1[7] -; X64-NEXT: vpaddq %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc3] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <8 x i64> @llvm.x86.avx512.mask.punpckhqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) - %res1 = call <8 x i64> @llvm.x86.avx512.mask.punpckhqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) - %res2 = add <8 x i64> %res, %res1 - ret <8 x i64> %res2 + ret <8 x i64> %res } declare <16 x i32> @llvm.x86.avx512.mask.punpckhd.q.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) +define <16 x i32>@test_int_x86_avx512_punpckhd_q_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) { +; CHECK-LABEL: test_int_x86_avx512_punpckhd_q_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vunpckhps %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x15,0xc1] +; CHECK-NEXT: ## zmm0 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <16 x i32> @llvm.x86.avx512.mask.punpckhd.q.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1) + ret <16 x i32> %res +} + define <16 x i32>@test_int_x86_avx512_mask_punpckhd_q_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_punpckhd_q_512: ; X86: ## %bb.0: -; X86-NEXT: vpunpckhdq %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf1,0x7d,0x48,0x6a,0xd9] -; X86-NEXT: ## zmm3 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpunpckhdq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x6a,0xd1] ; X86-NEXT: ## zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] -; X86-NEXT: vpaddd %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc3] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_punpckhd_q_512: ; X64: ## %bb.0: -; X64-NEXT: vpunpckhdq %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf1,0x7d,0x48,0x6a,0xd9] -; X64-NEXT: ## zmm3 = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpunpckhdq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x6a,0xd1] ; X64-NEXT: ## zmm2 {%k1} = zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[14],zmm1[14],zmm0[15],zmm1[15] -; X64-NEXT: vpaddd %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc3] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <16 x i32> @llvm.x86.avx512.mask.punpckhd.q.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) - %res1 = call <16 x i32> @llvm.x86.avx512.mask.punpckhd.q.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1) - %res2 = add <16 x i32> %res, %res1 - ret <16 x i32> %res2 + ret <16 x i32> %res } declare <16 x i32> @llvm.x86.avx512.mask.punpckld.q.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) +define <16 x i32>@test_int_x86_avx512_punpckld_q_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) { +; CHECK-LABEL: test_int_x86_avx512_punpckld_q_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vunpcklps %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x14,0xc1] +; CHECK-NEXT: ## zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <16 x i32> @llvm.x86.avx512.mask.punpckld.q.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1) + ret <16 x i32> %res +} + define <16 x i32>@test_int_x86_avx512_mask_punpckld_q_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_punpckld_q_512: ; X86: ## %bb.0: -; X86-NEXT: vpunpckldq %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf1,0x7d,0x48,0x62,0xd9] -; X86-NEXT: ## zmm3 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpunpckldq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x62,0xd1] ; X86-NEXT: ## zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] -; X86-NEXT: vpaddd %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc3] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_punpckld_q_512: ; X64: ## %bb.0: -; X64-NEXT: vpunpckldq %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf1,0x7d,0x48,0x62,0xd9] -; X64-NEXT: ## zmm3 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpunpckldq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x49,0x62,0xd1] ; X64-NEXT: ## zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[12],zmm1[12],zmm0[13],zmm1[13] -; X64-NEXT: vpaddd %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc3] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <16 x i32> @llvm.x86.avx512.mask.punpckld.q.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) - %res1 = call <16 x i32> @llvm.x86.avx512.mask.punpckld.q.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1) - %res2 = add <16 x i32> %res, %res1 - ret <16 x i32> %res2 + ret <16 x i32> %res } define <16 x i32> @test_x86_avx512_pslli_d(<16 x i32> %a0) { @@ -2637,387 +2872,471 @@ declare <16 x i32> @llvm.x86.avx512.mask.pmull.d.512(<16 x i32>, <16 x i32>, <16 declare <16 x float> @llvm.x86.avx512.mask.shuf.f32x4(<16 x float>, <16 x float>, i32, <16 x float>, i16) +define <16 x float>@test_int_x86_avx512_shuf_f32x4(<16 x float> %x0, <16 x float> %x1, <16 x float> %x3) { +; CHECK-LABEL: test_int_x86_avx512_shuf_f32x4: +; CHECK: ## %bb.0: +; CHECK-NEXT: vshuff64x2 $22, %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf3,0xfd,0x48,0x23,0xc1,0x16] +; CHECK-NEXT: ## zmm0 = zmm0[4,5,2,3],zmm1[2,3,0,1] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.shuf.f32x4(<16 x float> %x0, <16 x float> %x1, i32 22, <16 x float> %x3, i16 -1) + ret <16 x float> %res +} + define <16 x float>@test_int_x86_avx512_mask_shuf_f32x4(<16 x float> %x0, <16 x float> %x1, <16 x float> %x3, i16 %x4) { ; X86-LABEL: test_int_x86_avx512_mask_shuf_f32x4: ; X86: ## %bb.0: -; X86-NEXT: vshuff64x2 $22, %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf3,0xfd,0x48,0x23,0xd9,0x16] -; X86-NEXT: ## zmm3 = zmm0[4,5,2,3],zmm1[2,3,0,1] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vshuff32x4 $22, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x23,0xd1,0x16] ; X86-NEXT: ## zmm2 {%k1} = zmm0[8,9,10,11,4,5,6,7],zmm1[4,5,6,7,0,1,2,3] -; X86-NEXT: vaddps %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6c,0x48,0x58,0xc3] +; X86-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_shuf_f32x4: ; X64: ## %bb.0: -; X64-NEXT: vshuff64x2 $22, %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf3,0xfd,0x48,0x23,0xd9,0x16] -; X64-NEXT: ## zmm3 = zmm0[4,5,2,3],zmm1[2,3,0,1] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vshuff32x4 $22, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x23,0xd1,0x16] ; X64-NEXT: ## zmm2 {%k1} = zmm0[8,9,10,11,4,5,6,7],zmm1[4,5,6,7,0,1,2,3] -; X64-NEXT: vaddps %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6c,0x48,0x58,0xc3] +; X64-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <16 x float> @llvm.x86.avx512.mask.shuf.f32x4(<16 x float> %x0, <16 x float> %x1, i32 22, <16 x float> %x3, i16 %x4) - %res1 = call <16 x float> @llvm.x86.avx512.mask.shuf.f32x4(<16 x float> %x0, <16 x float> %x1, i32 22, <16 x float> %x3, i16 -1) - %res2 = fadd <16 x float> %res, %res1 - ret <16 x float> %res2 + ret <16 x float> %res } declare <8 x double> @llvm.x86.avx512.mask.shuf.f64x2(<8 x double>, <8 x double>, i32, <8 x double>, i8) +define <8 x double>@test_int_x86_avx512_shuf_f64x2(<8 x double> %x0, <8 x double> %x1, <8 x double> %x3) { +; CHECK-LABEL: test_int_x86_avx512_shuf_f64x2: +; CHECK: ## %bb.0: +; CHECK-NEXT: vshuff64x2 $22, %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf3,0xfd,0x48,0x23,0xc1,0x16] +; CHECK-NEXT: ## zmm0 = zmm0[4,5,2,3],zmm1[2,3,0,1] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <8 x double> @llvm.x86.avx512.mask.shuf.f64x2(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> %x3, i8 -1) + ret <8 x double> %res +} + define <8 x double>@test_int_x86_avx512_mask_shuf_f64x2(<8 x double> %x0, <8 x double> %x1, <8 x double> %x3, i8 %x4) { ; X86-LABEL: test_int_x86_avx512_mask_shuf_f64x2: ; X86: ## %bb.0: -; X86-NEXT: vshuff64x2 $22, %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf3,0xfd,0x48,0x23,0xd9,0x16] -; X86-NEXT: ## zmm3 = zmm0[4,5,2,3],zmm1[2,3,0,1] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vshuff64x2 $22, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x23,0xd1,0x16] ; X86-NEXT: ## zmm2 {%k1} = zmm0[4,5,2,3],zmm1[2,3,0,1] -; X86-NEXT: vaddpd %zmm3, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0xed,0x48,0x58,0xd3] -; X86-NEXT: vshuff64x2 $22, %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xc9,0x23,0xc1,0x16] -; X86-NEXT: ## zmm0 {%k1} {z} = zmm0[4,5,2,3],zmm1[2,3,0,1] -; X86-NEXT: vaddpd %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0x58,0xc0] +; X86-NEXT: vmovapd %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc2] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_shuf_f64x2: ; X64: ## %bb.0: -; X64-NEXT: vshuff64x2 $22, %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf3,0xfd,0x48,0x23,0xd9,0x16] -; X64-NEXT: ## zmm3 = zmm0[4,5,2,3],zmm1[2,3,0,1] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vshuff64x2 $22, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x23,0xd1,0x16] ; X64-NEXT: ## zmm2 {%k1} = zmm0[4,5,2,3],zmm1[2,3,0,1] -; X64-NEXT: vaddpd %zmm3, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0xed,0x48,0x58,0xd3] -; X64-NEXT: vshuff64x2 $22, %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xc9,0x23,0xc1,0x16] -; X64-NEXT: ## zmm0 {%k1} {z} = zmm0[4,5,2,3],zmm1[2,3,0,1] -; X64-NEXT: vaddpd %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0x58,0xc0] +; X64-NEXT: vmovapd %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc2] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <8 x double> @llvm.x86.avx512.mask.shuf.f64x2(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> %x3, i8 %x4) - %res1 = call <8 x double> @llvm.x86.avx512.mask.shuf.f64x2(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> %x3, i8 -1) - %res2 = call <8 x double> @llvm.x86.avx512.mask.shuf.f64x2(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> zeroinitializer, i8 %x4) + ret <8 x double> %res +} - %res3 = fadd <8 x double> %res, %res1 - %res4 = fadd <8 x double> %res3, %res2 - ret <8 x double> %res4 +define <8 x double>@test_int_x86_avx512_maskz_shuf_f64x2(<8 x double> %x0, <8 x double> %x1, i8 %x4) { +; X86-LABEL: test_int_x86_avx512_maskz_shuf_f64x2: +; X86: ## %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vshuff64x2 $22, %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xc9,0x23,0xc1,0x16] +; X86-NEXT: ## zmm0 {%k1} {z} = zmm0[4,5,2,3],zmm1[2,3,0,1] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_shuf_f64x2: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vshuff64x2 $22, %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xc9,0x23,0xc1,0x16] +; X64-NEXT: ## zmm0 {%k1} {z} = zmm0[4,5,2,3],zmm1[2,3,0,1] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <8 x double> @llvm.x86.avx512.mask.shuf.f64x2(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> zeroinitializer, i8 %x4) + ret <8 x double> %res } declare <16 x i32> @llvm.x86.avx512.mask.shuf.i32x4(<16 x i32>, <16 x i32>, i32, <16 x i32>, i16) +define <16 x i32>@test_int_x86_avx512_shuf_i32x4(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x3) { +; CHECK-LABEL: test_int_x86_avx512_shuf_i32x4: +; CHECK: ## %bb.0: +; CHECK-NEXT: vshufi64x2 $22, %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf3,0xfd,0x48,0x43,0xc1,0x16] +; CHECK-NEXT: ## zmm0 = zmm0[4,5,2,3],zmm1[2,3,0,1] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <16 x i32> @llvm.x86.avx512.mask.shuf.i32x4(<16 x i32> %x0, <16 x i32> %x1, i32 22, <16 x i32> %x3, i16 -1) + ret <16 x i32> %res +} + define <16 x i32>@test_int_x86_avx512_mask_shuf_i32x4(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x3, i16 %x4) { ; X86-LABEL: test_int_x86_avx512_mask_shuf_i32x4: ; X86: ## %bb.0: -; X86-NEXT: vshufi64x2 $22, %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf3,0xfd,0x48,0x43,0xd9,0x16] -; X86-NEXT: ## zmm3 = zmm0[4,5,2,3],zmm1[2,3,0,1] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vshufi32x4 $22, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x43,0xd1,0x16] ; X86-NEXT: ## zmm2 {%k1} = zmm0[8,9,10,11,4,5,6,7],zmm1[4,5,6,7,0,1,2,3] -; X86-NEXT: vpaddd %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc3] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_shuf_i32x4: ; X64: ## %bb.0: -; X64-NEXT: vshufi64x2 $22, %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf3,0xfd,0x48,0x43,0xd9,0x16] -; X64-NEXT: ## zmm3 = zmm0[4,5,2,3],zmm1[2,3,0,1] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vshufi32x4 $22, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x43,0xd1,0x16] ; X64-NEXT: ## zmm2 {%k1} = zmm0[8,9,10,11,4,5,6,7],zmm1[4,5,6,7,0,1,2,3] -; X64-NEXT: vpaddd %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc3] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <16 x i32> @llvm.x86.avx512.mask.shuf.i32x4(<16 x i32> %x0, <16 x i32> %x1, i32 22, <16 x i32> %x3, i16 %x4) - %res1 = call <16 x i32> @llvm.x86.avx512.mask.shuf.i32x4(<16 x i32> %x0, <16 x i32> %x1, i32 22, <16 x i32> %x3, i16 -1) - %res2 = add <16 x i32> %res, %res1 - ret <16 x i32> %res2 + ret <16 x i32> %res } declare <8 x i64> @llvm.x86.avx512.mask.shuf.i64x2(<8 x i64>, <8 x i64>, i32, <8 x i64>, i8) +define <8 x i64>@test_int_x86_avx512_shuf_i64x2(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x3) { +; CHECK-LABEL: test_int_x86_avx512_shuf_i64x2: +; CHECK: ## %bb.0: +; CHECK-NEXT: vshufi64x2 $22, %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf3,0xfd,0x48,0x43,0xc1,0x16] +; CHECK-NEXT: ## zmm0 = zmm0[4,5,2,3],zmm1[2,3,0,1] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <8 x i64> @llvm.x86.avx512.mask.shuf.i64x2(<8 x i64> %x0, <8 x i64> %x1, i32 22, <8 x i64> %x3, i8 -1) + ret <8 x i64> %res +} + define <8 x i64>@test_int_x86_avx512_mask_shuf_i64x2(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x3, i8 %x4) { ; X86-LABEL: test_int_x86_avx512_mask_shuf_i64x2: ; X86: ## %bb.0: -; X86-NEXT: vshufi64x2 $22, %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf3,0xfd,0x48,0x43,0xd9,0x16] -; X86-NEXT: ## zmm3 = zmm0[4,5,2,3],zmm1[2,3,0,1] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vshufi64x2 $22, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x43,0xd1,0x16] ; X86-NEXT: ## zmm2 {%k1} = zmm0[4,5,2,3],zmm1[2,3,0,1] -; X86-NEXT: vpaddq %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc3] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_shuf_i64x2: ; X64: ## %bb.0: -; X64-NEXT: vshufi64x2 $22, %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf3,0xfd,0x48,0x43,0xd9,0x16] -; X64-NEXT: ## zmm3 = zmm0[4,5,2,3],zmm1[2,3,0,1] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vshufi64x2 $22, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x43,0xd1,0x16] ; X64-NEXT: ## zmm2 {%k1} = zmm0[4,5,2,3],zmm1[2,3,0,1] -; X64-NEXT: vpaddq %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc3] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <8 x i64> @llvm.x86.avx512.mask.shuf.i64x2(<8 x i64> %x0, <8 x i64> %x1, i32 22, <8 x i64> %x3, i8 %x4) - %res1 = call <8 x i64> @llvm.x86.avx512.mask.shuf.i64x2(<8 x i64> %x0, <8 x i64> %x1, i32 22, <8 x i64> %x3, i8 -1) - %res2 = add <8 x i64> %res, %res1 - ret <8 x i64> %res2 + ret <8 x i64> %res } declare <8 x double> @llvm.x86.avx512.mask.shuf.pd.512(<8 x double>, <8 x double>, i32, <8 x double>, i8) +define <8 x double>@test_int_x86_avx512_shuf_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x3) { +; CHECK-LABEL: test_int_x86_avx512_shuf_pd_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vshufpd $22, %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xc6,0xc1,0x16] +; CHECK-NEXT: ## zmm0 = zmm0[0],zmm1[1],zmm0[3],zmm1[2],zmm0[5],zmm1[4],zmm0[6],zmm1[6] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <8 x double> @llvm.x86.avx512.mask.shuf.pd.512(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> %x3, i8 -1) + ret <8 x double> %res +} + define <8 x double>@test_int_x86_avx512_mask_shuf_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x3, i8 %x4) { ; X86-LABEL: test_int_x86_avx512_mask_shuf_pd_512: ; X86: ## %bb.0: -; X86-NEXT: vshufpd $22, %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf1,0xfd,0x48,0xc6,0xd9,0x16] -; X86-NEXT: ## zmm3 = zmm0[0],zmm1[1],zmm0[3],zmm1[2],zmm0[5],zmm1[4],zmm0[6],zmm1[6] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vshufpd $22, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xc6,0xd1,0x16] ; X86-NEXT: ## zmm2 {%k1} = zmm0[0],zmm1[1],zmm0[3],zmm1[2],zmm0[5],zmm1[4],zmm0[6],zmm1[6] -; X86-NEXT: vaddpd %zmm3, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0xed,0x48,0x58,0xd3] -; X86-NEXT: vshufpd $22, %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0xc6,0xc1,0x16] -; X86-NEXT: ## zmm0 {%k1} {z} = zmm0[0],zmm1[1],zmm0[3],zmm1[2],zmm0[5],zmm1[4],zmm0[6],zmm1[6] -; X86-NEXT: vaddpd %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0x58,0xc0] +; X86-NEXT: vmovapd %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc2] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_shuf_pd_512: ; X64: ## %bb.0: -; X64-NEXT: vshufpd $22, %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf1,0xfd,0x48,0xc6,0xd9,0x16] -; X64-NEXT: ## zmm3 = zmm0[0],zmm1[1],zmm0[3],zmm1[2],zmm0[5],zmm1[4],zmm0[6],zmm1[6] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vshufpd $22, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xc6,0xd1,0x16] ; X64-NEXT: ## zmm2 {%k1} = zmm0[0],zmm1[1],zmm0[3],zmm1[2],zmm0[5],zmm1[4],zmm0[6],zmm1[6] -; X64-NEXT: vaddpd %zmm3, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0xed,0x48,0x58,0xd3] -; X64-NEXT: vshufpd $22, %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0xc6,0xc1,0x16] -; X64-NEXT: ## zmm0 {%k1} {z} = zmm0[0],zmm1[1],zmm0[3],zmm1[2],zmm0[5],zmm1[4],zmm0[6],zmm1[6] -; X64-NEXT: vaddpd %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0x58,0xc0] +; X64-NEXT: vmovapd %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc2] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <8 x double> @llvm.x86.avx512.mask.shuf.pd.512(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> %x3, i8 %x4) - %res1 = call <8 x double> @llvm.x86.avx512.mask.shuf.pd.512(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> %x3, i8 -1) - %res2 = call <8 x double> @llvm.x86.avx512.mask.shuf.pd.512(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> zeroinitializer, i8 %x4) + ret <8 x double> %res +} - %res3 = fadd <8 x double> %res, %res1 - %res4 = fadd <8 x double> %res3, %res2 - ret <8 x double> %res4 +define <8 x double>@test_int_x86_avx512_maskz_shuf_pd_512(<8 x double> %x0, <8 x double> %x1, i8 %x4) { +; X86-LABEL: test_int_x86_avx512_maskz_shuf_pd_512: +; X86: ## %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vshufpd $22, %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0xc6,0xc1,0x16] +; X86-NEXT: ## zmm0 {%k1} {z} = zmm0[0],zmm1[1],zmm0[3],zmm1[2],zmm0[5],zmm1[4],zmm0[6],zmm1[6] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_shuf_pd_512: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vshufpd $22, %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0xc6,0xc1,0x16] +; X64-NEXT: ## zmm0 {%k1} {z} = zmm0[0],zmm1[1],zmm0[3],zmm1[2],zmm0[5],zmm1[4],zmm0[6],zmm1[6] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <8 x double> @llvm.x86.avx512.mask.shuf.pd.512(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> zeroinitializer, i8 %x4) + ret <8 x double> %res } declare <16 x float> @llvm.x86.avx512.mask.shuf.ps.512(<16 x float>, <16 x float>, i32, <16 x float>, i16) +define <16 x float>@test_int_x86_avx512_shuf_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x3) { +; CHECK-LABEL: test_int_x86_avx512_shuf_ps_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vshufps $22, %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0xc6,0xc1,0x16] +; CHECK-NEXT: ## zmm0 = zmm0[2,1],zmm1[1,0],zmm0[6,5],zmm1[5,4],zmm0[10,9],zmm1[9,8],zmm0[14,13],zmm1[13,12] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.shuf.ps.512(<16 x float> %x0, <16 x float> %x1, i32 22, <16 x float> %x3, i16 -1) + ret <16 x float> %res +} + define <16 x float>@test_int_x86_avx512_mask_shuf_ps_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x3, i16 %x4) { ; X86-LABEL: test_int_x86_avx512_mask_shuf_ps_512: ; X86: ## %bb.0: -; X86-NEXT: vshufps $22, %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf1,0x7c,0x48,0xc6,0xd9,0x16] -; X86-NEXT: ## zmm3 = zmm0[2,1],zmm1[1,0],zmm0[6,5],zmm1[5,4],zmm0[10,9],zmm1[9,8],zmm0[14,13],zmm1[13,12] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vshufps $22, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0xc6,0xd1,0x16] ; X86-NEXT: ## zmm2 {%k1} = zmm0[2,1],zmm1[1,0],zmm0[6,5],zmm1[5,4],zmm0[10,9],zmm1[9,8],zmm0[14,13],zmm1[13,12] -; X86-NEXT: vaddps %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6c,0x48,0x58,0xc3] +; X86-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_shuf_ps_512: ; X64: ## %bb.0: -; X64-NEXT: vshufps $22, %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf1,0x7c,0x48,0xc6,0xd9,0x16] -; X64-NEXT: ## zmm3 = zmm0[2,1],zmm1[1,0],zmm0[6,5],zmm1[5,4],zmm0[10,9],zmm1[9,8],zmm0[14,13],zmm1[13,12] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vshufps $22, %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x49,0xc6,0xd1,0x16] ; X64-NEXT: ## zmm2 {%k1} = zmm0[2,1],zmm1[1,0],zmm0[6,5],zmm1[5,4],zmm0[10,9],zmm1[9,8],zmm0[14,13],zmm1[13,12] -; X64-NEXT: vaddps %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6c,0x48,0x58,0xc3] +; X64-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <16 x float> @llvm.x86.avx512.mask.shuf.ps.512(<16 x float> %x0, <16 x float> %x1, i32 22, <16 x float> %x3, i16 %x4) - %res1 = call <16 x float> @llvm.x86.avx512.mask.shuf.ps.512(<16 x float> %x0, <16 x float> %x1, i32 22, <16 x float> %x3, i16 -1) - %res2 = fadd <16 x float> %res, %res1 - ret <16 x float> %res2 + ret <16 x float> %res } declare <16 x i32> @llvm.x86.avx512.mask.pmaxs.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) +define <16 x i32>@test_int_x86_avx512_pmaxs_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pmaxs_d_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpmaxsd %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf2,0x7d,0x48,0x3d,0xc1] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <16 x i32> @llvm.x86.avx512.mask.pmaxs.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1) + ret <16 x i32> %res +} + define <16 x i32>@test_int_x86_avx512_mask_pmaxs_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pmaxs_d_512: ; X86: ## %bb.0: -; X86-NEXT: vpmaxsd %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf2,0x7d,0x48,0x3d,0xd9] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmaxsd %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x3d,0xd1] -; X86-NEXT: vpaddd %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc3] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmaxs_d_512: ; X64: ## %bb.0: -; X64-NEXT: vpmaxsd %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf2,0x7d,0x48,0x3d,0xd9] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmaxsd %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x3d,0xd1] -; X64-NEXT: vpaddd %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc3] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <16 x i32> @llvm.x86.avx512.mask.pmaxs.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) - %res1 = call <16 x i32> @llvm.x86.avx512.mask.pmaxs.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1) - %res2 = add <16 x i32> %res, %res1 - ret <16 x i32> %res2 + ret <16 x i32> %res } declare <8 x i64> @llvm.x86.avx512.mask.pmaxs.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) +define <8 x i64>@test_int_x86_avx512_pmaxs_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pmaxs_q_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf2,0xfd,0x48,0x3d,0xc1] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <8 x i64> @llvm.x86.avx512.mask.pmaxs.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) + ret <8 x i64> %res +} + define <8 x i64>@test_int_x86_avx512_mask_pmaxs_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pmaxs_q_512: ; X86: ## %bb.0: -; X86-NEXT: vpmaxsq %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf2,0xfd,0x48,0x3d,0xd9] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmaxsq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x3d,0xd1] -; X86-NEXT: vpaddq %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc3] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmaxs_q_512: ; X64: ## %bb.0: -; X64-NEXT: vpmaxsq %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf2,0xfd,0x48,0x3d,0xd9] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmaxsq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x3d,0xd1] -; X64-NEXT: vpaddq %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc3] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <8 x i64> @llvm.x86.avx512.mask.pmaxs.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) - %res1 = call <8 x i64> @llvm.x86.avx512.mask.pmaxs.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) - %res2 = add <8 x i64> %res, %res1 - ret <8 x i64> %res2 + ret <8 x i64> %res } declare <16 x i32> @llvm.x86.avx512.mask.pmaxu.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) +define <16 x i32>@test_int_x86_avx512_pmaxu_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pmaxu_d_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpmaxud %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf2,0x7d,0x48,0x3f,0xc1] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <16 x i32> @llvm.x86.avx512.mask.pmaxu.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1) + ret <16 x i32> %res +} + define <16 x i32>@test_int_x86_avx512_mask_pmaxu_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pmaxu_d_512: ; X86: ## %bb.0: -; X86-NEXT: vpmaxud %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf2,0x7d,0x48,0x3f,0xd9] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmaxud %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x3f,0xd1] -; X86-NEXT: vpaddd %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc3] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmaxu_d_512: ; X64: ## %bb.0: -; X64-NEXT: vpmaxud %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf2,0x7d,0x48,0x3f,0xd9] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmaxud %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x3f,0xd1] -; X64-NEXT: vpaddd %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc3] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <16 x i32> @llvm.x86.avx512.mask.pmaxu.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) - %res1 = call <16 x i32> @llvm.x86.avx512.mask.pmaxu.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1) - %res2 = add <16 x i32> %res, %res1 - ret <16 x i32> %res2 + ret <16 x i32> %res } declare <8 x i64> @llvm.x86.avx512.mask.pmaxu.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) +define <8 x i64>@test_int_x86_avx512_pmaxu_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pmaxu_q_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpmaxuq %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf2,0xfd,0x48,0x3f,0xc1] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <8 x i64> @llvm.x86.avx512.mask.pmaxu.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) + ret <8 x i64> %res +} + define <8 x i64>@test_int_x86_avx512_mask_pmaxu_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pmaxu_q_512: ; X86: ## %bb.0: -; X86-NEXT: vpmaxuq %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf2,0xfd,0x48,0x3f,0xd9] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmaxuq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x3f,0xd1] -; X86-NEXT: vpaddq %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc3] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmaxu_q_512: ; X64: ## %bb.0: -; X64-NEXT: vpmaxuq %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf2,0xfd,0x48,0x3f,0xd9] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmaxuq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x3f,0xd1] -; X64-NEXT: vpaddq %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc3] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <8 x i64> @llvm.x86.avx512.mask.pmaxu.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) - %res1 = call <8 x i64> @llvm.x86.avx512.mask.pmaxu.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) - %res2 = add <8 x i64> %res, %res1 - ret <8 x i64> %res2 + ret <8 x i64> %res } declare <16 x i32> @llvm.x86.avx512.mask.pmins.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) +define <16 x i32>@test_int_x86_avx512_pmins_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pmins_d_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpminsd %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf2,0x7d,0x48,0x39,0xc1] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <16 x i32> @llvm.x86.avx512.mask.pmins.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1) + ret <16 x i32> %res +} + define <16 x i32>@test_int_x86_avx512_mask_pmins_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pmins_d_512: ; X86: ## %bb.0: -; X86-NEXT: vpminsd %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf2,0x7d,0x48,0x39,0xd9] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpminsd %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x39,0xd1] -; X86-NEXT: vpaddd %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc3] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmins_d_512: ; X64: ## %bb.0: -; X64-NEXT: vpminsd %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf2,0x7d,0x48,0x39,0xd9] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpminsd %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x39,0xd1] -; X64-NEXT: vpaddd %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc3] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <16 x i32> @llvm.x86.avx512.mask.pmins.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) - %res1 = call <16 x i32> @llvm.x86.avx512.mask.pmins.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1) - %res2 = add <16 x i32> %res, %res1 - ret <16 x i32> %res2 + ret <16 x i32> %res } declare <8 x i64> @llvm.x86.avx512.mask.pmins.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) +define <8 x i64>@test_int_x86_avx512_pmins_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pmins_q_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpminsq %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf2,0xfd,0x48,0x39,0xc1] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <8 x i64> @llvm.x86.avx512.mask.pmins.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) + ret <8 x i64> %res +} + define <8 x i64>@test_int_x86_avx512_mask_pmins_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pmins_q_512: ; X86: ## %bb.0: -; X86-NEXT: vpminsq %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf2,0xfd,0x48,0x39,0xd9] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpminsq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x39,0xd1] -; X86-NEXT: vpaddq %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc3] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmins_q_512: ; X64: ## %bb.0: -; X64-NEXT: vpminsq %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf2,0xfd,0x48,0x39,0xd9] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpminsq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x39,0xd1] -; X64-NEXT: vpaddq %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc3] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <8 x i64> @llvm.x86.avx512.mask.pmins.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) - %res1 = call <8 x i64> @llvm.x86.avx512.mask.pmins.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) - %res2 = add <8 x i64> %res, %res1 - ret <8 x i64> %res2 + ret <8 x i64> %res } declare <16 x i32> @llvm.x86.avx512.mask.pminu.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) +define <16 x i32>@test_int_x86_avx512_pminu_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pminu_d_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpminud %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf2,0x7d,0x48,0x3b,0xc1] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <16 x i32> @llvm.x86.avx512.mask.pminu.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1) + ret <16 x i32> %res +} + define <16 x i32>@test_int_x86_avx512_mask_pminu_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pminu_d_512: ; X86: ## %bb.0: -; X86-NEXT: vpminud %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf2,0x7d,0x48,0x3b,0xd9] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpminud %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x3b,0xd1] -; X86-NEXT: vpaddd %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc3] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pminu_d_512: ; X64: ## %bb.0: -; X64-NEXT: vpminud %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf2,0x7d,0x48,0x3b,0xd9] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpminud %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x3b,0xd1] -; X64-NEXT: vpaddd %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc3] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <16 x i32> @llvm.x86.avx512.mask.pminu.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) - %res1 = call <16 x i32> @llvm.x86.avx512.mask.pminu.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1) - %res2 = add <16 x i32> %res, %res1 - ret <16 x i32> %res2 + ret <16 x i32> %res } declare <8 x i64> @llvm.x86.avx512.mask.pminu.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) +define <8 x i64>@test_int_x86_avx512_pminu_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pminu_q_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpminuq %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf2,0xfd,0x48,0x3b,0xc1] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <8 x i64> @llvm.x86.avx512.mask.pminu.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) + ret <8 x i64> %res +} + define <8 x i64>@test_int_x86_avx512_mask_pminu_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pminu_q_512: ; X86: ## %bb.0: -; X86-NEXT: vpminuq %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf2,0xfd,0x48,0x3b,0xd9] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpminuq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x3b,0xd1] -; X86-NEXT: vpaddq %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc3] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pminu_q_512: ; X64: ## %bb.0: -; X64-NEXT: vpminuq %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf2,0xfd,0x48,0x3b,0xd9] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpminuq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x3b,0xd1] -; X64-NEXT: vpaddq %zmm3, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc3] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <8 x i64> @llvm.x86.avx512.mask.pminu.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) - %res1 = call <8 x i64> @llvm.x86.avx512.mask.pminu.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) - %res2 = add <8 x i64> %res, %res1 - ret <8 x i64> %res2 + ret <8 x i64> %res } define <4 x float> @test_mm_mask_move_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) { @@ -3098,340 +3417,489 @@ declare <2 x double> @llvm.x86.avx512.mask.move.sd(<2 x double>, <2 x double>, < declare <16 x i32> @llvm.x86.avx512.mask.pmovzxb.d.512(<16 x i8>, <16 x i32>, i16) +define <16 x i32>@test_int_x86_avx512_pmovzxb_d_512(<16 x i8> %x0, <16 x i32> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmovzxb_d_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpmovzxbd %xmm0, %zmm0 ## encoding: [0x62,0xf2,0x7d,0x48,0x31,0xc0] +; CHECK-NEXT: ## zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <16 x i32> @llvm.x86.avx512.mask.pmovzxb.d.512(<16 x i8> %x0, <16 x i32> %x1, i16 -1) + ret <16 x i32> %res +} + define <16 x i32>@test_int_x86_avx512_mask_pmovzxb_d_512(<16 x i8> %x0, <16 x i32> %x1, i16 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmovzxb_d_512: ; X86: ## %bb.0: -; X86-NEXT: vpmovzxbd %xmm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x31,0xd0] -; X86-NEXT: ## zmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmovzxbd %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x31,0xc8] ; X86-NEXT: ## zmm1 {%k1} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; X86-NEXT: vpmovzxbd %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x31,0xc0] -; X86-NEXT: ## zmm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; X86-NEXT: vpaddd %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfe,0xc2] -; X86-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x75,0x48,0xfe,0xc0] +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmovzxb_d_512: ; X64: ## %bb.0: -; X64-NEXT: vpmovzxbd %xmm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x31,0xd0] -; X64-NEXT: ## zmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovzxbd %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x31,0xc8] ; X64-NEXT: ## zmm1 {%k1} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <16 x i32> @llvm.x86.avx512.mask.pmovzxb.d.512(<16 x i8> %x0, <16 x i32> %x1, i16 %x2) + ret <16 x i32> %res +} + +define <16 x i32>@test_int_x86_avx512_maskz_pmovzxb_d_512(<16 x i8> %x0, i16 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmovzxb_d_512: +; X86: ## %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpmovzxbd %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x31,0xc0] +; X86-NEXT: ## zmm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmovzxb_d_512: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovzxbd %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x31,0xc0] ; X64-NEXT: ## zmm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero -; X64-NEXT: vpaddd %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfe,0xc2] -; X64-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x75,0x48,0xfe,0xc0] ; X64-NEXT: retq ## encoding: [0xc3] - %res = call <16 x i32> @llvm.x86.avx512.mask.pmovzxb.d.512(<16 x i8> %x0, <16 x i32> %x1, i16 %x2) - %res1 = call <16 x i32> @llvm.x86.avx512.mask.pmovzxb.d.512(<16 x i8> %x0, <16 x i32> zeroinitializer, i16 %x2) - %res2 = call <16 x i32> @llvm.x86.avx512.mask.pmovzxb.d.512(<16 x i8> %x0, <16 x i32> %x1, i16 -1) - %res3 = add <16 x i32> %res, %res1 - %res4 = add <16 x i32> %res3, %res2 - ret <16 x i32> %res4 + %res = call <16 x i32> @llvm.x86.avx512.mask.pmovzxb.d.512(<16 x i8> %x0, <16 x i32> zeroinitializer, i16 %x2) + ret <16 x i32> %res } declare <8 x i64> @llvm.x86.avx512.mask.pmovzxb.q.512(<16 x i8>, <8 x i64>, i8) +define <8 x i64>@test_int_x86_avx512_pmovzxb_q_512(<16 x i8> %x0, <8 x i64> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmovzxb_q_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpmovzxbq %xmm0, %zmm0 ## encoding: [0x62,0xf2,0x7d,0x48,0x32,0xc0] +; CHECK-NEXT: ## zmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxb.q.512(<16 x i8> %x0, <8 x i64> %x1, i8 -1) + ret <8 x i64> %res +} + define <8 x i64>@test_int_x86_avx512_mask_pmovzxb_q_512(<16 x i8> %x0, <8 x i64> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmovzxb_q_512: ; X86: ## %bb.0: -; X86-NEXT: vpmovzxbq %xmm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x32,0xd0] -; X86-NEXT: ## zmm2 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovzxbq %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x32,0xc8] ; X86-NEXT: ## zmm1 {%k1} = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero -; X86-NEXT: vpmovzxbq %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x32,0xc0] -; X86-NEXT: ## zmm0 {%k1} {z} = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero -; X86-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xd4,0xc2] -; X86-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc0] +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmovzxb_q_512: ; X64: ## %bb.0: -; X64-NEXT: vpmovzxbq %xmm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x32,0xd0] -; X64-NEXT: ## zmm2 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovzxbq %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x32,0xc8] ; X64-NEXT: ## zmm1 {%k1} = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxb.q.512(<16 x i8> %x0, <8 x i64> %x1, i8 %x2) + ret <8 x i64> %res +} + +define <8 x i64>@test_int_x86_avx512_maskz_pmovzxb_q_512(<16 x i8> %x0, i8 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmovzxb_q_512: +; X86: ## %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpmovzxbq %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x32,0xc0] +; X86-NEXT: ## zmm0 {%k1} {z} = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmovzxb_q_512: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovzxbq %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x32,0xc0] ; X64-NEXT: ## zmm0 {%k1} {z} = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,zero,zero,zero,xmm0[5],zero,zero,zero,zero,zero,zero,zero,xmm0[6],zero,zero,zero,zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,zero,zero,zero -; X64-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xd4,0xc2] -; X64-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc0] ; X64-NEXT: retq ## encoding: [0xc3] - %res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxb.q.512(<16 x i8> %x0, <8 x i64> %x1, i8 %x2) - %res1 = call <8 x i64> @llvm.x86.avx512.mask.pmovzxb.q.512(<16 x i8> %x0, <8 x i64> zeroinitializer, i8 %x2) - %res2 = call <8 x i64> @llvm.x86.avx512.mask.pmovzxb.q.512(<16 x i8> %x0, <8 x i64> %x1, i8 -1) - %res3 = add <8 x i64> %res, %res1 - %res4 = add <8 x i64> %res3, %res2 - ret <8 x i64> %res4 + %res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxb.q.512(<16 x i8> %x0, <8 x i64> zeroinitializer, i8 %x2) + ret <8 x i64> %res } declare <8 x i64> @llvm.x86.avx512.mask.pmovzxd.q.512(<8 x i32>, <8 x i64>, i8) +define <8 x i64>@test_int_x86_avx512_pmovzxd_q_512(<8 x i32> %x0, <8 x i64> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmovzxd_q_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpmovzxdq %ymm0, %zmm0 ## encoding: [0x62,0xf2,0x7d,0x48,0x35,0xc0] +; CHECK-NEXT: ## zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxd.q.512(<8 x i32> %x0, <8 x i64> %x1, i8 -1) + ret <8 x i64> %res +} + define <8 x i64>@test_int_x86_avx512_mask_pmovzxd_q_512(<8 x i32> %x0, <8 x i64> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmovzxd_q_512: ; X86: ## %bb.0: -; X86-NEXT: vpmovzxdq %ymm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x35,0xd0] -; X86-NEXT: ## zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovzxdq %ymm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x35,0xc8] ; X86-NEXT: ## zmm1 {%k1} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero -; X86-NEXT: vpmovzxdq %ymm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x35,0xc0] -; X86-NEXT: ## zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero -; X86-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xd4,0xc2] -; X86-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc0] +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmovzxd_q_512: ; X64: ## %bb.0: -; X64-NEXT: vpmovzxdq %ymm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x35,0xd0] -; X64-NEXT: ## zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovzxdq %ymm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x35,0xc8] ; X64-NEXT: ## zmm1 {%k1} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxd.q.512(<8 x i32> %x0, <8 x i64> %x1, i8 %x2) + ret <8 x i64> %res +} + +define <8 x i64>@test_int_x86_avx512_maskz_pmovzxd_q_512(<8 x i32> %x0, i8 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmovzxd_q_512: +; X86: ## %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpmovzxdq %ymm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x35,0xc0] +; X86-NEXT: ## zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmovzxd_q_512: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovzxdq %ymm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x35,0xc0] ; X64-NEXT: ## zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero -; X64-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xd4,0xc2] -; X64-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc0] ; X64-NEXT: retq ## encoding: [0xc3] - %res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxd.q.512(<8 x i32> %x0, <8 x i64> %x1, i8 %x2) - %res1 = call <8 x i64> @llvm.x86.avx512.mask.pmovzxd.q.512(<8 x i32> %x0, <8 x i64> zeroinitializer, i8 %x2) - %res2 = call <8 x i64> @llvm.x86.avx512.mask.pmovzxd.q.512(<8 x i32> %x0, <8 x i64> %x1, i8 -1) - %res3 = add <8 x i64> %res, %res1 - %res4 = add <8 x i64> %res3, %res2 - ret <8 x i64> %res4 + %res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxd.q.512(<8 x i32> %x0, <8 x i64> zeroinitializer, i8 %x2) + ret <8 x i64> %res } declare <16 x i32> @llvm.x86.avx512.mask.pmovzxw.d.512(<16 x i16>, <16 x i32>, i16) +define <16 x i32>@test_int_x86_avx512_pmovzxw_d_512(<16 x i16> %x0, <16 x i32> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmovzxw_d_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpmovzxwd %ymm0, %zmm0 ## encoding: [0x62,0xf2,0x7d,0x48,0x33,0xc0] +; CHECK-NEXT: ## zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <16 x i32> @llvm.x86.avx512.mask.pmovzxw.d.512(<16 x i16> %x0, <16 x i32> %x1, i16 -1) + ret <16 x i32> %res +} + define <16 x i32>@test_int_x86_avx512_mask_pmovzxw_d_512(<16 x i16> %x0, <16 x i32> %x1, i16 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmovzxw_d_512: ; X86: ## %bb.0: -; X86-NEXT: vpmovzxwd %ymm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x33,0xd0] -; X86-NEXT: ## zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmovzxwd %ymm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x33,0xc8] ; X86-NEXT: ## zmm1 {%k1} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; X86-NEXT: vpmovzxwd %ymm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x33,0xc0] -; X86-NEXT: ## zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; X86-NEXT: vpaddd %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfe,0xc2] -; X86-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x75,0x48,0xfe,0xc0] +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmovzxw_d_512: ; X64: ## %bb.0: -; X64-NEXT: vpmovzxwd %ymm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x33,0xd0] -; X64-NEXT: ## zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovzxwd %ymm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x33,0xc8] ; X64-NEXT: ## zmm1 {%k1} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <16 x i32> @llvm.x86.avx512.mask.pmovzxw.d.512(<16 x i16> %x0, <16 x i32> %x1, i16 %x2) + ret <16 x i32> %res +} + +define <16 x i32>@test_int_x86_avx512_maskz_pmovzxw_d_512(<16 x i16> %x0, i16 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmovzxw_d_512: +; X86: ## %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpmovzxwd %ymm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x33,0xc0] +; X86-NEXT: ## zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmovzxw_d_512: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovzxwd %ymm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x33,0xc0] ; X64-NEXT: ## zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; X64-NEXT: vpaddd %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfe,0xc2] -; X64-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x75,0x48,0xfe,0xc0] ; X64-NEXT: retq ## encoding: [0xc3] - %res = call <16 x i32> @llvm.x86.avx512.mask.pmovzxw.d.512(<16 x i16> %x0, <16 x i32> %x1, i16 %x2) - %res1 = call <16 x i32> @llvm.x86.avx512.mask.pmovzxw.d.512(<16 x i16> %x0, <16 x i32> zeroinitializer, i16 %x2) - %res2 = call <16 x i32> @llvm.x86.avx512.mask.pmovzxw.d.512(<16 x i16> %x0, <16 x i32> %x1, i16 -1) - %res3 = add <16 x i32> %res, %res1 - %res4 = add <16 x i32> %res3, %res2 - ret <16 x i32> %res4 + %res = call <16 x i32> @llvm.x86.avx512.mask.pmovzxw.d.512(<16 x i16> %x0, <16 x i32> zeroinitializer, i16 %x2) + ret <16 x i32> %res } declare <8 x i64> @llvm.x86.avx512.mask.pmovzxw.q.512(<8 x i16>, <8 x i64>, i8) +define <8 x i64>@test_int_x86_avx512_pmovzxw_q_512(<8 x i16> %x0, <8 x i64> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmovzxw_q_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpmovzxwq %xmm0, %zmm0 ## encoding: [0x62,0xf2,0x7d,0x48,0x34,0xc0] +; CHECK-NEXT: ## zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxw.q.512(<8 x i16> %x0, <8 x i64> %x1, i8 -1) + ret <8 x i64> %res +} + define <8 x i64>@test_int_x86_avx512_mask_pmovzxw_q_512(<8 x i16> %x0, <8 x i64> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmovzxw_q_512: ; X86: ## %bb.0: -; X86-NEXT: vpmovzxwq %xmm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x34,0xd0] -; X86-NEXT: ## zmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovzxwq %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x34,0xc8] ; X86-NEXT: ## zmm1 {%k1} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; X86-NEXT: vpmovzxwq %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x34,0xc0] -; X86-NEXT: ## zmm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; X86-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xd4,0xc2] -; X86-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc0] +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmovzxw_q_512: ; X64: ## %bb.0: -; X64-NEXT: vpmovzxwq %xmm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x34,0xd0] -; X64-NEXT: ## zmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovzxwq %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x34,0xc8] ; X64-NEXT: ## zmm1 {%k1} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxw.q.512(<8 x i16> %x0, <8 x i64> %x1, i8 %x2) + ret <8 x i64> %res +} + +define <8 x i64>@test_int_x86_avx512_maskz_pmovzxw_q_512(<8 x i16> %x0, i8 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmovzxw_q_512: +; X86: ## %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpmovzxwq %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x34,0xc0] +; X86-NEXT: ## zmm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmovzxw_q_512: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovzxwq %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x34,0xc0] ; X64-NEXT: ## zmm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; X64-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xd4,0xc2] -; X64-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc0] ; X64-NEXT: retq ## encoding: [0xc3] - %res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxw.q.512(<8 x i16> %x0, <8 x i64> %x1, i8 %x2) - %res1 = call <8 x i64> @llvm.x86.avx512.mask.pmovzxw.q.512(<8 x i16> %x0, <8 x i64> zeroinitializer, i8 %x2) - %res2 = call <8 x i64> @llvm.x86.avx512.mask.pmovzxw.q.512(<8 x i16> %x0, <8 x i64> %x1, i8 -1) - %res3 = add <8 x i64> %res, %res1 - %res4 = add <8 x i64> %res3, %res2 - ret <8 x i64> %res4 + %res = call <8 x i64> @llvm.x86.avx512.mask.pmovzxw.q.512(<8 x i16> %x0, <8 x i64> zeroinitializer, i8 %x2) + ret <8 x i64> %res } declare <16 x i32> @llvm.x86.avx512.mask.pmovsxb.d.512(<16 x i8>, <16 x i32>, i16) +define <16 x i32>@test_int_x86_avx512_pmovsxb_d_512(<16 x i8> %x0, <16 x i32> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmovsxb_d_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpmovsxbd %xmm0, %zmm0 ## encoding: [0x62,0xf2,0x7d,0x48,0x21,0xc0] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <16 x i32> @llvm.x86.avx512.mask.pmovsxb.d.512(<16 x i8> %x0, <16 x i32> %x1, i16 -1) + ret <16 x i32> %res +} + define <16 x i32>@test_int_x86_avx512_mask_pmovsxb_d_512(<16 x i8> %x0, <16 x i32> %x1, i16 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmovsxb_d_512: ; X86: ## %bb.0: -; X86-NEXT: vpmovsxbd %xmm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x21,0xd0] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmovsxbd %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x21,0xc8] -; X86-NEXT: vpmovsxbd %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x21,0xc0] -; X86-NEXT: vpaddd %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfe,0xc2] -; X86-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x75,0x48,0xfe,0xc0] +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmovsxb_d_512: ; X64: ## %bb.0: -; X64-NEXT: vpmovsxbd %xmm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x21,0xd0] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovsxbd %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x21,0xc8] -; X64-NEXT: vpmovsxbd %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x21,0xc0] -; X64-NEXT: vpaddd %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfe,0xc2] -; X64-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x75,0x48,0xfe,0xc0] +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <16 x i32> @llvm.x86.avx512.mask.pmovsxb.d.512(<16 x i8> %x0, <16 x i32> %x1, i16 %x2) - %res1 = call <16 x i32> @llvm.x86.avx512.mask.pmovsxb.d.512(<16 x i8> %x0, <16 x i32> zeroinitializer, i16 %x2) - %res2 = call <16 x i32> @llvm.x86.avx512.mask.pmovsxb.d.512(<16 x i8> %x0, <16 x i32> %x1, i16 -1) - %res3 = add <16 x i32> %res, %res1 - %res4 = add <16 x i32> %res3, %res2 - ret <16 x i32> %res4 + ret <16 x i32> %res +} + +define <16 x i32>@test_int_x86_avx512_maskz_pmovsxb_d_512(<16 x i8> %x0, i16 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmovsxb_d_512: +; X86: ## %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpmovsxbd %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x21,0xc0] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmovsxb_d_512: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vpmovsxbd %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x21,0xc0] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <16 x i32> @llvm.x86.avx512.mask.pmovsxb.d.512(<16 x i8> %x0, <16 x i32> zeroinitializer, i16 %x2) + ret <16 x i32> %res } declare <8 x i64> @llvm.x86.avx512.mask.pmovsxb.q.512(<16 x i8>, <8 x i64>, i8) +define <8 x i64>@test_int_x86_avx512_pmovsxb_q_512(<16 x i8> %x0, <8 x i64> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmovsxb_q_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpmovsxbq %xmm0, %zmm0 ## encoding: [0x62,0xf2,0x7d,0x48,0x22,0xc0] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <8 x i64> @llvm.x86.avx512.mask.pmovsxb.q.512(<16 x i8> %x0, <8 x i64> %x1, i8 -1) + ret <8 x i64> %res +} + define <8 x i64>@test_int_x86_avx512_mask_pmovsxb_q_512(<16 x i8> %x0, <8 x i64> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmovsxb_q_512: ; X86: ## %bb.0: -; X86-NEXT: vpmovsxbq %xmm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x22,0xd0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovsxbq %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x22,0xc8] -; X86-NEXT: vpmovsxbq %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x22,0xc0] -; X86-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xd4,0xc2] -; X86-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc0] +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmovsxb_q_512: ; X64: ## %bb.0: -; X64-NEXT: vpmovsxbq %xmm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x22,0xd0] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovsxbq %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x22,0xc8] -; X64-NEXT: vpmovsxbq %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x22,0xc0] -; X64-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xd4,0xc2] -; X64-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc0] +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <8 x i64> @llvm.x86.avx512.mask.pmovsxb.q.512(<16 x i8> %x0, <8 x i64> %x1, i8 %x2) - %res1 = call <8 x i64> @llvm.x86.avx512.mask.pmovsxb.q.512(<16 x i8> %x0, <8 x i64> zeroinitializer, i8 %x2) - %res2 = call <8 x i64> @llvm.x86.avx512.mask.pmovsxb.q.512(<16 x i8> %x0, <8 x i64> %x1, i8 -1) - %res3 = add <8 x i64> %res, %res1 - %res4 = add <8 x i64> %res3, %res2 - ret <8 x i64> %res4 + ret <8 x i64> %res +} + +define <8 x i64>@test_int_x86_avx512_maskz_pmovsxb_q_512(<16 x i8> %x0, i8 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmovsxb_q_512: +; X86: ## %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpmovsxbq %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x22,0xc0] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmovsxb_q_512: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vpmovsxbq %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x22,0xc0] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <8 x i64> @llvm.x86.avx512.mask.pmovsxb.q.512(<16 x i8> %x0, <8 x i64> zeroinitializer, i8 %x2) + ret <8 x i64> %res } declare <8 x i64> @llvm.x86.avx512.mask.pmovsxd.q.512(<8 x i32>, <8 x i64>, i8) +define <8 x i64>@test_int_x86_avx512_pmovsxd_q_512(<8 x i32> %x0, <8 x i64> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmovsxd_q_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpmovsxdq %ymm0, %zmm0 ## encoding: [0x62,0xf2,0x7d,0x48,0x25,0xc0] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <8 x i64> @llvm.x86.avx512.mask.pmovsxd.q.512(<8 x i32> %x0, <8 x i64> %x1, i8 -1) + ret <8 x i64> %res +} + define <8 x i64>@test_int_x86_avx512_mask_pmovsxd_q_512(<8 x i32> %x0, <8 x i64> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmovsxd_q_512: ; X86: ## %bb.0: -; X86-NEXT: vpmovsxdq %ymm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x25,0xd0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovsxdq %ymm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x25,0xc8] -; X86-NEXT: vpmovsxdq %ymm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x25,0xc0] -; X86-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xd4,0xc2] -; X86-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc0] +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmovsxd_q_512: ; X64: ## %bb.0: -; X64-NEXT: vpmovsxdq %ymm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x25,0xd0] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovsxdq %ymm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x25,0xc8] -; X64-NEXT: vpmovsxdq %ymm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x25,0xc0] -; X64-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xd4,0xc2] -; X64-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc0] +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <8 x i64> @llvm.x86.avx512.mask.pmovsxd.q.512(<8 x i32> %x0, <8 x i64> %x1, i8 %x2) - %res1 = call <8 x i64> @llvm.x86.avx512.mask.pmovsxd.q.512(<8 x i32> %x0, <8 x i64> zeroinitializer, i8 %x2) - %res2 = call <8 x i64> @llvm.x86.avx512.mask.pmovsxd.q.512(<8 x i32> %x0, <8 x i64> %x1, i8 -1) - %res3 = add <8 x i64> %res, %res1 - %res4 = add <8 x i64> %res3, %res2 - ret <8 x i64> %res4 + ret <8 x i64> %res +} + +define <8 x i64>@test_int_x86_avx512_maskz_pmovsxd_q_512(<8 x i32> %x0, i8 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmovsxd_q_512: +; X86: ## %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpmovsxdq %ymm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x25,0xc0] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmovsxd_q_512: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vpmovsxdq %ymm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x25,0xc0] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <8 x i64> @llvm.x86.avx512.mask.pmovsxd.q.512(<8 x i32> %x0, <8 x i64> zeroinitializer, i8 %x2) + ret <8 x i64> %res } - declare <16 x i32> @llvm.x86.avx512.mask.pmovsxw.d.512(<16 x i16>, <16 x i32>, i16) +define <16 x i32>@test_int_x86_avx512_pmovsxw_d_512(<16 x i16> %x0, <16 x i32> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmovsxw_d_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpmovsxwd %ymm0, %zmm0 ## encoding: [0x62,0xf2,0x7d,0x48,0x23,0xc0] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <16 x i32> @llvm.x86.avx512.mask.pmovsxw.d.512(<16 x i16> %x0, <16 x i32> %x1, i16 -1) + ret <16 x i32> %res +} + define <16 x i32>@test_int_x86_avx512_mask_pmovsxw_d_512(<16 x i16> %x0, <16 x i32> %x1, i16 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmovsxw_d_512: ; X86: ## %bb.0: -; X86-NEXT: vpmovsxwd %ymm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x23,0xd0] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmovsxwd %ymm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x23,0xc8] -; X86-NEXT: vpmovsxwd %ymm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x23,0xc0] -; X86-NEXT: vpaddd %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfe,0xc2] -; X86-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x75,0x48,0xfe,0xc0] +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmovsxw_d_512: ; X64: ## %bb.0: -; X64-NEXT: vpmovsxwd %ymm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x23,0xd0] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovsxwd %ymm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x23,0xc8] -; X64-NEXT: vpmovsxwd %ymm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x23,0xc0] -; X64-NEXT: vpaddd %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfe,0xc2] -; X64-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x75,0x48,0xfe,0xc0] +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <16 x i32> @llvm.x86.avx512.mask.pmovsxw.d.512(<16 x i16> %x0, <16 x i32> %x1, i16 %x2) - %res1 = call <16 x i32> @llvm.x86.avx512.mask.pmovsxw.d.512(<16 x i16> %x0, <16 x i32> zeroinitializer, i16 %x2) - %res2 = call <16 x i32> @llvm.x86.avx512.mask.pmovsxw.d.512(<16 x i16> %x0, <16 x i32> %x1, i16 -1) - %res3 = add <16 x i32> %res, %res1 - %res4 = add <16 x i32> %res3, %res2 - ret <16 x i32> %res4 + ret <16 x i32> %res } +define <16 x i32>@test_int_x86_avx512_maskz_pmovsxw_d_512(<16 x i16> %x0, i16 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmovsxw_d_512: +; X86: ## %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpmovsxwd %ymm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x23,0xc0] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmovsxw_d_512: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vpmovsxwd %ymm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x23,0xc0] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <16 x i32> @llvm.x86.avx512.mask.pmovsxw.d.512(<16 x i16> %x0, <16 x i32> zeroinitializer, i16 %x2) + ret <16 x i32> %res +} declare <8 x i64> @llvm.x86.avx512.mask.pmovsxw.q.512(<8 x i16>, <8 x i64>, i8) +define <8 x i64>@test_int_x86_avx512_pmovsxw_q_512(<8 x i16> %x0, <8 x i64> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmovsxw_q_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpmovsxwq %xmm0, %zmm0 ## encoding: [0x62,0xf2,0x7d,0x48,0x24,0xc0] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <8 x i64> @llvm.x86.avx512.mask.pmovsxw.q.512(<8 x i16> %x0, <8 x i64> %x1, i8 -1) + ret <8 x i64> %res +} + define <8 x i64>@test_int_x86_avx512_mask_pmovsxw_q_512(<8 x i16> %x0, <8 x i64> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmovsxw_q_512: ; X86: ## %bb.0: -; X86-NEXT: vpmovsxwq %xmm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x24,0xd0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovsxwq %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x24,0xc8] -; X86-NEXT: vpmovsxwq %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x24,0xc0] -; X86-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xd4,0xc2] -; X86-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc0] +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmovsxw_q_512: ; X64: ## %bb.0: -; X64-NEXT: vpmovsxwq %xmm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x24,0xd0] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovsxwq %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x24,0xc8] -; X64-NEXT: vpmovsxwq %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x24,0xc0] -; X64-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xd4,0xc2] -; X64-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc0] +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <8 x i64> @llvm.x86.avx512.mask.pmovsxw.q.512(<8 x i16> %x0, <8 x i64> %x1, i8 %x2) - %res1 = call <8 x i64> @llvm.x86.avx512.mask.pmovsxw.q.512(<8 x i16> %x0, <8 x i64> zeroinitializer, i8 %x2) - %res2 = call <8 x i64> @llvm.x86.avx512.mask.pmovsxw.q.512(<8 x i16> %x0, <8 x i64> %x1, i8 -1) - %res3 = add <8 x i64> %res, %res1 - %res4 = add <8 x i64> %res3, %res2 - ret <8 x i64> %res4 + ret <8 x i64> %res +} + +define <8 x i64>@test_int_x86_avx512_maskz_pmovsxw_q_512(<8 x i16> %x0, i8 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmovsxw_q_512: +; X86: ## %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpmovsxwq %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x24,0xc0] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmovsxw_q_512: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vpmovsxwq %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x24,0xc0] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <8 x i64> @llvm.x86.avx512.mask.pmovsxw.q.512(<8 x i16> %x0, <8 x i64> zeroinitializer, i8 %x2) + ret <8 x i64> %res } declare <16 x i32> @llvm.x86.avx512.prolv.d.512(<16 x i32>, <16 x i32>) @@ -4464,52 +4932,62 @@ define <8 x i64> @test_x86_avx512_psrlv_q_memop(<8 x i64> %a0, <8 x i64>* %ptr) declare <8 x double> @llvm.x86.avx512.mask.cvtdq2pd.512(<8 x i32>, <8 x double>, i8) +define <8 x double>@test_int_x86_avx512_cvt_dq2pd_512(<8 x i32> %x0, <8 x double> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvt_dq2pd_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vcvtdq2pd %ymm0, %zmm0 ## encoding: [0x62,0xf1,0x7e,0x48,0xe6,0xc0] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <8 x double> @llvm.x86.avx512.mask.cvtdq2pd.512(<8 x i32> %x0, <8 x double> %x1, i8 -1) + ret <8 x double> %res +} + define <8 x double>@test_int_x86_avx512_mask_cvt_dq2pd_512(<8 x i32> %x0, <8 x double> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvt_dq2pd_512: ; X86: ## %bb.0: -; X86-NEXT: vcvtdq2pd %ymm0, %zmm2 ## encoding: [0x62,0xf1,0x7e,0x48,0xe6,0xd0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvtdq2pd %ymm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x49,0xe6,0xc8] -; X86-NEXT: vaddpd %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0x58,0xc2] +; X86-NEXT: vmovaps %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_dq2pd_512: ; X64: ## %bb.0: -; X64-NEXT: vcvtdq2pd %ymm0, %zmm2 ## encoding: [0x62,0xf1,0x7e,0x48,0xe6,0xd0] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtdq2pd %ymm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x49,0xe6,0xc8] -; X64-NEXT: vaddpd %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0x58,0xc2] +; X64-NEXT: vmovaps %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <8 x double> @llvm.x86.avx512.mask.cvtdq2pd.512(<8 x i32> %x0, <8 x double> %x1, i8 %x2) - %res1 = call <8 x double> @llvm.x86.avx512.mask.cvtdq2pd.512(<8 x i32> %x0, <8 x double> %x1, i8 -1) - %res2 = fadd <8 x double> %res, %res1 - ret <8 x double> %res2 + ret <8 x double> %res } declare <8 x double> @llvm.x86.avx512.mask.cvtudq2pd.512(<8 x i32>, <8 x double>, i8) +define <8 x double>@test_int_x86_avx512_cvt_udq2pd_512(<8 x i32> %x0, <8 x double> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvt_udq2pd_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vcvtudq2pd %ymm0, %zmm0 ## encoding: [0x62,0xf1,0x7e,0x48,0x7a,0xc0] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <8 x double> @llvm.x86.avx512.mask.cvtudq2pd.512(<8 x i32> %x0, <8 x double> %x1, i8 -1) + ret <8 x double> %res +} + define <8 x double>@test_int_x86_avx512_mask_cvt_udq2pd_512(<8 x i32> %x0, <8 x double> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvt_udq2pd_512: ; X86: ## %bb.0: -; X86-NEXT: vcvtudq2pd %ymm0, %zmm2 ## encoding: [0x62,0xf1,0x7e,0x48,0x7a,0xd0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvtudq2pd %ymm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x49,0x7a,0xc8] -; X86-NEXT: vaddpd %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0x58,0xc2] +; X86-NEXT: vmovaps %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_udq2pd_512: ; X64: ## %bb.0: -; X64-NEXT: vcvtudq2pd %ymm0, %zmm2 ## encoding: [0x62,0xf1,0x7e,0x48,0x7a,0xd0] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtudq2pd %ymm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x49,0x7a,0xc8] -; X64-NEXT: vaddpd %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0x58,0xc2] +; X64-NEXT: vmovaps %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <8 x double> @llvm.x86.avx512.mask.cvtudq2pd.512(<8 x i32> %x0, <8 x double> %x1, i8 %x2) - %res1 = call <8 x double> @llvm.x86.avx512.mask.cvtudq2pd.512(<8 x i32> %x0, <8 x double> %x1, i8 -1) - %res2 = fadd <8 x double> %res, %res1 - ret <8 x double> %res2 + ret <8 x double> %res } define <16 x float> @test_x86_vcvtph2ps_512(<16 x i16> %a0) { @@ -4637,63 +5115,95 @@ declare <16 x i32> @llvm.x86.avx512.mask.valign.d.512(<16 x i32>, <16 x i32>, i3 declare <8 x double> @llvm.x86.avx512.mask.vpermilvar.pd.512(<8 x double>, <8 x i64>, <8 x double>, i8) +define <8 x double>@test_int_x86_avx512_vpermilvar_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermilvar_pd_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpermilpd %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf2,0xfd,0x48,0x0d,0xc1] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <8 x double> @llvm.x86.avx512.mask.vpermilvar.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 -1) + ret <8 x double> %res +} + define <8 x double>@test_int_x86_avx512_mask_vpermilvar_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermilvar_pd_512: ; X86: ## %bb.0: -; X86-NEXT: vpermilpd %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf2,0xfd,0x48,0x0d,0xd9] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermilpd %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x0d,0xd1] -; X86-NEXT: vpermilpd %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x0d,0xc1] -; X86-NEXT: vaddpd %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0x58,0xc0] -; X86-NEXT: vaddpd %zmm0, %zmm3, %zmm0 ## encoding: [0x62,0xf1,0xe5,0x48,0x58,0xc0] +; X86-NEXT: vmovapd %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc2] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermilvar_pd_512: ; X64: ## %bb.0: -; X64-NEXT: vpermilpd %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf2,0xfd,0x48,0x0d,0xd9] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermilpd %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x0d,0xd1] -; X64-NEXT: vpermilpd %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x0d,0xc1] -; X64-NEXT: vaddpd %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0x58,0xc0] -; X64-NEXT: vaddpd %zmm0, %zmm3, %zmm0 ## encoding: [0x62,0xf1,0xe5,0x48,0x58,0xc0] +; X64-NEXT: vmovapd %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc2] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <8 x double> @llvm.x86.avx512.mask.vpermilvar.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) - %res1 = call <8 x double> @llvm.x86.avx512.mask.vpermilvar.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> zeroinitializer, i8 %x3) - %res2 = call <8 x double> @llvm.x86.avx512.mask.vpermilvar.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 -1) - %res3 = fadd <8 x double> %res, %res1 - %res4 = fadd <8 x double> %res2, %res3 - ret <8 x double> %res4 + ret <8 x double> %res +} + +define <8 x double>@test_int_x86_avx512_maskz_vpermilvar_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_vpermilvar_pd_512: +; X86: ## %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpermilpd %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x0d,0xc1] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_vpermilvar_pd_512: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vpermilpd %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x0d,0xc1] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <8 x double> @llvm.x86.avx512.mask.vpermilvar.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> zeroinitializer, i8 %x3) + ret <8 x double> %res } declare <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float>, <16 x i32>, <16 x float>, i16) +define <16 x float>@test_int_x86_avx512_vpermilvar_ps_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermilvar_ps_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpermilps %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf2,0x7d,0x48,0x0c,0xc1] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 -1) + ret <16 x float> %res +} + define <16 x float>@test_int_x86_avx512_mask_vpermilvar_ps_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermilvar_ps_512: ; X86: ## %bb.0: -; X86-NEXT: vpermilps %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf2,0x7d,0x48,0x0c,0xd9] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpermilps %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x0c,0xd1] -; X86-NEXT: vpermilps %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x0c,0xc1] -; X86-NEXT: vaddps %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6c,0x48,0x58,0xc0] -; X86-NEXT: vaddps %zmm0, %zmm3, %zmm0 ## encoding: [0x62,0xf1,0x64,0x48,0x58,0xc0] +; X86-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermilvar_ps_512: ; X64: ## %bb.0: -; X64-NEXT: vpermilps %zmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf2,0x7d,0x48,0x0c,0xd9] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermilps %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x0c,0xd1] -; X64-NEXT: vpermilps %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x0c,0xc1] -; X64-NEXT: vaddps %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6c,0x48,0x58,0xc0] -; X64-NEXT: vaddps %zmm0, %zmm3, %zmm0 ## encoding: [0x62,0xf1,0x64,0x48,0x58,0xc0] +; X64-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3) - %res1 = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> zeroinitializer, i16 %x3) - %res2 = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 -1) - %res3 = fadd <16 x float> %res, %res1 - %res4 = fadd <16 x float> %res2, %res3 - ret <16 x float> %res4 + ret <16 x float> %res +} + + +define <16 x float>@test_int_x86_avx512_maskz_vpermilvar_ps_512(<16 x float> %x0, <16 x i32> %x1, i16 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_vpermilvar_ps_512: +; X86: ## %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpermilps %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x0c,0xc1] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_vpermilvar_ps_512: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vpermilps %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x0c,0xc1] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> zeroinitializer, i16 %x3) + ret <16 x float> %res } ; Test case to make sure we can print shuffle decode comments for constant pool loads. @@ -4703,14 +5213,14 @@ define <16 x float>@test_int_x86_avx512_mask_vpermilvar_ps_512_constant_pool(<16 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpermilps {{.*#+}} zmm2 {%k1} = zmm0[2,3,0,1,7,6,5,4,9,8,11,10,12,13,14,15] ; X86-NEXT: ## encoding: [0x62,0xf2,0x7d,0x49,0x0c,0x15,A,A,A,A] -; X86-NEXT: ## fixup A - offset: 6, value: LCPI216_0, kind: FK_Data_4 +; X86-NEXT: ## fixup A - offset: 6, value: LCPI291_0, kind: FK_Data_4 ; X86-NEXT: vpermilps {{.*#+}} zmm1 {%k1} {z} = zmm0[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15] ; X86-NEXT: ## encoding: [0x62,0xf2,0x7d,0xc9,0x0c,0x0d,A,A,A,A] -; X86-NEXT: ## fixup A - offset: 6, value: LCPI216_1, kind: FK_Data_4 +; X86-NEXT: ## fixup A - offset: 6, value: LCPI291_1, kind: FK_Data_4 ; X86-NEXT: vaddps %zmm1, %zmm2, %zmm1 ## encoding: [0x62,0xf1,0x6c,0x48,0x58,0xc9] ; X86-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[1,0,3,2,4,5,6,7,10,11,8,9,14,15,13,12] ; X86-NEXT: ## encoding: [0x62,0xf2,0x7d,0x48,0x0c,0x05,A,A,A,A] -; X86-NEXT: ## fixup A - offset: 6, value: LCPI216_2, kind: FK_Data_4 +; X86-NEXT: ## fixup A - offset: 6, value: LCPI291_2, kind: FK_Data_4 ; X86-NEXT: vaddps %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x58,0xc1] ; X86-NEXT: retl ## encoding: [0xc3] ; @@ -4719,14 +5229,14 @@ define <16 x float>@test_int_x86_avx512_mask_vpermilvar_ps_512_constant_pool(<16 ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermilps {{.*#+}} zmm2 {%k1} = zmm0[2,3,0,1,7,6,5,4,9,8,11,10,12,13,14,15] ; X64-NEXT: ## encoding: [0x62,0xf2,0x7d,0x49,0x0c,0x15,A,A,A,A] -; X64-NEXT: ## fixup A - offset: 6, value: LCPI216_0-4, kind: reloc_riprel_4byte +; X64-NEXT: ## fixup A - offset: 6, value: LCPI291_0-4, kind: reloc_riprel_4byte ; X64-NEXT: vpermilps {{.*#+}} zmm1 {%k1} {z} = zmm0[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15] ; X64-NEXT: ## encoding: [0x62,0xf2,0x7d,0xc9,0x0c,0x0d,A,A,A,A] -; X64-NEXT: ## fixup A - offset: 6, value: LCPI216_1-4, kind: reloc_riprel_4byte +; X64-NEXT: ## fixup A - offset: 6, value: LCPI291_1-4, kind: reloc_riprel_4byte ; X64-NEXT: vaddps %zmm1, %zmm2, %zmm1 ## encoding: [0x62,0xf1,0x6c,0x48,0x58,0xc9] ; X64-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[1,0,3,2,4,5,6,7,10,11,8,9,14,15,13,12] ; X64-NEXT: ## encoding: [0x62,0xf2,0x7d,0x48,0x0c,0x05,A,A,A,A] -; X64-NEXT: ## fixup A - offset: 6, value: LCPI216_2-4, kind: reloc_riprel_4byte +; X64-NEXT: ## fixup A - offset: 6, value: LCPI291_2-4, kind: reloc_riprel_4byte ; X64-NEXT: vaddps %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x58,0xc1] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> , <16 x float> %x2, i16 %x3) @@ -5153,124 +5663,186 @@ declare <4 x double> @llvm.x86.avx512.mask.vextractf64x4.512(<8 x double>, i32, declare <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float>, <4 x float>, i32, <16 x float>, i16) +define <16 x float>@test_int_x86_avx512_insertf32x4_512(<16 x float> %x0, <4 x float> %x1, <16 x float> %x3) { +; CHECK-LABEL: test_int_x86_avx512_insertf32x4_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf3,0x7d,0x48,0x18,0xc1,0x01] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> %x3, i16 -1) + ret <16 x float> %res +} + define <16 x float>@test_int_x86_avx512_mask_insertf32x4_512(<16 x float> %x0, <4 x float> %x1, <16 x float> %x3, i16 %x4) { ; X86-LABEL: test_int_x86_avx512_mask_insertf32x4_512: ; X86: ## %bb.0: -; X86-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf3,0x7d,0x48,0x18,0xd9,0x01] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x18,0xd1,0x01] -; X86-NEXT: vaddps %zmm3, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0x6c,0x48,0x58,0xd3] -; X86-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xc9,0x18,0xc1,0x01] -; X86-NEXT: vaddps %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x58,0xc2] +; X86-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_insertf32x4_512: ; X64: ## %bb.0: -; X64-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf3,0x7d,0x48,0x18,0xd9,0x01] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x18,0xd1,0x01] -; X64-NEXT: vaddps %zmm3, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0x6c,0x48,0x58,0xd3] -; X64-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xc9,0x18,0xc1,0x01] -; X64-NEXT: vaddps %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x58,0xc2] +; X64-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> %x3, i16 %x4) - %res1 = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> %x3, i16 -1) - %res2 = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> zeroinitializer, i16 %x4) - %res3 = fadd <16 x float> %res, %res1 - %res4 = fadd <16 x float> %res2, %res3 - ret <16 x float> %res4 + ret <16 x float> %res +} + +define <16 x float>@test_int_x86_avx512_maskz_insertf32x4_512(<16 x float> %x0, <4 x float> %x1, i16 %x4) { +; X86-LABEL: test_int_x86_avx512_maskz_insertf32x4_512: +; X86: ## %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xc9,0x18,0xc1,0x01] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_insertf32x4_512: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xc9,0x18,0xc1,0x01] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> zeroinitializer, i16 %x4) + ret <16 x float> %res } declare <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32>, <4 x i32>, i32, <16 x i32>, i16) +define <16 x i32>@test_int_x86_avx512_inserti32x4_512(<16 x i32> %x0, <4 x i32> %x1, <16 x i32> %x3, i16 %x4) { +; CHECK-LABEL: test_int_x86_avx512_inserti32x4_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf3,0x7d,0x48,0x18,0xc1,0x01] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> %x3, i16 -1) + ret <16 x i32> %res +} + define <16 x i32>@test_int_x86_avx512_mask_inserti32x4_512(<16 x i32> %x0, <4 x i32> %x1, <16 x i32> %x3, i16 %x4) { ; X86-LABEL: test_int_x86_avx512_mask_inserti32x4_512: ; X86: ## %bb.0: -; X86-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf3,0x7d,0x48,0x38,0xd9,0x01] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x38,0xd1,0x01] -; X86-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xc9,0x38,0xc1,0x01] -; X86-NEXT: vpaddd %zmm0, %zmm3, %zmm0 ## encoding: [0x62,0xf1,0x65,0x48,0xfe,0xc0] -; X86-NEXT: vpaddd %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc0] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_inserti32x4_512: ; X64: ## %bb.0: -; X64-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm3 ## encoding: [0x62,0xf3,0x7d,0x48,0x38,0xd9,0x01] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x49,0x38,0xd1,0x01] -; X64-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xc9,0x38,0xc1,0x01] -; X64-NEXT: vpaddd %zmm0, %zmm3, %zmm0 ## encoding: [0x62,0xf1,0x65,0x48,0xfe,0xc0] -; X64-NEXT: vpaddd %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc0] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> %x3, i16 %x4) - %res1 = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> %x3, i16 -1) - %res2 = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> zeroinitializer, i16 %x4) - %res3 = add <16 x i32> %res, %res1 - %res4 = add <16 x i32> %res2, %res3 - ret <16 x i32> %res4 + ret <16 x i32> %res +} + +define <16 x i32>@test_int_x86_avx512_maskz_inserti32x4_512(<16 x i32> %x0, <4 x i32> %x1, i16 %x4) { +; X86-LABEL: test_int_x86_avx512_maskz_inserti32x4_512: +; X86: ## %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xc9,0x38,0xc1,0x01] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_inserti32x4_512: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xc9,0x38,0xc1,0x01] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> zeroinitializer, i16 %x4) + ret <16 x i32> %res } declare <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double>, <4 x double>, i32, <8 x double>, i8) +define <8 x double>@test_int_x86_avx512_insertf64x4_512(<8 x double> %x0, <4 x double> %x1, <8 x double> %x3) { +; CHECK-LABEL: test_int_x86_avx512_insertf64x4_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 ## encoding: [0x62,0xf3,0xfd,0x48,0x1a,0xc1,0x01] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double> %x0, <4 x double> %x1, i32 1, <8 x double> %x3, i8 -1) + ret <8 x double> %res +} + define <8 x double>@test_int_x86_avx512_mask_insertf64x4_512(<8 x double> %x0, <4 x double> %x1, <8 x double> %x3, i8 %x4) { ; X86-LABEL: test_int_x86_avx512_mask_insertf64x4_512: ; X86: ## %bb.0: -; X86-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm3 ## encoding: [0x62,0xf3,0xfd,0x48,0x1a,0xd9,0x01] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x1a,0xd1,0x01] -; X86-NEXT: vaddpd %zmm3, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0xed,0x48,0x58,0xd3] -; X86-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xc9,0x1a,0xc1,0x01] -; X86-NEXT: vaddpd %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x58,0xc2] +; X86-NEXT: vmovapd %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc2] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_insertf64x4_512: ; X64: ## %bb.0: -; X64-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm3 ## encoding: [0x62,0xf3,0xfd,0x48,0x1a,0xd9,0x01] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x1a,0xd1,0x01] -; X64-NEXT: vaddpd %zmm3, %zmm2, %zmm2 ## encoding: [0x62,0xf1,0xed,0x48,0x58,0xd3] -; X64-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xc9,0x1a,0xc1,0x01] -; X64-NEXT: vaddpd %zmm2, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x58,0xc2] +; X64-NEXT: vmovapd %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc2] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double> %x0, <4 x double> %x1, i32 1, <8 x double> %x3, i8 %x4) - %res1 = call <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double> %x0, <4 x double> %x1, i32 1, <8 x double> %x3, i8 -1) - %res2 = call <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double> %x0, <4 x double> %x1, i32 1, <8 x double> zeroinitializer, i8 %x4) - %res3 = fadd <8 x double> %res, %res1 - %res4 = fadd <8 x double> %res2, %res3 - ret <8 x double> %res4 + ret <8 x double> %res +} + +define <8 x double>@test_int_x86_avx512_maskz_insertf64x4_512(<8 x double> %x0, <4 x double> %x1, i8 %x4) { +; X86-LABEL: test_int_x86_avx512_maskz_insertf64x4_512: +; X86: ## %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xc9,0x1a,0xc1,0x01] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_insertf64x4_512: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xc9,0x1a,0xc1,0x01] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double> %x0, <4 x double> %x1, i32 1, <8 x double> zeroinitializer, i8 %x4) + ret <8 x double> %res } declare <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64>, <4 x i64>, i32, <8 x i64>, i8) +define <8 x i64>@test_int_x86_avx512_inserti64x4_512(<8 x i64> %x0, <4 x i64> %x1, <8 x i64> %x3) { +; CHECK-LABEL: test_int_x86_avx512_inserti64x4_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 ## encoding: [0x62,0xf3,0xfd,0x48,0x1a,0xc1,0x01] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> %x3, i8 -1) + ret <8 x i64> %res +} + define <8 x i64>@test_int_x86_avx512_mask_inserti64x4_512(<8 x i64> %x0, <4 x i64> %x1, <8 x i64> %x3, i8 %x4) { ; X86-LABEL: test_int_x86_avx512_mask_inserti64x4_512: ; X86: ## %bb.0: -; X86-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm3 ## encoding: [0x62,0xf3,0xfd,0x48,0x3a,0xd9,0x01] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x3a,0xd1,0x01] -; X86-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xc9,0x3a,0xc1,0x01] -; X86-NEXT: vpaddq %zmm0, %zmm3, %zmm0 ## encoding: [0x62,0xf1,0xe5,0x48,0xd4,0xc0] -; X86-NEXT: vpaddq %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc0] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_inserti64x4_512: ; X64: ## %bb.0: -; X64-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm3 ## encoding: [0x62,0xf3,0xfd,0x48,0x3a,0xd9,0x01] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x3a,0xd1,0x01] -; X64-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xc9,0x3a,0xc1,0x01] -; X64-NEXT: vpaddq %zmm0, %zmm3, %zmm0 ## encoding: [0x62,0xf1,0xe5,0x48,0xd4,0xc0] -; X64-NEXT: vpaddq %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc0] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> %x3, i8 %x4) - %res1 = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> %x3, i8 -1) - %res2 = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> zeroinitializer, i8 %x4) - %res3 = add <8 x i64> %res, %res1 - %res4 = add <8 x i64> %res2, %res3 - ret <8 x i64> %res4 + ret <8 x i64> %res +} + +define <8 x i64>@test_int_x86_avx512_maskz_inserti64x4_512(<8 x i64> %x0, <4 x i64> %x1, i8 %x4) { +; X86-LABEL: test_int_x86_avx512_maskz_inserti64x4_512: +; X86: ## %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xc9,0x3a,0xc1,0x01] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_inserti64x4_512: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xc9,0x3a,0xc1,0x01] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> zeroinitializer, i8 %x4) + ret <8 x i64> %res } define <8 x i64> @test_x86_avx512_movntdqa(i8* %a0) { @@ -5817,38 +6389,59 @@ define <16 x float>@test_int_x86_avx512_mask_broadcastf32x4_512_load(<4 x float> ret <16 x float> %res } -declare <8 x double> @llvm.x86.avx512.mask.broadcastf64x4.512(<4 x double>, <8 x double>, i8) - -define <8 x double>@test_int_x86_avx512_mask_broadcastf64x4_512(<4 x double> %x0, <8 x double> %x2, i8 %mask) { -; X86-LABEL: test_int_x86_avx512_mask_broadcastf64x4_512: +declare <8 x double> @llvm.x86.avx512.mask.broadcastf64x4.512(<4 x double>, <8 x double>, i8) + +define <8 x double>@test_int_x86_avx512_broadcastf64x4_512(<4 x double> %x0, <8 x double> %x2) { +; CHECK-LABEL: test_int_x86_avx512_broadcastf64x4_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0 +; CHECK-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 ## encoding: [0x62,0xf3,0xfd,0x48,0x1a,0xc0,0x01] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + + %res = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x4.512(<4 x double> %x0, <8 x double> %x2, i8 -1) + ret <8 x double> %res +} + +define <8 x double>@test_int_x86_avx512_mask_broadcastf64x4_512(<4 x double> %x0, <8 x double> %x2, i8 %mask) { +; X86-LABEL: test_int_x86_avx512_mask_broadcastf64x4_512: +; X86: ## %bb.0: +; X86-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0 +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x1a,0xc8,0x01] +; X86-NEXT: vmovapd %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc1] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_mask_broadcastf64x4_512: +; X64: ## %bb.0: +; X64-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0 +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x1a,0xc8,0x01] +; X64-NEXT: vmovapd %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc1] +; X64-NEXT: retq ## encoding: [0xc3] + + %res = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x4.512(<4 x double> %x0, <8 x double> %x2, i8 %mask) + ret <8 x double> %res +} + +define <8 x double>@test_int_x86_avx512_maskz_broadcastf64x4_512(<4 x double> %x0, i8 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_broadcastf64x4_512: ; X86: ## %bb.0: ; X86-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0 -; X86-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm2 ## encoding: [0x62,0xf3,0xfd,0x48,0x1a,0xd0,0x01] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x1a,0xc8,0x01] -; X86-NEXT: vaddpd %zmm1, %zmm2, %zmm1 ## encoding: [0x62,0xf1,0xed,0x48,0x58,0xc9] ; X86-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xc9,0x1a,0xc0,0x01] -; X86-NEXT: vaddpd %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x58,0xc1] ; X86-NEXT: retl ## encoding: [0xc3] ; -; X64-LABEL: test_int_x86_avx512_mask_broadcastf64x4_512: +; X64-LABEL: test_int_x86_avx512_maskz_broadcastf64x4_512: ; X64: ## %bb.0: ; X64-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0 -; X64-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm2 ## encoding: [0x62,0xf3,0xfd,0x48,0x1a,0xd0,0x01] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x1a,0xc8,0x01] -; X64-NEXT: vaddpd %zmm1, %zmm2, %zmm1 ## encoding: [0x62,0xf1,0xed,0x48,0x58,0xc9] ; X64-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xc9,0x1a,0xc0,0x01] -; X64-NEXT: vaddpd %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x58,0xc1] ; X64-NEXT: retq ## encoding: [0xc3] - %res1 = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x4.512(<4 x double> %x0, <8 x double> %x2, i8 -1) - %res2 = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x4.512(<4 x double> %x0, <8 x double> %x2, i8 %mask) - %res3 = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x4.512(<4 x double> %x0, <8 x double> zeroinitializer, i8 %mask) - %res4 = fadd <8 x double> %res1, %res2 - %res5 = fadd <8 x double> %res3, %res4 - ret <8 x double> %res5 + %res = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x4.512(<4 x double> %x0, <8 x double> zeroinitializer, i8 %mask) + ret <8 x double> %res } define <8 x double>@test_int_x86_avx512_mask_broadcastf64x4_512_load(<4 x double>* %x0ptr, <8 x double> %x2, i8 %mask) { @@ -5931,36 +6524,57 @@ define <16 x i32>@test_int_x86_avx512_mask_broadcasti32x4_512_load(<4 x i32>* %x declare <8 x i64> @llvm.x86.avx512.mask.broadcasti64x4.512(<4 x i64>, <8 x i64>, i8) +define <8 x i64>@test_int_x86_avx512_broadcasti64x4_512(<4 x i64> %x0, <8 x i64> %x2) { +; CHECK-LABEL: test_int_x86_avx512_broadcasti64x4_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0 +; CHECK-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 ## encoding: [0x62,0xf3,0xfd,0x48,0x1a,0xc0,0x01] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + + %res = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x4.512(<4 x i64> %x0, <8 x i64> %x2, i8 -1) + ret <8 x i64> %res +} + define <8 x i64>@test_int_x86_avx512_mask_broadcasti64x4_512(<4 x i64> %x0, <8 x i64> %x2, i8 %mask) { ; X86-LABEL: test_int_x86_avx512_mask_broadcasti64x4_512: ; X86: ## %bb.0: ; X86-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0 -; X86-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 ## encoding: [0x62,0xf3,0xfd,0x48,0x3a,0xd0,0x01] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x3a,0xc8,0x01] -; X86-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xc9,0x3a,0xc0,0x01] -; X86-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc0] -; X86-NEXT: vpaddq %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc0] +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_broadcasti64x4_512: ; X64: ## %bb.0: ; X64-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0 -; X64-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 ## encoding: [0x62,0xf3,0xfd,0x48,0x3a,0xd0,0x01] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x49,0x3a,0xc8,0x01] +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] +; X64-NEXT: retq ## encoding: [0xc3] + + %res = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x4.512(<4 x i64> %x0, <8 x i64> %x2, i8 %mask) + ret <8 x i64> %res +} + +define <8 x i64>@test_int_x86_avx512_maskz_broadcasti64x4_512(<4 x i64> %x0, i8 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_broadcasti64x4_512: +; X86: ## %bb.0: +; X86-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0 +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xc9,0x3a,0xc0,0x01] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_broadcasti64x4_512: +; X64: ## %bb.0: +; X64-NEXT: ## kill: def $ymm0 killed $ymm0 def $zmm0 +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xc9,0x3a,0xc0,0x01] -; X64-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc0] -; X64-NEXT: vpaddq %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc0] ; X64-NEXT: retq ## encoding: [0xc3] - %res1 = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x4.512(<4 x i64> %x0, <8 x i64> %x2, i8 -1) - %res2 = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x4.512(<4 x i64> %x0, <8 x i64> %x2, i8 %mask) - %res3 = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x4.512(<4 x i64> %x0, <8 x i64> zeroinitializer, i8 %mask) - %res4 = add <8 x i64> %res1, %res2 - %res5 = add <8 x i64> %res3, %res4 - ret <8 x i64> %res5 + %res = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x4.512(<4 x i64> %x0, <8 x i64> zeroinitializer, i8 %mask) + ret <8 x i64> %res } define <8 x i64>@test_int_x86_avx512_mask_broadcasti64x4_512_load(<4 x i64>* %x0ptr, <8 x i64> %x2, i8 %mask) { @@ -6768,150 +7382,213 @@ declare <8 x double> @llvm.x86.avx512.vbroadcast.sd.512(i8*) nounwind readonly declare <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double>, <8 x i64>, <8 x double>, i8) +define <8 x double>@test_int_x86_avx512_permvar_df_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2) { +; CHECK-LABEL: test_int_x86_avx512_permvar_df_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x48,0x16,0xc0] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 -1) + ret <8 x double> %res +} + define <8 x double>@test_int_x86_avx512_mask_permvar_df_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_permvar_df_512: ; X86: ## %bb.0: -; X86-NEXT: vpermpd %zmm0, %zmm1, %zmm3 ## encoding: [0x62,0xf2,0xf5,0x48,0x16,0xd8] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermpd %zmm0, %zmm1, %zmm2 {%k1} ## encoding: [0x62,0xf2,0xf5,0x49,0x16,0xd0] -; X86-NEXT: vpermpd %zmm0, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0xc9,0x16,0xc0] -; X86-NEXT: vaddpd %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0x58,0xc0] -; X86-NEXT: vaddpd %zmm3, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x58,0xc3] +; X86-NEXT: vmovapd %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc2] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_permvar_df_512: ; X64: ## %bb.0: -; X64-NEXT: vpermpd %zmm0, %zmm1, %zmm3 ## encoding: [0x62,0xf2,0xf5,0x48,0x16,0xd8] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermpd %zmm0, %zmm1, %zmm2 {%k1} ## encoding: [0x62,0xf2,0xf5,0x49,0x16,0xd0] -; X64-NEXT: vpermpd %zmm0, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0xc9,0x16,0xc0] -; X64-NEXT: vaddpd %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0x58,0xc0] -; X64-NEXT: vaddpd %zmm3, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x58,0xc3] +; X64-NEXT: vmovapd %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc2] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) - %res1 = call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> zeroinitializer, i8 %x3) - %res2 = call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 -1) - %res3 = fadd <8 x double> %res, %res1 - %res4 = fadd <8 x double> %res3, %res2 - ret <8 x double> %res4 + ret <8 x double> %res +} + +define <8 x double>@test_int_x86_avx512_maskz_permvar_df_512(<8 x double> %x0, <8 x i64> %x1, i8 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_permvar_df_512: +; X86: ## %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpermpd %zmm0, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0xc9,0x16,0xc0] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_permvar_df_512: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vpermpd %zmm0, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0xc9,0x16,0xc0] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> zeroinitializer, i8 %x3) + ret <8 x double> %res } declare <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) +define <8 x i64>@test_int_x86_avx512_permvar_di_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) { +; CHECK-LABEL: test_int_x86_avx512_permvar_di_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpermpd %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x48,0x16,0xc0] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) + ret <8 x i64> %res +} + define <8 x i64>@test_int_x86_avx512_mask_permvar_di_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_permvar_di_512: ; X86: ## %bb.0: -; X86-NEXT: vpermq %zmm0, %zmm1, %zmm3 ## encoding: [0x62,0xf2,0xf5,0x48,0x36,0xd8] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermq %zmm0, %zmm1, %zmm2 {%k1} ## encoding: [0x62,0xf2,0xf5,0x49,0x36,0xd0] -; X86-NEXT: vpermq %zmm0, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0xc9,0x36,0xc0] -; X86-NEXT: vpaddq %zmm3, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xd4,0xc3] -; X86-NEXT: vpaddq %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc0] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_permvar_di_512: ; X64: ## %bb.0: -; X64-NEXT: vpermq %zmm0, %zmm1, %zmm3 ## encoding: [0x62,0xf2,0xf5,0x48,0x36,0xd8] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermq %zmm0, %zmm1, %zmm2 {%k1} ## encoding: [0x62,0xf2,0xf5,0x49,0x36,0xd0] -; X64-NEXT: vpermq %zmm0, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0xc9,0x36,0xc0] -; X64-NEXT: vpaddq %zmm3, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xd4,0xc3] -; X64-NEXT: vpaddq %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc0] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) - %res1 = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3) - %res2 = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) - %res3 = add <8 x i64> %res, %res1 - %res4 = add <8 x i64> %res3, %res2 - ret <8 x i64> %res4 + ret <8 x i64> %res +} + +define <8 x i64>@test_int_x86_avx512_maskz_permvar_di_512(<8 x i64> %x0, <8 x i64> %x1, i8 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_permvar_di_512: +; X86: ## %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpermq %zmm0, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0xc9,0x36,0xc0] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_permvar_di_512: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vpermq %zmm0, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0xc9,0x36,0xc0] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3) + ret <8 x i64> %res } declare <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float>, <16 x i32>, <16 x float>, i16) +define <16 x float>@test_int_x86_avx512_permvar_sf_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2) { +; CHECK-LABEL: test_int_x86_avx512_permvar_sf_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x48,0x16,0xc0] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 -1) + ret <16 x float> %res +} + define <16 x float>@test_int_x86_avx512_mask_permvar_sf_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_permvar_sf_512: ; X86: ## %bb.0: -; X86-NEXT: vpermps %zmm0, %zmm1, %zmm3 ## encoding: [0x62,0xf2,0x75,0x48,0x16,0xd8] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpermps %zmm0, %zmm1, %zmm2 {%k1} ## encoding: [0x62,0xf2,0x75,0x49,0x16,0xd0] -; X86-NEXT: vpermps %zmm0, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0xc9,0x16,0xc0] -; X86-NEXT: vaddps %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6c,0x48,0x58,0xc0] -; X86-NEXT: vaddps %zmm3, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x58,0xc3] +; X86-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_permvar_sf_512: ; X64: ## %bb.0: -; X64-NEXT: vpermps %zmm0, %zmm1, %zmm3 ## encoding: [0x62,0xf2,0x75,0x48,0x16,0xd8] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermps %zmm0, %zmm1, %zmm2 {%k1} ## encoding: [0x62,0xf2,0x75,0x49,0x16,0xd0] -; X64-NEXT: vpermps %zmm0, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0xc9,0x16,0xc0] -; X64-NEXT: vaddps %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6c,0x48,0x58,0xc0] -; X64-NEXT: vaddps %zmm3, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x58,0xc3] +; X64-NEXT: vmovaps %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3) - %res1 = call <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> zeroinitializer, i16 %x3) - %res2 = call <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 -1) - %res3 = fadd <16 x float> %res, %res1 - %res4 = fadd <16 x float> %res3, %res2 - ret <16 x float> %res4 + ret <16 x float> %res +} + +define <16 x float>@test_int_x86_avx512_maskz_permvar_sf_512(<16 x float> %x0, <16 x i32> %x1, i16 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_permvar_sf_512: +; X86: ## %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpermps %zmm0, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0xc9,0x16,0xc0] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_permvar_sf_512: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vpermps %zmm0, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0xc9,0x16,0xc0] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> zeroinitializer, i16 %x3) + ret <16 x float> %res } declare <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) +define <16 x i32>@test_int_x86_avx512_permvar_si_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) { +; CHECK-LABEL: test_int_x86_avx512_permvar_si_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x48,0x16,0xc0] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1) + ret <16 x i32> %res +} + define <16 x i32>@test_int_x86_avx512_mask_permvar_si_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_permvar_si_512: ; X86: ## %bb.0: -; X86-NEXT: vpermd %zmm0, %zmm1, %zmm3 ## encoding: [0x62,0xf2,0x75,0x48,0x36,0xd8] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpermd %zmm0, %zmm1, %zmm2 {%k1} ## encoding: [0x62,0xf2,0x75,0x49,0x36,0xd0] -; X86-NEXT: vpermd %zmm0, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0xc9,0x36,0xc0] -; X86-NEXT: vpaddd %zmm3, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfe,0xc3] -; X86-NEXT: vpaddd %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc0] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_permvar_si_512: ; X64: ## %bb.0: -; X64-NEXT: vpermd %zmm0, %zmm1, %zmm3 ## encoding: [0x62,0xf2,0x75,0x48,0x36,0xd8] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermd %zmm0, %zmm1, %zmm2 {%k1} ## encoding: [0x62,0xf2,0x75,0x49,0x36,0xd0] -; X64-NEXT: vpermd %zmm0, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0xc9,0x36,0xc0] -; X64-NEXT: vpaddd %zmm3, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfe,0xc3] -; X64-NEXT: vpaddd %zmm0, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc0] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) - %res1 = call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> zeroinitializer, i16 %x3) - %res2 = call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1) - %res3 = add <16 x i32> %res, %res1 - %res4 = add <16 x i32> %res3, %res2 - ret <16 x i32> %res4 + ret <16 x i32> %res +} + +define <16 x i32>@test_int_x86_avx512_maskz_permvar_si_512(<16 x i32> %x0, <16 x i32> %x1, i16 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_permvar_si_512: +; X86: ## %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpermd %zmm0, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0xc9,0x36,0xc0] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_permvar_si_512: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vpermd %zmm0, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0xc9,0x36,0xc0] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> zeroinitializer, i16 %x3) + ret <16 x i32> %res } declare <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i32, i16) +define <16 x i32>@test_int_x86_avx512_pternlog_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pternlog_d_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpternlogd $33, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf3,0x75,0x48,0x25,0xc2,0x21] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 -1) + ret <16 x i32> %res +} + define <16 x i32>@test_int_x86_avx512_mask_pternlog_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x4) { ; X86-LABEL: test_int_x86_avx512_mask_pternlog_d_512: ; X86: ## %bb.0: -; X86-NEXT: vmovdqa64 %zmm0, %zmm3 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8] -; X86-NEXT: vpternlogd $33, %zmm2, %zmm1, %zmm3 ## encoding: [0x62,0xf3,0x75,0x48,0x25,0xda,0x21] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpternlogd $33, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf3,0x75,0x49,0x25,0xc2,0x21] -; X86-NEXT: vpaddd %zmm3, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfe,0xc3] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pternlog_d_512: ; X64: ## %bb.0: -; X64-NEXT: vmovdqa64 %zmm0, %zmm3 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8] -; X64-NEXT: vpternlogd $33, %zmm2, %zmm1, %zmm3 ## encoding: [0x62,0xf3,0x75,0x48,0x25,0xda,0x21] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpternlogd $33, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf3,0x75,0x49,0x25,0xc2,0x21] -; X64-NEXT: vpaddd %zmm3, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfe,0xc3] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 %x4) - %res1 = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 -1) - %res2 = add <16 x i32> %res, %res1 - ret <16 x i32> %res2 + ret <16 x i32> %res } declare <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i32, i16) @@ -6919,52 +7596,45 @@ declare <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32>, <16 x i32>, define <16 x i32>@test_int_x86_avx512_maskz_pternlog_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x4) { ; X86-LABEL: test_int_x86_avx512_maskz_pternlog_d_512: ; X86: ## %bb.0: -; X86-NEXT: vmovdqa64 %zmm0, %zmm3 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8] -; X86-NEXT: vpternlogd $33, %zmm2, %zmm1, %zmm3 ## encoding: [0x62,0xf3,0x75,0x48,0x25,0xda,0x21] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpternlogd $33, %zmm2, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x75,0xc9,0x25,0xc2,0x21] -; X86-NEXT: vpaddd %zmm3, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfe,0xc3] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_maskz_pternlog_d_512: ; X64: ## %bb.0: -; X64-NEXT: vmovdqa64 %zmm0, %zmm3 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8] -; X64-NEXT: vpternlogd $33, %zmm2, %zmm1, %zmm3 ## encoding: [0x62,0xf3,0x75,0x48,0x25,0xda,0x21] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpternlogd $33, %zmm2, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x75,0xc9,0x25,0xc2,0x21] -; X64-NEXT: vpaddd %zmm3, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfe,0xc3] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 %x4) - %res1 = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 -1) - %res2 = add <16 x i32> %res, %res1 - ret <16 x i32> %res2 + ret <16 x i32> %res } declare <8 x i64> @llvm.x86.avx512.mask.pternlog.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i32, i8) +define <8 x i64>@test_int_x86_avx512_pternlog_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pternlog_q_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpternlogq $33, %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf3,0xf5,0x48,0x25,0xc2,0x21] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <8 x i64> @llvm.x86.avx512.mask.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33, i8 -1) + ret <8 x i64> %res +} + define <8 x i64>@test_int_x86_avx512_mask_pternlog_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x4) { ; X86-LABEL: test_int_x86_avx512_mask_pternlog_q_512: ; X86: ## %bb.0: -; X86-NEXT: vmovdqa64 %zmm0, %zmm3 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8] -; X86-NEXT: vpternlogq $33, %zmm2, %zmm1, %zmm3 ## encoding: [0x62,0xf3,0xf5,0x48,0x25,0xda,0x21] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpternlogq $33, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf3,0xf5,0x49,0x25,0xc2,0x21] -; X86-NEXT: vpaddq %zmm3, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xd4,0xc3] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pternlog_q_512: ; X64: ## %bb.0: -; X64-NEXT: vmovdqa64 %zmm0, %zmm3 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8] -; X64-NEXT: vpternlogq $33, %zmm2, %zmm1, %zmm3 ## encoding: [0x62,0xf3,0xf5,0x48,0x25,0xda,0x21] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpternlogq $33, %zmm2, %zmm1, %zmm0 {%k1} ## encoding: [0x62,0xf3,0xf5,0x49,0x25,0xc2,0x21] -; X64-NEXT: vpaddq %zmm3, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xd4,0xc3] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <8 x i64> @llvm.x86.avx512.mask.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33, i8 %x4) - %res1 = call <8 x i64> @llvm.x86.avx512.mask.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33, i8 -1) - %res2 = add <8 x i64> %res, %res1 - ret <8 x i64> %res2 + ret <8 x i64> %res } declare <8 x i64> @llvm.x86.avx512.maskz.pternlog.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i32, i8) @@ -6996,110 +7666,123 @@ define <8 x i64>@test_int_x86_avx512_maskz_pternlog_q_512(<8 x i64> %x0, <8 x i6 declare <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) +define <16 x i32>@test_int_x86_avx512_vpermi2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2p, <16 x i32> %x4) { +; CHECK-LABEL: test_int_x86_avx512_vpermi2var_d_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpermt2d %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x48,0x7e,0xc2] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %x2 = load <16 x i32>, <16 x i32>* %x2p + %res = call <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x4, i16 -1) + ret <16 x i32> %res +} + define <16 x i32>@test_int_x86_avx512_mask_vpermi2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2p, <16 x i32> %x4, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermi2var_d_512: ; X86: ## %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] -; X86-NEXT: vmovdqa64 %zmm1, %zmm3 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd9] -; X86-NEXT: vpermi2d (%eax), %zmm0, %zmm3 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x76,0x18] -; X86-NEXT: vpermt2d %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x48,0x7e,0xc2] -; X86-NEXT: vpaddd %zmm0, %zmm3, %zmm0 ## encoding: [0x62,0xf1,0x65,0x48,0xfe,0xc0] +; X86-NEXT: vpermi2d (%eax), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x76,0x08] +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermi2var_d_512: ; X64: ## %bb.0: ; X64-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vmovdqa64 %zmm1, %zmm3 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd9] -; X64-NEXT: vpermi2d (%rdi), %zmm0, %zmm3 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x76,0x1f] -; X64-NEXT: vpermt2d %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x48,0x7e,0xc2] -; X64-NEXT: vpaddd %zmm0, %zmm3, %zmm0 ## encoding: [0x62,0xf1,0x65,0x48,0xfe,0xc0] +; X64-NEXT: vpermi2d (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x76,0x0f] +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X64-NEXT: retq ## encoding: [0xc3] %x2 = load <16 x i32>, <16 x i32>* %x2p %res = call <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) - %res1 = call <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x4, i16 -1) - %res2 = add <16 x i32> %res, %res1 - ret <16 x i32> %res2 + ret <16 x i32> %res } declare <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double>, <8 x i64>, <8 x double>, i8) +define <8 x double>@test_int_x86_avx512_vpermi2var_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermi2var_pd_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpermt2pd %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x48,0x7f,0xc2] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 -1) + ret <8 x double> %res +} + define <8 x double>@test_int_x86_avx512_mask_vpermi2var_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermi2var_pd_512: ; X86: ## %bb.0: -; X86-NEXT: vmovapd %zmm0, %zmm3 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xd8] -; X86-NEXT: vpermt2pd %zmm2, %zmm1, %zmm3 ## encoding: [0x62,0xf2,0xf5,0x48,0x7f,0xda] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermi2pd %zmm2, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x77,0xca] -; X86-NEXT: vaddpd %zmm3, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0x58,0xc3] +; X86-NEXT: vmovapd %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc1] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermi2var_pd_512: ; X64: ## %bb.0: -; X64-NEXT: vmovapd %zmm0, %zmm3 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xd8] -; X64-NEXT: vpermt2pd %zmm2, %zmm1, %zmm3 ## encoding: [0x62,0xf2,0xf5,0x48,0x7f,0xda] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermi2pd %zmm2, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x77,0xca] -; X64-NEXT: vaddpd %zmm3, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0x58,0xc3] +; X64-NEXT: vmovapd %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc1] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) - %res1 = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 -1) - %res2 = fadd <8 x double> %res, %res1 - ret <8 x double> %res2 + ret <8 x double> %res } declare <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float>, <16 x i32>, <16 x float>, i16) +define <16 x float>@test_int_x86_avx512_vpermi2var_ps_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermi2var_ps_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpermt2ps %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x48,0x7f,0xc2] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 -1) + ret <16 x float> %res +} + define <16 x float>@test_int_x86_avx512_mask_vpermi2var_ps_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermi2var_ps_512: ; X86: ## %bb.0: -; X86-NEXT: vmovaps %zmm0, %zmm3 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xd8] -; X86-NEXT: vpermt2ps %zmm2, %zmm1, %zmm3 ## encoding: [0x62,0xf2,0x75,0x48,0x7f,0xda] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpermi2ps %zmm2, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x77,0xca] -; X86-NEXT: vaddps %zmm3, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x74,0x48,0x58,0xc3] +; X86-NEXT: vmovaps %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermi2var_ps_512: ; X64: ## %bb.0: -; X64-NEXT: vmovaps %zmm0, %zmm3 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xd8] -; X64-NEXT: vpermt2ps %zmm2, %zmm1, %zmm3 ## encoding: [0x62,0xf2,0x75,0x48,0x7f,0xda] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermi2ps %zmm2, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x77,0xca] -; X64-NEXT: vaddps %zmm3, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x74,0x48,0x58,0xc3] +; X64-NEXT: vmovaps %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3) - %res1 = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 -1) - %res2 = fadd <16 x float> %res, %res1 - ret <16 x float> %res2 + ret <16 x float> %res } declare <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) +define <8 x i64>@test_int_x86_avx512_vpermi2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermi2var_q_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpermt2q %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x48,0x7e,0xc2] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) + ret <8 x i64> %res +} + define <8 x i64>@test_int_x86_avx512_mask_vpermi2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermi2var_q_512: ; X86: ## %bb.0: -; X86-NEXT: vmovdqa64 %zmm0, %zmm3 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8] -; X86-NEXT: vpermt2q %zmm2, %zmm1, %zmm3 ## encoding: [0x62,0xf2,0xf5,0x48,0x7e,0xda] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x76,0xca] -; X86-NEXT: vpaddq %zmm3, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc3] +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermi2var_q_512: ; X64: ## %bb.0: -; X64-NEXT: vmovdqa64 %zmm0, %zmm3 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8] -; X64-NEXT: vpermt2q %zmm2, %zmm1, %zmm3 ## encoding: [0x62,0xf2,0xf5,0x48,0x7e,0xda] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x76,0xca] -; X64-NEXT: vpaddq %zmm3, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc3] +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) - %res1 = call <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) - %res2 = add <8 x i64> %res, %res1 - ret <8 x i64> %res2 + ret <8 x i64> %res } declare <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) @@ -7109,25 +7792,17 @@ define <16 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_512(<16 x i32> %x0, <16 ; X86: ## %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] -; X86-NEXT: vmovdqa64 %zmm1, %zmm2 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd1] -; X86-NEXT: vpermt2d (%eax), %zmm0, %zmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x7e,0x10] -; X86-NEXT: vpermt2d %zmm1, %zmm0, %zmm1 ## encoding: [0x62,0xf2,0x7d,0x48,0x7e,0xc9] -; X86-NEXT: vpaddd %zmm1, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc1] +; X86-NEXT: vpermi2d (%eax), %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0xc9,0x76,0x00] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_maskz_vpermt2var_d_512: ; X64: ## %bb.0: ; X64-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vmovdqa64 %zmm1, %zmm2 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd1] -; X64-NEXT: vpermt2d (%rdi), %zmm0, %zmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x7e,0x17] -; X64-NEXT: vpermt2d %zmm1, %zmm0, %zmm1 ## encoding: [0x62,0xf2,0x7d,0x48,0x7e,0xc9] -; X64-NEXT: vpaddd %zmm1, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc1] +; X64-NEXT: vpermi2d (%rdi), %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0xc9,0x76,0x07] ; X64-NEXT: retq ## encoding: [0xc3] %x2 = load <16 x i32>, <16 x i32>* %x2p %res = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) - %res1 = call <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x1, i16 -1) - %res2 = add <16 x i32> %res, %res1 - ret <16 x i32> %res2 + ret <16 x i32> %res } declare <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64>, <8 x double>, <8 x double>, i8) @@ -7138,27 +7813,19 @@ define <8 x double>@test_int_x86_avx512_maskz_vpermt2var_pd_512(<8 x i64> %x0, < ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ## encoding: [0x0f,0xb6,0x4c,0x24,0x08] ; X86-NEXT: kmovw %ecx, %k1 ## encoding: [0xc5,0xf8,0x92,0xc9] -; X86-NEXT: vmovapd %zmm1, %zmm2 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xd1] -; X86-NEXT: vpermt2pd (%eax){1to8}, %zmm0, %zmm2 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xd9,0x7f,0x10] -; X86-NEXT: vpermt2pd %zmm1, %zmm0, %zmm1 ## encoding: [0x62,0xf2,0xfd,0x48,0x7f,0xc9] -; X86-NEXT: vaddpd %zmm1, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0x58,0xc1] +; X86-NEXT: vpermi2pd (%eax){1to8}, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0xd9,0x77,0x00] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_maskz_vpermt2var_pd_512: ; X64: ## %bb.0: ; X64-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] -; X64-NEXT: vmovapd %zmm1, %zmm2 ## encoding: [0x62,0xf1,0xfd,0x48,0x28,0xd1] -; X64-NEXT: vpermt2pd (%rdi){1to8}, %zmm0, %zmm2 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xd9,0x7f,0x17] -; X64-NEXT: vpermt2pd %zmm1, %zmm0, %zmm1 ## encoding: [0x62,0xf2,0xfd,0x48,0x7f,0xc9] -; X64-NEXT: vaddpd %zmm1, %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xed,0x48,0x58,0xc1] +; X64-NEXT: vpermi2pd (%rdi){1to8}, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0xd9,0x77,0x07] ; X64-NEXT: retq ## encoding: [0xc3] %x2s = load double, double* %x2ptr %x2ins = insertelement <8 x double> undef, double %x2s, i32 0 %x2 = shufflevector <8 x double> %x2ins, <8 x double> undef, <8 x i32> zeroinitializer %res = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> %x0, <8 x double> %x1, <8 x double> %x2, i8 %x3) - %res1 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> %x0, <8 x double> %x1, <8 x double> %x1, i8 -1) - %res2 = fadd <8 x double> %res, %res1 - ret <8 x double> %res2 + ret <8 x double> %res } declare <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32>, <16 x float>, <16 x float>, i16) @@ -7166,25 +7833,17 @@ declare <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32>, <16 x define <16 x float>@test_int_x86_avx512_maskz_vpermt2var_ps_512(<16 x i32> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_maskz_vpermt2var_ps_512: ; X86: ## %bb.0: -; X86-NEXT: vmovaps %zmm1, %zmm3 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xd9] -; X86-NEXT: vpermt2ps %zmm2, %zmm0, %zmm3 ## encoding: [0x62,0xf2,0x7d,0x48,0x7f,0xda] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] -; X86-NEXT: vpermt2ps %zmm2, %zmm0, %zmm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x7f,0xca] -; X86-NEXT: vaddps %zmm3, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x74,0x48,0x58,0xc3] +; X86-NEXT: vpermi2ps %zmm2, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0xc9,0x77,0xc2] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_maskz_vpermt2var_ps_512: ; X64: ## %bb.0: -; X64-NEXT: vmovaps %zmm1, %zmm3 ## encoding: [0x62,0xf1,0x7c,0x48,0x28,0xd9] -; X64-NEXT: vpermt2ps %zmm2, %zmm0, %zmm3 ## encoding: [0x62,0xf2,0x7d,0x48,0x7f,0xda] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vpermt2ps %zmm2, %zmm0, %zmm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x7f,0xca] -; X64-NEXT: vaddps %zmm3, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x74,0x48,0x58,0xc3] +; X64-NEXT: vpermi2ps %zmm2, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0xc9,0x77,0xc2] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) - %res1 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1) - %res2 = fadd <16 x float> %res, %res1 - ret <16 x float> %res2 + ret <16 x float> %res } @@ -7193,52 +7852,47 @@ declare <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64>, <8 x i64>, define <8 x i64>@test_int_x86_avx512_maskz_vpermt2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_maskz_vpermt2var_q_512: ; X86: ## %bb.0: -; X86-NEXT: vmovdqa64 %zmm1, %zmm3 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd9] -; X86-NEXT: vpermt2q %zmm2, %zmm0, %zmm3 ## encoding: [0x62,0xf2,0xfd,0x48,0x7e,0xda] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x7e,0xca] -; X86-NEXT: vpaddq %zmm3, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc3] +; X86-NEXT: vpermi2q %zmm2, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0xc9,0x76,0xc2] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_maskz_vpermt2var_q_512: ; X64: ## %bb.0: -; X64-NEXT: vmovdqa64 %zmm1, %zmm3 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd9] -; X64-NEXT: vpermt2q %zmm2, %zmm0, %zmm3 ## encoding: [0x62,0xf2,0xfd,0x48,0x7e,0xda] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x7e,0xca] -; X64-NEXT: vpaddq %zmm3, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc3] +; X64-NEXT: vpermi2q %zmm2, %zmm1, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0xc9,0x76,0xc2] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) - %res1 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) - %res2 = add <8 x i64> %res, %res1 - ret <8 x i64> %res2 + ret <8 x i64> %res } declare <16 x i32> @llvm.x86.avx512.mask.vpermt2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) +define <16 x i32>@test_int_x86_avx512_vpermt2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermt2var_d_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpermi2d %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x48,0x76,0xc2] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <16 x i32> @llvm.x86.avx512.mask.vpermt2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1) + ret <16 x i32> %res +} + define <16 x i32>@test_int_x86_avx512_mask_vpermt2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermt2var_d_512: ; X86: ## %bb.0: -; X86-NEXT: vmovdqa64 %zmm1, %zmm3 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd9] -; X86-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 ## encoding: [0x62,0xf2,0x7d,0x48,0x7e,0xda] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ## encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x7e,0xca] -; X86-NEXT: vpaddd %zmm3, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x75,0x48,0xfe,0xc3] +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermt2var_d_512: ; X64: ## %bb.0: -; X64-NEXT: vmovdqa64 %zmm1, %zmm3 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd9] -; X64-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 ## encoding: [0x62,0xf2,0x7d,0x48,0x7e,0xda] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x7e,0xca] -; X64-NEXT: vpaddd %zmm3, %zmm1, %zmm0 ## encoding: [0x62,0xf1,0x75,0x48,0xfe,0xc3] +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X64-NEXT: retq ## encoding: [0xc3] %res = call <16 x i32> @llvm.x86.avx512.mask.vpermt2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) - %res1 = call <16 x i32> @llvm.x86.avx512.mask.vpermt2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1) - %res2 = add <16 x i32> %res, %res1 - ret <16 x i32> %res2 + ret <16 x i32> %res } declare <16 x float> @llvm.x86.avx512.mask.sub.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) @@ -9740,33 +10394,49 @@ define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ss_rm(<4 x float> %x0, <4 x declare <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64>, <8 x i32>, i8) +define <8 x i32>@test_int_x86_avx512_pmov_qd_512(<8 x i64> %x0, <8 x i32> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmov_qd_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpmovqd %zmm0, %ymm0 ## encoding: [0x62,0xf2,0x7e,0x48,0x35,0xc0] +; CHECK-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 -1) + ret <8 x i32> %res +} + define <8 x i32>@test_int_x86_avx512_mask_pmov_qd_512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmov_qd_512: ; X86: ## %bb.0: -; X86-NEXT: vpmovqd %zmm0, %ymm2 ## encoding: [0x62,0xf2,0x7e,0x48,0x35,0xc2] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovqd %zmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x49,0x35,0xc1] -; X86-NEXT: vpmovqd %zmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xc9,0x35,0xc0] -; X86-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## encoding: [0xc5,0xf5,0xfe,0xc0] -; X86-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ## encoding: [0xc5,0xed,0xfe,0xc0] +; X86-NEXT: vmovdqa %ymm1, %ymm0 ## encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmov_qd_512: ; X64: ## %bb.0: -; X64-NEXT: vpmovqd %zmm0, %ymm2 ## encoding: [0x62,0xf2,0x7e,0x48,0x35,0xc2] ; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovqd %zmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x49,0x35,0xc1] +; X64-NEXT: vmovdqa %ymm1, %ymm0 ## encoding: [0xc5,0xfd,0x6f,0xc1] +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) + ret <8 x i32> %res +} + +define <8 x i32>@test_int_x86_avx512_maskz_pmov_qd_512(<8 x i64> %x0, i8 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmov_qd_512: +; X86: ## %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpmovqd %zmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xc9,0x35,0xc0] +; X86-NEXT: retl ## encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmov_qd_512: +; X64: ## %bb.0: +; X64-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovqd %zmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xc9,0x35,0xc0] -; X64-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## encoding: [0xc5,0xf5,0xfe,0xc0] -; X64-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ## encoding: [0xc5,0xed,0xfe,0xc0] -; X64-NEXT: retq ## encoding: [0xc3] - %res0 = call <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 -1) - %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) - %res2 = call <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64> %x0, <8 x i32> zeroinitializer, i8 %x2) - %res3 = add <8 x i32> %res0, %res1 - %res4 = add <8 x i32> %res3, %res2 - ret <8 x i32> %res4 +; X64-NEXT: retq ## encoding: [0xc3] + %res = call <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64> %x0, <8 x i32> zeroinitializer, i8 %x2) + ret <8 x i32> %res } declare <16 x float> @llvm.x86.avx512.mask.cvtdq2ps.512(<16 x i32>, <16 x float>, i16, i32) diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics.ll b/llvm/test/CodeGen/X86/avx512-intrinsics.ll index 8844c0e5e95f1..07278bde99c22 100644 --- a/llvm/test/CodeGen/X86/avx512-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics.ll @@ -2922,160 +2922,166 @@ declare <4 x float> @llvm.x86.avx512.cvtusi2ss(<4 x float>, i32, i32) nounwind r declare <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>) -define <16 x i32>@test_int_x86_avx512_mask_vpermi2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2p, <16 x i32> %x4, i16 %x3) { +define <16 x i32>@test_int_x86_avx512_vpermi2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2p) { +; X64-LABEL: test_int_x86_avx512_vpermi2var_d_512: +; X64: # %bb.0: +; X64-NEXT: vpermt2d (%rdi), %zmm1, %zmm0 +; X64-NEXT: retq +; +; X86-LABEL: test_int_x86_avx512_vpermi2var_d_512: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vpermt2d (%eax), %zmm1, %zmm0 +; X86-NEXT: retl + %x2 = load <16 x i32>, <16 x i32>* %x2p + %1 = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) + ret <16 x i32> %1 +} + +define <16 x i32>@test_int_x86_avx512_mask_vpermi2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2p, i16 %x3) { ; X64-LABEL: test_int_x86_avx512_mask_vpermi2var_d_512: ; X64: # %bb.0: ; X64-NEXT: kmovw %esi, %k1 -; X64-NEXT: vmovdqa64 %zmm1, %zmm3 -; X64-NEXT: vpermi2d (%rdi), %zmm0, %zmm3 {%k1} -; X64-NEXT: vpermt2d %zmm2, %zmm1, %zmm0 -; X64-NEXT: vpaddd %zmm0, %zmm3, %zmm0 +; X64-NEXT: vpermi2d (%rdi), %zmm0, %zmm1 {%k1} +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 ; X64-NEXT: retq ; ; X86-LABEL: test_int_x86_avx512_mask_vpermi2var_d_512: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 -; X86-NEXT: vmovdqa64 %zmm1, %zmm3 -; X86-NEXT: vpermi2d (%eax), %zmm0, %zmm3 {%k1} -; X86-NEXT: vpermt2d %zmm2, %zmm1, %zmm0 -; X86-NEXT: vpaddd %zmm0, %zmm3, %zmm0 +; X86-NEXT: vpermi2d (%eax), %zmm0, %zmm1 {%k1} +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 ; X86-NEXT: retl %x2 = load <16 x i32>, <16 x i32>* %x2p %1 = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) %2 = bitcast i16 %x3 to <16 x i1> %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x1 - %4 = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x4) - %res2 = add <16 x i32> %3, %4 - ret <16 x i32> %res2 + ret <16 x i32> %3 } declare <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double>, <8 x i64>, <8 x double>) +define <8 x double>@test_int_x86_avx512_vpermi2var_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermi2var_pd_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermt2pd %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: ret{{[l|q]}} + %1 = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2) + ret <8 x double> %1 +} + define <8 x double>@test_int_x86_avx512_mask_vpermi2var_pd_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) { ; X64-LABEL: test_int_x86_avx512_mask_vpermi2var_pd_512: ; X64: # %bb.0: -; X64-NEXT: vmovapd %zmm0, %zmm3 -; X64-NEXT: vpermt2pd %zmm2, %zmm1, %zmm3 ; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vpermi2pd %zmm2, %zmm0, %zmm1 {%k1} -; X64-NEXT: vaddpd %zmm3, %zmm1, %zmm0 +; X64-NEXT: vmovapd %zmm1, %zmm0 ; X64-NEXT: retq ; ; X86-LABEL: test_int_x86_avx512_mask_vpermi2var_pd_512: ; X86: # %bb.0: -; X86-NEXT: vmovapd %zmm0, %zmm3 -; X86-NEXT: vpermt2pd %zmm2, %zmm1, %zmm3 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vpermi2pd %zmm2, %zmm0, %zmm1 {%k1} -; X86-NEXT: vaddpd %zmm3, %zmm1, %zmm0 +; X86-NEXT: vmovapd %zmm1, %zmm0 ; X86-NEXT: retl %1 = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2) %2 = bitcast <8 x i64> %x1 to <8 x double> %3 = bitcast i8 %x3 to <8 x i1> %4 = select <8 x i1> %3, <8 x double> %1, <8 x double> %2 - %5 = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2) - %6 = bitcast <8 x i64> %x1 to <8 x double> - %res2 = fadd <8 x double> %4, %5 - ret <8 x double> %res2 + ret <8 x double> %4 } declare <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float>, <16 x i32>, <16 x float>) +define <16 x float>@test_int_x86_avx512_vpermi2var_ps_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermi2var_ps_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermt2ps %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: ret{{[l|q]}} + %1 = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2) + ret <16 x float> %1 +} + define <16 x float>@test_int_x86_avx512_mask_vpermi2var_ps_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3) { ; X64-LABEL: test_int_x86_avx512_mask_vpermi2var_ps_512: ; X64: # %bb.0: -; X64-NEXT: vmovaps %zmm0, %zmm3 -; X64-NEXT: vpermt2ps %zmm2, %zmm1, %zmm3 ; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vpermi2ps %zmm2, %zmm0, %zmm1 {%k1} -; X64-NEXT: vaddps %zmm3, %zmm1, %zmm0 +; X64-NEXT: vmovaps %zmm1, %zmm0 ; X64-NEXT: retq ; ; X86-LABEL: test_int_x86_avx512_mask_vpermi2var_ps_512: ; X86: # %bb.0: -; X86-NEXT: vmovaps %zmm0, %zmm3 -; X86-NEXT: vpermt2ps %zmm2, %zmm1, %zmm3 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X86-NEXT: vpermi2ps %zmm2, %zmm0, %zmm1 {%k1} -; X86-NEXT: vaddps %zmm3, %zmm1, %zmm0 +; X86-NEXT: vmovaps %zmm1, %zmm0 ; X86-NEXT: retl %1 = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2) %2 = bitcast <16 x i32> %x1 to <16 x float> %3 = bitcast i16 %x3 to <16 x i1> %4 = select <16 x i1> %3, <16 x float> %1, <16 x float> %2 - %5 = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2) - %6 = bitcast <16 x i32> %x1 to <16 x float> - %res2 = fadd <16 x float> %4, %5 - ret <16 x float> %res2 + ret <16 x float> %4 } declare <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64>, <8 x i64>, <8 x i64>) +define <8 x i64>@test_int_x86_avx512_vpermi2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermi2var_q_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermt2q %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: ret{{[l|q]}} + %1 = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) + ret <8 x i64> %1 +} + define <8 x i64>@test_int_x86_avx512_mask_vpermi2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) { ; X64-LABEL: test_int_x86_avx512_mask_vpermi2var_q_512: ; X64: # %bb.0: -; X64-NEXT: vmovdqa64 %zmm0, %zmm3 -; X64-NEXT: vpermt2q %zmm2, %zmm1, %zmm3 ; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 {%k1} -; X64-NEXT: vpaddq %zmm3, %zmm1, %zmm0 +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 ; X64-NEXT: retq ; ; X86-LABEL: test_int_x86_avx512_mask_vpermi2var_q_512: ; X86: # %bb.0: -; X86-NEXT: vmovdqa64 %zmm0, %zmm3 -; X86-NEXT: vpermt2q %zmm2, %zmm1, %zmm3 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vpermi2q %zmm2, %zmm0, %zmm1 {%k1} -; X86-NEXT: vpaddq %zmm3, %zmm1, %zmm0 +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 ; X86-NEXT: retl %1 = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) %2 = bitcast i8 %x3 to <8 x i1> %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %x1 - %4 = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) - %res2 = add <8 x i64> %3, %4 - ret <8 x i64> %res2 + ret <8 x i64> %3 } define <16 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2p, i16 %x3) { ; X64-LABEL: test_int_x86_avx512_maskz_vpermt2var_d_512: ; X64: # %bb.0: ; X64-NEXT: kmovw %esi, %k1 -; X64-NEXT: vmovdqa64 %zmm1, %zmm2 -; X64-NEXT: vpermt2d (%rdi), %zmm0, %zmm2 {%k1} {z} -; X64-NEXT: vpermt2d %zmm1, %zmm0, %zmm1 -; X64-NEXT: vpaddd %zmm1, %zmm2, %zmm0 +; X64-NEXT: vpermi2d (%rdi), %zmm1, %zmm0 {%k1} {z} ; X64-NEXT: retq ; ; X86-LABEL: test_int_x86_avx512_maskz_vpermt2var_d_512: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 -; X86-NEXT: vmovdqa64 %zmm1, %zmm2 -; X86-NEXT: vpermt2d (%eax), %zmm0, %zmm2 {%k1} {z} -; X86-NEXT: vpermt2d %zmm1, %zmm0, %zmm1 -; X86-NEXT: vpaddd %zmm1, %zmm2, %zmm0 +; X86-NEXT: vpermi2d (%eax), %zmm1, %zmm0 {%k1} {z} ; X86-NEXT: retl %x2 = load <16 x i32>, <16 x i32>* %x2p %1 = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2) %2 = bitcast i16 %x3 to <16 x i1> %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer - %4 = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x1) - %res2 = add <16 x i32> %3, %4 - ret <16 x i32> %res2 + ret <16 x i32> %3 } define <8 x double>@test_int_x86_avx512_maskz_vpermt2var_pd_512(<8 x i64> %x0, <8 x double> %x1, double* %x2ptr, i8 %x3) { ; X64-LABEL: test_int_x86_avx512_maskz_vpermt2var_pd_512: ; X64: # %bb.0: ; X64-NEXT: kmovw %esi, %k1 -; X64-NEXT: vmovapd %zmm1, %zmm2 -; X64-NEXT: vpermt2pd (%rdi){1to8}, %zmm0, %zmm2 {%k1} {z} -; X64-NEXT: vpermt2pd %zmm1, %zmm0, %zmm1 -; X64-NEXT: vaddpd %zmm1, %zmm2, %zmm0 +; X64-NEXT: vpermi2pd (%rdi){1to8}, %zmm1, %zmm0 {%k1} {z} ; X64-NEXT: retq ; ; X86-LABEL: test_int_x86_avx512_maskz_vpermt2var_pd_512: @@ -3083,10 +3089,7 @@ define <8 x double>@test_int_x86_avx512_maskz_vpermt2var_pd_512(<8 x i64> %x0, < ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: kmovw %ecx, %k1 -; X86-NEXT: vmovapd %zmm1, %zmm2 -; X86-NEXT: vpermt2pd (%eax){1to8}, %zmm0, %zmm2 {%k1} {z} -; X86-NEXT: vpermt2pd %zmm1, %zmm0, %zmm1 -; X86-NEXT: vaddpd %zmm1, %zmm2, %zmm0 +; X86-NEXT: vpermi2pd (%eax){1to8}, %zmm1, %zmm0 {%k1} {z} ; X86-NEXT: retl %x2s = load double, double* %x2ptr %x2ins = insertelement <8 x double> undef, double %x2s, i32 0 @@ -3094,88 +3097,73 @@ define <8 x double>@test_int_x86_avx512_maskz_vpermt2var_pd_512(<8 x i64> %x0, < %1 = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %x1, <8 x i64> %x0, <8 x double> %x2) %2 = bitcast i8 %x3 to <8 x i1> %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> zeroinitializer - %4 = call <8 x double> @llvm.x86.avx512.vpermi2var.pd.512(<8 x double> %x1, <8 x i64> %x0, <8 x double> %x1) - %res2 = fadd <8 x double> %3, %4 - ret <8 x double> %res2 + ret <8 x double> %3 } define <16 x float>@test_int_x86_avx512_maskz_vpermt2var_ps_512(<16 x i32> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) { ; X64-LABEL: test_int_x86_avx512_maskz_vpermt2var_ps_512: ; X64: # %bb.0: -; X64-NEXT: vmovaps %zmm1, %zmm3 -; X64-NEXT: vpermt2ps %zmm2, %zmm0, %zmm3 ; X64-NEXT: kmovw %edi, %k1 -; X64-NEXT: vpermt2ps %zmm2, %zmm0, %zmm1 {%k1} {z} -; X64-NEXT: vaddps %zmm3, %zmm1, %zmm0 +; X64-NEXT: vpermi2ps %zmm2, %zmm1, %zmm0 {%k1} {z} ; X64-NEXT: retq ; ; X86-LABEL: test_int_x86_avx512_maskz_vpermt2var_ps_512: ; X86: # %bb.0: -; X86-NEXT: vmovaps %zmm1, %zmm3 -; X86-NEXT: vpermt2ps %zmm2, %zmm0, %zmm3 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 -; X86-NEXT: vpermt2ps %zmm2, %zmm0, %zmm1 {%k1} {z} -; X86-NEXT: vaddps %zmm3, %zmm1, %zmm0 +; X86-NEXT: vpermi2ps %zmm2, %zmm1, %zmm0 {%k1} {z} ; X86-NEXT: retl %1 = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %x1, <16 x i32> %x0, <16 x float> %x2) %2 = bitcast i16 %x3 to <16 x i1> %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer - %4 = call <16 x float> @llvm.x86.avx512.vpermi2var.ps.512(<16 x float> %x1, <16 x i32> %x0, <16 x float> %x2) - %res2 = fadd <16 x float> %3, %4 - ret <16 x float> %res2 + ret <16 x float> %3 } define <8 x i64>@test_int_x86_avx512_maskz_vpermt2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) { ; X64-LABEL: test_int_x86_avx512_maskz_vpermt2var_q_512: ; X64: # %bb.0: -; X64-NEXT: vmovdqa64 %zmm1, %zmm3 -; X64-NEXT: vpermt2q %zmm2, %zmm0, %zmm3 ; X64-NEXT: kmovw %edi, %k1 -; X64-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 {%k1} {z} -; X64-NEXT: vpaddq %zmm3, %zmm1, %zmm0 +; X64-NEXT: vpermi2q %zmm2, %zmm1, %zmm0 {%k1} {z} ; X64-NEXT: retq ; ; X86-LABEL: test_int_x86_avx512_maskz_vpermt2var_q_512: ; X86: # %bb.0: -; X86-NEXT: vmovdqa64 %zmm1, %zmm3 -; X86-NEXT: vpermt2q %zmm2, %zmm0, %zmm3 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: kmovw %eax, %k1 -; X86-NEXT: vpermt2q %zmm2, %zmm0, %zmm1 {%k1} {z} -; X86-NEXT: vpaddq %zmm3, %zmm1, %zmm0 +; X86-NEXT: vpermi2q %zmm2, %zmm1, %zmm0 {%k1} {z} ; X86-NEXT: retl %1 = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %x1, <8 x i64> %x0, <8 x i64> %x2) %2 = bitcast i8 %x3 to <8 x i1> %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> zeroinitializer - %4 = call <8 x i64> @llvm.x86.avx512.vpermi2var.q.512(<8 x i64> %x1, <8 x i64> %x0, <8 x i64> %x2) - %res2 = add <8 x i64> %3, %4 - ret <8 x i64> %res2 + ret <8 x i64> %3 +} + +define <16 x i32>@test_int_x86_avx512_vpermt2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermt2var_d_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermi2d %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: ret{{[l|q]}} + %1 = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2) + ret <16 x i32> %1 } define <16 x i32>@test_int_x86_avx512_mask_vpermt2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) { ; X64-LABEL: test_int_x86_avx512_mask_vpermt2var_d_512: ; X64: # %bb.0: -; X64-NEXT: vmovdqa64 %zmm1, %zmm3 -; X64-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 ; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 {%k1} -; X64-NEXT: vpaddd %zmm3, %zmm1, %zmm0 +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 ; X64-NEXT: retq ; ; X86-LABEL: test_int_x86_avx512_mask_vpermt2var_d_512: ; X86: # %bb.0: -; X86-NEXT: vmovdqa64 %zmm1, %zmm3 -; X86-NEXT: vpermt2d %zmm2, %zmm0, %zmm3 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X86-NEXT: vpermt2d %zmm2, %zmm0, %zmm1 {%k1} -; X86-NEXT: vpaddd %zmm3, %zmm1, %zmm0 +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 ; X86-NEXT: retl %1 = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2) %2 = bitcast i16 %x3 to <16 x i1> %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x1 - %4 = call <16 x i32> @llvm.x86.avx512.vpermi2var.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2) - %res2 = add <16 x i32> %3, %4 - ret <16 x i32> %res2 + ret <16 x i32> %3 } declare <8 x double> @llvm.x86.avx512.mask.scalef.pd.512(<8 x double>, <8 x double>, <8 x double>, i8, i32) @@ -3573,37 +3561,53 @@ define void @test_int_x86_avx512_mask_pmovus_qw_mem_512(i8* %ptr, <8 x i64> %x1, ret void } +define <8 x i32>@test_int_x86_avx512_pmov_qd_512(<8 x i64> %x0, <8 x i32> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmov_qd_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovqd %zmm0, %ymm0 +; CHECK-NEXT: ret{{[l|q]}} + %1 = trunc <8 x i64> %x0 to <8 x i32> + ret <8 x i32> %1 +} + define <8 x i32>@test_int_x86_avx512_mask_pmov_qd_512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) { ; X64-LABEL: test_int_x86_avx512_mask_pmov_qd_512: ; X64: # %bb.0: -; X64-NEXT: vpmovqd %zmm0, %ymm2 ; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vpmovqd %zmm0, %ymm1 {%k1} -; X64-NEXT: vpmovqd %zmm0, %ymm0 {%k1} {z} -; X64-NEXT: vpaddd %ymm0, %ymm1, %ymm0 -; X64-NEXT: vpaddd %ymm0, %ymm2, %ymm0 +; X64-NEXT: vmovdqa %ymm1, %ymm0 ; X64-NEXT: retq ; ; X86-LABEL: test_int_x86_avx512_mask_pmov_qd_512: ; X86: # %bb.0: -; X86-NEXT: vpmovqd %zmm0, %ymm2 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vpmovqd %zmm0, %ymm1 {%k1} +; X86-NEXT: vmovdqa %ymm1, %ymm0 +; X86-NEXT: retl + %1 = trunc <8 x i64> %x0 to <8 x i32> + %2 = bitcast i8 %x2 to <8 x i1> + %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> %x1 + ret <8 x i32> %3 +} + +define <8 x i32>@test_int_x86_avx512_maskz_pmov_qd_512(<8 x i64> %x0, i8 %x2) { +; X64-LABEL: test_int_x86_avx512_maskz_pmov_qd_512: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vpmovqd %zmm0, %ymm0 {%k1} {z} +; X64-NEXT: retq +; +; X86-LABEL: test_int_x86_avx512_maskz_pmov_qd_512: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vpmovqd %zmm0, %ymm0 {%k1} {z} -; X86-NEXT: vpaddd %ymm0, %ymm1, %ymm0 -; X86-NEXT: vpaddd %ymm0, %ymm2, %ymm0 ; X86-NEXT: retl %1 = trunc <8 x i64> %x0 to <8 x i32> - %2 = trunc <8 x i64> %x0 to <8 x i32> - %3 = bitcast i8 %x2 to <8 x i1> - %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> %x1 - %5 = trunc <8 x i64> %x0 to <8 x i32> - %6 = bitcast i8 %x2 to <8 x i1> - %7 = select <8 x i1> %6, <8 x i32> %5, <8 x i32> zeroinitializer - %res3 = add <8 x i32> %1, %4 - %res4 = add <8 x i32> %res3, %7 - ret <8 x i32> %res4 + %2 = bitcast i8 %x2 to <8 x i1> + %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> zeroinitializer + ret <8 x i32> %3 } declare void @llvm.x86.avx512.mask.pmov.qd.mem.512(i8* %ptr, <8 x i64>, i8) @@ -3633,15 +3637,21 @@ define void @test_int_x86_avx512_mask_pmov_qd_mem_512(i8* %ptr, <8 x i64> %x1, i declare <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64>, <8 x i32>, i8) +define <8 x i32>@test_int_x86_avx512_pmovs_qd_512(<8 x i64> %x0, <8 x i32> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmovs_qd_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovsqd %zmm0, %ymm0 +; CHECK-NEXT: ret{{[l|q]}} + %res = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 -1) + ret <8 x i32> %res +} + define <8 x i32>@test_int_x86_avx512_mask_pmovs_qd_512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) { ; X64-LABEL: test_int_x86_avx512_mask_pmovs_qd_512: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 -; X64-NEXT: vpmovsqd %zmm0, %ymm2 {%k1} {z} ; X64-NEXT: vpmovsqd %zmm0, %ymm1 {%k1} -; X64-NEXT: vpaddd %ymm2, %ymm1, %ymm1 -; X64-NEXT: vpmovsqd %zmm0, %ymm0 -; X64-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; X64-NEXT: vmovdqa %ymm1, %ymm0 ; X64-NEXT: retq ; ; X86-LABEL: test_int_x86_avx512_mask_pmovs_qd_512: @@ -3649,17 +3659,27 @@ define <8 x i32>@test_int_x86_avx512_mask_pmovs_qd_512(<8 x i64> %x0, <8 x i32> ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vpmovsqd %zmm0, %ymm1 {%k1} -; X86-NEXT: vpmovsqd %zmm0, %ymm2 {%k1} {z} -; X86-NEXT: vpaddd %ymm2, %ymm1, %ymm1 -; X86-NEXT: vpmovsqd %zmm0, %ymm0 -; X86-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; X86-NEXT: vmovdqa %ymm1, %ymm0 ; X86-NEXT: retl - %res0 = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 -1) - %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) - %res2 = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> %x0, <8 x i32> zeroinitializer, i8 %x2) - %res3 = add <8 x i32> %res0, %res1 - %res4 = add <8 x i32> %res3, %res2 - ret <8 x i32> %res4 + %res = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) + ret <8 x i32> %res +} + +define <8 x i32>@test_int_x86_avx512_maskz_pmovs_qd_512(<8 x i64> %x0, i8 %x2) { +; X64-LABEL: test_int_x86_avx512_maskz_pmovs_qd_512: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vpmovsqd %zmm0, %ymm0 {%k1} {z} +; X64-NEXT: retq +; +; X86-LABEL: test_int_x86_avx512_maskz_pmovs_qd_512: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 +; X86-NEXT: vpmovsqd %zmm0, %ymm0 {%k1} {z} +; X86-NEXT: retl + %res = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> %x0, <8 x i32> zeroinitializer, i8 %x2) + ret <8 x i32> %res } declare void @llvm.x86.avx512.mask.pmovs.qd.mem.512(i8* %ptr, <8 x i64>, i8) @@ -3689,15 +3709,21 @@ define void @test_int_x86_avx512_mask_pmovs_qd_mem_512(i8* %ptr, <8 x i64> %x1, declare <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64>, <8 x i32>, i8) +define <8 x i32>@test_int_x86_avx512_pmovus_qd_512(<8 x i64> %x0, <8 x i32> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmovus_qd_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovusqd %zmm0, %ymm0 +; CHECK-NEXT: ret{{[l|q]}} + %res = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 -1) + ret <8 x i32> %res +} + define <8 x i32>@test_int_x86_avx512_mask_pmovus_qd_512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) { ; X64-LABEL: test_int_x86_avx512_mask_pmovus_qd_512: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 -; X64-NEXT: vpmovusqd %zmm0, %ymm2 {%k1} {z} ; X64-NEXT: vpmovusqd %zmm0, %ymm1 {%k1} -; X64-NEXT: vpaddd %ymm2, %ymm1, %ymm1 -; X64-NEXT: vpmovusqd %zmm0, %ymm0 -; X64-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; X64-NEXT: vmovdqa %ymm1, %ymm0 ; X64-NEXT: retq ; ; X86-LABEL: test_int_x86_avx512_mask_pmovus_qd_512: @@ -3705,17 +3731,27 @@ define <8 x i32>@test_int_x86_avx512_mask_pmovus_qd_512(<8 x i64> %x0, <8 x i32> ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vpmovusqd %zmm0, %ymm1 {%k1} -; X86-NEXT: vpmovusqd %zmm0, %ymm2 {%k1} {z} -; X86-NEXT: vpaddd %ymm2, %ymm1, %ymm1 -; X86-NEXT: vpmovusqd %zmm0, %ymm0 -; X86-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; X86-NEXT: vmovdqa %ymm1, %ymm0 +; X86-NEXT: retl + %res = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) + ret <8 x i32> %res +} + +define <8 x i32>@test_int_x86_avx512_maskz_pmovus_qd_512(<8 x i64> %x0, i8 %x2) { +; X64-LABEL: test_int_x86_avx512_maskz_pmovus_qd_512: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vpmovusqd %zmm0, %ymm0 {%k1} {z} +; X64-NEXT: retq +; +; X86-LABEL: test_int_x86_avx512_maskz_pmovus_qd_512: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 +; X86-NEXT: vpmovusqd %zmm0, %ymm0 {%k1} {z} ; X86-NEXT: retl - %res0 = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 -1) - %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> %x0, <8 x i32> %x1, i8 %x2) - %res2 = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> %x0, <8 x i32> zeroinitializer, i8 %x2) - %res3 = add <8 x i32> %res0, %res1 - %res4 = add <8 x i32> %res3, %res2 - ret <8 x i32> %res4 + %res = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> %x0, <8 x i32> zeroinitializer, i8 %x2) + ret <8 x i32> %res } declare void @llvm.x86.avx512.mask.pmovus.qd.mem.512(i8* %ptr, <8 x i64>, i8) @@ -5148,140 +5184,202 @@ declare i32 @llvm.x86.avx512.vcomi.ss(<4 x float>, <4 x float>, i32, i32) declare <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double>, <8 x i64>) +define <8 x double>@test_int_x86_avx512_permvar_df_512(<8 x double> %x0, <8 x i64> %x1) { +; CHECK-LABEL: test_int_x86_avx512_permvar_df_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: ret{{[l|q]}} + %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %x0, <8 x i64> %x1) + ret <8 x double> %1 +} + define <8 x double>@test_int_x86_avx512_mask_permvar_df_512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3) { ; X64-LABEL: test_int_x86_avx512_mask_permvar_df_512: ; X64: # %bb.0: -; X64-NEXT: vpermpd %zmm0, %zmm1, %zmm3 ; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vpermpd %zmm0, %zmm1, %zmm2 {%k1} -; X64-NEXT: vpermpd %zmm0, %zmm1, %zmm0 {%k1} {z} -; X64-NEXT: vaddpd %zmm0, %zmm2, %zmm0 -; X64-NEXT: vaddpd %zmm3, %zmm0, %zmm0 +; X64-NEXT: vmovapd %zmm2, %zmm0 ; X64-NEXT: retq ; ; X86-LABEL: test_int_x86_avx512_mask_permvar_df_512: ; X86: # %bb.0: -; X86-NEXT: vpermpd %zmm0, %zmm1, %zmm3 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vpermpd %zmm0, %zmm1, %zmm2 {%k1} -; X86-NEXT: vpermpd %zmm0, %zmm1, %zmm0 {%k1} {z} -; X86-NEXT: vaddpd %zmm0, %zmm2, %zmm0 -; X86-NEXT: vaddpd %zmm3, %zmm0, %zmm0 +; X86-NEXT: vmovapd %zmm2, %zmm0 ; X86-NEXT: retl %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %x0, <8 x i64> %x1) %2 = bitcast i8 %x3 to <8 x i1> %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> %x2 - %4 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %x0, <8 x i64> %x1) - %5 = bitcast i8 %x3 to <8 x i1> - %6 = select <8 x i1> %5, <8 x double> %4, <8 x double> zeroinitializer - %7 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %x0, <8 x i64> %x1) - %res3 = fadd <8 x double> %3, %6 - %res4 = fadd <8 x double> %res3, %7 - ret <8 x double> %res4 + ret <8 x double> %3 +} + +define <8 x double>@test_int_x86_avx512_maskz_permvar_df_512(<8 x double> %x0, <8 x i64> %x1, i8 %x3) { +; X64-LABEL: test_int_x86_avx512_maskz_permvar_df_512: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vpermpd %zmm0, %zmm1, %zmm0 {%k1} {z} +; X64-NEXT: retq +; +; X86-LABEL: test_int_x86_avx512_maskz_permvar_df_512: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 +; X86-NEXT: vpermpd %zmm0, %zmm1, %zmm0 {%k1} {z} +; X86-NEXT: retl + %1 = call <8 x double> @llvm.x86.avx512.permvar.df.512(<8 x double> %x0, <8 x i64> %x1) + %2 = bitcast i8 %x3 to <8 x i1> + %3 = select <8 x i1> %2, <8 x double> %1, <8 x double> zeroinitializer + ret <8 x double> %3 } declare <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64>, <8 x i64>) +define <8 x i64>@test_int_x86_avx512_permvar_di_512(<8 x i64> %x0, <8 x i64> %x1) { +; CHECK-LABEL: test_int_x86_avx512_permvar_di_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermpd %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: ret{{[l|q]}} + %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1) + ret <8 x i64> %1 +} + define <8 x i64>@test_int_x86_avx512_mask_permvar_di_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) { ; X64-LABEL: test_int_x86_avx512_mask_permvar_di_512: ; X64: # %bb.0: -; X64-NEXT: vpermq %zmm0, %zmm1, %zmm3 ; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vpermq %zmm0, %zmm1, %zmm2 {%k1} -; X64-NEXT: vpermq %zmm0, %zmm1, %zmm0 {%k1} {z} -; X64-NEXT: vpaddq %zmm3, %zmm0, %zmm0 -; X64-NEXT: vpaddq %zmm0, %zmm2, %zmm0 +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 ; X64-NEXT: retq ; ; X86-LABEL: test_int_x86_avx512_mask_permvar_di_512: ; X86: # %bb.0: -; X86-NEXT: vpermq %zmm0, %zmm1, %zmm3 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vpermq %zmm0, %zmm1, %zmm2 {%k1} -; X86-NEXT: vpermq %zmm0, %zmm1, %zmm0 {%k1} {z} -; X86-NEXT: vpaddq %zmm3, %zmm0, %zmm0 -; X86-NEXT: vpaddq %zmm0, %zmm2, %zmm0 +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 ; X86-NEXT: retl %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1) %2 = bitcast i8 %x3 to <8 x i1> %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %x2 - %4 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1) - %5 = bitcast i8 %x3 to <8 x i1> - %6 = select <8 x i1> %5, <8 x i64> %4, <8 x i64> zeroinitializer - %7 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1) - %res3 = add <8 x i64> %3, %6 - %res4 = add <8 x i64> %res3, %7 - ret <8 x i64> %res4 + ret <8 x i64> %3 +} + +define <8 x i64>@test_int_x86_avx512_maskz_permvar_di_512(<8 x i64> %x0, <8 x i64> %x1, i8 %x3) { +; X64-LABEL: test_int_x86_avx512_maskz_permvar_di_512: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vpermq %zmm0, %zmm1, %zmm0 {%k1} {z} +; X64-NEXT: retq +; +; X86-LABEL: test_int_x86_avx512_maskz_permvar_di_512: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw %eax, %k1 +; X86-NEXT: vpermq %zmm0, %zmm1, %zmm0 {%k1} {z} +; X86-NEXT: retl + %1 = call <8 x i64> @llvm.x86.avx512.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1) + %2 = bitcast i8 %x3 to <8 x i1> + %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> zeroinitializer + ret <8 x i64> %3 } declare <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float>, <16 x i32>) +define <16 x float>@test_int_x86_avx512_permvar_sf_512(<16 x float> %x0, <16 x i32> %x1) { +; CHECK-LABEL: test_int_x86_avx512_permvar_sf_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: ret{{[l|q]}} + %1 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %x0, <16 x i32> %x1) + ret <16 x float> %1 +} + define <16 x float>@test_int_x86_avx512_mask_permvar_sf_512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3) { ; X64-LABEL: test_int_x86_avx512_mask_permvar_sf_512: ; X64: # %bb.0: -; X64-NEXT: vpermps %zmm0, %zmm1, %zmm3 ; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vpermps %zmm0, %zmm1, %zmm2 {%k1} -; X64-NEXT: vpermps %zmm0, %zmm1, %zmm0 {%k1} {z} -; X64-NEXT: vaddps %zmm0, %zmm2, %zmm0 -; X64-NEXT: vaddps %zmm3, %zmm0, %zmm0 +; X64-NEXT: vmovaps %zmm2, %zmm0 ; X64-NEXT: retq ; ; X86-LABEL: test_int_x86_avx512_mask_permvar_sf_512: ; X86: # %bb.0: -; X86-NEXT: vpermps %zmm0, %zmm1, %zmm3 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X86-NEXT: vpermps %zmm0, %zmm1, %zmm2 {%k1} -; X86-NEXT: vpermps %zmm0, %zmm1, %zmm0 {%k1} {z} -; X86-NEXT: vaddps %zmm0, %zmm2, %zmm0 -; X86-NEXT: vaddps %zmm3, %zmm0, %zmm0 +; X86-NEXT: vmovaps %zmm2, %zmm0 ; X86-NEXT: retl %1 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %x0, <16 x i32> %x1) %2 = bitcast i16 %x3 to <16 x i1> %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> %x2 - %4 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %x0, <16 x i32> %x1) - %5 = bitcast i16 %x3 to <16 x i1> - %6 = select <16 x i1> %5, <16 x float> %4, <16 x float> zeroinitializer - %7 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %x0, <16 x i32> %x1) - %res3 = fadd <16 x float> %3, %6 - %res4 = fadd <16 x float> %res3, %7 - ret <16 x float> %res4 + ret <16 x float> %3 +} + +define <16 x float>@test_int_x86_avx512_maskz_permvar_sf_512(<16 x float> %x0, <16 x i32> %x1, i16 %x3) { +; X64-LABEL: test_int_x86_avx512_maskz_permvar_sf_512: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vpermps %zmm0, %zmm1, %zmm0 {%k1} {z} +; X64-NEXT: retq +; +; X86-LABEL: test_int_x86_avx512_maskz_permvar_sf_512: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: vpermps %zmm0, %zmm1, %zmm0 {%k1} {z} +; X86-NEXT: retl + %1 = call <16 x float> @llvm.x86.avx512.permvar.sf.512(<16 x float> %x0, <16 x i32> %x1) + %2 = bitcast i16 %x3 to <16 x i1> + %3 = select <16 x i1> %2, <16 x float> %1, <16 x float> zeroinitializer + ret <16 x float> %3 } declare <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32>, <16 x i32>) +define <16 x i32>@test_int_x86_avx512_permvar_si_512(<16 x i32> %x0, <16 x i32> %x1) { +; CHECK-LABEL: test_int_x86_avx512_permvar_si_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermps %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: ret{{[l|q]}} + %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1) + ret <16 x i32> %1 +} + define <16 x i32>@test_int_x86_avx512_mask_permvar_si_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3) { ; X64-LABEL: test_int_x86_avx512_mask_permvar_si_512: ; X64: # %bb.0: -; X64-NEXT: vpermd %zmm0, %zmm1, %zmm3 ; X64-NEXT: kmovw %edi, %k1 ; X64-NEXT: vpermd %zmm0, %zmm1, %zmm2 {%k1} -; X64-NEXT: vpermd %zmm0, %zmm1, %zmm0 {%k1} {z} -; X64-NEXT: vpaddd %zmm3, %zmm0, %zmm0 -; X64-NEXT: vpaddd %zmm0, %zmm2, %zmm0 +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 ; X64-NEXT: retq ; ; X86-LABEL: test_int_x86_avx512_mask_permvar_si_512: ; X86: # %bb.0: -; X86-NEXT: vpermd %zmm0, %zmm1, %zmm3 ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X86-NEXT: vpermd %zmm0, %zmm1, %zmm2 {%k1} -; X86-NEXT: vpermd %zmm0, %zmm1, %zmm0 {%k1} {z} -; X86-NEXT: vpaddd %zmm3, %zmm0, %zmm0 -; X86-NEXT: vpaddd %zmm0, %zmm2, %zmm0 +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 ; X86-NEXT: retl %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1) %2 = bitcast i16 %x3 to <16 x i1> %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %x2 - %4 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1) - %5 = bitcast i16 %x3 to <16 x i1> - %6 = select <16 x i1> %5, <16 x i32> %4, <16 x i32> zeroinitializer - %7 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1) - %res3 = add <16 x i32> %3, %6 - %res4 = add <16 x i32> %res3, %7 - ret <16 x i32> %res4 + ret <16 x i32> %3 +} + +define <16 x i32>@test_int_x86_avx512_maskz_permvar_si_512(<16 x i32> %x0, <16 x i32> %x1, i16 %x3) { +; X64-LABEL: test_int_x86_avx512_maskz_permvar_si_512: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vpermd %zmm0, %zmm1, %zmm0 {%k1} {z} +; X64-NEXT: retq +; +; X86-LABEL: test_int_x86_avx512_maskz_permvar_si_512: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: vpermd %zmm0, %zmm1, %zmm0 {%k1} {z} +; X86-NEXT: retl + %1 = call <16 x i32> @llvm.x86.avx512.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1) + %2 = bitcast i16 %x3 to <16 x i1> + %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer + ret <16 x i32> %3 } declare <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double>, <8 x double>, <8 x i64>, i32, i8, i32) diff --git a/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll index 0ca1f7be88529..c87f04fbb841b 100644 --- a/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll @@ -46,62 +46,99 @@ define i64@test_int_x86_avx512_kunpck_qd(i64 %x0, i64 %x1) { declare <64 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.512(i8, <64 x i8>, i64) - define <64 x i8>@test_int_x86_avx512_mask_pbroadcast_b_gpr_512(i8 %x0, <64 x i8> %x1, i64 %mask) { +define <64 x i8>@test_int_x86_avx512_pbroadcast_b_gpr_512(i8 %x0, <64 x i8> %x1) { +; X86-LABEL: test_int_x86_avx512_pbroadcast_b_gpr_512: +; X86: # %bb.0: +; X86-NEXT: vpbroadcastb {{[0-9]+}}(%esp), %zmm0 # encoding: [0x62,0xf2,0x7d,0x48,0x78,0x44,0x24,0x04] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_pbroadcast_b_gpr_512: +; X64: # %bb.0: +; X64-NEXT: vpbroadcastb %edi, %zmm0 # encoding: [0x62,0xf2,0x7d,0x48,0x7a,0xc7] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <64 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.512(i8 %x0, <64 x i8> %x1, i64 -1) + ret <64 x i8> %res +} + +define <64 x i8>@test_int_x86_avx512_mask_pbroadcast_b_gpr_512(i8 %x0, <64 x i8> %x1, i64 %mask) { ; X86-LABEL: test_int_x86_avx512_mask_pbroadcast_b_gpr_512: ; X86: # %bb.0: -; X86-NEXT: vpbroadcastb {{[0-9]+}}(%esp), %zmm1 # encoding: [0x62,0xf2,0x7d,0x48,0x78,0x4c,0x24,0x04] ; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x08] -; X86-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf1,0x7f,0x49,0x6f,0xc1] -; X86-NEXT: vmovdqu8 %zmm1, %zmm2 {%k1} {z} # encoding: [0x62,0xf1,0x7f,0xc9,0x6f,0xd1] -; X86-NEXT: vpaddb %zmm2, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfc,0xc2] -; X86-NEXT: vpaddb %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfc,0xc0] +; X86-NEXT: vpbroadcastb {{[0-9]+}}(%esp), %zmm0 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x78,0x44,0x24,0x04] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pbroadcast_b_gpr_512: ; X64: # %bb.0: -; X64-NEXT: vpbroadcastb %edi, %zmm1 # encoding: [0x62,0xf2,0x7d,0x48,0x7a,0xcf] ; X64-NEXT: kmovq %rsi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xce] ; X64-NEXT: vpbroadcastb %edi, %zmm0 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x7a,0xc7] -; X64-NEXT: vpbroadcastb %edi, %zmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0x7a,0xd7] -; X64-NEXT: vpaddb %zmm2, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfc,0xc2] -; X64-NEXT: vpaddb %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfc,0xc0] -; X64-NEXT: retq # encoding: [0xc3] - %res = call <64 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.512(i8 %x0, <64 x i8> %x1, i64 -1) - %res1 = call <64 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.512(i8 %x0, <64 x i8> %x1, i64 %mask) - %res2 = call <64 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.512(i8 %x0, <64 x i8> zeroinitializer, i64 %mask) - %res3 = add <64 x i8> %res, %res1 - %res4 = add <64 x i8> %res2, %res3 - ret <64 x i8> %res4 - } +; X64-NEXT: retq # encoding: [0xc3] + %res = call <64 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.512(i8 %x0, <64 x i8> %x1, i64 %mask) + ret <64 x i8> %res +} + +define <64 x i8>@test_int_x86_avx512_maskz_pbroadcast_b_gpr_512(i8 %x0, i64 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_pbroadcast_b_gpr_512: +; X86: # %bb.0: +; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x08] +; X86-NEXT: vpbroadcastb {{[0-9]+}}(%esp), %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0x78,0x44,0x24,0x04] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pbroadcast_b_gpr_512: +; X64: # %bb.0: +; X64-NEXT: kmovq %rsi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xce] +; X64-NEXT: vpbroadcastb %edi, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0x7a,0xc7] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <64 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.512(i8 %x0, <64 x i8> zeroinitializer, i64 %mask) + ret <64 x i8> %res +} declare <32 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.512(i16, <32 x i16>, i32) - define <32 x i16>@test_int_x86_avx512_mask_pbroadcast_w_gpr_512(i16 %x0, <32 x i16> %x1, i32 %mask) { + +define <32 x i16>@test_int_x86_avx512_pbroadcast_w_gpr_512(i16 %x0, <32 x i16> %x1) { +; X86-LABEL: test_int_x86_avx512_pbroadcast_w_gpr_512: +; X86: # %bb.0: +; X86-NEXT: vpbroadcastw {{[0-9]+}}(%esp), %zmm0 # encoding: [0x62,0xf2,0x7d,0x48,0x79,0x44,0x24,0x02] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_pbroadcast_w_gpr_512: +; X64: # %bb.0: +; X64-NEXT: vpbroadcastw %edi, %zmm0 # encoding: [0x62,0xf2,0x7d,0x48,0x7b,0xc7] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <32 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.512(i16 %x0, <32 x i16> %x1, i32 -1) + ret <32 x i16> %res +} + +define <32 x i16>@test_int_x86_avx512_mask_pbroadcast_w_gpr_512(i16 %x0, <32 x i16> %x1, i32 %mask) { ; X86-LABEL: test_int_x86_avx512_mask_pbroadcast_w_gpr_512: ; X86: # %bb.0: -; X86-NEXT: vpbroadcastw {{[0-9]+}}(%esp), %zmm1 # encoding: [0x62,0xf2,0x7d,0x48,0x79,0x4c,0x24,0x02] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf1,0xff,0x49,0x6f,0xc1] -; X86-NEXT: vmovdqu16 %zmm1, %zmm2 {%k1} {z} # encoding: [0x62,0xf1,0xff,0xc9,0x6f,0xd1] -; X86-NEXT: vpaddw %zmm2, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc2] -; X86-NEXT: vpaddw %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfd,0xc0] +; X86-NEXT: vpbroadcastw {{[0-9]+}}(%esp), %zmm0 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x79,0x44,0x24,0x02] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pbroadcast_w_gpr_512: ; X64: # %bb.0: -; X64-NEXT: vpbroadcastw %edi, %zmm1 # encoding: [0x62,0xf2,0x7d,0x48,0x7b,0xcf] ; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] ; X64-NEXT: vpbroadcastw %edi, %zmm0 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x7b,0xc7] -; X64-NEXT: vpbroadcastw %edi, %zmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0x7b,0xd7] -; X64-NEXT: vpaddw %zmm2, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc2] -; X64-NEXT: vpaddw %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfd,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <32 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.512(i16 %x0, <32 x i16> %x1, i32 -1) - %res1 = call <32 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.512(i16 %x0, <32 x i16> %x1, i32 %mask) - %res2 = call <32 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.512(i16 %x0, <32 x i16> zeroinitializer, i32 %mask) - %res3 = add <32 x i16> %res, %res1 - %res4 = add <32 x i16> %res2, %res3 - ret <32 x i16> %res4 - } + %res = call <32 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.512(i16 %x0, <32 x i16> %x1, i32 %mask) + ret <32 x i16> %res +} + +define <32 x i16>@test_int_x86_avx512_maskz_pbroadcast_w_gpr_512(i16 %x0, i32 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_pbroadcast_w_gpr_512: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08] +; X86-NEXT: vpbroadcastw {{[0-9]+}}(%esp), %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0x79,0x44,0x24,0x02] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pbroadcast_w_gpr_512: +; X64: # %bb.0: +; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] +; X64-NEXT: vpbroadcastw %edi, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0x7b,0xc7] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <32 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.512(i16 %x0, <32 x i16> zeroinitializer, i32 %mask) + ret <32 x i16> %res +} declare void @llvm.x86.avx512.mask.storeu.b.512(i8*, <64 x i8>, i64) @@ -283,110 +320,152 @@ define <8 x i64>@test_int_x86_avx512_psrl_load_dq_512(<8 x i64>* %p0) { declare <64 x i8> @llvm.x86.avx512.mask.palignr.512(<64 x i8>, <64 x i8>, i32, <64 x i8>, i64) +define <64 x i8>@test_int_x86_avx512_palignr_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x3) { +; CHECK-LABEL: test_int_x86_avx512_palignr_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpalignr $2, %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0x7d,0x48,0x0f,0xc1,0x02] +; CHECK-NEXT: # zmm0 = zmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0,1],zmm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zmm0[16,17],zmm1[34,35,36,37,38,39,40,41,42,43,44,45,46,47],zmm0[32,33],zmm1[50,51,52,53,54,55,56,57,58,59,60,61,62,63],zmm0[48,49] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <64 x i8> @llvm.x86.avx512.mask.palignr.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <64 x i8> %x3, i64 -1) + ret <64 x i8> %res +} + define <64 x i8>@test_int_x86_avx512_mask_palignr_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x3, i64 %x4) { ; X86-LABEL: test_int_x86_avx512_mask_palignr_512: ; X86: # %bb.0: -; X86-NEXT: vpalignr $2, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0x7d,0x48,0x0f,0xd9,0x02] -; X86-NEXT: # zmm3 = zmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0,1],zmm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zmm0[16,17],zmm1[34,35,36,37,38,39,40,41,42,43,44,45,46,47],zmm0[32,33],zmm1[50,51,52,53,54,55,56,57,58,59,60,61,62,63],zmm0[48,49] ; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpalignr $2, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x0f,0xd1,0x02] ; X86-NEXT: # zmm2 {%k1} = zmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0,1],zmm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zmm0[16,17],zmm1[34,35,36,37,38,39,40,41,42,43,44,45,46,47],zmm0[32,33],zmm1[50,51,52,53,54,55,56,57,58,59,60,61,62,63],zmm0[48,49] -; X86-NEXT: vpalignr $2, %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xc9,0x0f,0xc1,0x02] -; X86-NEXT: # zmm0 {%k1} {z} = zmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0,1],zmm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zmm0[16,17],zmm1[34,35,36,37,38,39,40,41,42,43,44,45,46,47],zmm0[32,33],zmm1[50,51,52,53,54,55,56,57,58,59,60,61,62,63],zmm0[48,49] -; X86-NEXT: vpaddb %zmm3, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfc,0xc3] -; X86-NEXT: vpaddb %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc0] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_palignr_512: ; X64: # %bb.0: -; X64-NEXT: vpalignr $2, %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0x7d,0x48,0x0f,0xd9,0x02] -; X64-NEXT: # zmm3 = zmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0,1],zmm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zmm0[16,17],zmm1[34,35,36,37,38,39,40,41,42,43,44,45,46,47],zmm0[32,33],zmm1[50,51,52,53,54,55,56,57,58,59,60,61,62,63],zmm0[48,49] ; X64-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] ; X64-NEXT: vpalignr $2, %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x0f,0xd1,0x02] ; X64-NEXT: # zmm2 {%k1} = zmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0,1],zmm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zmm0[16,17],zmm1[34,35,36,37,38,39,40,41,42,43,44,45,46,47],zmm0[32,33],zmm1[50,51,52,53,54,55,56,57,58,59,60,61,62,63],zmm0[48,49] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <64 x i8> @llvm.x86.avx512.mask.palignr.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <64 x i8> %x3, i64 %x4) + ret <64 x i8> %res +} + +define <64 x i8>@test_int_x86_avx512_maskz_palignr_512(<64 x i8> %x0, <64 x i8> %x1, i64 %x4) { +; X86-LABEL: test_int_x86_avx512_maskz_palignr_512: +; X86: # %bb.0: +; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpalignr $2, %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xc9,0x0f,0xc1,0x02] +; X86-NEXT: # zmm0 {%k1} {z} = zmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0,1],zmm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zmm0[16,17],zmm1[34,35,36,37,38,39,40,41,42,43,44,45,46,47],zmm0[32,33],zmm1[50,51,52,53,54,55,56,57,58,59,60,61,62,63],zmm0[48,49] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_palignr_512: +; X64: # %bb.0: +; X64-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] ; X64-NEXT: vpalignr $2, %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xc9,0x0f,0xc1,0x02] ; X64-NEXT: # zmm0 {%k1} {z} = zmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm0[0,1],zmm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],zmm0[16,17],zmm1[34,35,36,37,38,39,40,41,42,43,44,45,46,47],zmm0[32,33],zmm1[50,51,52,53,54,55,56,57,58,59,60,61,62,63],zmm0[48,49] -; X64-NEXT: vpaddb %zmm3, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfc,0xc3] -; X64-NEXT: vpaddb %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <64 x i8> @llvm.x86.avx512.mask.palignr.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <64 x i8> %x3, i64 %x4) - %res1 = call <64 x i8> @llvm.x86.avx512.mask.palignr.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <64 x i8> zeroinitializer, i64 %x4) - %res2 = call <64 x i8> @llvm.x86.avx512.mask.palignr.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <64 x i8> %x3, i64 -1) - %res3 = add <64 x i8> %res, %res1 - %res4 = add <64 x i8> %res3, %res2 - ret <64 x i8> %res4 + %res = call <64 x i8> @llvm.x86.avx512.mask.palignr.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <64 x i8> zeroinitializer, i64 %x4) + ret <64 x i8> %res } declare <32 x i16> @llvm.x86.avx512.mask.pshufh.w.512(<32 x i16>, i32, <32 x i16>, i32) +define <32 x i16>@test_int_x86_avx512_pshufh_w_512(<32 x i16> %x0, i32 %x1, <32 x i16> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pshufh_w_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpshufhw $3, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7e,0x48,0x70,0xc0,0x03] +; CHECK-NEXT: # zmm0 = zmm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12,16,17,18,19,23,20,20,20,24,25,26,27,31,28,28,28] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <32 x i16> @llvm.x86.avx512.mask.pshufh.w.512(<32 x i16> %x0, i32 3, <32 x i16> %x2, i32 -1) + ret <32 x i16> %res +} + define <32 x i16>@test_int_x86_avx512_mask_pshufh_w_512(<32 x i16> %x0, i32 %x1, <32 x i16> %x2, i32 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pshufh_w_512: ; X86: # %bb.0: -; X86-NEXT: vpshufhw $3, %zmm0, %zmm2 # encoding: [0x62,0xf1,0x7e,0x48,0x70,0xd0,0x03] -; X86-NEXT: # zmm2 = zmm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12,16,17,18,19,23,20,20,20,24,25,26,27,31,28,28,28] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08] ; X86-NEXT: vpshufhw $3, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7e,0x49,0x70,0xc8,0x03] ; X86-NEXT: # zmm1 {%k1} = zmm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12,16,17,18,19,23,20,20,20,24,25,26,27,31,28,28,28] -; X86-NEXT: vpshufhw $3, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7e,0xc9,0x70,0xc0,0x03] -; X86-NEXT: # zmm0 {%k1} {z} = zmm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12,16,17,18,19,23,20,20,20,24,25,26,27,31,28,28,28] -; X86-NEXT: vpaddw %zmm2, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc2] -; X86-NEXT: vpaddw %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfd,0xc0] +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pshufh_w_512: ; X64: # %bb.0: -; X64-NEXT: vpshufhw $3, %zmm0, %zmm2 # encoding: [0x62,0xf1,0x7e,0x48,0x70,0xd0,0x03] -; X64-NEXT: # zmm2 = zmm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12,16,17,18,19,23,20,20,20,24,25,26,27,31,28,28,28] ; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] ; X64-NEXT: vpshufhw $3, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7e,0x49,0x70,0xc8,0x03] ; X64-NEXT: # zmm1 {%k1} = zmm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12,16,17,18,19,23,20,20,20,24,25,26,27,31,28,28,28] +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <32 x i16> @llvm.x86.avx512.mask.pshufh.w.512(<32 x i16> %x0, i32 3, <32 x i16> %x2, i32 %x3) + ret <32 x i16> %res +} + +define <32 x i16>@test_int_x86_avx512_maskz_pshufh_w_512(<32 x i16> %x0, i32 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_pshufh_w_512: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpshufhw $3, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7e,0xc9,0x70,0xc0,0x03] +; X86-NEXT: # zmm0 {%k1} {z} = zmm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12,16,17,18,19,23,20,20,20,24,25,26,27,31,28,28,28] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pshufh_w_512: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpshufhw $3, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7e,0xc9,0x70,0xc0,0x03] ; X64-NEXT: # zmm0 {%k1} {z} = zmm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12,16,17,18,19,23,20,20,20,24,25,26,27,31,28,28,28] -; X64-NEXT: vpaddw %zmm2, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc2] -; X64-NEXT: vpaddw %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfd,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <32 x i16> @llvm.x86.avx512.mask.pshufh.w.512(<32 x i16> %x0, i32 3, <32 x i16> %x2, i32 %x3) - %res1 = call <32 x i16> @llvm.x86.avx512.mask.pshufh.w.512(<32 x i16> %x0, i32 3, <32 x i16> zeroinitializer, i32 %x3) - %res2 = call <32 x i16> @llvm.x86.avx512.mask.pshufh.w.512(<32 x i16> %x0, i32 3, <32 x i16> %x2, i32 -1) - %res3 = add <32 x i16> %res, %res1 - %res4 = add <32 x i16> %res3, %res2 - ret <32 x i16> %res4 + %res = call <32 x i16> @llvm.x86.avx512.mask.pshufh.w.512(<32 x i16> %x0, i32 3, <32 x i16> zeroinitializer, i32 %x3) + ret <32 x i16> %res } declare <32 x i16> @llvm.x86.avx512.mask.pshufl.w.512(<32 x i16>, i32, <32 x i16>, i32) +define <32 x i16>@test_int_x86_avx512_pshufl_w_512(<32 x i16> %x0, i32 %x1, <32 x i16> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pshufl_w_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpshuflw $3, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7f,0x48,0x70,0xc0,0x03] +; CHECK-NEXT: # zmm0 = zmm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15,19,16,16,16,20,21,22,23,27,24,24,24,28,29,30,31] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <32 x i16> @llvm.x86.avx512.mask.pshufl.w.512(<32 x i16> %x0, i32 3, <32 x i16> %x2, i32 -1) + ret <32 x i16> %res +} + define <32 x i16>@test_int_x86_avx512_mask_pshufl_w_512(<32 x i16> %x0, i32 %x1, <32 x i16> %x2, i32 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pshufl_w_512: ; X86: # %bb.0: -; X86-NEXT: vpshuflw $3, %zmm0, %zmm2 # encoding: [0x62,0xf1,0x7f,0x48,0x70,0xd0,0x03] -; X86-NEXT: # zmm2 = zmm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15,19,16,16,16,20,21,22,23,27,24,24,24,28,29,30,31] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08] ; X86-NEXT: vpshuflw $3, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7f,0x49,0x70,0xc8,0x03] ; X86-NEXT: # zmm1 {%k1} = zmm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15,19,16,16,16,20,21,22,23,27,24,24,24,28,29,30,31] -; X86-NEXT: vpshuflw $3, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7f,0xc9,0x70,0xc0,0x03] -; X86-NEXT: # zmm0 {%k1} {z} = zmm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15,19,16,16,16,20,21,22,23,27,24,24,24,28,29,30,31] -; X86-NEXT: vpaddw %zmm2, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc2] -; X86-NEXT: vpaddw %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfd,0xc0] +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pshufl_w_512: ; X64: # %bb.0: -; X64-NEXT: vpshuflw $3, %zmm0, %zmm2 # encoding: [0x62,0xf1,0x7f,0x48,0x70,0xd0,0x03] -; X64-NEXT: # zmm2 = zmm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15,19,16,16,16,20,21,22,23,27,24,24,24,28,29,30,31] ; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] ; X64-NEXT: vpshuflw $3, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7f,0x49,0x70,0xc8,0x03] ; X64-NEXT: # zmm1 {%k1} = zmm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15,19,16,16,16,20,21,22,23,27,24,24,24,28,29,30,31] +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <32 x i16> @llvm.x86.avx512.mask.pshufl.w.512(<32 x i16> %x0, i32 3, <32 x i16> %x2, i32 %x3) + ret <32 x i16> %res +} + +define <32 x i16>@test_int_x86_avx512_maskz_pshufl_w_512(<32 x i16> %x0, i32 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_pshufl_w_512: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpshuflw $3, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7f,0xc9,0x70,0xc0,0x03] +; X86-NEXT: # zmm0 {%k1} {z} = zmm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15,19,16,16,16,20,21,22,23,27,24,24,24,28,29,30,31] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pshufl_w_512: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpshuflw $3, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7f,0xc9,0x70,0xc0,0x03] ; X64-NEXT: # zmm0 {%k1} {z} = zmm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15,19,16,16,16,20,21,22,23,27,24,24,24,28,29,30,31] -; X64-NEXT: vpaddw %zmm2, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc2] -; X64-NEXT: vpaddw %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfd,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <32 x i16> @llvm.x86.avx512.mask.pshufl.w.512(<32 x i16> %x0, i32 3, <32 x i16> %x2, i32 %x3) - %res1 = call <32 x i16> @llvm.x86.avx512.mask.pshufl.w.512(<32 x i16> %x0, i32 3, <32 x i16> zeroinitializer, i32 %x3) - %res2 = call <32 x i16> @llvm.x86.avx512.mask.pshufl.w.512(<32 x i16> %x0, i32 3, <32 x i16> %x2, i32 -1) - %res3 = add <32 x i16> %res, %res1 - %res4 = add <32 x i16> %res3, %res2 - ret <32 x i16> %res4 + %res = call <32 x i16> @llvm.x86.avx512.mask.pshufl.w.512(<32 x i16> %x0, i32 3, <32 x i16> zeroinitializer, i32 %x3) + ret <32 x i16> %res } define i64 @test_pcmpeq_b(<64 x i8> %a, <64 x i8> %b) { @@ -547,372 +626,457 @@ declare i32 @llvm.x86.avx512.mask.pcmpgt.w.512(<32 x i16>, <32 x i16>, i32) declare <64 x i8> @llvm.x86.avx512.mask.punpckhb.w.512(<64 x i8>, <64 x i8>, <64 x i8>, i64) +define <64 x i8>@test_int_x86_avx512_punpckhb_w_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2) { +; CHECK-LABEL: test_int_x86_avx512_punpckhb_w_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpunpckhbw %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0x68,0xc1] +; CHECK-NEXT: # zmm0 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <64 x i8> @llvm.x86.avx512.mask.punpckhb.w.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1) + ret <64 x i8> %res +} + define <64 x i8>@test_int_x86_avx512_mask_punpckhb_w_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_punpckhb_w_512: ; X86: # %bb.0: -; X86-NEXT: vpunpckhbw %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0x68,0xd9] -; X86-NEXT: # zmm3 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63] ; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpunpckhbw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x68,0xd1] ; X86-NEXT: # zmm2 {%k1} = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63] -; X86-NEXT: vpaddb %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc3] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_punpckhb_w_512: ; X64: # %bb.0: -; X64-NEXT: vpunpckhbw %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0x68,0xd9] -; X64-NEXT: # zmm3 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63] ; X64-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] ; X64-NEXT: vpunpckhbw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x68,0xd1] ; X64-NEXT: # zmm2 {%k1} = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63] -; X64-NEXT: vpaddb %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc3] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <64 x i8> @llvm.x86.avx512.mask.punpckhb.w.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) - %res1 = call <64 x i8> @llvm.x86.avx512.mask.punpckhb.w.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1) - %res2 = add <64 x i8> %res, %res1 - ret <64 x i8> %res2 + ret <64 x i8> %res } declare <64 x i8> @llvm.x86.avx512.mask.punpcklb.w.512(<64 x i8>, <64 x i8>, <64 x i8>, i64) +define <64 x i8>@test_int_x86_avx512_punpcklb_w_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2) { +; CHECK-LABEL: test_int_x86_avx512_punpcklb_w_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpunpcklbw %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0x60,0xc1] +; CHECK-NEXT: # zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <64 x i8> @llvm.x86.avx512.mask.punpcklb.w.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1) + ret <64 x i8> %res +} + define <64 x i8>@test_int_x86_avx512_mask_punpcklb_w_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_punpcklb_w_512: ; X86: # %bb.0: -; X86-NEXT: vpunpcklbw %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0x60,0xd9] -; X86-NEXT: # zmm3 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55] ; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpunpcklbw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x60,0xd1] ; X86-NEXT: # zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55] -; X86-NEXT: vpaddb %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc3] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_punpcklb_w_512: ; X64: # %bb.0: -; X64-NEXT: vpunpcklbw %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0x60,0xd9] -; X64-NEXT: # zmm3 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55] ; X64-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] ; X64-NEXT: vpunpcklbw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x60,0xd1] ; X64-NEXT: # zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55] -; X64-NEXT: vpaddb %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc3] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <64 x i8> @llvm.x86.avx512.mask.punpcklb.w.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) - %res1 = call <64 x i8> @llvm.x86.avx512.mask.punpcklb.w.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1) - %res2 = add <64 x i8> %res, %res1 - ret <64 x i8> %res2 + ret <64 x i8> %res } declare <32 x i16> @llvm.x86.avx512.mask.punpckhw.d.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) +define <32 x i16>@test_int_x86_avx512_punpckhw_d_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) { +; CHECK-LABEL: test_int_x86_avx512_punpckhw_d_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpunpckhwd %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0x69,0xc1] +; CHECK-NEXT: # zmm0 = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <32 x i16> @llvm.x86.avx512.mask.punpckhw.d.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1) + ret <32 x i16> %res +} + define <32 x i16>@test_int_x86_avx512_mask_punpckhw_d_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_punpckhw_d_512: ; X86: # %bb.0: -; X86-NEXT: vpunpckhwd %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0x69,0xd9] -; X86-NEXT: # zmm3 = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpunpckhwd %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x69,0xd1] ; X86-NEXT: # zmm2 {%k1} = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31] -; X86-NEXT: vpaddw %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc3] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_punpckhw_d_512: ; X64: # %bb.0: -; X64-NEXT: vpunpckhwd %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0x69,0xd9] -; X64-NEXT: # zmm3 = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpunpckhwd %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x69,0xd1] ; X64-NEXT: # zmm2 {%k1} = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31] -; X64-NEXT: vpaddw %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc3] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <32 x i16> @llvm.x86.avx512.mask.punpckhw.d.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) - %res1 = call <32 x i16> @llvm.x86.avx512.mask.punpckhw.d.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1) - %res2 = add <32 x i16> %res, %res1 - ret <32 x i16> %res2 + ret <32 x i16> %res } declare <32 x i16> @llvm.x86.avx512.mask.punpcklw.d.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) +define <32 x i16>@test_int_x86_avx512_punpcklw_d_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) { +; CHECK-LABEL: test_int_x86_avx512_punpcklw_d_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpunpcklwd %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0x61,0xc1] +; CHECK-NEXT: # zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <32 x i16> @llvm.x86.avx512.mask.punpcklw.d.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1) + ret <32 x i16> %res +} + define <32 x i16>@test_int_x86_avx512_mask_punpcklw_d_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_punpcklw_d_512: ; X86: # %bb.0: -; X86-NEXT: vpunpcklwd %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0x61,0xd9] -; X86-NEXT: # zmm3 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpunpcklwd %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x61,0xd1] ; X86-NEXT: # zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27] -; X86-NEXT: vpaddw %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc3] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_punpcklw_d_512: ; X64: # %bb.0: -; X64-NEXT: vpunpcklwd %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0x61,0xd9] -; X64-NEXT: # zmm3 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpunpcklwd %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x61,0xd1] ; X64-NEXT: # zmm2 {%k1} = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27] -; X64-NEXT: vpaddw %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc3] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <32 x i16> @llvm.x86.avx512.mask.punpcklw.d.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) - %res1 = call <32 x i16> @llvm.x86.avx512.mask.punpcklw.d.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1) - %res2 = add <32 x i16> %res, %res1 - ret <32 x i16> %res2 + ret <32 x i16> %res } declare <64 x i8> @llvm.x86.avx512.mask.pmaxs.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64) +define <64 x i8>@test_int_x86_avx512_pmaxs_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pmaxs_b_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmaxsb %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf2,0x7d,0x48,0x3c,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <64 x i8> @llvm.x86.avx512.mask.pmaxs.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1) + ret <64 x i8> %res +} + define <64 x i8>@test_int_x86_avx512_mask_pmaxs_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pmaxs_b_512: ; X86: # %bb.0: -; X86-NEXT: vpmaxsb %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf2,0x7d,0x48,0x3c,0xd9] ; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmaxsb %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x3c,0xd1] -; X86-NEXT: vpaddb %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc3] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmaxs_b_512: ; X64: # %bb.0: -; X64-NEXT: vpmaxsb %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf2,0x7d,0x48,0x3c,0xd9] ; X64-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] ; X64-NEXT: vpmaxsb %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x3c,0xd1] -; X64-NEXT: vpaddb %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc3] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <64 x i8> @llvm.x86.avx512.mask.pmaxs.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) - %res1 = call <64 x i8> @llvm.x86.avx512.mask.pmaxs.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1) - %res2 = add <64 x i8> %res, %res1 - ret <64 x i8> %res2 + ret <64 x i8> %res } declare <32 x i16> @llvm.x86.avx512.mask.pmaxs.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) +define <32 x i16>@test_int_x86_avx512_pmaxs_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pmaxs_w_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmaxsw %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xee,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <32 x i16> @llvm.x86.avx512.mask.pmaxs.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1) + ret <32 x i16> %res +} + define <32 x i16>@test_int_x86_avx512_mask_pmaxs_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pmaxs_w_512: ; X86: # %bb.0: -; X86-NEXT: vpmaxsw %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0xee,0xd9] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmaxsw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xee,0xd1] -; X86-NEXT: vpaddw %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc3] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmaxs_w_512: ; X64: # %bb.0: -; X64-NEXT: vpmaxsw %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0xee,0xd9] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmaxsw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xee,0xd1] -; X64-NEXT: vpaddw %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc3] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <32 x i16> @llvm.x86.avx512.mask.pmaxs.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) - %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmaxs.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1) - %res2 = add <32 x i16> %res, %res1 - ret <32 x i16> %res2 + ret <32 x i16> %res } declare <64 x i8> @llvm.x86.avx512.mask.pmaxu.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64) +define <64 x i8>@test_int_x86_avx512_pmaxu_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pmaxu_b_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmaxub %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xde,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <64 x i8> @llvm.x86.avx512.mask.pmaxu.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1) + ret <64 x i8> %res +} + define <64 x i8>@test_int_x86_avx512_mask_pmaxu_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pmaxu_b_512: ; X86: # %bb.0: -; X86-NEXT: vpmaxub %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0xde,0xd9] ; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmaxub %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xde,0xd1] -; X86-NEXT: vpaddb %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc3] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmaxu_b_512: ; X64: # %bb.0: -; X64-NEXT: vpmaxub %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0xde,0xd9] ; X64-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] ; X64-NEXT: vpmaxub %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xde,0xd1] -; X64-NEXT: vpaddb %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc3] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <64 x i8> @llvm.x86.avx512.mask.pmaxu.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) - %res1 = call <64 x i8> @llvm.x86.avx512.mask.pmaxu.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1) - %res2 = add <64 x i8> %res, %res1 - ret <64 x i8> %res2 + ret <64 x i8> %res } declare <32 x i16> @llvm.x86.avx512.mask.pmaxu.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) +define <32 x i16>@test_int_x86_avx512_pmaxu_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pmaxu_w_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmaxuw %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf2,0x7d,0x48,0x3e,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <32 x i16> @llvm.x86.avx512.mask.pmaxu.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1) + ret <32 x i16> %res +} + define <32 x i16>@test_int_x86_avx512_mask_pmaxu_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pmaxu_w_512: ; X86: # %bb.0: -; X86-NEXT: vpmaxuw %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf2,0x7d,0x48,0x3e,0xd9] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmaxuw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x3e,0xd1] -; X86-NEXT: vpaddw %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc3] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmaxu_w_512: ; X64: # %bb.0: -; X64-NEXT: vpmaxuw %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf2,0x7d,0x48,0x3e,0xd9] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmaxuw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x3e,0xd1] -; X64-NEXT: vpaddw %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc3] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <32 x i16> @llvm.x86.avx512.mask.pmaxu.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) - %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmaxu.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1) - %res2 = add <32 x i16> %res, %res1 - ret <32 x i16> %res2 + ret <32 x i16> %res } declare <64 x i8> @llvm.x86.avx512.mask.pmins.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64) +define <64 x i8>@test_int_x86_avx512_pmins_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pmins_b_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpminsb %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf2,0x7d,0x48,0x38,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <64 x i8> @llvm.x86.avx512.mask.pmins.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1) + ret <64 x i8> %res +} + define <64 x i8>@test_int_x86_avx512_mask_pmins_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pmins_b_512: ; X86: # %bb.0: -; X86-NEXT: vpminsb %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf2,0x7d,0x48,0x38,0xd9] ; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpminsb %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x38,0xd1] -; X86-NEXT: vpaddb %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc3] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmins_b_512: ; X64: # %bb.0: -; X64-NEXT: vpminsb %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf2,0x7d,0x48,0x38,0xd9] ; X64-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] ; X64-NEXT: vpminsb %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x38,0xd1] -; X64-NEXT: vpaddb %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc3] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <64 x i8> @llvm.x86.avx512.mask.pmins.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) - %res1 = call <64 x i8> @llvm.x86.avx512.mask.pmins.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1) - %res2 = add <64 x i8> %res, %res1 - ret <64 x i8> %res2 + ret <64 x i8> %res } declare <32 x i16> @llvm.x86.avx512.mask.pmins.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) +define <32 x i16>@test_int_x86_avx512_pmins_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pmins_w_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpminsw %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xea,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <32 x i16> @llvm.x86.avx512.mask.pmins.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1) + ret <32 x i16> %res +} + define <32 x i16>@test_int_x86_avx512_mask_pmins_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pmins_w_512: ; X86: # %bb.0: -; X86-NEXT: vpminsw %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0xea,0xd9] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpminsw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xea,0xd1] -; X86-NEXT: vpaddw %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc3] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmins_w_512: ; X64: # %bb.0: -; X64-NEXT: vpminsw %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0xea,0xd9] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpminsw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xea,0xd1] -; X64-NEXT: vpaddw %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc3] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <32 x i16> @llvm.x86.avx512.mask.pmins.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) - %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmins.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1) - %res2 = add <32 x i16> %res, %res1 - ret <32 x i16> %res2 + ret <32 x i16> %res } declare <64 x i8> @llvm.x86.avx512.mask.pminu.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64) +define <64 x i8>@test_int_x86_avx512_pminu_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pminu_b_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpminub %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xda,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <64 x i8> @llvm.x86.avx512.mask.pminu.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1) + ret <64 x i8> %res +} + define <64 x i8>@test_int_x86_avx512_mask_pminu_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pminu_b_512: ; X86: # %bb.0: -; X86-NEXT: vpminub %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0xda,0xd9] ; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpminub %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xda,0xd1] -; X86-NEXT: vpaddb %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc3] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pminu_b_512: ; X64: # %bb.0: -; X64-NEXT: vpminub %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0xda,0xd9] ; X64-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] ; X64-NEXT: vpminub %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xda,0xd1] -; X64-NEXT: vpaddb %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc3] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <64 x i8> @llvm.x86.avx512.mask.pminu.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) - %res1 = call <64 x i8> @llvm.x86.avx512.mask.pminu.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1) - %res2 = add <64 x i8> %res, %res1 - ret <64 x i8> %res2 + ret <64 x i8> %res } declare <32 x i16> @llvm.x86.avx512.mask.pminu.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) +define <32 x i16>@test_int_x86_avx512_pminu_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pminu_w_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpminuw %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf2,0x7d,0x48,0x3a,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <32 x i16> @llvm.x86.avx512.mask.pminu.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1) + ret <32 x i16> %res +} + define <32 x i16>@test_int_x86_avx512_mask_pminu_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pminu_w_512: ; X86: # %bb.0: -; X86-NEXT: vpminuw %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf2,0x7d,0x48,0x3a,0xd9] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpminuw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x3a,0xd1] -; X86-NEXT: vpaddw %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc3] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pminu_w_512: ; X64: # %bb.0: -; X64-NEXT: vpminuw %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf2,0x7d,0x48,0x3a,0xd9] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpminuw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x3a,0xd1] -; X64-NEXT: vpaddw %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc3] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <32 x i16> @llvm.x86.avx512.mask.pminu.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) - %res1 = call <32 x i16> @llvm.x86.avx512.mask.pminu.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1) - %res2 = add <32 x i16> %res, %res1 - ret <32 x i16> %res2 + ret <32 x i16> %res } declare <32 x i16> @llvm.x86.avx512.mask.pmovzxb.w.512(<32 x i8>, <32 x i16>, i32) +define <32 x i16>@test_int_x86_avx512_pmovzxb_w_512(<32 x i8> %x0, <32 x i16> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmovzxb_w_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovzxbw %ymm0, %zmm0 # encoding: [0x62,0xf2,0x7d,0x48,0x30,0xc0] +; CHECK-NEXT: # zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <32 x i16> @llvm.x86.avx512.mask.pmovzxb.w.512(<32 x i8> %x0, <32 x i16> %x1, i32 -1) + ret <32 x i16> %res +} + define <32 x i16>@test_int_x86_avx512_mask_pmovzxb_w_512(<32 x i8> %x0, <32 x i16> %x1, i32 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmovzxb_w_512: ; X86: # %bb.0: -; X86-NEXT: vpmovzxbw %ymm0, %zmm2 # encoding: [0x62,0xf2,0x7d,0x48,0x30,0xd0] -; X86-NEXT: # zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmovzxbw %ymm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x30,0xc8] ; X86-NEXT: # zmm1 {%k1} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero -; X86-NEXT: vpmovzxbw %ymm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0x30,0xc0] -; X86-NEXT: # zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero -; X86-NEXT: vpaddw %zmm2, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc2] -; X86-NEXT: vpaddw %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfd,0xc0] +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmovzxb_w_512: ; X64: # %bb.0: -; X64-NEXT: vpmovzxbw %ymm0, %zmm2 # encoding: [0x62,0xf2,0x7d,0x48,0x30,0xd0] -; X64-NEXT: # zmm2 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmovzxbw %ymm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x30,0xc8] ; X64-NEXT: # zmm1 {%k1} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <32 x i16> @llvm.x86.avx512.mask.pmovzxb.w.512(<32 x i8> %x0, <32 x i16> %x1, i32 %x2) + ret <32 x i16> %res +} + +define <32 x i16>@test_int_x86_avx512_maskz_pmovzxb_w_512(<32 x i8> %x0, i32 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmovzxb_w_512: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpmovzxbw %ymm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0x30,0xc0] +; X86-NEXT: # zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmovzxb_w_512: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmovzxbw %ymm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0x30,0xc0] ; X64-NEXT: # zmm0 {%k1} {z} = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero -; X64-NEXT: vpaddw %zmm2, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc2] -; X64-NEXT: vpaddw %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfd,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <32 x i16> @llvm.x86.avx512.mask.pmovzxb.w.512(<32 x i8> %x0, <32 x i16> %x1, i32 %x2) - %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmovzxb.w.512(<32 x i8> %x0, <32 x i16> zeroinitializer, i32 %x2) - %res2 = call <32 x i16> @llvm.x86.avx512.mask.pmovzxb.w.512(<32 x i8> %x0, <32 x i16> %x1, i32 -1) - %res3 = add <32 x i16> %res, %res1 - %res4 = add <32 x i16> %res3, %res2 - ret <32 x i16> %res4 + %res = call <32 x i16> @llvm.x86.avx512.mask.pmovzxb.w.512(<32 x i8> %x0, <32 x i16> zeroinitializer, i32 %x2) + ret <32 x i16> %res } declare <32 x i16> @llvm.x86.avx512.mask.pmovsxb.w.512(<32 x i8>, <32 x i16>, i32) +define <32 x i16>@test_int_x86_avx512_pmovsxb_w_512(<32 x i8> %x0, <32 x i16> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmovsxb_w_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovsxbw %ymm0, %zmm0 # encoding: [0x62,0xf2,0x7d,0x48,0x20,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <32 x i16> @llvm.x86.avx512.mask.pmovsxb.w.512(<32 x i8> %x0, <32 x i16> %x1, i32 -1) + ret <32 x i16> %res +} + define <32 x i16>@test_int_x86_avx512_mask_pmovsxb_w_512(<32 x i8> %x0, <32 x i16> %x1, i32 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmovsxb_w_512: ; X86: # %bb.0: -; X86-NEXT: vpmovsxbw %ymm0, %zmm2 # encoding: [0x62,0xf2,0x7d,0x48,0x20,0xd0] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmovsxbw %ymm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x20,0xc8] -; X86-NEXT: vpmovsxbw %ymm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0x20,0xc0] -; X86-NEXT: vpaddw %zmm2, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc2] -; X86-NEXT: vpaddw %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfd,0xc0] +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmovsxb_w_512: ; X64: # %bb.0: -; X64-NEXT: vpmovsxbw %ymm0, %zmm2 # encoding: [0x62,0xf2,0x7d,0x48,0x20,0xd0] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmovsxbw %ymm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x20,0xc8] -; X64-NEXT: vpmovsxbw %ymm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0x20,0xc0] -; X64-NEXT: vpaddw %zmm2, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc2] -; X64-NEXT: vpaddw %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfd,0xc0] +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <32 x i16> @llvm.x86.avx512.mask.pmovsxb.w.512(<32 x i8> %x0, <32 x i16> %x1, i32 %x2) - %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmovsxb.w.512(<32 x i8> %x0, <32 x i16> zeroinitializer, i32 %x2) - %res2 = call <32 x i16> @llvm.x86.avx512.mask.pmovsxb.w.512(<32 x i8> %x0, <32 x i16> %x1, i32 -1) - %res3 = add <32 x i16> %res, %res1 - %res4 = add <32 x i16> %res3, %res2 - ret <32 x i16> %res4 + ret <32 x i16> %res +} + +define <32 x i16>@test_int_x86_avx512_maskz_pmovsxb_w_512(<32 x i8> %x0, i32 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmovsxb_w_512: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpmovsxbw %ymm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0x20,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmovsxb_w_512: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpmovsxbw %ymm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0x20,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <32 x i16> @llvm.x86.avx512.mask.pmovsxb.w.512(<32 x i8> %x0, <32 x i16> zeroinitializer, i32 %x2) + ret <32 x i16> %res } declare <32 x i16> @llvm.x86.avx512.mask.psrl.w.512(<32 x i16>, <8 x i16>, <32 x i16>, i32) @@ -1097,26 +1261,31 @@ define <32 x i16>@test_int_x86_avx512_mask_psll_wi_512(<32 x i16> %x0, i32 %x1, declare <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64) +define <64 x i8>@test_int_x86_avx512_pshuf_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pshuf_b_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpshufb %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf2,0x7d,0x48,0x00,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1) + ret <64 x i8> %res +} + define <64 x i8>@test_int_x86_avx512_mask_pshuf_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pshuf_b_512: ; X86: # %bb.0: -; X86-NEXT: vpshufb %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf2,0x7d,0x48,0x00,0xd9] ; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpshufb %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x00,0xd1] -; X86-NEXT: vpaddb %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc3] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pshuf_b_512: ; X64: # %bb.0: -; X64-NEXT: vpshufb %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf2,0x7d,0x48,0x00,0xd9] ; X64-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] ; X64-NEXT: vpshufb %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x00,0xd1] -; X64-NEXT: vpaddb %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc3] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) - %res1 = call <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1) - %res2 = add <64 x i8> %res, %res1 - ret <64 x i8> %res2 + ret <64 x i8> %res } @@ -2631,178 +2800,221 @@ define i32@test_int_x86_avx512_cvtw2mask_512(<32 x i16> %x0) { declare <32 x i16> @llvm.x86.avx512.mask.pmulhu.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) +define <32 x i16>@test_int_x86_avx512_pmulhu_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pmulhu_w_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmulhuw %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xe4,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <32 x i16> @llvm.x86.avx512.mask.pmulhu.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1) + ret <32 x i16> %res +} + define <32 x i16>@test_int_x86_avx512_mask_pmulhu_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pmulhu_w_512: ; X86: # %bb.0: -; X86-NEXT: vpmulhuw %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0xe4,0xd9] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmulhuw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xe4,0xd1] -; X86-NEXT: vpaddw %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc3] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmulhu_w_512: ; X64: # %bb.0: -; X64-NEXT: vpmulhuw %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0xe4,0xd9] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmulhuw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xe4,0xd1] -; X64-NEXT: vpaddw %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc3] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <32 x i16> @llvm.x86.avx512.mask.pmulhu.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) - %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmulhu.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1) - %res2 = add <32 x i16> %res, %res1 - ret <32 x i16> %res2 + ret <32 x i16> %res } declare <32 x i16> @llvm.x86.avx512.mask.pmulh.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) +define <32 x i16>@test_int_x86_avx512_pmulh_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pmulh_w_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmulhw %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xe5,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <32 x i16> @llvm.x86.avx512.mask.pmulh.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1) + ret <32 x i16> %res +} + define <32 x i16>@test_int_x86_avx512_mask_pmulh_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pmulh_w_512: ; X86: # %bb.0: -; X86-NEXT: vpmulhw %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0xe5,0xd9] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmulhw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xe5,0xd1] -; X86-NEXT: vpaddw %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc3] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmulh_w_512: ; X64: # %bb.0: -; X64-NEXT: vpmulhw %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0xe5,0xd9] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmulhw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xe5,0xd1] -; X64-NEXT: vpaddw %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc3] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <32 x i16> @llvm.x86.avx512.mask.pmulh.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) - %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmulh.w.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1) - %res2 = add <32 x i16> %res, %res1 - ret <32 x i16> %res2 + ret <32 x i16> %res } declare <32 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) +define <32 x i16>@test_int_x86_avx512_pmulhr_sw_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pmulhr_sw_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmulhrsw %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf2,0x7d,0x48,0x0b,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <32 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1) + ret <32 x i16> %res +} + define <32 x i16>@test_int_x86_avx512_mask_pmulhr_sw_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pmulhr_sw_512: ; X86: # %bb.0: -; X86-NEXT: vpmulhrsw %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf2,0x7d,0x48,0x0b,0xd9] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmulhrsw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x0b,0xd1] -; X86-NEXT: vpaddw %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc3] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmulhr_sw_512: ; X64: # %bb.0: -; X64-NEXT: vpmulhrsw %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf2,0x7d,0x48,0x0b,0xd9] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmulhrsw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x0b,0xd1] -; X64-NEXT: vpaddw %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc3] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <32 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) - %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1) - %res2 = add <32 x i16> %res, %res1 - ret <32 x i16> %res2 + ret <32 x i16> %res } declare <32 x i16> @llvm.x86.avx512.mask.pmaddubs.w.512(<64 x i8>, <64 x i8>, <32 x i16>, i32) +define <32 x i16>@test_int_x86_avx512_pmaddubs_w_512(<64 x i8> %x0, <64 x i8> %x1, <32 x i16> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pmaddubs_w_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmaddubsw %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf2,0x7d,0x48,0x04,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <32 x i16> @llvm.x86.avx512.mask.pmaddubs.w.512(<64 x i8> %x0, <64 x i8> %x1, <32 x i16> %x2, i32 -1) + ret <32 x i16> %res +} + define <32 x i16>@test_int_x86_avx512_mask_pmaddubs_w_512(<64 x i8> %x0, <64 x i8> %x1, <32 x i16> %x2, i32 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pmaddubs_w_512: ; X86: # %bb.0: -; X86-NEXT: vpmaddubsw %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf2,0x7d,0x48,0x04,0xd9] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmaddubsw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x04,0xd1] -; X86-NEXT: vpaddw %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc3] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmaddubs_w_512: ; X64: # %bb.0: -; X64-NEXT: vpmaddubsw %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf2,0x7d,0x48,0x04,0xd9] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmaddubsw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x04,0xd1] -; X64-NEXT: vpaddw %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc3] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <32 x i16> @llvm.x86.avx512.mask.pmaddubs.w.512(<64 x i8> %x0, <64 x i8> %x1, <32 x i16> %x2, i32 %x3) - %res1 = call <32 x i16> @llvm.x86.avx512.mask.pmaddubs.w.512(<64 x i8> %x0, <64 x i8> %x1, <32 x i16> %x2, i32 -1) - %res2 = add <32 x i16> %res, %res1 - ret <32 x i16> %res2 + ret <32 x i16> %res } declare <16 x i32> @llvm.x86.avx512.mask.pmaddw.d.512(<32 x i16>, <32 x i16>, <16 x i32>, i16) +define <16 x i32>@test_int_x86_avx512_pmaddw_d_512(<32 x i16> %x0, <32 x i16> %x1, <16 x i32> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pmaddw_d_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmaddwd %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xf5,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <16 x i32> @llvm.x86.avx512.mask.pmaddw.d.512(<32 x i16> %x0, <32 x i16> %x1, <16 x i32> %x2, i16 -1) + ret <16 x i32> %res +} + define <16 x i32>@test_int_x86_avx512_mask_pmaddw_d_512(<32 x i16> %x0, <32 x i16> %x1, <16 x i32> %x2, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pmaddw_d_512: ; X86: # %bb.0: -; X86-NEXT: vpmaddwd %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0xf5,0xd9] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmaddwd %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xf5,0xd1] -; X86-NEXT: vpaddd %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc3] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmaddw_d_512: ; X64: # %bb.0: -; X64-NEXT: vpmaddwd %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0xf5,0xd9] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmaddwd %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xf5,0xd1] -; X64-NEXT: vpaddd %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc3] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <16 x i32> @llvm.x86.avx512.mask.pmaddw.d.512(<32 x i16> %x0, <32 x i16> %x1, <16 x i32> %x2, i16 %x3) - %res1 = call <16 x i32> @llvm.x86.avx512.mask.pmaddw.d.512(<32 x i16> %x0, <32 x i16> %x1, <16 x i32> %x2, i16 -1) - %res2 = add <16 x i32> %res, %res1 - ret <16 x i32> %res2 + ret <16 x i32> %res } declare <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) +define <32 x i16>@test_int_x86_avx512_permvar_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) { +; CHECK-LABEL: test_int_x86_avx512_permvar_hi_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermw %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf2,0xf5,0x48,0x8d,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1) + ret <32 x i16> %res +} + define <32 x i16>@test_int_x86_avx512_mask_permvar_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_permvar_hi_512: ; X86: # %bb.0: -; X86-NEXT: vpermw %zmm0, %zmm1, %zmm3 # encoding: [0x62,0xf2,0xf5,0x48,0x8d,0xd8] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpermw %zmm0, %zmm1, %zmm2 {%k1} # encoding: [0x62,0xf2,0xf5,0x49,0x8d,0xd0] -; X86-NEXT: vpermw %zmm0, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xc9,0x8d,0xc0] -; X86-NEXT: vpaddw %zmm3, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc3] -; X86-NEXT: vpaddw %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc0] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_permvar_hi_512: ; X64: # %bb.0: -; X64-NEXT: vpermw %zmm0, %zmm1, %zmm3 # encoding: [0x62,0xf2,0xf5,0x48,0x8d,0xd8] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpermw %zmm0, %zmm1, %zmm2 {%k1} # encoding: [0x62,0xf2,0xf5,0x49,0x8d,0xd0] -; X64-NEXT: vpermw %zmm0, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xc9,0x8d,0xc0] -; X64-NEXT: vpaddw %zmm3, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc3] -; X64-NEXT: vpaddw %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc0] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) - %res1 = call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> zeroinitializer, i32 %x3) - %res2 = call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1) - %res3 = add <32 x i16> %res, %res1 - %res4 = add <32 x i16> %res3, %res2 - ret <32 x i16> %res4 + ret <32 x i16> %res +} + +define <32 x i16>@test_int_x86_avx512_maskz_permvar_hi_512(<32 x i16> %x0, <32 x i16> %x1, i32 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_permvar_hi_512: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpermw %zmm0, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xc9,0x8d,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_permvar_hi_512: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpermw %zmm0, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xc9,0x8d,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> zeroinitializer, i32 %x3) + ret <32 x i16> %res } declare <32 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) +define <32 x i16>@test_int_x86_avx512_vpermt2var_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermt2var_hi_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermi2w %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0xf5,0x48,0x75,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <32 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1) + ret <32 x i16> %res +} + define <32 x i16>@test_int_x86_avx512_mask_vpermt2var_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermt2var_hi_512: ; X86: # %bb.0: -; X86-NEXT: vmovdqa64 %zmm1, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd9] -; X86-NEXT: vpermt2w %zmm2, %zmm0, %zmm3 # encoding: [0x62,0xf2,0xfd,0x48,0x7d,0xda] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpermt2w %zmm2, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x49,0x7d,0xca] -; X86-NEXT: vpaddw %zmm3, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfd,0xc3] +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermt2var_hi_512: ; X64: # %bb.0: -; X64-NEXT: vmovdqa64 %zmm1, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd9] -; X64-NEXT: vpermt2w %zmm2, %zmm0, %zmm3 # encoding: [0x62,0xf2,0xfd,0x48,0x7d,0xda] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpermt2w %zmm2, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x49,0x7d,0xca] -; X64-NEXT: vpaddw %zmm3, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfd,0xc3] +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <32 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) - %res1 = call <32 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1) - %res2 = add <32 x i16> %res, %res1 - ret <32 x i16> %res2 + ret <32 x i16> %res } declare <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) @@ -2810,51 +3022,46 @@ declare <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16>, <32 x i1 define <32 x i16>@test_int_x86_avx512_maskz_vpermt2var_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) { ; X86-LABEL: test_int_x86_avx512_maskz_vpermt2var_hi_512: ; X86: # %bb.0: -; X86-NEXT: vmovdqa64 %zmm1, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd9] -; X86-NEXT: vpermt2w %zmm2, %zmm0, %zmm3 # encoding: [0x62,0xf2,0xfd,0x48,0x7d,0xda] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vpermt2w %zmm2, %zmm0, %zmm1 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xc9,0x7d,0xca] -; X86-NEXT: vpaddw %zmm3, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfd,0xc3] +; X86-NEXT: vpermi2w %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xc9,0x75,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_maskz_vpermt2var_hi_512: ; X64: # %bb.0: -; X64-NEXT: vmovdqa64 %zmm1, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd9] -; X64-NEXT: vpermt2w %zmm2, %zmm0, %zmm3 # encoding: [0x62,0xf2,0xfd,0x48,0x7d,0xda] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vpermt2w %zmm2, %zmm0, %zmm1 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xc9,0x7d,0xca] -; X64-NEXT: vpaddw %zmm3, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfd,0xc3] +; X64-NEXT: vpermi2w %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xc9,0x75,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) - %res1 = call <32 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1) - %res2 = add <32 x i16> %res, %res1 - ret <32 x i16> %res2 + ret <32 x i16> %res } declare <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) +define <32 x i16>@test_int_x86_avx512_vpermi2var_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermi2var_hi_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermt2w %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0xf5,0x48,0x7d,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1) + ret <32 x i16> %res +} + define <32 x i16>@test_int_x86_avx512_mask_vpermi2var_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermi2var_hi_512: ; X86: # %bb.0: -; X86-NEXT: vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8] -; X86-NEXT: vpermt2w %zmm2, %zmm1, %zmm3 # encoding: [0x62,0xf2,0xf5,0x48,0x7d,0xda] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpermi2w %zmm2, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x49,0x75,0xca] -; X86-NEXT: vpaddw %zmm3, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfd,0xc3] +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermi2var_hi_512: ; X64: # %bb.0: -; X64-NEXT: vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8] -; X64-NEXT: vpermt2w %zmm2, %zmm1, %zmm3 # encoding: [0x62,0xf2,0xf5,0x48,0x7d,0xda] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpermi2w %zmm2, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x49,0x75,0xca] -; X64-NEXT: vpaddw %zmm3, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfd,0xc3] +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) - %res1 = call <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1) - %res2 = add <32 x i16> %res, %res1 - ret <32 x i16> %res2 + ret <32 x i16> %res } declare <32 x i16> @llvm.x86.avx512.mask.dbpsadbw.512(<64 x i8>, <64 x i8>, i32, <32 x i16>, i32) @@ -3985,30 +4192,45 @@ define <32 x i16>@test_int_x86_avx512_mask_psllv32hi(<32 x i16> %x0, <32 x i16> declare <32 x i8> @llvm.x86.avx512.mask.pmov.wb.512(<32 x i16>, <32 x i8>, i32) +define <32 x i8>@test_int_x86_avx512_pmov_wb_512(<32 x i16> %x0, <32 x i8> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmov_wb_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovwb %zmm0, %ymm0 # encoding: [0x62,0xf2,0x7e,0x48,0x30,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx512.mask.pmov.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 -1) + ret <32 x i8> %res +} + define <32 x i8>@test_int_x86_avx512_mask_pmov_wb_512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmov_wb_512: ; X86: # %bb.0: -; X86-NEXT: vpmovwb %zmm0, %ymm2 # encoding: [0x62,0xf2,0x7e,0x48,0x30,0xc2] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmovwb %zmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x49,0x30,0xc1] -; X86-NEXT: vpmovwb %zmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xc9,0x30,0xc0] -; X86-NEXT: vpaddb %ymm0, %ymm1, %ymm0 # encoding: [0xc5,0xf5,0xfc,0xc0] -; X86-NEXT: vpaddb %ymm0, %ymm2, %ymm0 # encoding: [0xc5,0xed,0xfc,0xc0] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmov_wb_512: ; X64: # %bb.0: -; X64-NEXT: vpmovwb %zmm0, %ymm2 # encoding: [0x62,0xf2,0x7e,0x48,0x30,0xc2] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmovwb %zmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x49,0x30,0xc1] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # encoding: [0xc5,0xfd,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx512.mask.pmov.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2) + ret <32 x i8> %res +} + +define <32 x i8>@test_int_x86_avx512_maskz_pmov_wb_512(<32 x i16> %x0, i32 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmov_wb_512: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpmovwb %zmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xc9,0x30,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmov_wb_512: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmovwb %zmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xc9,0x30,0xc0] -; X64-NEXT: vpaddb %ymm0, %ymm1, %ymm0 # encoding: [0xc5,0xf5,0xfc,0xc0] -; X64-NEXT: vpaddb %ymm0, %ymm2, %ymm0 # encoding: [0xc5,0xed,0xfc,0xc0] -; X64-NEXT: retq # encoding: [0xc3] - %res0 = call <32 x i8> @llvm.x86.avx512.mask.pmov.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 -1) - %res1 = call <32 x i8> @llvm.x86.avx512.mask.pmov.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2) - %res2 = call <32 x i8> @llvm.x86.avx512.mask.pmov.wb.512(<32 x i16> %x0, <32 x i8> zeroinitializer, i32 %x2) - %res3 = add <32 x i8> %res0, %res1 - %res4 = add <32 x i8> %res3, %res2 - ret <32 x i8> %res4 +; X64-NEXT: retq # encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx512.mask.pmov.wb.512(<32 x i16> %x0, <32 x i8> zeroinitializer, i32 %x2) + ret <32 x i8> %res } diff --git a/llvm/test/CodeGen/X86/avx512bw-intrinsics.ll b/llvm/test/CodeGen/X86/avx512bw-intrinsics.ll index 613bbd6633d6f..e84d9d37c8222 100644 --- a/llvm/test/CodeGen/X86/avx512bw-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512bw-intrinsics.ll @@ -683,84 +683,82 @@ define <64 x i8> @test_mask_packus_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %pt declare <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16>, <32 x i16>) +define <32 x i16>@test_int_x86_avx512_vpermt2var_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermt2var_hi_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermi2w %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0xf5,0x48,0x75,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> %x1, <32 x i16> %x0, <32 x i16> %x2) + ret <32 x i16> %1 +} + define <32 x i16>@test_int_x86_avx512_mask_vpermt2var_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermt2var_hi_512: ; X86: # %bb.0: -; X86-NEXT: vmovdqa64 %zmm1, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd9] -; X86-NEXT: vpermt2w %zmm2, %zmm0, %zmm3 # encoding: [0x62,0xf2,0xfd,0x48,0x7d,0xda] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpermt2w %zmm2, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x49,0x7d,0xca] -; X86-NEXT: vpaddw %zmm3, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfd,0xc3] +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermt2var_hi_512: ; X64: # %bb.0: -; X64-NEXT: vmovdqa64 %zmm1, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd9] -; X64-NEXT: vpermt2w %zmm2, %zmm0, %zmm3 # encoding: [0x62,0xf2,0xfd,0x48,0x7d,0xda] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpermt2w %zmm2, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x49,0x7d,0xca] -; X64-NEXT: vpaddw %zmm3, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfd,0xc3] +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %1 = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> %x1, <32 x i16> %x0, <32 x i16> %x2) %2 = bitcast i32 %x3 to <32 x i1> %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %x1 - %4 = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> %x1, <32 x i16> %x0, <32 x i16> %x2) - %res2 = add <32 x i16> %3, %4 - ret <32 x i16> %res2 + ret <32 x i16> %3 } define <32 x i16>@test_int_x86_avx512_maskz_vpermt2var_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) { ; X86-LABEL: test_int_x86_avx512_maskz_vpermt2var_hi_512: ; X86: # %bb.0: -; X86-NEXT: vmovdqa64 %zmm1, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd9] -; X86-NEXT: vpermt2w %zmm2, %zmm0, %zmm3 # encoding: [0x62,0xf2,0xfd,0x48,0x7d,0xda] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vpermt2w %zmm2, %zmm0, %zmm1 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xc9,0x7d,0xca] -; X86-NEXT: vpaddw %zmm3, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfd,0xc3] +; X86-NEXT: vpermi2w %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xc9,0x75,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_maskz_vpermt2var_hi_512: ; X64: # %bb.0: -; X64-NEXT: vmovdqa64 %zmm1, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd9] -; X64-NEXT: vpermt2w %zmm2, %zmm0, %zmm3 # encoding: [0x62,0xf2,0xfd,0x48,0x7d,0xda] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vpermt2w %zmm2, %zmm0, %zmm1 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xc9,0x7d,0xca] -; X64-NEXT: vpaddw %zmm3, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfd,0xc3] +; X64-NEXT: vpermi2w %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xc9,0x75,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %1 = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> %x1, <32 x i16> %x0, <32 x i16> %x2) %2 = bitcast i32 %x3 to <32 x i1> %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> zeroinitializer - %4 = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> %x1, <32 x i16> %x0, <32 x i16> %x2) - %res2 = add <32 x i16> %3, %4 - ret <32 x i16> %res2 + ret <32 x i16> %3 } declare <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16>, <32 x i16>, <32 x i16>) +define <32 x i16>@test_int_x86_avx512_vpermi2var_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermi2var_hi_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermt2w %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0xf5,0x48,0x7d,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) + ret <32 x i16> %1 +} + define <32 x i16>@test_int_x86_avx512_mask_vpermi2var_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermi2var_hi_512: ; X86: # %bb.0: -; X86-NEXT: vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8] -; X86-NEXT: vpermt2w %zmm2, %zmm1, %zmm3 # encoding: [0x62,0xf2,0xf5,0x48,0x7d,0xda] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpermi2w %zmm2, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x49,0x75,0xca] -; X86-NEXT: vpaddw %zmm3, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfd,0xc3] +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermi2var_hi_512: ; X64: # %bb.0: -; X64-NEXT: vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8] -; X64-NEXT: vpermt2w %zmm2, %zmm1, %zmm3 # encoding: [0x62,0xf2,0xf5,0x48,0x7d,0xda] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpermi2w %zmm2, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x49,0x75,0xca] -; X64-NEXT: vpaddw %zmm3, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfd,0xc3] +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %1 = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) %2 = bitcast i32 %x3 to <32 x i1> %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %x1 - %4 = call <32 x i16> @llvm.x86.avx512.vpermi2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) - %res2 = add <32 x i16> %3, %4 - ret <32 x i16> %res2 + ret <32 x i16> %3 } declare <64 x i8> @llvm.x86.avx512.pavg.b.512(<64 x i8>, <64 x i8>) @@ -942,36 +940,51 @@ define <32 x i16> @test_int_x86_avx512_mask_pmulhr_sw_512(<32 x i16> %x0, <32 x ret <32 x i16> %res2 } +define <32 x i8>@test_int_x86_avx512_pmov_wb_512(<32 x i16> %x0) { +; CHECK-LABEL: test_int_x86_avx512_pmov_wb_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovwb %zmm0, %ymm0 # encoding: [0x62,0xf2,0x7e,0x48,0x30,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = trunc <32 x i16> %x0 to <32 x i8> + ret <32 x i8> %1 +} + define <32 x i8>@test_int_x86_avx512_mask_pmov_wb_512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmov_wb_512: ; X86: # %bb.0: -; X86-NEXT: vpmovwb %zmm0, %ymm2 # encoding: [0x62,0xf2,0x7e,0x48,0x30,0xc2] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmovwb %zmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x49,0x30,0xc1] -; X86-NEXT: vpmovwb %zmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xc9,0x30,0xc0] -; X86-NEXT: vpaddb %ymm0, %ymm1, %ymm0 # encoding: [0xc5,0xf5,0xfc,0xc0] -; X86-NEXT: vpaddb %ymm0, %ymm2, %ymm0 # encoding: [0xc5,0xed,0xfc,0xc0] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmov_wb_512: ; X64: # %bb.0: -; X64-NEXT: vpmovwb %zmm0, %ymm2 # encoding: [0x62,0xf2,0x7e,0x48,0x30,0xc2] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmovwb %zmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x49,0x30,0xc1] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # encoding: [0xc5,0xfd,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %1 = trunc <32 x i16> %x0 to <32 x i8> + %2 = bitcast i32 %x2 to <32 x i1> + %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %x1 + ret <32 x i8> %3 +} + +define <32 x i8>@test_int_x86_avx512_maskz_pmov_wb_512(<32 x i16> %x0, i32 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmov_wb_512: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpmovwb %zmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xc9,0x30,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmov_wb_512: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmovwb %zmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xc9,0x30,0xc0] -; X64-NEXT: vpaddb %ymm0, %ymm1, %ymm0 # encoding: [0xc5,0xf5,0xfc,0xc0] -; X64-NEXT: vpaddb %ymm0, %ymm2, %ymm0 # encoding: [0xc5,0xed,0xfc,0xc0] ; X64-NEXT: retq # encoding: [0xc3] %1 = trunc <32 x i16> %x0 to <32 x i8> - %2 = trunc <32 x i16> %x0 to <32 x i8> - %3 = bitcast i32 %x2 to <32 x i1> - %4 = select <32 x i1> %3, <32 x i8> %2, <32 x i8> %x1 - %5 = trunc <32 x i16> %x0 to <32 x i8> - %6 = bitcast i32 %x2 to <32 x i1> - %7 = select <32 x i1> %6, <32 x i8> %5, <32 x i8> zeroinitializer - %res3 = add <32 x i8> %1, %4 - %res4 = add <32 x i8> %res3, %7 - ret <32 x i8> %res4 + %2 = bitcast i32 %x2 to <32 x i1> + %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> zeroinitializer + ret <32 x i8> %3 } declare void @llvm.x86.avx512.mask.pmov.wb.mem.512(i8* %ptr, <32 x i16>, i32) @@ -1000,32 +1013,47 @@ define void @test_int_x86_avx512_mask_pmov_wb_mem_512(i8* %ptr, <32 x i16> %x1, declare <32 x i8> @llvm.x86.avx512.mask.pmovs.wb.512(<32 x i16>, <32 x i8>, i32) +define <32 x i8>@test_int_x86_avx512_pmovs_wb_512(<32 x i16> %x0, <32 x i8> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmovs_wb_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovswb %zmm0, %ymm0 # encoding: [0x62,0xf2,0x7e,0x48,0x20,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx512.mask.pmovs.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 -1) + ret <32 x i8> %res +} + define <32 x i8>@test_int_x86_avx512_mask_pmovs_wb_512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmovs_wb_512: ; X86: # %bb.0: ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmovswb %zmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x49,0x20,0xc1] -; X86-NEXT: vpmovswb %zmm0, %ymm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xc9,0x20,0xc2] -; X86-NEXT: vpaddb %ymm2, %ymm1, %ymm1 # encoding: [0xc5,0xf5,0xfc,0xca] -; X86-NEXT: vpmovswb %zmm0, %ymm0 # encoding: [0x62,0xf2,0x7e,0x48,0x20,0xc0] -; X86-NEXT: vpaddb %ymm1, %ymm0, %ymm0 # encoding: [0xc5,0xfd,0xfc,0xc1] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmovs_wb_512: ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmovswb %zmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x49,0x20,0xc1] -; X64-NEXT: vpmovswb %zmm0, %ymm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xc9,0x20,0xc2] -; X64-NEXT: vpaddb %ymm2, %ymm1, %ymm1 # encoding: [0xc5,0xf5,0xfc,0xca] -; X64-NEXT: vpmovswb %zmm0, %ymm0 # encoding: [0x62,0xf2,0x7e,0x48,0x20,0xc0] -; X64-NEXT: vpaddb %ymm1, %ymm0, %ymm0 # encoding: [0xc5,0xfd,0xfc,0xc1] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # encoding: [0xc5,0xfd,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] - %res0 = call <32 x i8> @llvm.x86.avx512.mask.pmovs.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 -1) - %res1 = call <32 x i8> @llvm.x86.avx512.mask.pmovs.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2) - %res2 = call <32 x i8> @llvm.x86.avx512.mask.pmovs.wb.512(<32 x i16> %x0, <32 x i8> zeroinitializer, i32 %x2) - %res3 = add <32 x i8> %res0, %res1 - %res4 = add <32 x i8> %res3, %res2 - ret <32 x i8> %res4 + %res = call <32 x i8> @llvm.x86.avx512.mask.pmovs.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2) + ret <32 x i8> %res +} + +define <32 x i8>@test_int_x86_avx512_maskz_pmovs_wb_512(<32 x i16> %x0, i32 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmovs_wb_512: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpmovswb %zmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xc9,0x20,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmovs_wb_512: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpmovswb %zmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xc9,0x20,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx512.mask.pmovs.wb.512(<32 x i16> %x0, <32 x i8> zeroinitializer, i32 %x2) + ret <32 x i8> %res } declare void @llvm.x86.avx512.mask.pmovs.wb.mem.512(i8* %ptr, <32 x i16>, i32) @@ -1054,32 +1082,47 @@ define void @test_int_x86_avx512_mask_pmovs_wb_mem_512(i8* %ptr, <32 x i16> %x1, declare <32 x i8> @llvm.x86.avx512.mask.pmovus.wb.512(<32 x i16>, <32 x i8>, i32) +define <32 x i8>@test_int_x86_avx512_pmovus_wb_512(<32 x i16> %x0, <32 x i8> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmovus_wb_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovuswb %zmm0, %ymm0 # encoding: [0x62,0xf2,0x7e,0x48,0x10,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx512.mask.pmovus.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 -1) + ret <32 x i8> %res +} + define <32 x i8>@test_int_x86_avx512_mask_pmovus_wb_512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmovus_wb_512: ; X86: # %bb.0: ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmovuswb %zmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x49,0x10,0xc1] -; X86-NEXT: vpmovuswb %zmm0, %ymm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xc9,0x10,0xc2] -; X86-NEXT: vpaddb %ymm2, %ymm1, %ymm1 # encoding: [0xc5,0xf5,0xfc,0xca] -; X86-NEXT: vpmovuswb %zmm0, %ymm0 # encoding: [0x62,0xf2,0x7e,0x48,0x10,0xc0] -; X86-NEXT: vpaddb %ymm1, %ymm0, %ymm0 # encoding: [0xc5,0xfd,0xfc,0xc1] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmovus_wb_512: ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmovuswb %zmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x49,0x10,0xc1] -; X64-NEXT: vpmovuswb %zmm0, %ymm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xc9,0x10,0xc2] -; X64-NEXT: vpaddb %ymm2, %ymm1, %ymm1 # encoding: [0xc5,0xf5,0xfc,0xca] -; X64-NEXT: vpmovuswb %zmm0, %ymm0 # encoding: [0x62,0xf2,0x7e,0x48,0x10,0xc0] -; X64-NEXT: vpaddb %ymm1, %ymm0, %ymm0 # encoding: [0xc5,0xfd,0xfc,0xc1] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # encoding: [0xc5,0xfd,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx512.mask.pmovus.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2) + ret <32 x i8> %res +} + +define <32 x i8>@test_int_x86_avx512_maskz_pmovus_wb_512(<32 x i16> %x0, i32 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmovus_wb_512: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpmovuswb %zmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xc9,0x10,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmovus_wb_512: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpmovuswb %zmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xc9,0x10,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res0 = call <32 x i8> @llvm.x86.avx512.mask.pmovus.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 -1) - %res1 = call <32 x i8> @llvm.x86.avx512.mask.pmovus.wb.512(<32 x i16> %x0, <32 x i8> %x1, i32 %x2) - %res2 = call <32 x i8> @llvm.x86.avx512.mask.pmovus.wb.512(<32 x i16> %x0, <32 x i8> zeroinitializer, i32 %x2) - %res3 = add <32 x i8> %res0, %res1 - %res4 = add <32 x i8> %res3, %res2 - ret <32 x i8> %res4 + %res = call <32 x i8> @llvm.x86.avx512.mask.pmovus.wb.512(<32 x i16> %x0, <32 x i8> zeroinitializer, i32 %x2) + ret <32 x i8> %res } declare void @llvm.x86.avx512.mask.pmovus.wb.mem.512(i8* %ptr, <32 x i16>, i32) @@ -1353,36 +1396,51 @@ define <32 x i16>@test_int_x86_avx512_mask_psllv32hi(<32 x i16> %x0, <32 x i16> declare <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16>, <32 x i16>) +define <32 x i16>@test_int_x86_avx512_permvar_hi_512(<32 x i16> %x0, <32 x i16> %x1) { +; CHECK-LABEL: test_int_x86_avx512_permvar_hi_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermw %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf2,0xf5,0x48,0x8d,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %x0, <32 x i16> %x1) + ret <32 x i16> %1 +} + define <32 x i16>@test_int_x86_avx512_mask_permvar_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_permvar_hi_512: ; X86: # %bb.0: -; X86-NEXT: vpermw %zmm0, %zmm1, %zmm3 # encoding: [0x62,0xf2,0xf5,0x48,0x8d,0xd8] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpermw %zmm0, %zmm1, %zmm2 {%k1} # encoding: [0x62,0xf2,0xf5,0x49,0x8d,0xd0] -; X86-NEXT: vpermw %zmm0, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xc9,0x8d,0xc0] -; X86-NEXT: vpaddw %zmm3, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc3] -; X86-NEXT: vpaddw %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc0] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_permvar_hi_512: ; X64: # %bb.0: -; X64-NEXT: vpermw %zmm0, %zmm1, %zmm3 # encoding: [0x62,0xf2,0xf5,0x48,0x8d,0xd8] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpermw %zmm0, %zmm1, %zmm2 {%k1} # encoding: [0x62,0xf2,0xf5,0x49,0x8d,0xd0] -; X64-NEXT: vpermw %zmm0, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xc9,0x8d,0xc0] -; X64-NEXT: vpaddw %zmm3, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc3] -; X64-NEXT: vpaddw %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc0] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %1 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %x0, <32 x i16> %x1) %2 = bitcast i32 %x3 to <32 x i1> %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %x2 - %4 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %x0, <32 x i16> %x1) - %5 = bitcast i32 %x3 to <32 x i1> - %6 = select <32 x i1> %5, <32 x i16> %4, <32 x i16> zeroinitializer - %7 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %x0, <32 x i16> %x1) - %res3 = add <32 x i16> %3, %6 - %res4 = add <32 x i16> %res3, %7 - ret <32 x i16> %res4 + ret <32 x i16> %3 +} + +define <32 x i16>@test_int_x86_avx512_maskz_permvar_hi_512(<32 x i16> %x0, <32 x i16> %x1, i32 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_permvar_hi_512: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpermw %zmm0, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xc9,0x8d,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_permvar_hi_512: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpermw %zmm0, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xc9,0x8d,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %1 = call <32 x i16> @llvm.x86.avx512.permvar.hi.512(<32 x i16> %x0, <32 x i16> %x1) + %2 = bitcast i32 %x3 to <32 x i1> + %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> zeroinitializer + ret <32 x i16> %3 } define <32 x i16> @test_x86_avx512_psll_w_512(<32 x i16> %a0, <8 x i16> %a1) { diff --git a/llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll index 16bd8fd65cb79..1f4185ed6dc20 100644 --- a/llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll @@ -4,308 +4,469 @@ declare <16 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.128(i8, <16 x i8>, i16) +define <16 x i8>@test_int_x86_avx512_pbroadcast_b_gpr_128(i8 %x0, <16 x i8> %x1) { +; X86-LABEL: test_int_x86_avx512_pbroadcast_b_gpr_128: +; X86: # %bb.0: +; X86-NEXT: vpbroadcastb {{[0-9]+}}(%esp), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x78,0x44,0x24,0x04] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_pbroadcast_b_gpr_128: +; X64: # %bb.0: +; X64-NEXT: vpbroadcastb %edi, %xmm0 # encoding: [0x62,0xf2,0x7d,0x08,0x7a,0xc7] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.128(i8 %x0, <16 x i8> %x1, i16 -1) + ret <16 x i8> %res +} + define <16 x i8>@test_int_x86_avx512_mask_pbroadcast_b_gpr_128(i8 %x0, <16 x i8> %x1, i16 %mask) { ; X86-LABEL: test_int_x86_avx512_mask_pbroadcast_b_gpr_128: ; X86: # %bb.0: -; X86-NEXT: vpbroadcastb {{[0-9]+}}(%esp), %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x78,0x4c,0x24,0x04] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] -; X86-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf1,0x7f,0x09,0x6f,0xc1] -; X86-NEXT: vmovdqu8 %xmm1, %xmm2 {%k1} {z} # encoding: [0x62,0xf1,0x7f,0x89,0x6f,0xd1] -; X86-NEXT: vpaddb %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc2] -; X86-NEXT: vpaddb %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xc0] +; X86-NEXT: vpbroadcastb {{[0-9]+}}(%esp), %xmm0 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x78,0x44,0x24,0x04] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pbroadcast_b_gpr_128: ; X64: # %bb.0: -; X64-NEXT: vpbroadcastb %edi, %xmm1 # encoding: [0x62,0xf2,0x7d,0x08,0x7a,0xcf] ; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] ; X64-NEXT: vpbroadcastb %edi, %xmm0 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x7a,0xc7] -; X64-NEXT: vpbroadcastb %edi, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x7a,0xd7] -; X64-NEXT: vpaddb %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc2] -; X64-NEXT: vpaddb %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <16 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.128(i8 %x0, <16 x i8> %x1, i16 -1) - %res1 = call <16 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.128(i8 %x0, <16 x i8> %x1, i16 %mask) - %res2 = call <16 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.128(i8 %x0, <16 x i8> zeroinitializer, i16 %mask) - %res3 = add <16 x i8> %res, %res1 - %res4 = add <16 x i8> %res2, %res3 - ret <16 x i8> %res4 + %res = call <16 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.128(i8 %x0, <16 x i8> %x1, i16 %mask) + ret <16 x i8> %res +} + +define <16 x i8>@test_int_x86_avx512_maskz_pbroadcast_b_gpr_128(i8 %x0, i16 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_pbroadcast_b_gpr_128: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] +; X86-NEXT: vpbroadcastb {{[0-9]+}}(%esp), %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x78,0x44,0x24,0x04] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pbroadcast_b_gpr_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] +; X64-NEXT: vpbroadcastb %edi, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x7a,0xc7] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.128(i8 %x0, <16 x i8> zeroinitializer, i16 %mask) + ret <16 x i8> %res } declare <8 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.128(i16, <8 x i16>, i8) +define <8 x i16>@test_int_x86_avx512_pbroadcast_w_gpr_128(i16 %x0, <8 x i16> %x1) { +; X86-LABEL: test_int_x86_avx512_pbroadcast_w_gpr_128: +; X86: # %bb.0: +; X86-NEXT: vpbroadcastw {{[0-9]+}}(%esp), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x79,0x44,0x24,0x04] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_pbroadcast_w_gpr_128: +; X64: # %bb.0: +; X64-NEXT: vpbroadcastw %edi, %xmm0 # encoding: [0x62,0xf2,0x7d,0x08,0x7b,0xc7] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.128(i16 %x0, <8 x i16> %x1, i8 -1) + ret <8 x i16> %res +} + define <8 x i16>@test_int_x86_avx512_mask_pbroadcast_w_gpr_128(i16 %x0, <8 x i16> %x1, i8 %mask) { ; X86-LABEL: test_int_x86_avx512_mask_pbroadcast_w_gpr_128: ; X86: # %bb.0: -; X86-NEXT: vpbroadcastw {{[0-9]+}}(%esp), %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x79,0x4c,0x24,0x04] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] -; X86-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x6f,0xc1] -; X86-NEXT: vmovdqu16 %xmm1, %xmm2 {%k1} {z} # encoding: [0x62,0xf1,0xff,0x89,0x6f,0xd1] -; X86-NEXT: vpaddw %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc2] -; X86-NEXT: vpaddw %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0] +; X86-NEXT: vpbroadcastw {{[0-9]+}}(%esp), %xmm0 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x79,0x44,0x24,0x02] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pbroadcast_w_gpr_128: ; X64: # %bb.0: -; X64-NEXT: vpbroadcastw %edi, %xmm1 # encoding: [0x62,0xf2,0x7d,0x08,0x7b,0xcf] ; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] ; X64-NEXT: vpbroadcastw %edi, %xmm0 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x7b,0xc7] -; X64-NEXT: vpbroadcastw %edi, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x7b,0xd7] -; X64-NEXT: vpaddw %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc2] -; X64-NEXT: vpaddw %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <8 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.128(i16 %x0, <8 x i16> %x1, i8 -1) - %res1 = call <8 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.128(i16 %x0, <8 x i16> %x1, i8 %mask) - %res2 = call <8 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.128(i16 %x0, <8 x i16> zeroinitializer, i8 %mask) - %res3 = add <8 x i16> %res, %res1 - %res4 = add <8 x i16> %res2, %res3 - ret <8 x i16> %res4 + %res = call <8 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.128(i16 %x0, <8 x i16> %x1, i8 %mask) + ret <8 x i16> %res } +define <8 x i16>@test_int_x86_avx512_maskz_pbroadcast_w_gpr_128(i16 %x0, i8 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_pbroadcast_w_gpr_128: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] +; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] +; X86-NEXT: vpbroadcastw {{[0-9]+}}(%esp), %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x79,0x44,0x24,0x02] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pbroadcast_w_gpr_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] +; X64-NEXT: vpbroadcastw %edi, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x7b,0xc7] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.128(i16 %x0, <8 x i16> zeroinitializer, i8 %mask) + ret <8 x i16> %res +} - declare <32 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.256(i8, <32 x i8>, i32) +declare <32 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.256(i8, <32 x i8>, i32) + +define <32 x i8>@test_int_x86_avx512_pbroadcast_b_gpr_256(i8 %x0, <32 x i8> %x1) { +; X86-LABEL: test_int_x86_avx512_pbroadcast_b_gpr_256: +; X86: # %bb.0: +; X86-NEXT: vpbroadcastb {{[0-9]+}}(%esp), %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x78,0x44,0x24,0x04] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_pbroadcast_b_gpr_256: +; X64: # %bb.0: +; X64-NEXT: vpbroadcastb %edi, %ymm0 # encoding: [0x62,0xf2,0x7d,0x28,0x7a,0xc7] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.256(i8 %x0, <32 x i8> %x1, i32 -1) + ret <32 x i8> %res +} - define <32 x i8>@test_int_x86_avx512_mask_pbroadcast_b_gpr_256(i8 %x0, <32 x i8> %x1, i32 %mask) { +define <32 x i8>@test_int_x86_avx512_mask_pbroadcast_b_gpr_256(i8 %x0, <32 x i8> %x1, i32 %mask) { ; X86-LABEL: test_int_x86_avx512_mask_pbroadcast_b_gpr_256: ; X86: # %bb.0: -; X86-NEXT: vpbroadcastb {{[0-9]+}}(%esp), %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x78,0x4c,0x24,0x04] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf1,0x7f,0x29,0x6f,0xc1] -; X86-NEXT: vmovdqu8 %ymm1, %ymm2 {%k1} {z} # encoding: [0x62,0xf1,0x7f,0xa9,0x6f,0xd1] -; X86-NEXT: vpaddb %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfc,0xc2] -; X86-NEXT: vpaddb %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfc,0xc0] +; X86-NEXT: vpbroadcastb {{[0-9]+}}(%esp), %ymm0 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x78,0x44,0x24,0x04] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pbroadcast_b_gpr_256: ; X64: # %bb.0: -; X64-NEXT: vpbroadcastb %edi, %ymm1 # encoding: [0x62,0xf2,0x7d,0x28,0x7a,0xcf] ; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] ; X64-NEXT: vpbroadcastb %edi, %ymm0 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x7a,0xc7] -; X64-NEXT: vpbroadcastb %edi, %ymm2 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x7a,0xd7] -; X64-NEXT: vpaddb %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfc,0xc2] -; X64-NEXT: vpaddb %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfc,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <32 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.256(i8 %x0, <32 x i8> %x1, i32 -1) - %res1 = call <32 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.256(i8 %x0, <32 x i8> %x1, i32 %mask) - %res2 = call <32 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.256(i8 %x0, <32 x i8> zeroinitializer, i32 %mask) - %res3 = add <32 x i8> %res, %res1 - %res4 = add <32 x i8> %res2, %res3 - ret <32 x i8> %res4 - } - + %res = call <32 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.256(i8 %x0, <32 x i8> %x1, i32 %mask) + ret <32 x i8> %res +} +define <32 x i8>@test_int_x86_avx512_maskz_pbroadcast_b_gpr_256(i8 %x0, i32 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_pbroadcast_b_gpr_256: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08] +; X86-NEXT: vpbroadcastb {{[0-9]+}}(%esp), %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x78,0x44,0x24,0x04] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pbroadcast_b_gpr_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] +; X64-NEXT: vpbroadcastb %edi, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x7a,0xc7] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.256(i8 %x0, <32 x i8> zeroinitializer, i32 %mask) + ret <32 x i8> %res +} declare <16 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.256(i16, <16 x i16>, i16) - define <16 x i16>@test_int_x86_avx512_mask_pbroadcast_w_gpr_256(i16 %x0, <16 x i16> %x1, i16 %mask) { +define <16 x i16>@test_int_x86_avx512_pbroadcast_w_gpr_256(i16 %x0, <16 x i16> %x1) { +; X86-LABEL: test_int_x86_avx512_pbroadcast_w_gpr_256: +; X86: # %bb.0: +; X86-NEXT: vpbroadcastw {{[0-9]+}}(%esp), %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x79,0x44,0x24,0x04] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_pbroadcast_w_gpr_256: +; X64: # %bb.0: +; X64-NEXT: vpbroadcastw %edi, %ymm0 # encoding: [0x62,0xf2,0x7d,0x28,0x7b,0xc7] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.256(i16 %x0, <16 x i16> %x1, i16 -1) + ret <16 x i16> %res +} + +define <16 x i16>@test_int_x86_avx512_mask_pbroadcast_w_gpr_256(i16 %x0, <16 x i16> %x1, i16 %mask) { ; X86-LABEL: test_int_x86_avx512_mask_pbroadcast_w_gpr_256: ; X86: # %bb.0: -; X86-NEXT: vpbroadcastw {{[0-9]+}}(%esp), %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x79,0x4c,0x24,0x04] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] -; X86-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf1,0xff,0x29,0x6f,0xc1] -; X86-NEXT: vmovdqu16 %ymm1, %ymm2 {%k1} {z} # encoding: [0x62,0xf1,0xff,0xa9,0x6f,0xd1] -; X86-NEXT: vpaddw %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc2] -; X86-NEXT: vpaddw %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0] +; X86-NEXT: vpbroadcastw {{[0-9]+}}(%esp), %ymm0 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x79,0x44,0x24,0x02] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pbroadcast_w_gpr_256: ; X64: # %bb.0: -; X64-NEXT: vpbroadcastw %edi, %ymm1 # encoding: [0x62,0xf2,0x7d,0x28,0x7b,0xcf] ; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] ; X64-NEXT: vpbroadcastw %edi, %ymm0 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x7b,0xc7] -; X64-NEXT: vpbroadcastw %edi, %ymm2 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x7b,0xd7] -; X64-NEXT: vpaddw %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc2] -; X64-NEXT: vpaddw %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <16 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.256(i16 %x0, <16 x i16> %x1, i16 -1) - %res1 = call <16 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.256(i16 %x0, <16 x i16> %x1, i16 %mask) - %res2 = call <16 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.256(i16 %x0, <16 x i16> zeroinitializer, i16 %mask) - %res3 = add <16 x i16> %res, %res1 - %res4 = add <16 x i16> %res2, %res3 - ret <16 x i16> %res4 - } + %res = call <16 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.256(i16 %x0, <16 x i16> %x1, i16 %mask) + ret <16 x i16> %res +} + +define <16 x i16>@test_int_x86_avx512_maskz_pbroadcast_w_gpr_256(i16 %x0, i16 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_pbroadcast_w_gpr_256: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] +; X86-NEXT: vpbroadcastw {{[0-9]+}}(%esp), %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x79,0x44,0x24,0x02] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pbroadcast_w_gpr_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] +; X64-NEXT: vpbroadcastw %edi, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x7b,0xc7] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.256(i16 %x0, <16 x i16> zeroinitializer, i16 %mask) + ret <16 x i16> %res +} declare <32 x i8> @llvm.x86.avx512.pbroadcastb.256(<16 x i8>, <32 x i8>, i32) -define <32 x i8>@test_int_x86_avx512_pbroadcastb_256(<16 x i8> %x0, <32 x i8> %x1, i32 %mask) { -; X86-LABEL: test_int_x86_avx512_pbroadcastb_256: +define <32 x i8>@test_int_x86_avx512_pbroadcastb_256(<16 x i8> %x0, <32 x i8> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pbroadcastb_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpbroadcastb %xmm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x78,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx512.pbroadcastb.256(<16 x i8> %x0, <32 x i8> %x1, i32 -1) + ret <32 x i8> %res +} + +define <32 x i8>@test_int_x86_avx512_mask_pbroadcastb_256(<16 x i8> %x0, <32 x i8> %x1, i32 %mask) { +; X86-LABEL: test_int_x86_avx512_mask_pbroadcastb_256: ; X86: # %bb.0: -; X86-NEXT: vpbroadcastb %xmm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x78,0xd0] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpbroadcastb %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x78,0xc8] -; X86-NEXT: vpbroadcastb %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x78,0xc0] -; X86-NEXT: vpaddb %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfc,0xc0] -; X86-NEXT: vpaddb %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc0] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; -; X64-LABEL: test_int_x86_avx512_pbroadcastb_256: +; X64-LABEL: test_int_x86_avx512_mask_pbroadcastb_256: ; X64: # %bb.0: -; X64-NEXT: vpbroadcastb %xmm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x78,0xd0] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpbroadcastb %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x78,0xc8] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx512.pbroadcastb.256(<16 x i8> %x0, <32 x i8> %x1, i32 %mask) + ret <32 x i8> %res +} + +define <32 x i8>@test_int_x86_avx512_maskz_pbroadcastb_256(<16 x i8> %x0, i32 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_pbroadcastb_256: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpbroadcastb %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x78,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pbroadcastb_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpbroadcastb %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x78,0xc0] -; X64-NEXT: vpaddb %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfc,0xc0] -; X64-NEXT: vpaddb %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <32 x i8> @llvm.x86.avx512.pbroadcastb.256(<16 x i8> %x0, <32 x i8> %x1, i32 -1) - %res1 = call <32 x i8> @llvm.x86.avx512.pbroadcastb.256(<16 x i8> %x0, <32 x i8> %x1, i32 %mask) - %res2 = call <32 x i8> @llvm.x86.avx512.pbroadcastb.256(<16 x i8> %x0, <32 x i8> zeroinitializer, i32 %mask) - %res3 = add <32 x i8> %res, %res1 - %res4 = add <32 x i8> %res2, %res3 - ret <32 x i8> %res4 + %res = call <32 x i8> @llvm.x86.avx512.pbroadcastb.256(<16 x i8> %x0, <32 x i8> zeroinitializer, i32 %mask) + ret <32 x i8> %res } declare <16 x i8> @llvm.x86.avx512.pbroadcastb.128(<16 x i8>, <16 x i8>, i16) -define <16 x i8>@test_int_x86_avx512_pbroadcastb_128(<16 x i8> %x0, <16 x i8> %x1, i16 %mask) { -; X86-LABEL: test_int_x86_avx512_pbroadcastb_128: +define <16 x i8>@test_int_x86_avx512_pbroadcastb_128(<16 x i8> %x0, <16 x i8> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pbroadcastb_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpbroadcastb %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x78,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.avx512.pbroadcastb.128(<16 x i8> %x0, <16 x i8> %x1, i16 -1) + ret <16 x i8> %res +} + +define <16 x i8>@test_int_x86_avx512_mask_pbroadcastb_128(<16 x i8> %x0, <16 x i8> %x1, i16 %mask) { +; X86-LABEL: test_int_x86_avx512_mask_pbroadcastb_128: ; X86: # %bb.0: -; X86-NEXT: vpbroadcastb %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x78,0xd0] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpbroadcastb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x78,0xc8] -; X86-NEXT: vpbroadcastb %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x78,0xc0] -; X86-NEXT: vpaddb %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xc0] -; X86-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; -; X64-LABEL: test_int_x86_avx512_pbroadcastb_128: +; X64-LABEL: test_int_x86_avx512_mask_pbroadcastb_128: ; X64: # %bb.0: -; X64-NEXT: vpbroadcastb %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x78,0xd0] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpbroadcastb %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x78,0xc8] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.avx512.pbroadcastb.128(<16 x i8> %x0, <16 x i8> %x1, i16 %mask) + ret <16 x i8> %res +} + +define <16 x i8>@test_int_x86_avx512_maskz_pbroadcastb_128(<16 x i8> %x0, i16 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_pbroadcastb_128: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpbroadcastb %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x78,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pbroadcastb_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpbroadcastb %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x78,0xc0] -; X64-NEXT: vpaddb %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xc0] -; X64-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <16 x i8> @llvm.x86.avx512.pbroadcastb.128(<16 x i8> %x0, <16 x i8> %x1, i16 -1) - %res1 = call <16 x i8> @llvm.x86.avx512.pbroadcastb.128(<16 x i8> %x0, <16 x i8> %x1, i16 %mask) - %res2 = call <16 x i8> @llvm.x86.avx512.pbroadcastb.128(<16 x i8> %x0, <16 x i8> zeroinitializer, i16 %mask) - %res3 = add <16 x i8> %res, %res1 - %res4 = add <16 x i8> %res2, %res3 - ret <16 x i8> %res4 + %res = call <16 x i8> @llvm.x86.avx512.pbroadcastb.128(<16 x i8> %x0, <16 x i8> zeroinitializer, i16 %mask) + ret <16 x i8> %res } declare <16 x i16> @llvm.x86.avx512.pbroadcastw.256(<8 x i16>, <16 x i16>, i16) -define <16 x i16>@test_int_x86_avx512_pbroadcastw_256(<8 x i16> %x0, <16 x i16> %x1, i16 %mask) { -; X86-LABEL: test_int_x86_avx512_pbroadcastw_256: +define <16 x i16>@test_int_x86_avx512_pbroadcastw_256(<8 x i16> %x0, <16 x i16> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pbroadcastw_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpbroadcastw %xmm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x79,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx512.pbroadcastw.256(<8 x i16> %x0, <16 x i16> %x1, i16 -1) + ret <16 x i16> %res +} + +define <16 x i16>@test_int_x86_avx512_mask_pbroadcastw_256(<8 x i16> %x0, <16 x i16> %x1, i16 %mask) { +; X86-LABEL: test_int_x86_avx512_mask_pbroadcastw_256: ; X86: # %bb.0: -; X86-NEXT: vpbroadcastw %xmm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x79,0xd0] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpbroadcastw %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x79,0xc8] -; X86-NEXT: vpbroadcastw %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x79,0xc0] -; X86-NEXT: vpaddw %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0] -; X86-NEXT: vpaddw %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; -; X64-LABEL: test_int_x86_avx512_pbroadcastw_256: +; X64-LABEL: test_int_x86_avx512_mask_pbroadcastw_256: ; X64: # %bb.0: -; X64-NEXT: vpbroadcastw %xmm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x79,0xd0] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpbroadcastw %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x79,0xc8] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx512.pbroadcastw.256(<8 x i16> %x0, <16 x i16> %x1, i16 %mask) + ret <16 x i16> %res +} + +define <16 x i16>@test_int_x86_avx512_maskz_pbroadcastw_256(<8 x i16> %x0, i16 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_pbroadcastw_256: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpbroadcastw %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x79,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pbroadcastw_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpbroadcastw %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x79,0xc0] -; X64-NEXT: vpaddw %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0] -; X64-NEXT: vpaddw %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <16 x i16> @llvm.x86.avx512.pbroadcastw.256(<8 x i16> %x0, <16 x i16> %x1, i16 -1) - %res1 = call <16 x i16> @llvm.x86.avx512.pbroadcastw.256(<8 x i16> %x0, <16 x i16> %x1, i16 %mask) - %res2 = call <16 x i16> @llvm.x86.avx512.pbroadcastw.256(<8 x i16> %x0, <16 x i16> zeroinitializer, i16 %mask) - %res3 = add <16 x i16> %res, %res1 - %res4 = add <16 x i16> %res2, %res3 - ret <16 x i16> %res4 + %res = call <16 x i16> @llvm.x86.avx512.pbroadcastw.256(<8 x i16> %x0, <16 x i16> zeroinitializer, i16 %mask) + ret <16 x i16> %res } declare <8 x i16> @llvm.x86.avx512.pbroadcastw.128(<8 x i16>, <8 x i16>, i8) -define <8 x i16>@test_int_x86_avx512_pbroadcastw_128(<8 x i16> %x0, <8 x i16> %x1, i8 %mask) { -; X86-LABEL: test_int_x86_avx512_pbroadcastw_128: +define <8 x i16>@test_int_x86_avx512_pbroadcastw_128(<8 x i16> %x0, <8 x i16> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pbroadcastw_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpbroadcastw %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x79,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.avx512.pbroadcastw.128(<8 x i16> %x0, <8 x i16> %x1, i8 -1) + ret <8 x i16> %res +} + +define <8 x i16>@test_int_x86_avx512_mask_pbroadcastw_128(<8 x i16> %x0, <8 x i16> %x1, i8 %mask) { +; X86-LABEL: test_int_x86_avx512_mask_pbroadcastw_128: ; X86: # %bb.0: -; X86-NEXT: vpbroadcastw %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x79,0xd0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpbroadcastw %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x79,0xc8] -; X86-NEXT: vpbroadcastw %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x79,0xc0] -; X86-NEXT: vpaddw %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0] -; X86-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; -; X64-LABEL: test_int_x86_avx512_pbroadcastw_128: +; X64-LABEL: test_int_x86_avx512_mask_pbroadcastw_128: ; X64: # %bb.0: -; X64-NEXT: vpbroadcastw %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x79,0xd0] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpbroadcastw %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x79,0xc8] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.avx512.pbroadcastw.128(<8 x i16> %x0, <8 x i16> %x1, i8 %mask) + ret <8 x i16> %res +} + +define <8 x i16>@test_int_x86_avx512_maskz_pbroadcastw_128(<8 x i16> %x0, i8 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_pbroadcastw_128: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] +; X86-NEXT: vpbroadcastw %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x79,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pbroadcastw_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpbroadcastw %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x79,0xc0] -; X64-NEXT: vpaddw %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0] -; X64-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <8 x i16> @llvm.x86.avx512.pbroadcastw.128(<8 x i16> %x0, <8 x i16> %x1, i8 -1) - %res1 = call <8 x i16> @llvm.x86.avx512.pbroadcastw.128(<8 x i16> %x0, <8 x i16> %x1, i8 %mask) - %res2 = call <8 x i16> @llvm.x86.avx512.pbroadcastw.128(<8 x i16> %x0, <8 x i16> zeroinitializer, i8 %mask) - %res3 = add <8 x i16> %res, %res1 - %res4 = add <8 x i16> %res2, %res3 - ret <8 x i16> %res4 + %res = call <8 x i16> @llvm.x86.avx512.pbroadcastw.128(<8 x i16> %x0, <8 x i16> zeroinitializer, i8 %mask) + ret <8 x i16> %res } declare <64 x i8> @llvm.x86.avx512.pbroadcastb.512(<16 x i8>, <64 x i8>, i64) -define <64 x i8>@test_int_x86_avx512_pbroadcastb_512(<16 x i8> %x0, <64 x i8> %x1, i64 %mask) { -; X86-LABEL: test_int_x86_avx512_pbroadcastb_512: +define <64 x i8>@test_int_x86_avx512_pbroadcastb_512(<16 x i8> %x0, <64 x i8> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pbroadcastb_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpbroadcastb %xmm0, %zmm0 # encoding: [0x62,0xf2,0x7d,0x48,0x78,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <64 x i8> @llvm.x86.avx512.pbroadcastb.512(<16 x i8> %x0, <64 x i8> %x1, i64 -1) + ret <64 x i8> %res +} + +define <64 x i8>@test_int_x86_avx512_mask_pbroadcastb_512(<16 x i8> %x0, <64 x i8> %x1, i64 %mask) { +; X86-LABEL: test_int_x86_avx512_mask_pbroadcastb_512: ; X86: # %bb.0: -; X86-NEXT: vpbroadcastb %xmm0, %zmm2 # encoding: [0x62,0xf2,0x7d,0x48,0x78,0xd0] ; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpbroadcastb %xmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x78,0xc8] -; X86-NEXT: vpbroadcastb %xmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0x78,0xc0] -; X86-NEXT: vpaddb %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfc,0xc0] -; X86-NEXT: vpaddb %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc0] +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; -; X64-LABEL: test_int_x86_avx512_pbroadcastb_512: +; X64-LABEL: test_int_x86_avx512_mask_pbroadcastb_512: ; X64: # %bb.0: -; X64-NEXT: vpbroadcastb %xmm0, %zmm2 # encoding: [0x62,0xf2,0x7d,0x48,0x78,0xd0] ; X64-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] ; X64-NEXT: vpbroadcastb %xmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x78,0xc8] +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <64 x i8> @llvm.x86.avx512.pbroadcastb.512(<16 x i8> %x0, <64 x i8> %x1, i64 %mask) + ret <64 x i8> %res +} + +define <64 x i8>@test_int_x86_avx512_maskz_pbroadcastb_512(<16 x i8> %x0, i64 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_pbroadcastb_512: +; X86: # %bb.0: +; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpbroadcastb %xmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0x78,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pbroadcastb_512: +; X64: # %bb.0: +; X64-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] ; X64-NEXT: vpbroadcastb %xmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0x78,0xc0] -; X64-NEXT: vpaddb %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfc,0xc0] -; X64-NEXT: vpaddb %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <64 x i8> @llvm.x86.avx512.pbroadcastb.512(<16 x i8> %x0, <64 x i8> %x1, i64 -1) - %res1 = call <64 x i8> @llvm.x86.avx512.pbroadcastb.512(<16 x i8> %x0, <64 x i8> %x1, i64 %mask) - %res2 = call <64 x i8> @llvm.x86.avx512.pbroadcastb.512(<16 x i8> %x0, <64 x i8> zeroinitializer, i64 %mask) - %res3 = add <64 x i8> %res, %res1 - %res4 = add <64 x i8> %res2, %res3 - ret <64 x i8> %res4 + %res = call <64 x i8> @llvm.x86.avx512.pbroadcastb.512(<16 x i8> %x0, <64 x i8> zeroinitializer, i64 %mask) + ret <64 x i8> %res } declare <32 x i16> @llvm.x86.avx512.pbroadcastw.512(<8 x i16>, <32 x i16>, i32) -define <32 x i16>@test_int_x86_avx512_pbroadcastw_512(<8 x i16> %x0, <32 x i16> %x1, i32 %mask) { -; X86-LABEL: test_int_x86_avx512_pbroadcastw_512: +define <32 x i16>@test_int_x86_avx512_pbroadcastw_512(<8 x i16> %x0, <32 x i16> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pbroadcastw_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpbroadcastw %xmm0, %zmm0 # encoding: [0x62,0xf2,0x7d,0x48,0x79,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <32 x i16> @llvm.x86.avx512.pbroadcastw.512(<8 x i16> %x0, <32 x i16> %x1, i32 -1) + ret <32 x i16> %res +} + +define <32 x i16>@test_int_x86_avx512_mask_pbroadcastw_512(<8 x i16> %x0, <32 x i16> %x1, i32 %mask) { +; X86-LABEL: test_int_x86_avx512_mask_pbroadcastw_512: ; X86: # %bb.0: -; X86-NEXT: vpbroadcastw %xmm0, %zmm2 # encoding: [0x62,0xf2,0x7d,0x48,0x79,0xd0] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpbroadcastw %xmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x79,0xc8] -; X86-NEXT: vpbroadcastw %xmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0x79,0xc0] -; X86-NEXT: vpaddw %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfd,0xc0] -; X86-NEXT: vpaddw %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc0] +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; -; X64-LABEL: test_int_x86_avx512_pbroadcastw_512: +; X64-LABEL: test_int_x86_avx512_mask_pbroadcastw_512: ; X64: # %bb.0: -; X64-NEXT: vpbroadcastw %xmm0, %zmm2 # encoding: [0x62,0xf2,0x7d,0x48,0x79,0xd0] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpbroadcastw %xmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x79,0xc8] +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <32 x i16> @llvm.x86.avx512.pbroadcastw.512(<8 x i16> %x0, <32 x i16> %x1, i32 %mask) + ret <32 x i16> %res +} + +define <32 x i16>@test_int_x86_avx512_maskz_pbroadcastw_512(<8 x i16> %x0, i32 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_pbroadcastw_512: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpbroadcastw %xmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0x79,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pbroadcastw_512: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpbroadcastw %xmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0x79,0xc0] -; X64-NEXT: vpaddw %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfd,0xc0] -; X64-NEXT: vpaddw %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <32 x i16> @llvm.x86.avx512.pbroadcastw.512(<8 x i16> %x0, <32 x i16> %x1, i32 -1) - %res1 = call <32 x i16> @llvm.x86.avx512.pbroadcastw.512(<8 x i16> %x0, <32 x i16> %x1, i32 %mask) - %res2 = call <32 x i16> @llvm.x86.avx512.pbroadcastw.512(<8 x i16> %x0, <32 x i16> zeroinitializer, i32 %mask) - %res3 = add <32 x i16> %res, %res1 - %res4 = add <32 x i16> %res2, %res3 - ret <32 x i16> %res4 + %res = call <32 x i16> @llvm.x86.avx512.pbroadcastw.512(<8 x i16> %x0, <32 x i16> zeroinitializer, i32 %mask) + ret <32 x i16> %res } declare void @llvm.x86.avx512.mask.storeu.b.128(i8*, <16 x i8>, i16) @@ -524,220 +685,306 @@ define <32 x i8>@test_int_x86_avx512_mask_loadu_b_256(i8* %ptr, i8* %ptr2, <32 x declare <16 x i8> @llvm.x86.avx512.mask.palignr.128(<16 x i8>, <16 x i8>, i32, <16 x i8>, i16) +define <16 x i8>@test_int_x86_avx512_palignr_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x3) { +; CHECK-LABEL: test_int_x86_avx512_palignr_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpalignr $2, %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x0f,0xc1,0x02] +; CHECK-NEXT: # xmm0 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.avx512.mask.palignr.128(<16 x i8> %x0, <16 x i8> %x1, i32 2, <16 x i8> %x3, i16 -1) + ret <16 x i8> %res +} + define <16 x i8>@test_int_x86_avx512_mask_palignr_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x3, i16 %x4) { ; X86-LABEL: test_int_x86_avx512_mask_palignr_128: ; X86: # %bb.0: -; X86-NEXT: vpalignr $2, %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x0f,0xd9,0x02] -; X86-NEXT: # xmm3 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpalignr $2, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x0f,0xd1,0x02] ; X86-NEXT: # xmm2 {%k1} = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1] -; X86-NEXT: vpalignr $2, %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x0f,0xc1,0x02] -; X86-NEXT: # xmm0 {%k1} {z} = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1] -; X86-NEXT: vpaddb %xmm3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc3] -; X86-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0] +; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_palignr_128: ; X64: # %bb.0: -; X64-NEXT: vpalignr $2, %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x0f,0xd9,0x02] -; X64-NEXT: # xmm3 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpalignr $2, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x0f,0xd1,0x02] ; X64-NEXT: # xmm2 {%k1} = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1] +; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.avx512.mask.palignr.128(<16 x i8> %x0, <16 x i8> %x1, i32 2, <16 x i8> %x3, i16 %x4) + ret <16 x i8> %res +} + +define <16 x i8>@test_int_x86_avx512_maskz_palignr_128(<16 x i8> %x0, <16 x i8> %x1, i16 %x4) { +; X86-LABEL: test_int_x86_avx512_maskz_palignr_128: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpalignr $2, %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x0f,0xc1,0x02] +; X86-NEXT: # xmm0 {%k1} {z} = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_palignr_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpalignr $2, %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x0f,0xc1,0x02] ; X64-NEXT: # xmm0 {%k1} {z} = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1] -; X64-NEXT: vpaddb %xmm3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc3] -; X64-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <16 x i8> @llvm.x86.avx512.mask.palignr.128(<16 x i8> %x0, <16 x i8> %x1, i32 2, <16 x i8> %x3, i16 %x4) - %res1 = call <16 x i8> @llvm.x86.avx512.mask.palignr.128(<16 x i8> %x0, <16 x i8> %x1, i32 2, <16 x i8> zeroinitializer, i16 %x4) - %res2 = call <16 x i8> @llvm.x86.avx512.mask.palignr.128(<16 x i8> %x0, <16 x i8> %x1, i32 2, <16 x i8> %x3, i16 -1) - %res3 = add <16 x i8> %res, %res1 - %res4 = add <16 x i8> %res3, %res2 - ret <16 x i8> %res4 + %res = call <16 x i8> @llvm.x86.avx512.mask.palignr.128(<16 x i8> %x0, <16 x i8> %x1, i32 2, <16 x i8> zeroinitializer, i16 %x4) + ret <16 x i8> %res } declare <32 x i8> @llvm.x86.avx512.mask.palignr.256(<32 x i8>, <32 x i8>, i32, <32 x i8>, i32) +define <32 x i8>@test_int_x86_avx512_palignr_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x3) { +; CHECK-LABEL: test_int_x86_avx512_palignr_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpalignr $2, %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x0f,0xc1,0x02] +; CHECK-NEXT: # ymm0 = ymm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1],ymm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx512.mask.palignr.256(<32 x i8> %x0, <32 x i8> %x1, i32 2, <32 x i8> %x3, i32 -1) + ret <32 x i8> %res +} + define <32 x i8>@test_int_x86_avx512_mask_palignr_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x3, i32 %x4) { ; X86-LABEL: test_int_x86_avx512_mask_palignr_256: ; X86: # %bb.0: -; X86-NEXT: vpalignr $2, %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x0f,0xd9,0x02] -; X86-NEXT: # ymm3 = ymm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1],ymm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpalignr $2, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x0f,0xd1,0x02] ; X86-NEXT: # ymm2 {%k1} = ymm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1],ymm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17] -; X86-NEXT: vpalignr $2, %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x0f,0xc1,0x02] -; X86-NEXT: # ymm0 {%k1} {z} = ymm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1],ymm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17] -; X86-NEXT: vpaddb %ymm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfc,0xc3] -; X86-NEXT: vpaddb %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc0] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_palignr_256: ; X64: # %bb.0: -; X64-NEXT: vpalignr $2, %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x0f,0xd9,0x02] -; X64-NEXT: # ymm3 = ymm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1],ymm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpalignr $2, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x0f,0xd1,0x02] ; X64-NEXT: # ymm2 {%k1} = ymm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1],ymm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx512.mask.palignr.256(<32 x i8> %x0, <32 x i8> %x1, i32 2, <32 x i8> %x3, i32 %x4) + ret <32 x i8> %res +} + +define <32 x i8>@test_int_x86_avx512_maskz_palignr_256(<32 x i8> %x0, <32 x i8> %x1, i32 %x4) { +; X86-LABEL: test_int_x86_avx512_maskz_palignr_256: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpalignr $2, %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x0f,0xc1,0x02] +; X86-NEXT: # ymm0 {%k1} {z} = ymm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1],ymm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_palignr_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpalignr $2, %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x0f,0xc1,0x02] ; X64-NEXT: # ymm0 {%k1} {z} = ymm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1],ymm1[18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17] -; X64-NEXT: vpaddb %ymm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfc,0xc3] -; X64-NEXT: vpaddb %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <32 x i8> @llvm.x86.avx512.mask.palignr.256(<32 x i8> %x0, <32 x i8> %x1, i32 2, <32 x i8> %x3, i32 %x4) - %res1 = call <32 x i8> @llvm.x86.avx512.mask.palignr.256(<32 x i8> %x0, <32 x i8> %x1, i32 2, <32 x i8> zeroinitializer, i32 %x4) - %res2 = call <32 x i8> @llvm.x86.avx512.mask.palignr.256(<32 x i8> %x0, <32 x i8> %x1, i32 2, <32 x i8> %x3, i32 -1) - %res3 = add <32 x i8> %res, %res1 - %res4 = add <32 x i8> %res3, %res2 - ret <32 x i8> %res4 + %res = call <32 x i8> @llvm.x86.avx512.mask.palignr.256(<32 x i8> %x0, <32 x i8> %x1, i32 2, <32 x i8> zeroinitializer, i32 %x4) + ret <32 x i8> %res } declare <8 x i16> @llvm.x86.avx512.mask.pshufh.w.128(<8 x i16>, i32, <8 x i16>, i8) +define <8 x i16>@test_int_x86_avx512_pshufh_w_128(<8 x i16> %x0, i32 %x1, <8 x i16> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pshufh_w_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpshufhw $3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x70,0xc0,0x03] +; CHECK-NEXT: # xmm0 = xmm0[0,1,2,3,7,4,4,4] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.avx512.mask.pshufh.w.128(<8 x i16> %x0, i32 3, <8 x i16> %x2, i8 -1) + ret <8 x i16> %res +} + define <8 x i16>@test_int_x86_avx512_mask_pshufh_w_128(<8 x i16> %x0, i32 %x1, <8 x i16> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pshufh_w_128: ; X86: # %bb.0: -; X86-NEXT: vpshufhw $3, %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x70,0xd0,0x03] -; X86-NEXT: # xmm2 = xmm0[0,1,2,3,7,4,4,4] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpshufhw $3, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7e,0x09,0x70,0xc8,0x03] ; X86-NEXT: # xmm1 {%k1} = xmm0[0,1,2,3,7,4,4,4] -; X86-NEXT: vpshufhw $3, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7e,0x89,0x70,0xc0,0x03] -; X86-NEXT: # xmm0 {%k1} {z} = xmm0[0,1,2,3,7,4,4,4] -; X86-NEXT: vpaddw %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc2] -; X86-NEXT: vpaddw %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pshufh_w_128: ; X64: # %bb.0: -; X64-NEXT: vpshufhw $3, %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x70,0xd0,0x03] -; X64-NEXT: # xmm2 = xmm0[0,1,2,3,7,4,4,4] ; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] ; X64-NEXT: vpshufhw $3, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7e,0x09,0x70,0xc8,0x03] ; X64-NEXT: # xmm1 {%k1} = xmm0[0,1,2,3,7,4,4,4] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.avx512.mask.pshufh.w.128(<8 x i16> %x0, i32 3, <8 x i16> %x2, i8 %x3) + ret <8 x i16> %res +} + +define <8 x i16>@test_int_x86_avx512_maskz_pshufh_w_128(<8 x i16> %x0, i8 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_pshufh_w_128: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] +; X86-NEXT: vpshufhw $3, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7e,0x89,0x70,0xc0,0x03] +; X86-NEXT: # xmm0 {%k1} {z} = xmm0[0,1,2,3,7,4,4,4] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pshufh_w_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpshufhw $3, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7e,0x89,0x70,0xc0,0x03] ; X64-NEXT: # xmm0 {%k1} {z} = xmm0[0,1,2,3,7,4,4,4] -; X64-NEXT: vpaddw %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc2] -; X64-NEXT: vpaddw %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <8 x i16> @llvm.x86.avx512.mask.pshufh.w.128(<8 x i16> %x0, i32 3, <8 x i16> %x2, i8 %x3) - %res1 = call <8 x i16> @llvm.x86.avx512.mask.pshufh.w.128(<8 x i16> %x0, i32 3, <8 x i16> zeroinitializer, i8 %x3) - %res2 = call <8 x i16> @llvm.x86.avx512.mask.pshufh.w.128(<8 x i16> %x0, i32 3, <8 x i16> %x2, i8 -1) - %res3 = add <8 x i16> %res, %res1 - %res4 = add <8 x i16> %res3, %res2 - ret <8 x i16> %res4 + %res = call <8 x i16> @llvm.x86.avx512.mask.pshufh.w.128(<8 x i16> %x0, i32 3, <8 x i16> zeroinitializer, i8 %x3) + ret <8 x i16> %res } declare <16 x i16> @llvm.x86.avx512.mask.pshufh.w.256(<16 x i16>, i32, <16 x i16>, i16) +define <16 x i16>@test_int_x86_avx512_pshufh_w_256(<16 x i16> %x0, i32 %x1, <16 x i16> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pshufh_w_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpshufhw $3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfe,0x70,0xc0,0x03] +; CHECK-NEXT: # ymm0 = ymm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx512.mask.pshufh.w.256(<16 x i16> %x0, i32 3, <16 x i16> %x2, i16 -1) + ret <16 x i16> %res +} + define <16 x i16>@test_int_x86_avx512_mask_pshufh_w_256(<16 x i16> %x0, i32 %x1, <16 x i16> %x2, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pshufh_w_256: ; X86: # %bb.0: -; X86-NEXT: vpshufhw $3, %ymm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc5,0xfe,0x70,0xd0,0x03] -; X86-NEXT: # ymm2 = ymm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] ; X86-NEXT: vpshufhw $3, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7e,0x29,0x70,0xc8,0x03] ; X86-NEXT: # ymm1 {%k1} = ymm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12] -; X86-NEXT: vpshufhw $3, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7e,0xa9,0x70,0xc0,0x03] -; X86-NEXT: # ymm0 {%k1} {z} = ymm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12] -; X86-NEXT: vpaddw %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc2] -; X86-NEXT: vpaddw %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pshufh_w_256: ; X64: # %bb.0: -; X64-NEXT: vpshufhw $3, %ymm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc5,0xfe,0x70,0xd0,0x03] -; X64-NEXT: # ymm2 = ymm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12] ; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] ; X64-NEXT: vpshufhw $3, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7e,0x29,0x70,0xc8,0x03] ; X64-NEXT: # ymm1 {%k1} = ymm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx512.mask.pshufh.w.256(<16 x i16> %x0, i32 3, <16 x i16> %x2, i16 %x3) + ret <16 x i16> %res +} + +define <16 x i16>@test_int_x86_avx512_maskz_pshufh_w_256(<16 x i16> %x0, i16 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_pshufh_w_256: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpshufhw $3, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7e,0xa9,0x70,0xc0,0x03] +; X86-NEXT: # ymm0 {%k1} {z} = ymm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pshufh_w_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpshufhw $3, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7e,0xa9,0x70,0xc0,0x03] ; X64-NEXT: # ymm0 {%k1} {z} = ymm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12] -; X64-NEXT: vpaddw %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc2] -; X64-NEXT: vpaddw %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <16 x i16> @llvm.x86.avx512.mask.pshufh.w.256(<16 x i16> %x0, i32 3, <16 x i16> %x2, i16 %x3) - %res1 = call <16 x i16> @llvm.x86.avx512.mask.pshufh.w.256(<16 x i16> %x0, i32 3, <16 x i16> zeroinitializer, i16 %x3) - %res2 = call <16 x i16> @llvm.x86.avx512.mask.pshufh.w.256(<16 x i16> %x0, i32 3, <16 x i16> %x2, i16 -1) - %res3 = add <16 x i16> %res, %res1 - %res4 = add <16 x i16> %res3, %res2 - ret <16 x i16> %res4 + %res = call <16 x i16> @llvm.x86.avx512.mask.pshufh.w.256(<16 x i16> %x0, i32 3, <16 x i16> zeroinitializer, i16 %x3) + ret <16 x i16> %res } declare <8 x i16> @llvm.x86.avx512.mask.pshufl.w.128(<8 x i16>, i32, <8 x i16>, i8) +define <8 x i16>@test_int_x86_avx512_pshufl_w_128(<8 x i16> %x0, i32 %x1, <8 x i16> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pshufl_w_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpshuflw $3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x70,0xc0,0x03] +; CHECK-NEXT: # xmm0 = xmm0[3,0,0,0,4,5,6,7] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.avx512.mask.pshufl.w.128(<8 x i16> %x0, i32 3, <8 x i16> %x2, i8 -1) + ret <8 x i16> %res +} + define <8 x i16>@test_int_x86_avx512_mask_pshufl_w_128(<8 x i16> %x0, i32 %x1, <8 x i16> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pshufl_w_128: ; X86: # %bb.0: -; X86-NEXT: vpshuflw $3, %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x70,0xd0,0x03] -; X86-NEXT: # xmm2 = xmm0[3,0,0,0,4,5,6,7] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpshuflw $3, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7f,0x09,0x70,0xc8,0x03] ; X86-NEXT: # xmm1 {%k1} = xmm0[3,0,0,0,4,5,6,7] -; X86-NEXT: vpshuflw $3, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7f,0x89,0x70,0xc0,0x03] -; X86-NEXT: # xmm0 {%k1} {z} = xmm0[3,0,0,0,4,5,6,7] -; X86-NEXT: vpaddw %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc2] -; X86-NEXT: vpaddw %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pshufl_w_128: ; X64: # %bb.0: -; X64-NEXT: vpshuflw $3, %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x70,0xd0,0x03] -; X64-NEXT: # xmm2 = xmm0[3,0,0,0,4,5,6,7] ; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] ; X64-NEXT: vpshuflw $3, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7f,0x09,0x70,0xc8,0x03] ; X64-NEXT: # xmm1 {%k1} = xmm0[3,0,0,0,4,5,6,7] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.avx512.mask.pshufl.w.128(<8 x i16> %x0, i32 3, <8 x i16> %x2, i8 %x3) + ret <8 x i16> %res +} + +define <8 x i16>@test_int_x86_avx512_maskz_pshufl_w_128(<8 x i16> %x0, i8 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_pshufl_w_128: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] +; X86-NEXT: vpshuflw $3, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7f,0x89,0x70,0xc0,0x03] +; X86-NEXT: # xmm0 {%k1} {z} = xmm0[3,0,0,0,4,5,6,7] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pshufl_w_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpshuflw $3, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7f,0x89,0x70,0xc0,0x03] ; X64-NEXT: # xmm0 {%k1} {z} = xmm0[3,0,0,0,4,5,6,7] -; X64-NEXT: vpaddw %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc2] -; X64-NEXT: vpaddw %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <8 x i16> @llvm.x86.avx512.mask.pshufl.w.128(<8 x i16> %x0, i32 3, <8 x i16> %x2, i8 %x3) - %res1 = call <8 x i16> @llvm.x86.avx512.mask.pshufl.w.128(<8 x i16> %x0, i32 3, <8 x i16> zeroinitializer, i8 %x3) - %res2 = call <8 x i16> @llvm.x86.avx512.mask.pshufl.w.128(<8 x i16> %x0, i32 3, <8 x i16> %x2, i8 -1) - %res3 = add <8 x i16> %res, %res1 - %res4 = add <8 x i16> %res3, %res2 - ret <8 x i16> %res4 + %res = call <8 x i16> @llvm.x86.avx512.mask.pshufl.w.128(<8 x i16> %x0, i32 3, <8 x i16> zeroinitializer, i8 %x3) + ret <8 x i16> %res } declare <16 x i16> @llvm.x86.avx512.mask.pshufl.w.256(<16 x i16>, i32, <16 x i16>, i16) +define <16 x i16>@test_int_x86_avx512_pshufl_w_256(<16 x i16> %x0, i32 %x1, <16 x i16> %x2, i16 %x3) { +; CHECK-LABEL: test_int_x86_avx512_pshufl_w_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpshuflw $3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xff,0x70,0xc0,0x03] +; CHECK-NEXT: # ymm0 = ymm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx512.mask.pshufl.w.256(<16 x i16> %x0, i32 3, <16 x i16> %x2, i16 -1) + ret <16 x i16> %res +} + define <16 x i16>@test_int_x86_avx512_mask_pshufl_w_256(<16 x i16> %x0, i32 %x1, <16 x i16> %x2, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pshufl_w_256: ; X86: # %bb.0: -; X86-NEXT: vpshuflw $3, %ymm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc5,0xff,0x70,0xd0,0x03] -; X86-NEXT: # ymm2 = ymm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] ; X86-NEXT: vpshuflw $3, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7f,0x29,0x70,0xc8,0x03] ; X86-NEXT: # ymm1 {%k1} = ymm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15] -; X86-NEXT: vpshuflw $3, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7f,0xa9,0x70,0xc0,0x03] -; X86-NEXT: # ymm0 {%k1} {z} = ymm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15] -; X86-NEXT: vpaddw %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc2] -; X86-NEXT: vpaddw %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pshufl_w_256: ; X64: # %bb.0: -; X64-NEXT: vpshuflw $3, %ymm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc5,0xff,0x70,0xd0,0x03] -; X64-NEXT: # ymm2 = ymm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15] ; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] ; X64-NEXT: vpshuflw $3, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7f,0x29,0x70,0xc8,0x03] ; X64-NEXT: # ymm1 {%k1} = ymm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx512.mask.pshufl.w.256(<16 x i16> %x0, i32 3, <16 x i16> %x2, i16 %x3) + ret <16 x i16> %res +} + +define <16 x i16>@test_int_x86_avx512_maskz_pshufl_w_256(<16 x i16> %x0, i16 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_pshufl_w_256: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpshuflw $3, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7f,0xa9,0x70,0xc0,0x03] +; X86-NEXT: # ymm0 {%k1} {z} = ymm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pshufl_w_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpshuflw $3, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7f,0xa9,0x70,0xc0,0x03] ; X64-NEXT: # ymm0 {%k1} {z} = ymm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15] -; X64-NEXT: vpaddw %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc2] -; X64-NEXT: vpaddw %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <16 x i16> @llvm.x86.avx512.mask.pshufl.w.256(<16 x i16> %x0, i32 3, <16 x i16> %x2, i16 %x3) - %res1 = call <16 x i16> @llvm.x86.avx512.mask.pshufl.w.256(<16 x i16> %x0, i32 3, <16 x i16> zeroinitializer, i16 %x3) - %res2 = call <16 x i16> @llvm.x86.avx512.mask.pshufl.w.256(<16 x i16> %x0, i32 3, <16 x i16> %x2, i16 -1) - %res3 = add <16 x i16> %res, %res1 - %res4 = add <16 x i16> %res3, %res2 - ret <16 x i16> %res4 + %res = call <16 x i16> @llvm.x86.avx512.mask.pshufl.w.256(<16 x i16> %x0, i32 3, <16 x i16> zeroinitializer, i16 %x3) + ret <16 x i16> %res } define i32 @test_pcmpeq_b_256(<32 x i8> %a, <32 x i8> %b) { @@ -1012,228 +1259,260 @@ declare i8 @llvm.x86.avx512.mask.pcmpgt.w.128(<8 x i16>, <8 x i16>, i8) declare <16 x i8> @llvm.x86.avx512.mask.punpckhb.w.128(<16 x i8>, <16 x i8>, <16 x i8>, i16) +define <16 x i8>@test_int_x86_avx512_punpckhb_w_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2) { +; CHECK-LABEL: test_int_x86_avx512_punpckhb_w_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpunpckhbw %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x68,0xc1] +; CHECK-NEXT: # xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.avx512.mask.punpckhb.w.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1) + ret <16 x i8> %res +} + define <16 x i8>@test_int_x86_avx512_mask_punpckhb_w_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_punpckhb_w_128: ; X86: # %bb.0: -; X86-NEXT: vpunpckhbw %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x68,0xd9] -; X86-NEXT: # xmm3 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpunpckhbw %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x68,0xd1] ; X86-NEXT: # xmm2 {%k1} = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; X86-NEXT: vpaddb %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc3] +; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_punpckhb_w_128: ; X64: # %bb.0: -; X64-NEXT: vpunpckhbw %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x68,0xd9] -; X64-NEXT: # xmm3 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpunpckhbw %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x68,0xd1] ; X64-NEXT: # xmm2 {%k1} = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; X64-NEXT: vpaddb %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc3] +; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <16 x i8> @llvm.x86.avx512.mask.punpckhb.w.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) - %res1 = call <16 x i8> @llvm.x86.avx512.mask.punpckhb.w.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1) - %res2 = add <16 x i8> %res, %res1 - ret <16 x i8> %res2 + ret <16 x i8> %res } declare <16 x i8> @llvm.x86.avx512.mask.punpcklb.w.128(<16 x i8>, <16 x i8>, <16 x i8>, i16) +define <16 x i8>@test_int_x86_avx512_ask_punpcklb_w_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2) { +; CHECK-LABEL: test_int_x86_avx512_ask_punpcklb_w_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpunpcklbw %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x60,0xc1] +; CHECK-NEXT: # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.avx512.mask.punpcklb.w.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1) + ret <16 x i8> %res +} + define <16 x i8>@test_int_x86_avx512_mask_punpcklb_w_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_punpcklb_w_128: ; X86: # %bb.0: -; X86-NEXT: vpunpcklbw %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x60,0xd9] -; X86-NEXT: # xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpunpcklbw %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x60,0xd1] ; X86-NEXT: # xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; X86-NEXT: vpaddb %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc3] +; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_punpcklb_w_128: ; X64: # %bb.0: -; X64-NEXT: vpunpcklbw %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x60,0xd9] -; X64-NEXT: # xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpunpcklbw %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x60,0xd1] ; X64-NEXT: # xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; X64-NEXT: vpaddb %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc3] +; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <16 x i8> @llvm.x86.avx512.mask.punpcklb.w.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) - %res1 = call <16 x i8> @llvm.x86.avx512.mask.punpcklb.w.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1) - %res2 = add <16 x i8> %res, %res1 - ret <16 x i8> %res2 + ret <16 x i8> %res } declare <32 x i8> @llvm.x86.avx512.mask.punpckhb.w.256(<32 x i8>, <32 x i8>, <32 x i8>, i32) +define <32 x i8>@test_int_x86_avx512_punpckhb_w_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2) { +; CHECK-LABEL: test_int_x86_avx512_punpckhb_w_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpunpckhbw %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x68,0xc1] +; CHECK-NEXT: # ymm0 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx512.mask.punpckhb.w.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1) + ret <32 x i8> %res +} + define <32 x i8>@test_int_x86_avx512_mask_punpckhb_w_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_punpckhb_w_256: ; X86: # %bb.0: -; X86-NEXT: vpunpckhbw %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x68,0xd9] -; X86-NEXT: # ymm3 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpunpckhbw %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x68,0xd1] ; X86-NEXT: # ymm2 {%k1} = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] -; X86-NEXT: vpaddb %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc3] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_punpckhb_w_256: ; X64: # %bb.0: -; X64-NEXT: vpunpckhbw %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x68,0xd9] -; X64-NEXT: # ymm3 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpunpckhbw %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x68,0xd1] ; X64-NEXT: # ymm2 {%k1} = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] -; X64-NEXT: vpaddb %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc3] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <32 x i8> @llvm.x86.avx512.mask.punpckhb.w.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) - %res1 = call <32 x i8> @llvm.x86.avx512.mask.punpckhb.w.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1) - %res2 = add <32 x i8> %res, %res1 - ret <32 x i8> %res2 + ret <32 x i8> %res } declare <32 x i8> @llvm.x86.avx512.mask.punpcklb.w.256(<32 x i8>, <32 x i8>, <32 x i8>, i32) +define <32 x i8>@test_int_x86_avx512_punpcklb_w_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2) { +; CHECK-LABEL: test_int_x86_avx512_punpcklb_w_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpunpcklbw %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x60,0xc1] +; CHECK-NEXT: # ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx512.mask.punpcklb.w.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1) + ret <32 x i8> %res +} + define <32 x i8>@test_int_x86_avx512_mask_punpcklb_w_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_punpcklb_w_256: ; X86: # %bb.0: -; X86-NEXT: vpunpcklbw %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x60,0xd9] -; X86-NEXT: # ymm3 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpunpcklbw %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x60,0xd1] ; X86-NEXT: # ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] -; X86-NEXT: vpaddb %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc3] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_punpcklb_w_256: ; X64: # %bb.0: -; X64-NEXT: vpunpcklbw %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x60,0xd9] -; X64-NEXT: # ymm3 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpunpcklbw %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x60,0xd1] ; X64-NEXT: # ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] -; X64-NEXT: vpaddb %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc3] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <32 x i8> @llvm.x86.avx512.mask.punpcklb.w.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) - %res1 = call <32 x i8> @llvm.x86.avx512.mask.punpcklb.w.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1) - %res2 = add <32 x i8> %res, %res1 - ret <32 x i8> %res2 + ret <32 x i8> %res } declare <8 x i16> @llvm.x86.avx512.mask.punpcklw.d.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) +define <8 x i16>@test_int_x86_avx512_punpcklw_d_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2) { +; CHECK-LABEL: test_int_x86_avx512_punpcklw_d_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpunpcklwd %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x61,0xc1] +; CHECK-NEXT: # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.avx512.mask.punpcklw.d.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) + ret <8 x i16> %res +} + define <8 x i16>@test_int_x86_avx512_mask_punpcklw_d_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_punpcklw_d_128: ; X86: # %bb.0: -; X86-NEXT: vpunpcklwd %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x61,0xd9] -; X86-NEXT: # xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpunpcklwd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x61,0xd1] ; X86-NEXT: # xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; X86-NEXT: vpaddw %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc3] +; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_punpcklw_d_128: ; X64: # %bb.0: -; X64-NEXT: vpunpcklwd %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x61,0xd9] -; X64-NEXT: # xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpunpcklwd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x61,0xd1] ; X64-NEXT: # xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; X64-NEXT: vpaddw %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc3] +; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x i16> @llvm.x86.avx512.mask.punpcklw.d.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) - %res1 = call <8 x i16> @llvm.x86.avx512.mask.punpcklw.d.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) - %res2 = add <8 x i16> %res, %res1 - ret <8 x i16> %res2 + ret <8 x i16> %res } declare <8 x i16> @llvm.x86.avx512.mask.punpckhw.d.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) +define <8 x i16>@test_int_x86_avx512_punpckhw_d_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2) { +; CHECK-LABEL: test_int_x86_avx512_punpckhw_d_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpunpckhwd %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x69,0xc1] +; CHECK-NEXT: # xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.avx512.mask.punpckhw.d.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) + ret <8 x i16> %res +} + define <8 x i16>@test_int_x86_avx512_mask_punpckhw_d_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_punpckhw_d_128: ; X86: # %bb.0: -; X86-NEXT: vpunpckhwd %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x69,0xd9] -; X86-NEXT: # xmm3 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpunpckhwd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x69,0xd1] ; X86-NEXT: # xmm2 {%k1} = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; X86-NEXT: vpaddw %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc3] +; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_punpckhw_d_128: ; X64: # %bb.0: -; X64-NEXT: vpunpckhwd %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x69,0xd9] -; X64-NEXT: # xmm3 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpunpckhwd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x69,0xd1] ; X64-NEXT: # xmm2 {%k1} = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; X64-NEXT: vpaddw %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc3] +; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x i16> @llvm.x86.avx512.mask.punpckhw.d.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) - %res1 = call <8 x i16> @llvm.x86.avx512.mask.punpckhw.d.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) - %res2 = add <8 x i16> %res, %res1 - ret <8 x i16> %res2 + ret <8 x i16> %res } declare <16 x i16> @llvm.x86.avx512.mask.punpcklw.d.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) +define <16 x i16>@test_int_x86_avx512_punpcklw_d_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2) { +; CHECK-LABEL: test_int_x86_avx512_punpcklw_d_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpunpcklwd %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x61,0xc1] +; CHECK-NEXT: # ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx512.mask.punpcklw.d.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1) + ret <16 x i16> %res +} + define <16 x i16>@test_int_x86_avx512_mask_punpcklw_d_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_punpcklw_d_256: ; X86: # %bb.0: -; X86-NEXT: vpunpcklwd %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x61,0xd9] -; X86-NEXT: # ymm3 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpunpcklwd %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x61,0xd1] ; X86-NEXT: # ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] -; X86-NEXT: vpaddw %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc3] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_punpcklw_d_256: ; X64: # %bb.0: -; X64-NEXT: vpunpcklwd %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x61,0xd9] -; X64-NEXT: # ymm3 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpunpcklwd %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x61,0xd1] ; X64-NEXT: # ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] -; X64-NEXT: vpaddw %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc3] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx512.mask.punpcklw.d.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) - %res1 = call <16 x i16> @llvm.x86.avx512.mask.punpcklw.d.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1) - %res2 = add <16 x i16> %res, %res1 - ret <16 x i16> %res2 + ret <16 x i16> %res } declare <16 x i16> @llvm.x86.avx512.mask.punpckhw.d.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) +define <16 x i16>@test_int_x86_avx512_punpckhw_d_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2) { +; CHECK-LABEL: test_int_x86_avx512_punpckhw_d_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpunpckhwd %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x69,0xc1] +; CHECK-NEXT: # ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx512.mask.punpckhw.d.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1) + ret <16 x i16> %res +} + define <16 x i16>@test_int_x86_avx512_mask_punpckhw_d_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_punpckhw_d_256: ; X86: # %bb.0: -; X86-NEXT: vpunpckhwd %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x69,0xd9] -; X86-NEXT: # ymm3 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpunpckhwd %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x69,0xd1] ; X86-NEXT: # ymm2 {%k1} = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] -; X86-NEXT: vpaddw %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc3] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_punpckhw_d_256: ; X64: # %bb.0: -; X64-NEXT: vpunpckhwd %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x69,0xd9] -; X64-NEXT: # ymm3 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpunpckhwd %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x69,0xd1] ; X64-NEXT: # ymm2 {%k1} = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] -; X64-NEXT: vpaddw %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc3] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx512.mask.punpckhw.d.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) - %res1 = call <16 x i16> @llvm.x86.avx512.mask.punpckhw.d.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1) - %res2 = add <16 x i16> %res, %res1 - ret <16 x i16> %res2 + ret <16 x i16> %res } define <8 x i16> @test_mask_add_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) { @@ -2146,70 +2425,92 @@ define <16 x i8>@test_int_x86_avx512_mask_pmaxs_b_128(<16 x i8> %x0, <16 x i8> % ; X86: # %bb.0: ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmaxsb %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x3c,0xd1] -; X86-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x3c,0xc1] -; X86-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0] +; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmaxs_b_128: ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmaxsb %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x3c,0xd1] -; X64-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x3c,0xc1] -; X64-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0] +; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <16 x i8> @llvm.x86.avx512.mask.pmaxs.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2 ,i16 %mask) - %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmaxs.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> zeroinitializer, i16 %mask) - %res2 = add <16 x i8> %res, %res1 - ret <16 x i8> %res2 + ret <16 x i8> %res +} + +define <16 x i8>@test_int_x86_avx512_maskz_pmaxs_b_128(<16 x i8> %x0, <16 x i8> %x1, i16 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_pmaxs_b_128: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x3c,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmaxs_b_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x3c,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.avx512.mask.pmaxs.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> zeroinitializer, i16 %mask) + ret <16 x i8> %res } declare <32 x i8> @llvm.x86.avx512.mask.pmaxs.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32) +define <32 x i8>@test_int_x86_avx512_pmaxs_b_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pmaxs_b_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3c,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx512.mask.pmaxs.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1) + ret <32 x i8> %res +} + define <32 x i8>@test_int_x86_avx512_mask_pmaxs_b_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pmaxs_b_256: ; X86: # %bb.0: -; X86-NEXT: vpmaxsb %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3c,0xd9] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmaxsb %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x3c,0xd1] -; X86-NEXT: vpaddb %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc3] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmaxs_b_256: ; X64: # %bb.0: -; X64-NEXT: vpmaxsb %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3c,0xd9] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmaxsb %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x3c,0xd1] -; X64-NEXT: vpaddb %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc3] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <32 x i8> @llvm.x86.avx512.mask.pmaxs.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) - %res1 = call <32 x i8> @llvm.x86.avx512.mask.pmaxs.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1) - %res2 = add <32 x i8> %res, %res1 - ret <32 x i8> %res2 + ret <32 x i8> %res } declare <8 x i16> @llvm.x86.avx512.mask.pmaxs.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) +define <8 x i16>@test_int_x86_avx512_pmaxs_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pmaxs_w_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmaxsw %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xee,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.avx512.mask.pmaxs.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) + ret <8 x i16> %res +} + define <8 x i16>@test_int_x86_avx512_mask_pmaxs_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pmaxs_w_128: ; X86: # %bb.0: -; X86-NEXT: vpmaxsw %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xee,0xd9] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpmaxsw %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xee,0xd1] -; X86-NEXT: vpaddw %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc3] +; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmaxs_w_128: ; X64: # %bb.0: -; X64-NEXT: vpmaxsw %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xee,0xd9] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmaxsw %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xee,0xd1] -; X64-NEXT: vpaddw %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc3] +; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x i16> @llvm.x86.avx512.mask.pmaxs.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) - %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmaxs.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) - %res2 = add <8 x i16> %res, %res1 - ret <8 x i16> %res2 + ret <8 x i16> %res } declare <16 x i16> @llvm.x86.avx512.mask.pmaxs.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) @@ -2219,21 +2520,33 @@ define <16 x i16>@test_int_x86_avx512_mask_pmaxs_w_256(<16 x i16> %x0, <16 x i16 ; X86: # %bb.0: ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmaxsw %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xee,0xd1] -; X86-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xee,0xc1] -; X86-NEXT: vpaddw %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmaxs_w_256: ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmaxsw %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xee,0xd1] -; X64-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xee,0xc1] -; X64-NEXT: vpaddw %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx512.mask.pmaxs.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %mask) - %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmaxs.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> zeroinitializer, i16 %mask) - %res2 = add <16 x i16> %res, %res1 - ret <16 x i16> %res2 + ret <16 x i16> %res +} + +define <16 x i16>@test_int_x86_avx512_maskz_pmaxs_w_256(<16 x i16> %x0, <16 x i16> %x1, i16 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_pmaxs_w_256: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xee,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmaxs_w_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xee,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx512.mask.pmaxs.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> zeroinitializer, i16 %mask) + ret <16 x i16> %res } declare <16 x i8> @llvm.x86.avx512.mask.pmaxu.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16) @@ -2243,70 +2556,92 @@ define <16 x i8>@test_int_x86_avx512_mask_pmaxu_b_128(<16 x i8> %x0, <16 x i8> % ; X86: # %bb.0: ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmaxub %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xde,0xd1] -; X86-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xde,0xc1] -; X86-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0] +; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmaxu_b_128: ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmaxub %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xde,0xd1] -; X64-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xde,0xc1] -; X64-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0] +; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <16 x i8> @llvm.x86.avx512.mask.pmaxu.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %mask) - %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmaxu.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> zeroinitializer, i16 %mask) - %res2 = add <16 x i8> %res, %res1 - ret <16 x i8> %res2 + ret <16 x i8> %res +} + +define <16 x i8>@test_int_x86_avx512_maskz_pmaxu_b_128(<16 x i8> %x0, <16 x i8> %x1, i16 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_pmaxu_b_128: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xde,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmaxu_b_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpmaxub %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xde,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.avx512.mask.pmaxu.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> zeroinitializer, i16 %mask) + ret <16 x i8> %res } declare <32 x i8> @llvm.x86.avx512.mask.pmaxu.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32) +define <32 x i8>@test_int_x86_avx512_pmaxu_b_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pmaxu_b_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xde,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx512.mask.pmaxu.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1) + ret <32 x i8> %res +} + define <32 x i8>@test_int_x86_avx512_mask_pmaxu_b_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pmaxu_b_256: ; X86: # %bb.0: -; X86-NEXT: vpmaxub %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xde,0xd9] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmaxub %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xde,0xd1] -; X86-NEXT: vpaddb %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc3] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmaxu_b_256: ; X64: # %bb.0: -; X64-NEXT: vpmaxub %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xde,0xd9] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmaxub %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xde,0xd1] -; X64-NEXT: vpaddb %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc3] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <32 x i8> @llvm.x86.avx512.mask.pmaxu.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) - %res1 = call <32 x i8> @llvm.x86.avx512.mask.pmaxu.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1) - %res2 = add <32 x i8> %res, %res1 - ret <32 x i8> %res2 + ret <32 x i8> %res } declare <8 x i16> @llvm.x86.avx512.mask.pmaxu.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) +define <8 x i16>@test_int_x86_avx512_pmaxu_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pmaxu_w_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x3e,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.avx512.mask.pmaxu.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) + ret <8 x i16> %res +} + define <8 x i16>@test_int_x86_avx512_mask_pmaxu_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pmaxu_w_128: ; X86: # %bb.0: -; X86-NEXT: vpmaxuw %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x3e,0xd9] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpmaxuw %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x3e,0xd1] -; X86-NEXT: vpaddw %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc3] +; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmaxu_w_128: ; X64: # %bb.0: -; X64-NEXT: vpmaxuw %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x3e,0xd9] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmaxuw %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x3e,0xd1] -; X64-NEXT: vpaddw %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc3] +; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x i16> @llvm.x86.avx512.mask.pmaxu.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) - %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmaxu.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) - %res2 = add <8 x i16> %res, %res1 - ret <8 x i16> %res2 + ret <8 x i16> %res } declare <16 x i16> @llvm.x86.avx512.mask.pmaxu.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) @@ -2316,21 +2651,33 @@ define <16 x i16>@test_int_x86_avx512_mask_pmaxu_w_256(<16 x i16> %x0, <16 x i16 ; X86: # %bb.0: ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmaxuw %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x3e,0xd1] -; X86-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x3e,0xc1] -; X86-NEXT: vpaddw %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmaxu_w_256: ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmaxuw %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x3e,0xd1] -; X64-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x3e,0xc1] -; X64-NEXT: vpaddw %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx512.mask.pmaxu.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %mask) - %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmaxu.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> zeroinitializer, i16 %mask) - %res2 = add <16 x i16> %res, %res1 - ret <16 x i16> %res2 + ret <16 x i16> %res +} + +define <16 x i16>@test_int_x86_avx512_maskz_pmaxu_w_256(<16 x i16> %x0, <16 x i16> %x1, i16 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_pmaxu_w_256: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x3e,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmaxu_w_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x3e,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx512.mask.pmaxu.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> zeroinitializer, i16 %mask) + ret <16 x i16> %res } declare <16 x i8> @llvm.x86.avx512.mask.pmins.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16) @@ -2340,70 +2687,92 @@ define <16 x i8>@test_int_x86_avx512_mask_pmins_b_128(<16 x i8> %x0, <16 x i8> % ; X86: # %bb.0: ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpminsb %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x38,0xd1] -; X86-NEXT: vpminsb %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x38,0xc1] -; X86-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0] +; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmins_b_128: ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpminsb %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x38,0xd1] -; X64-NEXT: vpminsb %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x38,0xc1] -; X64-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0] +; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <16 x i8> @llvm.x86.avx512.mask.pmins.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %mask) - %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmins.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> zeroinitializer, i16 %mask) - %res2 = add <16 x i8> %res, %res1 - ret <16 x i8> %res2 + ret <16 x i8> %res +} + +define <16 x i8>@test_int_x86_avx512_maskz_pmins_b_128(<16 x i8> %x0, <16 x i8> %x1, i16 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_pmins_b_128: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpminsb %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x38,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmins_b_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpminsb %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x38,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.avx512.mask.pmins.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> zeroinitializer, i16 %mask) + ret <16 x i8> %res } declare <32 x i8> @llvm.x86.avx512.mask.pmins.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32) +define <32 x i8>@test_int_x86_avx512_pmins_b_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pmins_b_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpminsb %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x38,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx512.mask.pmins.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1) + ret <32 x i8> %res +} + define <32 x i8>@test_int_x86_avx512_mask_pmins_b_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pmins_b_256: ; X86: # %bb.0: -; X86-NEXT: vpminsb %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x38,0xd9] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpminsb %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x38,0xd1] -; X86-NEXT: vpaddb %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc3] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmins_b_256: ; X64: # %bb.0: -; X64-NEXT: vpminsb %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x38,0xd9] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpminsb %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x38,0xd1] -; X64-NEXT: vpaddb %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc3] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <32 x i8> @llvm.x86.avx512.mask.pmins.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) - %res1 = call <32 x i8> @llvm.x86.avx512.mask.pmins.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1) - %res2 = add <32 x i8> %res, %res1 - ret <32 x i8> %res2 + ret <32 x i8> %res } declare <8 x i16> @llvm.x86.avx512.mask.pmins.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) +define <8 x i16>@test_int_x86_avx512_pmins_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pmins_w_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpminsw %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xea,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.avx512.mask.pmins.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) + ret <8 x i16> %res +} + define <8 x i16>@test_int_x86_avx512_mask_pmins_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pmins_w_128: ; X86: # %bb.0: -; X86-NEXT: vpminsw %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xea,0xd9] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpminsw %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xea,0xd1] -; X86-NEXT: vpaddw %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc3] +; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmins_w_128: ; X64: # %bb.0: -; X64-NEXT: vpminsw %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xea,0xd9] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpminsw %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xea,0xd1] -; X64-NEXT: vpaddw %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc3] +; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x i16> @llvm.x86.avx512.mask.pmins.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) - %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmins.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) - %res2 = add <8 x i16> %res, %res1 - ret <8 x i16> %res2 + ret <8 x i16> %res } declare <16 x i16> @llvm.x86.avx512.mask.pmins.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) @@ -2413,21 +2782,33 @@ define <16 x i16>@test_int_x86_avx512_mask_pmins_w_256(<16 x i16> %x0, <16 x i16 ; X86: # %bb.0: ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpminsw %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xea,0xd1] -; X86-NEXT: vpminsw %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xea,0xc1] -; X86-NEXT: vpaddw %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmins_w_256: ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpminsw %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xea,0xd1] -; X64-NEXT: vpminsw %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xea,0xc1] -; X64-NEXT: vpaddw %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx512.mask.pmins.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %mask) - %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmins.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> zeroinitializer, i16 %mask) - %res2 = add <16 x i16> %res, %res1 - ret <16 x i16> %res2 + ret <16 x i16> %res +} + +define <16 x i16>@test_int_x86_avx512_maskz_pmins_w_256(<16 x i16> %x0, <16 x i16> %x1, i16 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_pmins_w_256: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpminsw %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xea,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmins_w_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpminsw %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xea,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx512.mask.pmins.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> zeroinitializer, i16 %mask) + ret <16 x i16> %res } declare <16 x i8> @llvm.x86.avx512.mask.pminu.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16) @@ -2437,70 +2818,92 @@ define <16 x i8>@test_int_x86_avx512_mask_pminu_b_128(<16 x i8> %x0, <16 x i8> % ; X86: # %bb.0: ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpminub %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xda,0xd1] -; X86-NEXT: vpminub %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xda,0xc1] -; X86-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0] +; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pminu_b_128: ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpminub %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xda,0xd1] -; X64-NEXT: vpminub %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xda,0xc1] -; X64-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0] +; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <16 x i8> @llvm.x86.avx512.mask.pminu.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %mask) - %res1 = call <16 x i8> @llvm.x86.avx512.mask.pminu.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> zeroinitializer, i16 %mask) - %res2 = add <16 x i8> %res, %res1 - ret <16 x i8> %res2 + ret <16 x i8> %res +} + +define <16 x i8>@test_int_x86_avx512_maskz_pminu_b_128(<16 x i8> %x0, <16 x i8> %x1, i16 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_pminu_b_128: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpminub %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xda,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pminu_b_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpminub %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xda,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.avx512.mask.pminu.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> zeroinitializer, i16 %mask) + ret <16 x i8> %res } declare <32 x i8> @llvm.x86.avx512.mask.pminu.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32) +define <32 x i8>@test_int_x86_avx512_pminu_b_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pminu_b_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpminub %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xda,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx512.mask.pminu.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1) + ret <32 x i8> %res +} + define <32 x i8>@test_int_x86_avx512_mask_pminu_b_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pminu_b_256: ; X86: # %bb.0: -; X86-NEXT: vpminub %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xda,0xd9] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpminub %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xda,0xd1] -; X86-NEXT: vpaddb %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc3] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pminu_b_256: ; X64: # %bb.0: -; X64-NEXT: vpminub %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xda,0xd9] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpminub %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xda,0xd1] -; X64-NEXT: vpaddb %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc3] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <32 x i8> @llvm.x86.avx512.mask.pminu.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) - %res1 = call <32 x i8> @llvm.x86.avx512.mask.pminu.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1) - %res2 = add <32 x i8> %res, %res1 - ret <32 x i8> %res2 + ret <32 x i8> %res } declare <8 x i16> @llvm.x86.avx512.mask.pminu.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) +define <8 x i16>@test_int_x86_avx512_pminu_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pminu_w_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpminuw %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x3a,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.avx512.mask.pminu.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) + ret <8 x i16> %res +} + define <8 x i16>@test_int_x86_avx512_mask_pminu_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pminu_w_128: ; X86: # %bb.0: -; X86-NEXT: vpminuw %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x3a,0xd9] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpminuw %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x3a,0xd1] -; X86-NEXT: vpaddw %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc3] +; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pminu_w_128: ; X64: # %bb.0: -; X64-NEXT: vpminuw %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x3a,0xd9] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpminuw %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x3a,0xd1] -; X64-NEXT: vpaddw %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc3] +; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x i16> @llvm.x86.avx512.mask.pminu.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) - %res1 = call <8 x i16> @llvm.x86.avx512.mask.pminu.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) - %res2 = add <8 x i16> %res, %res1 - ret <8 x i16> %res2 + ret <8 x i16> %res } declare <16 x i16> @llvm.x86.avx512.mask.pminu.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) @@ -2510,21 +2913,33 @@ define <16 x i16>@test_int_x86_avx512_mask_pminu_w_256(<16 x i16> %x0, <16 x i16 ; X86: # %bb.0: ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpminuw %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x3a,0xd1] -; X86-NEXT: vpminuw %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x3a,0xc1] -; X86-NEXT: vpaddw %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pminu_w_256: ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpminuw %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x3a,0xd1] -; X64-NEXT: vpminuw %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x3a,0xc1] -; X64-NEXT: vpaddw %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx512.mask.pminu.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %mask) - %res1 = call <16 x i16> @llvm.x86.avx512.mask.pminu.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> zeroinitializer, i16 %mask) - %res2 = add <16 x i16> %res, %res1 - ret <16 x i16> %res2 + ret <16 x i16> %res +} + +define <16 x i16>@test_int_x86_avx512_maskz_pminu_w_256(<16 x i16> %x0, <16 x i16> %x1, i16 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_pminu_w_256: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpminuw %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x3a,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pminu_w_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpminuw %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x3a,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx512.mask.pminu.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> zeroinitializer, i16 %mask) + ret <16 x i16> %res } declare <8 x i16> @llvm.x86.avx512.mask.psrl.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) @@ -2895,249 +3310,349 @@ define <16 x i16>@test_int_x86_avx512_mask_psll_wi_256(<16 x i16> %x0, i32 %x1, declare <16 x i8> @llvm.x86.avx512.mask.pshuf.b.128(<16 x i8>, <16 x i8>, <16 x i8>, i16) +define <16 x i8>@test_int_x86_avx512_pshuf_b_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pshuf_b_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpshufb %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x00,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.avx512.mask.pshuf.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1) + ret <16 x i8> %res +} + define <16 x i8>@test_int_x86_avx512_mask_pshuf_b_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pshuf_b_128: ; X86: # %bb.0: -; X86-NEXT: vpshufb %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x00,0xd9] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpshufb %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x00,0xd1] -; X86-NEXT: vpaddb %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc3] +; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pshuf_b_128: ; X64: # %bb.0: -; X64-NEXT: vpshufb %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x00,0xd9] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpshufb %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x00,0xd1] -; X64-NEXT: vpaddb %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc3] +; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <16 x i8> @llvm.x86.avx512.mask.pshuf.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) - %res1 = call <16 x i8> @llvm.x86.avx512.mask.pshuf.b.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1) - %res2 = add <16 x i8> %res, %res1 - ret <16 x i8> %res2 + ret <16 x i8> %res } declare <32 x i8> @llvm.x86.avx512.mask.pshuf.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32) +define <32 x i8>@test_int_x86_avx512_pshuf_b_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pshuf_b_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpshufb %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x00,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx512.mask.pshuf.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1) + ret <32 x i8> %res +} + define <32 x i8>@test_int_x86_avx512_mask_pshuf_b_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pshuf_b_256: ; X86: # %bb.0: -; X86-NEXT: vpshufb %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x00,0xd9] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpshufb %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x00,0xd1] -; X86-NEXT: vpaddb %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc3] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pshuf_b_256: ; X64: # %bb.0: -; X64-NEXT: vpshufb %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x00,0xd9] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpshufb %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x00,0xd1] -; X64-NEXT: vpaddb %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc3] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <32 x i8> @llvm.x86.avx512.mask.pshuf.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) - %res1 = call <32 x i8> @llvm.x86.avx512.mask.pshuf.b.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1) - %res2 = add <32 x i8> %res, %res1 - ret <32 x i8> %res2 + ret <32 x i8> %res } declare <8 x i16> @llvm.x86.avx512.mask.pmovzxb.w.128(<16 x i8>, <8 x i16>, i8) +define <8 x i16>@test_int_x86_avx512_pmovzxb_w_128(<16 x i8> %x0, <8 x i16> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmovzxb_w_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovzxbw %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x30,0xc0] +; CHECK-NEXT: # xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.avx512.mask.pmovzxb.w.128(<16 x i8> %x0, <8 x i16> %x1, i8 -1) + ret <8 x i16> %res +} + define <8 x i16>@test_int_x86_avx512_mask_pmovzxb_w_128(<16 x i8> %x0, <8 x i16> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmovzxb_w_128: ; X86: # %bb.0: -; X86-NEXT: vpmovzxbw %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x30,0xd0] -; X86-NEXT: # xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpmovzxbw %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x30,0xc8] ; X86-NEXT: # xmm1 {%k1} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; X86-NEXT: vpmovzxbw %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x30,0xc0] -; X86-NEXT: # xmm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; X86-NEXT: vpaddw %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc2] -; X86-NEXT: vpaddw %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmovzxb_w_128: ; X64: # %bb.0: -; X64-NEXT: vpmovzxbw %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x30,0xd0] -; X64-NEXT: # xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmovzxbw %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x30,0xc8] ; X64-NEXT: # xmm1 {%k1} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.avx512.mask.pmovzxb.w.128(<16 x i8> %x0, <8 x i16> %x1, i8 %x2) + ret <8 x i16> %res +} + +define <8 x i16>@test_int_x86_avx512_maskz_pmovzxb_w_128(<16 x i8> %x0, i8 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmovzxb_w_128: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] +; X86-NEXT: vpmovzxbw %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x30,0xc0] +; X86-NEXT: # xmm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmovzxb_w_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmovzxbw %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x30,0xc0] ; X64-NEXT: # xmm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; X64-NEXT: vpaddw %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc2] -; X64-NEXT: vpaddw %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <8 x i16> @llvm.x86.avx512.mask.pmovzxb.w.128(<16 x i8> %x0, <8 x i16> %x1, i8 %x2) - %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovzxb.w.128(<16 x i8> %x0, <8 x i16> zeroinitializer, i8 %x2) - %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovzxb.w.128(<16 x i8> %x0, <8 x i16> %x1, i8 -1) - %res3 = add <8 x i16> %res, %res1 - %res4 = add <8 x i16> %res3, %res2 - ret <8 x i16> %res4 + %res = call <8 x i16> @llvm.x86.avx512.mask.pmovzxb.w.128(<16 x i8> %x0, <8 x i16> zeroinitializer, i8 %x2) + ret <8 x i16> %res } declare <16 x i16> @llvm.x86.avx512.mask.pmovzxb.w.256(<16 x i8>, <16 x i16>, i16) +define <16 x i16>@test_int_x86_avx512_pmovzxb_w_256(<16 x i8> %x0, <16 x i16> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmovzxb_w_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovzxbw %xmm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x30,0xc0] +; CHECK-NEXT: # ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx512.mask.pmovzxb.w.256(<16 x i8> %x0, <16 x i16> %x1, i16 -1) + ret <16 x i16> %res +} + define <16 x i16>@test_int_x86_avx512_mask_pmovzxb_w_256(<16 x i8> %x0, <16 x i16> %x1, i16 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmovzxb_w_256: ; X86: # %bb.0: -; X86-NEXT: vpmovzxbw %xmm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x30,0xd0] -; X86-NEXT: # ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmovzxbw %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x30,0xc8] ; X86-NEXT: # ymm1 {%k1} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; X86-NEXT: vpmovzxbw %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x30,0xc0] -; X86-NEXT: # ymm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; X86-NEXT: vpaddw %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc2] -; X86-NEXT: vpaddw %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmovzxb_w_256: ; X64: # %bb.0: -; X64-NEXT: vpmovzxbw %xmm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x30,0xd0] -; X64-NEXT: # ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmovzxbw %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x30,0xc8] ; X64-NEXT: # ymm1 {%k1} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; X64-NEXT: vpmovzxbw %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x30,0xc0] -; X64-NEXT: # ymm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; X64-NEXT: vpaddw %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc2] -; X64-NEXT: vpaddw %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx512.mask.pmovzxb.w.256(<16 x i8> %x0, <16 x i16> %x1, i16 %x2) - %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmovzxb.w.256(<16 x i8> %x0, <16 x i16> zeroinitializer, i16 %x2) - %res2 = call <16 x i16> @llvm.x86.avx512.mask.pmovzxb.w.256(<16 x i8> %x0, <16 x i16> %x1, i16 -1) - %res3 = add <16 x i16> %res, %res1 - %res4 = add <16 x i16> %res3, %res2 - ret <16 x i16> %res4 + ret <16 x i16> %res } +define <16 x i16>@test_int_x86_avx512_maskz_pmovzxb_w_256(<16 x i8> %x0, i16 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmovzxb_w_256: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpmovzxbw %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x30,0xc0] +; X86-NEXT: # ymm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmovzxb_w_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpmovzxbw %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x30,0xc0] +; X64-NEXT: # ymm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; X64-NEXT: retq # encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx512.mask.pmovzxb.w.256(<16 x i8> %x0, <16 x i16> zeroinitializer, i16 %x2) + ret <16 x i16> %res +} declare <8 x i16> @llvm.x86.avx512.mask.pmovsxb.w.128(<16 x i8>, <8 x i16>, i8) +define <8 x i16>@test_int_x86_avx512_pmovsxb_w_128(<16 x i8> %x0, <8 x i16> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmovsxb_w_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovsxbw %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x20,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.avx512.mask.pmovsxb.w.128(<16 x i8> %x0, <8 x i16> %x1, i8 -1) + ret <8 x i16> %res +} + define <8 x i16>@test_int_x86_avx512_mask_pmovsxb_w_128(<16 x i8> %x0, <8 x i16> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmovsxb_w_128: ; X86: # %bb.0: -; X86-NEXT: vpmovsxbw %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x20,0xd0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpmovsxbw %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x20,0xc8] -; X86-NEXT: vpmovsxbw %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x20,0xc0] -; X86-NEXT: vpaddw %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc2] -; X86-NEXT: vpaddw %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmovsxb_w_128: ; X64: # %bb.0: -; X64-NEXT: vpmovsxbw %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x20,0xd0] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmovsxbw %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x20,0xc8] -; X64-NEXT: vpmovsxbw %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x20,0xc0] -; X64-NEXT: vpaddw %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc2] -; X64-NEXT: vpaddw %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x i16> @llvm.x86.avx512.mask.pmovsxb.w.128(<16 x i8> %x0, <8 x i16> %x1, i8 %x2) - %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmovsxb.w.128(<16 x i8> %x0, <8 x i16> zeroinitializer, i8 %x2) - %res2 = call <8 x i16> @llvm.x86.avx512.mask.pmovsxb.w.128(<16 x i8> %x0, <8 x i16> %x1, i8 -1) - %res3 = add <8 x i16> %res, %res1 - %res4 = add <8 x i16> %res3, %res2 - ret <8 x i16> %res4 + ret <8 x i16> %res +} + +define <8 x i16>@test_int_x86_avx512_maskz_pmovsxb_w_128(<16 x i8> %x0, i8 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmovsxb_w_128: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] +; X86-NEXT: vpmovsxbw %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x20,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmovsxb_w_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpmovsxbw %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x20,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.avx512.mask.pmovsxb.w.128(<16 x i8> %x0, <8 x i16> zeroinitializer, i8 %x2) + ret <8 x i16> %res } declare <16 x i16> @llvm.x86.avx512.mask.pmovsxb.w.256(<16 x i8>, <16 x i16>, i16) +define <16 x i16>@test_int_x86_avx512_pmovsxb_w_256(<16 x i8> %x0, <16 x i16> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmovsxb_w_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovsxbw %xmm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x20,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx512.mask.pmovsxb.w.256(<16 x i8> %x0, <16 x i16> %x1, i16 -1) + ret <16 x i16> %res +} + define <16 x i16>@test_int_x86_avx512_mask_pmovsxb_w_256(<16 x i8> %x0, <16 x i16> %x1, i16 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmovsxb_w_256: ; X86: # %bb.0: -; X86-NEXT: vpmovsxbw %xmm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x20,0xd0] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmovsxbw %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x20,0xc8] -; X86-NEXT: vpmovsxbw %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x20,0xc0] -; X86-NEXT: vpaddw %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc2] -; X86-NEXT: vpaddw %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmovsxb_w_256: ; X64: # %bb.0: -; X64-NEXT: vpmovsxbw %xmm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x20,0xd0] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmovsxbw %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x20,0xc8] -; X64-NEXT: vpmovsxbw %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x20,0xc0] -; X64-NEXT: vpaddw %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc2] -; X64-NEXT: vpaddw %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx512.mask.pmovsxb.w.256(<16 x i8> %x0, <16 x i16> %x1, i16 %x2) - %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmovsxb.w.256(<16 x i8> %x0, <16 x i16> zeroinitializer, i16 %x2) - %res2 = call <16 x i16> @llvm.x86.avx512.mask.pmovsxb.w.256(<16 x i8> %x0, <16 x i16> %x1, i16 -1) - %res3 = add <16 x i16> %res, %res1 - %res4 = add <16 x i16> %res3, %res2 - ret <16 x i16> %res4 + ret <16 x i16> %res +} + +define <16 x i16>@test_int_x86_avx512_maskz_pmovsxb_w_256(<16 x i8> %x0, i16 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmovsxb_w_256: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpmovsxbw %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x20,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmovsxb_w_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpmovsxbw %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x20,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx512.mask.pmovsxb.w.256(<16 x i8> %x0, <16 x i16> zeroinitializer, i16 %x2) + ret <16 x i16> %res } declare <2 x i64> @llvm.x86.avx512.mask.pmovsxd.q.128(<4 x i32>, <2 x i64>, i8) +define <2 x i64>@test_int_x86_avx512_pmovsxd_q_128(<4 x i32> %x0, <2 x i64> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmovsxd_q_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovsxdq %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x25,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <2 x i64> @llvm.x86.avx512.mask.pmovsxd.q.128(<4 x i32> %x0, <2 x i64> %x1, i8 -1) + ret <2 x i64> %res +} + define <2 x i64>@test_int_x86_avx512_mask_pmovsxd_q_128(<4 x i32> %x0, <2 x i64> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmovsxd_q_128: ; X86: # %bb.0: -; X86-NEXT: vpmovsxdq %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x25,0xd0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpmovsxdq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x25,0xc8] -; X86-NEXT: vpmovsxdq %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x25,0xc0] -; X86-NEXT: vpaddq %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc2] -; X86-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmovsxd_q_128: ; X64: # %bb.0: -; X64-NEXT: vpmovsxdq %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x25,0xd0] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmovsxdq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x25,0xc8] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <2 x i64> @llvm.x86.avx512.mask.pmovsxd.q.128(<4 x i32> %x0, <2 x i64> %x1, i8 %x2) + ret <2 x i64> %res +} + +define <2 x i64>@test_int_x86_avx512_maskz_pmovsxd_q_128(<4 x i32> %x0, i8 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmovsxd_q_128: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] +; X86-NEXT: vpmovsxdq %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x25,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmovsxd_q_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmovsxdq %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x25,0xc0] -; X64-NEXT: vpaddq %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc2] -; X64-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <2 x i64> @llvm.x86.avx512.mask.pmovsxd.q.128(<4 x i32> %x0, <2 x i64> %x1, i8 %x2) - %res1 = call <2 x i64> @llvm.x86.avx512.mask.pmovsxd.q.128(<4 x i32> %x0, <2 x i64> zeroinitializer, i8 %x2) - %res2 = call <2 x i64> @llvm.x86.avx512.mask.pmovsxd.q.128(<4 x i32> %x0, <2 x i64> %x1, i8 -1) - %res3 = add <2 x i64> %res, %res1 - %res4 = add <2 x i64> %res3, %res2 - ret <2 x i64> %res4 + %res = call <2 x i64> @llvm.x86.avx512.mask.pmovsxd.q.128(<4 x i32> %x0, <2 x i64> zeroinitializer, i8 %x2) + ret <2 x i64> %res } declare <4 x i64> @llvm.x86.avx512.mask.pmovsxd.q.256(<4 x i32>, <4 x i64>, i8) +define <4 x i64>@test_int_x86_avx512_pmovsxd_q_256(<4 x i32> %x0, <4 x i64> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmovsxd_q_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovsxdq %xmm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x25,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i64> @llvm.x86.avx512.mask.pmovsxd.q.256(<4 x i32> %x0, <4 x i64> %x1, i8 -1) + ret <4 x i64> %res +} + define <4 x i64>@test_int_x86_avx512_mask_pmovsxd_q_256(<4 x i32> %x0, <4 x i64> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmovsxd_q_256: ; X86: # %bb.0: -; X86-NEXT: vpmovsxdq %xmm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x25,0xd0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpmovsxdq %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x25,0xc8] -; X86-NEXT: vpmovsxdq %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x25,0xc0] -; X86-NEXT: vpaddq %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc2] -; X86-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmovsxd_q_256: ; X64: # %bb.0: -; X64-NEXT: vpmovsxdq %xmm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x25,0xd0] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmovsxdq %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x25,0xc8] -; X64-NEXT: vpmovsxdq %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x25,0xc0] -; X64-NEXT: vpaddq %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc2] -; X64-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.pmovsxd.q.256(<4 x i32> %x0, <4 x i64> %x1, i8 %x2) - %res1 = call <4 x i64> @llvm.x86.avx512.mask.pmovsxd.q.256(<4 x i32> %x0, <4 x i64> zeroinitializer, i8 %x2) - %res2 = call <4 x i64> @llvm.x86.avx512.mask.pmovsxd.q.256(<4 x i32> %x0, <4 x i64> %x1, i8 -1) - %res3 = add <4 x i64> %res, %res1 - %res4 = add <4 x i64> %res3, %res2 - ret <4 x i64> %res4 + ret <4 x i64> %res } +define <4 x i64>@test_int_x86_avx512_maskz_pmovsxd_q_256(<4 x i32> %x0, i8 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmovsxd_q_256: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] +; X86-NEXT: vpmovsxdq %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x25,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmovsxd_q_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpmovsxdq %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x25,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <4 x i64> @llvm.x86.avx512.mask.pmovsxd.q.256(<4 x i32> %x0, <4 x i64> zeroinitializer, i8 %x2) + ret <4 x i64> %res +} declare <16 x i8> @llvm.x86.avx512.cvtmask2b.128(i16) @@ -5805,336 +6320,420 @@ define i16@test_int_x86_avx512_cvtw2mask_256(<16 x i16> %x0) { declare <8 x i16> @llvm.x86.avx512.mask.pmulhu.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) +define <8 x i16>@test_int_x86_avx512_pmulhu_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_pmulhu_w_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmulhuw %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe4,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.avx512.mask.pmulhu.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) + ret <8 x i16> %res +} + define <8 x i16>@test_int_x86_avx512_mask_pmulhu_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pmulhu_w_128: ; X86: # %bb.0: -; X86-NEXT: vpmulhuw %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe4,0xd9] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpmulhuw %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xe4,0xd1] -; X86-NEXT: vpaddw %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc3] +; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmulhu_w_128: ; X64: # %bb.0: -; X64-NEXT: vpmulhuw %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe4,0xd9] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmulhuw %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xe4,0xd1] -; X64-NEXT: vpaddw %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc3] +; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x i16> @llvm.x86.avx512.mask.pmulhu.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) - %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmulhu.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) - %res2 = add <8 x i16> %res, %res1 - ret <8 x i16> %res2 + ret <8 x i16> %res } declare <16 x i16> @llvm.x86.avx512.mask.pmulhu.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) +define <16 x i16>@test_int_x86_avx512_pmulhu_w_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pmulhu_w_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmulhuw %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe4,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx512.mask.pmulhu.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1) + ret <16 x i16> %res +} + define <16 x i16>@test_int_x86_avx512_mask_pmulhu_w_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pmulhu_w_256: ; X86: # %bb.0: -; X86-NEXT: vpmulhuw %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe4,0xd9] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmulhuw %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xe4,0xd1] -; X86-NEXT: vpaddw %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc3] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmulhu_w_256: ; X64: # %bb.0: -; X64-NEXT: vpmulhuw %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe4,0xd9] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmulhuw %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xe4,0xd1] -; X64-NEXT: vpaddw %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc3] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx512.mask.pmulhu.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) - %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmulhu.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1) - %res2 = add <16 x i16> %res, %res1 - ret <16 x i16> %res2 + ret <16 x i16> %res } declare <8 x i16> @llvm.x86.avx512.mask.pmulh.w.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) +define <8 x i16>@test_int_x86_avx512_pmulh_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pmulh_w_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmulhw %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe5,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.avx512.mask.pmulh.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) + ret <8 x i16> %res +} + define <8 x i16>@test_int_x86_avx512_mask_pmulh_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pmulh_w_128: ; X86: # %bb.0: -; X86-NEXT: vpmulhw %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe5,0xd9] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpmulhw %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xe5,0xd1] -; X86-NEXT: vpaddw %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc3] +; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmulh_w_128: ; X64: # %bb.0: -; X64-NEXT: vpmulhw %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe5,0xd9] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmulhw %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xe5,0xd1] -; X64-NEXT: vpaddw %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc3] +; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x i16> @llvm.x86.avx512.mask.pmulh.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) - %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmulh.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) - %res2 = add <8 x i16> %res, %res1 - ret <8 x i16> %res2 + ret <8 x i16> %res } declare <16 x i16> @llvm.x86.avx512.mask.pmulh.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) +define <16 x i16>@test_int_x86_avx512_pmulh_w_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pmulh_w_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmulhw %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe5,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx512.mask.pmulh.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1) + ret <16 x i16> %res +} + define <16 x i16>@test_int_x86_avx512_mask_pmulh_w_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pmulh_w_256: ; X86: # %bb.0: -; X86-NEXT: vpmulhw %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe5,0xd9] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmulhw %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xe5,0xd1] -; X86-NEXT: vpaddw %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc3] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmulh_w_256: ; X64: # %bb.0: -; X64-NEXT: vpmulhw %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe5,0xd9] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmulhw %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xe5,0xd1] -; X64-NEXT: vpaddw %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc3] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx512.mask.pmulh.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) - %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmulh.w.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1) - %res2 = add <16 x i16> %res, %res1 - ret <16 x i16> %res2 + ret <16 x i16> %res } declare <8 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) +define <8 x i16>@test_int_x86_avx512_pmulhr_sw_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pmulhr_sw_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmulhrsw %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x0b,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) + ret <8 x i16> %res +} + define <8 x i16>@test_int_x86_avx512_mask_pmulhr_sw_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pmulhr_sw_128: ; X86: # %bb.0: -; X86-NEXT: vpmulhrsw %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x0b,0xd9] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpmulhrsw %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x0b,0xd1] -; X86-NEXT: vpaddw %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc3] +; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmulhr_sw_128: ; X64: # %bb.0: -; X64-NEXT: vpmulhrsw %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x0b,0xd9] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmulhrsw %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x0b,0xd1] -; X64-NEXT: vpaddw %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc3] +; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) - %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) - %res2 = add <8 x i16> %res, %res1 - ret <8 x i16> %res2 + ret <8 x i16> %res } declare <16 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) +define <16 x i16>@test_int_x86_avx512_pmulhr_sw_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pmulhr_sw_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmulhrsw %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x0b,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1) + ret <16 x i16> %res +} + define <16 x i16>@test_int_x86_avx512_mask_pmulhr_sw_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pmulhr_sw_256: ; X86: # %bb.0: -; X86-NEXT: vpmulhrsw %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x0b,0xd9] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmulhrsw %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x0b,0xd1] -; X86-NEXT: vpaddw %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc3] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmulhr_sw_256: ; X64: # %bb.0: -; X64-NEXT: vpmulhrsw %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x0b,0xd9] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmulhrsw %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x0b,0xd1] -; X64-NEXT: vpaddw %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc3] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) - %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1) - %res2 = add <16 x i16> %res, %res1 - ret <16 x i16> %res2 + ret <16 x i16> %res } declare <8 x i16> @llvm.x86.avx512.mask.pmaddubs.w.128(<16 x i8>, <16 x i8>, <8 x i16>, i8) +define <8 x i16>@test_int_x86_avx512_ask_pmaddubs_w_128(<16 x i8> %x0, <16 x i8> %x1, <8 x i16> %x2) { +; CHECK-LABEL: test_int_x86_avx512_ask_pmaddubs_w_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x04,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.avx512.mask.pmaddubs.w.128(<16 x i8> %x0, <16 x i8> %x1, <8 x i16> %x2, i8 -1) + ret <8 x i16> %res +} + define <8 x i16>@test_int_x86_avx512_mask_pmaddubs_w_128(<16 x i8> %x0, <16 x i8> %x1, <8 x i16> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pmaddubs_w_128: ; X86: # %bb.0: -; X86-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x04,0xd9] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x04,0xd1] -; X86-NEXT: vpaddw %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc3] +; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmaddubs_w_128: ; X64: # %bb.0: -; X64-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x04,0xd9] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x04,0xd1] -; X64-NEXT: vpaddw %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc3] +; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x i16> @llvm.x86.avx512.mask.pmaddubs.w.128(<16 x i8> %x0, <16 x i8> %x1, <8 x i16> %x2, i8 %x3) - %res1 = call <8 x i16> @llvm.x86.avx512.mask.pmaddubs.w.128(<16 x i8> %x0, <16 x i8> %x1, <8 x i16> %x2, i8 -1) - %res2 = add <8 x i16> %res, %res1 - ret <8 x i16> %res2 + ret <8 x i16> %res } declare <16 x i16> @llvm.x86.avx512.mask.pmaddubs.w.256(<32 x i8>, <32 x i8>, <16 x i16>, i16) +define <16 x i16>@test_int_x86_avx512_pmaddubs_w_256(<32 x i8> %x0, <32 x i8> %x1, <16 x i16> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pmaddubs_w_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x04,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx512.mask.pmaddubs.w.256(<32 x i8> %x0, <32 x i8> %x1, <16 x i16> %x2, i16 -1) + ret <16 x i16> %res +} + define <16 x i16>@test_int_x86_avx512_mask_pmaddubs_w_256(<32 x i8> %x0, <32 x i8> %x1, <16 x i16> %x2, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pmaddubs_w_256: ; X86: # %bb.0: -; X86-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x04,0xd9] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x04,0xd1] -; X86-NEXT: vpaddw %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc3] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmaddubs_w_256: ; X64: # %bb.0: -; X64-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x04,0xd9] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x04,0xd1] -; X64-NEXT: vpaddw %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc3] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx512.mask.pmaddubs.w.256(<32 x i8> %x0, <32 x i8> %x1, <16 x i16> %x2, i16 %x3) - %res1 = call <16 x i16> @llvm.x86.avx512.mask.pmaddubs.w.256(<32 x i8> %x0, <32 x i8> %x1, <16 x i16> %x2, i16 -1) - %res2 = add <16 x i16> %res, %res1 - ret <16 x i16> %res2 + ret <16 x i16> %res } declare <4 x i32> @llvm.x86.avx512.mask.pmaddw.d.128(<8 x i16>, <8 x i16>, <4 x i32>, i8) +define <4 x i32>@test_int_x86_avx512_pmaddw_d_128(<8 x i16> %x0, <8 x i16> %x1, <4 x i32> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pmaddw_d_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xf5,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.mask.pmaddw.d.128(<8 x i16> %x0, <8 x i16> %x1, <4 x i32> %x2, i8 -1) + ret <4 x i32> %res +} + define <4 x i32>@test_int_x86_avx512_mask_pmaddw_d_128(<8 x i16> %x0, <8 x i16> %x1, <4 x i32> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pmaddw_d_128: ; X86: # %bb.0: -; X86-NEXT: vpmaddwd %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xf5,0xd9] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpmaddwd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xf5,0xd1] -; X86-NEXT: vpaddd %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc3] +; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmaddw_d_128: ; X64: # %bb.0: -; X64-NEXT: vpmaddwd %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xf5,0xd9] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmaddwd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xf5,0xd1] -; X64-NEXT: vpaddd %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc3] +; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.pmaddw.d.128(<8 x i16> %x0, <8 x i16> %x1, <4 x i32> %x2, i8 %x3) - %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmaddw.d.128(<8 x i16> %x0, <8 x i16> %x1, <4 x i32> %x2, i8 -1) - %res2 = add <4 x i32> %res, %res1 - ret <4 x i32> %res2 + ret <4 x i32> %res } declare <8 x i32> @llvm.x86.avx512.mask.pmaddw.d.256(<16 x i16>, <16 x i16>, <8 x i32>, i8) +define <8 x i32>@test_int_x86_avx512_pmaddw_d_256(<16 x i16> %x0, <16 x i16> %x1, <8 x i32> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pmaddw_d_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf5,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i32> @llvm.x86.avx512.mask.pmaddw.d.256(<16 x i16> %x0, <16 x i16> %x1, <8 x i32> %x2, i8 -1) + ret <8 x i32> %res +} + define <8 x i32>@test_int_x86_avx512_mask_pmaddw_d_256(<16 x i16> %x0, <16 x i16> %x1, <8 x i32> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pmaddw_d_256: ; X86: # %bb.0: -; X86-NEXT: vpmaddwd %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf5,0xd9] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpmaddwd %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xf5,0xd1] -; X86-NEXT: vpaddd %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc3] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmaddw_d_256: ; X64: # %bb.0: -; X64-NEXT: vpmaddwd %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf5,0xd9] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmaddwd %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xf5,0xd1] -; X64-NEXT: vpaddd %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc3] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.pmaddw.d.256(<16 x i16> %x0, <16 x i16> %x1, <8 x i32> %x2, i8 %x3) - %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmaddw.d.256(<16 x i16> %x0, <16 x i16> %x1, <8 x i32> %x2, i8 -1) - %res2 = add <8 x i32> %res, %res1 - ret <8 x i32> %res2 + ret <8 x i32> %res } declare <8 x i16> @llvm.x86.avx512.mask.permvar.hi.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) +define <8 x i16>@test_int_x86_avx512_permvar_hi_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2) { +; CHECK-LABEL: test_int_x86_avx512_permvar_hi_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermw %xmm0, %xmm1, %xmm0 # encoding: [0x62,0xf2,0xf5,0x08,0x8d,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.avx512.mask.permvar.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) + ret <8 x i16> %res +} + define <8 x i16>@test_int_x86_avx512_mask_permvar_hi_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_permvar_hi_128: ; X86: # %bb.0: -; X86-NEXT: vpermw %xmm0, %xmm1, %xmm3 # encoding: [0x62,0xf2,0xf5,0x08,0x8d,0xd8] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpermw %xmm0, %xmm1, %xmm2 {%k1} # encoding: [0x62,0xf2,0xf5,0x09,0x8d,0xd0] -; X86-NEXT: vpermw %xmm0, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0x8d,0xc0] -; X86-NEXT: vpaddw %xmm3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc3] -; X86-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0] +; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_permvar_hi_128: ; X64: # %bb.0: -; X64-NEXT: vpermw %xmm0, %xmm1, %xmm3 # encoding: [0x62,0xf2,0xf5,0x08,0x8d,0xd8] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpermw %xmm0, %xmm1, %xmm2 {%k1} # encoding: [0x62,0xf2,0xf5,0x09,0x8d,0xd0] -; X64-NEXT: vpermw %xmm0, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0x8d,0xc0] -; X64-NEXT: vpaddw %xmm3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc3] -; X64-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0] +; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x i16> @llvm.x86.avx512.mask.permvar.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) - %res1 = call <8 x i16> @llvm.x86.avx512.mask.permvar.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> zeroinitializer, i8 %x3) - %res2 = call <8 x i16> @llvm.x86.avx512.mask.permvar.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) - %res3 = add <8 x i16> %res, %res1 - %res4 = add <8 x i16> %res3, %res2 - ret <8 x i16> %res4 + ret <8 x i16> %res +} + +define <8 x i16>@test_int_x86_avx512_maskz_permvar_hi_128(<8 x i16> %x0, <8 x i16> %x1, i8 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_permvar_hi_128: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] +; X86-NEXT: vpermw %xmm0, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0x8d,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_permvar_hi_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpermw %xmm0, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0x8d,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.avx512.mask.permvar.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> zeroinitializer, i8 %x3) + ret <8 x i16> %res } declare <16 x i16> @llvm.x86.avx512.mask.permvar.hi.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) +define <16 x i16>@test_int_x86_avx512_permvar_hi_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2) { +; CHECK-LABEL: test_int_x86_avx512_permvar_hi_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermw %ymm0, %ymm1, %ymm0 # encoding: [0x62,0xf2,0xf5,0x28,0x8d,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx512.mask.permvar.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1) + ret <16 x i16> %res +} + define <16 x i16>@test_int_x86_avx512_mask_permvar_hi_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_permvar_hi_256: ; X86: # %bb.0: -; X86-NEXT: vpermw %ymm0, %ymm1, %ymm3 # encoding: [0x62,0xf2,0xf5,0x28,0x8d,0xd8] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpermw %ymm0, %ymm1, %ymm2 {%k1} # encoding: [0x62,0xf2,0xf5,0x29,0x8d,0xd0] -; X86-NEXT: vpermw %ymm0, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0x8d,0xc0] -; X86-NEXT: vpaddw %ymm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc3] -; X86-NEXT: vpaddw %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_permvar_hi_256: ; X64: # %bb.0: -; X64-NEXT: vpermw %ymm0, %ymm1, %ymm3 # encoding: [0x62,0xf2,0xf5,0x28,0x8d,0xd8] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpermw %ymm0, %ymm1, %ymm2 {%k1} # encoding: [0x62,0xf2,0xf5,0x29,0x8d,0xd0] -; X64-NEXT: vpermw %ymm0, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0x8d,0xc0] -; X64-NEXT: vpaddw %ymm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc3] -; X64-NEXT: vpaddw %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx512.mask.permvar.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) - %res1 = call <16 x i16> @llvm.x86.avx512.mask.permvar.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> zeroinitializer, i16 %x3) - %res2 = call <16 x i16> @llvm.x86.avx512.mask.permvar.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1) - %res3 = add <16 x i16> %res, %res1 - %res4 = add <16 x i16> %res3, %res2 - ret <16 x i16> %res4 + ret <16 x i16> %res +} + +define <16 x i16>@test_int_x86_avx512_maskz_permvar_hi_256(<16 x i16> %x0, <16 x i16> %x1, i16 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_permvar_hi_256: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpermw %ymm0, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0x8d,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_permvar_hi_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpermw %ymm0, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0x8d,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx512.mask.permvar.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> zeroinitializer, i16 %x3) + ret <16 x i16> %res } declare <8 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) +define <8 x i16>@test_int_x86_avx512_vpermt2var_hi_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermt2var_hi_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermi2w %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0xf5,0x08,0x75,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) + ret <8 x i16> %res +} + define <8 x i16>@test_int_x86_avx512_mask_vpermt2var_hi_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermt2var_hi_128: ; X86: # %bb.0: -; X86-NEXT: vmovdqa %xmm1, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9] -; X86-NEXT: vpermt2w %xmm2, %xmm0, %xmm3 # encoding: [0x62,0xf2,0xfd,0x08,0x7d,0xda] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpermt2w %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x7d,0xca] -; X86-NEXT: vpaddw %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc3] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermt2var_hi_128: ; X64: # %bb.0: -; X64-NEXT: vmovdqa %xmm1, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9] -; X64-NEXT: vpermt2w %xmm2, %xmm0, %xmm3 # encoding: [0x62,0xf2,0xfd,0x08,0x7d,0xda] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpermt2w %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x7d,0xca] -; X64-NEXT: vpaddw %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc3] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) - %res1 = call <8 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) - %res2 = add <8 x i16> %res, %res1 - ret <8 x i16> %res2 + ret <8 x i16> %res } declare <8 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) @@ -6142,52 +6741,47 @@ declare <8 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.128(<8 x i16>, <8 x i16>, define <8 x i16>@test_int_x86_avx512_maskz_vpermt2var_hi_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_maskz_vpermt2var_hi_128: ; X86: # %bb.0: -; X86-NEXT: vmovdqa %xmm1, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9] -; X86-NEXT: vpermt2w %xmm2, %xmm0, %xmm3 # encoding: [0x62,0xf2,0xfd,0x08,0x7d,0xda] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] -; X86-NEXT: vpermt2w %xmm2, %xmm0, %xmm1 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0x7d,0xca] -; X86-NEXT: vpaddw %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc3] +; X86-NEXT: vpermi2w %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0x75,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_maskz_vpermt2var_hi_128: ; X64: # %bb.0: -; X64-NEXT: vmovdqa %xmm1, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9] -; X64-NEXT: vpermt2w %xmm2, %xmm0, %xmm3 # encoding: [0x62,0xf2,0xfd,0x08,0x7d,0xda] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vpermt2w %xmm2, %xmm0, %xmm1 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0x7d,0xca] -; X64-NEXT: vpaddw %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc3] +; X64-NEXT: vpermi2w %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0x75,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) - %res1 = call <8 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) - %res2 = add <8 x i16> %res, %res1 - ret <8 x i16> %res2 + ret <8 x i16> %res } declare <16 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) +define <16 x i16>@test_int_x86_avx512_vpermt2var_hi_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermt2var_hi_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermi2w %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0xf5,0x28,0x75,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1) + ret <16 x i16> %res +} + define <16 x i16>@test_int_x86_avx512_mask_vpermt2var_hi_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermt2var_hi_256: ; X86: # %bb.0: -; X86-NEXT: vmovdqa %ymm1, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9] -; X86-NEXT: vpermt2w %ymm2, %ymm0, %ymm3 # encoding: [0x62,0xf2,0xfd,0x28,0x7d,0xda] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpermt2w %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x7d,0xca] -; X86-NEXT: vpaddw %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc3] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermt2var_hi_256: ; X64: # %bb.0: -; X64-NEXT: vmovdqa %ymm1, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9] -; X64-NEXT: vpermt2w %ymm2, %ymm0, %ymm3 # encoding: [0x62,0xf2,0xfd,0x28,0x7d,0xda] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpermt2w %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x7d,0xca] -; X64-NEXT: vpaddw %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc3] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) - %res1 = call <16 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1) - %res2 = add <16 x i16> %res, %res1 - ret <16 x i16> %res2 + ret <16 x i16> %res } declare <16 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) @@ -6195,78 +6789,76 @@ declare <16 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.256(<16 x i16>, <16 x i1 define <16 x i16>@test_int_x86_avx512_maskz_vpermt2var_hi_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_maskz_vpermt2var_hi_256: ; X86: # %bb.0: -; X86-NEXT: vmovdqa %ymm1, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9] -; X86-NEXT: vpermt2w %ymm2, %ymm0, %ymm3 # encoding: [0x62,0xf2,0xfd,0x28,0x7d,0xda] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] -; X86-NEXT: vpermt2w %ymm2, %ymm0, %ymm1 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x7d,0xca] -; X86-NEXT: vpaddw %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc3] +; X86-NEXT: vpermi2w %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0x75,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_maskz_vpermt2var_hi_256: ; X64: # %bb.0: -; X64-NEXT: vmovdqa %ymm1, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9] -; X64-NEXT: vpermt2w %ymm2, %ymm0, %ymm3 # encoding: [0x62,0xf2,0xfd,0x28,0x7d,0xda] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vpermt2w %ymm2, %ymm0, %ymm1 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x7d,0xca] -; X64-NEXT: vpaddw %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc3] +; X64-NEXT: vpermi2w %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0x75,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) - %res1 = call <16 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1) - %res2 = add <16 x i16> %res, %res1 - ret <16 x i16> %res2 + ret <16 x i16> %res } declare <8 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.128(<8 x i16>, <8 x i16>, <8 x i16>, i8) +define <8 x i16>@test_int_x86_avx512_vpermi2var_hi_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermi2var_hi_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermt2w %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0xf5,0x08,0x7d,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) + ret <8 x i16> %res +} + define <8 x i16>@test_int_x86_avx512_mask_vpermi2var_hi_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermi2var_hi_128: ; X86: # %bb.0: -; X86-NEXT: vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8] -; X86-NEXT: vpermt2w %xmm2, %xmm1, %xmm3 # encoding: [0x62,0xf2,0xf5,0x08,0x7d,0xda] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpermi2w %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x75,0xca] -; X86-NEXT: vpaddw %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc3] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermi2var_hi_128: ; X64: # %bb.0: -; X64-NEXT: vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8] -; X64-NEXT: vpermt2w %xmm2, %xmm1, %xmm3 # encoding: [0x62,0xf2,0xf5,0x08,0x7d,0xda] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpermi2w %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x75,0xca] -; X64-NEXT: vpaddw %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc3] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) - %res1 = call <8 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) - %res2 = add <8 x i16> %res, %res1 - ret <8 x i16> %res2 + ret <8 x i16> %res } declare <16 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) +define <16 x i16>@test_int_x86_avx512_vpermi2var_hi_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermi2var_hi_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermt2w %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0xf5,0x28,0x7d,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1) + ret <16 x i16> %res +} + define <16 x i16>@test_int_x86_avx512_mask_vpermi2var_hi_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermi2var_hi_256: ; X86: # %bb.0: -; X86-NEXT: vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8] -; X86-NEXT: vpermt2w %ymm2, %ymm1, %ymm3 # encoding: [0x62,0xf2,0xf5,0x28,0x7d,0xda] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpermi2w %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x75,0xca] -; X86-NEXT: vpaddw %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc3] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermi2var_hi_256: ; X64: # %bb.0: -; X64-NEXT: vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8] -; X64-NEXT: vpermt2w %ymm2, %ymm1, %ymm3 # encoding: [0x62,0xf2,0xf5,0x28,0x7d,0xda] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpermi2w %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x75,0xca] -; X64-NEXT: vpaddw %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc3] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) - %res1 = call <16 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1) - %res2 = add <16 x i16> %res, %res1 - ret <16 x i16> %res2 + ret <16 x i16> %res } declare <8 x i16> @llvm.x86.avx512.mask.dbpsadbw.128(<16 x i8>, <16 x i8>, i32, <8 x i16>, i8) @@ -8795,32 +9387,50 @@ define <8 x i16>@test_int_x86_avx512_mask_psrlv8_hi(<8 x i16> %x0, <8 x i16> %x1 declare <16 x i8> @llvm.x86.avx512.mask.pmov.wb.256(<16 x i16>, <16 x i8>, i16) +define <16 x i8>@test_int_x86_avx512_pmov_wb_256(<16 x i16> %x0, <16 x i8> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmov_wb_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovwb %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x30,0xc0] +; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 -1) + ret <16 x i8> %res +} + define <16 x i8>@test_int_x86_avx512_mask_pmov_wb_256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmov_wb_256: ; X86: # %bb.0: -; X86-NEXT: vpmovwb %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x30,0xc2] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmovwb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x30,0xc1] -; X86-NEXT: vpmovwb %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x30,0xc0] -; X86-NEXT: vpaddb %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xc0] -; X86-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmov_wb_256: ; X64: # %bb.0: -; X64-NEXT: vpmovwb %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x30,0xc2] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmovwb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x30,0xc1] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2) + ret <16 x i8> %res +} + +define <16 x i8>@test_int_x86_avx512_maskz_pmov_wb_256(<16 x i16> %x0, i16 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmov_wb_256: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpmovwb %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x30,0xc0] +; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmov_wb_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmovwb %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x30,0xc0] -; X64-NEXT: vpaddb %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xc0] -; X64-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] - %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 -1) - %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2) - %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.256(<16 x i16> %x0, <16 x i8> zeroinitializer, i16 %x2) - %res3 = add <16 x i8> %res0, %res1 - %res4 = add <16 x i8> %res3, %res2 - ret <16 x i8> %res4 + %res = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.256(<16 x i16> %x0, <16 x i8> zeroinitializer, i16 %x2) + ret <16 x i8> %res } diff --git a/llvm/test/CodeGen/X86/avx512bwvl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512bwvl-intrinsics.ll index bec461abed8b2..73008582aca28 100644 --- a/llvm/test/CodeGen/X86/avx512bwvl-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512bwvl-intrinsics.ll @@ -1127,167 +1127,163 @@ define <32 x i8> @test_mask_packus_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %pt declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>) +define <8 x i16>@test_int_x86_avx512_vpermt2var_hi_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermt2var_hi_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermi2w %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0xf5,0x08,0x75,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = call <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16> %x1, <8 x i16> %x0, <8 x i16> %x2) + ret <8 x i16> %1 +} + define <8 x i16>@test_int_x86_avx512_mask_vpermt2var_hi_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermt2var_hi_128: ; X86: # %bb.0: -; X86-NEXT: vmovdqa %xmm1, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9] -; X86-NEXT: vpermt2w %xmm2, %xmm0, %xmm3 # encoding: [0x62,0xf2,0xfd,0x08,0x7d,0xda] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpermt2w %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x7d,0xca] -; X86-NEXT: vpaddw %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc3] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermt2var_hi_128: ; X64: # %bb.0: -; X64-NEXT: vmovdqa %xmm1, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9] -; X64-NEXT: vpermt2w %xmm2, %xmm0, %xmm3 # encoding: [0x62,0xf2,0xfd,0x08,0x7d,0xda] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpermt2w %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x7d,0xca] -; X64-NEXT: vpaddw %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc3] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %1 = call <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16> %x1, <8 x i16> %x0, <8 x i16> %x2) %2 = bitcast i8 %x3 to <8 x i1> %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %x1 - %4 = call <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16> %x1, <8 x i16> %x0, <8 x i16> %x2) - %res2 = add <8 x i16> %3, %4 - ret <8 x i16> %res2 + ret <8 x i16> %3 } define <8 x i16>@test_int_x86_avx512_maskz_vpermt2var_hi_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_maskz_vpermt2var_hi_128: ; X86: # %bb.0: -; X86-NEXT: vmovdqa %xmm1, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9] -; X86-NEXT: vpermt2w %xmm2, %xmm0, %xmm3 # encoding: [0x62,0xf2,0xfd,0x08,0x7d,0xda] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] -; X86-NEXT: vpermt2w %xmm2, %xmm0, %xmm1 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0x7d,0xca] -; X86-NEXT: vpaddw %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc3] +; X86-NEXT: vpermi2w %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0x75,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_maskz_vpermt2var_hi_128: ; X64: # %bb.0: -; X64-NEXT: vmovdqa %xmm1, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9] -; X64-NEXT: vpermt2w %xmm2, %xmm0, %xmm3 # encoding: [0x62,0xf2,0xfd,0x08,0x7d,0xda] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vpermt2w %xmm2, %xmm0, %xmm1 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0x7d,0xca] -; X64-NEXT: vpaddw %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc3] +; X64-NEXT: vpermi2w %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0x75,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %1 = call <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16> %x1, <8 x i16> %x0, <8 x i16> %x2) %2 = bitcast i8 %x3 to <8 x i1> %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> zeroinitializer - %4 = call <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16> %x1, <8 x i16> %x0, <8 x i16> %x2) - %res2 = add <8 x i16> %3, %4 - ret <8 x i16> %res2 + ret <8 x i16> %3 +} + +define <16 x i16>@test_int_x86_avx512_vpermt2var_hi_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermt2var_hi_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermi2w %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0xf5,0x28,0x75,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = call <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16> %x1, <16 x i16> %x0, <16 x i16> %x2) + ret <16 x i16> %1 } define <16 x i16>@test_int_x86_avx512_mask_vpermt2var_hi_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermt2var_hi_256: ; X86: # %bb.0: -; X86-NEXT: vmovdqa %ymm1, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9] -; X86-NEXT: vpermt2w %ymm2, %ymm0, %ymm3 # encoding: [0x62,0xf2,0xfd,0x28,0x7d,0xda] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpermt2w %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x7d,0xca] -; X86-NEXT: vpaddw %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc3] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermt2var_hi_256: ; X64: # %bb.0: -; X64-NEXT: vmovdqa %ymm1, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9] -; X64-NEXT: vpermt2w %ymm2, %ymm0, %ymm3 # encoding: [0x62,0xf2,0xfd,0x28,0x7d,0xda] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpermt2w %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x7d,0xca] -; X64-NEXT: vpaddw %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc3] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %1 = call <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16> %x1, <16 x i16> %x0, <16 x i16> %x2) %2 = bitcast i16 %x3 to <16 x i1> %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %x1 - %4 = call <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16> %x1, <16 x i16> %x0, <16 x i16> %x2) - %res2 = add <16 x i16> %3, %4 - ret <16 x i16> %res2 + ret <16 x i16> %3 } define <16 x i16>@test_int_x86_avx512_maskz_vpermt2var_hi_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_maskz_vpermt2var_hi_256: ; X86: # %bb.0: -; X86-NEXT: vmovdqa %ymm1, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9] -; X86-NEXT: vpermt2w %ymm2, %ymm0, %ymm3 # encoding: [0x62,0xf2,0xfd,0x28,0x7d,0xda] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] -; X86-NEXT: vpermt2w %ymm2, %ymm0, %ymm1 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x7d,0xca] -; X86-NEXT: vpaddw %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc3] +; X86-NEXT: vpermi2w %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0x75,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_maskz_vpermt2var_hi_256: ; X64: # %bb.0: -; X64-NEXT: vmovdqa %ymm1, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9] -; X64-NEXT: vpermt2w %ymm2, %ymm0, %ymm3 # encoding: [0x62,0xf2,0xfd,0x28,0x7d,0xda] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vpermt2w %ymm2, %ymm0, %ymm1 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x7d,0xca] -; X64-NEXT: vpaddw %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc3] +; X64-NEXT: vpermi2w %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0x75,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %1 = call <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16> %x1, <16 x i16> %x0, <16 x i16> %x2) %2 = bitcast i16 %x3 to <16 x i1> %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> zeroinitializer - %4 = call <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16> %x1, <16 x i16> %x0, <16 x i16> %x2) - %res2 = add <16 x i16> %3, %4 - ret <16 x i16> %res2 + ret <16 x i16> %3 } declare <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16>, <8 x i16>, <8 x i16>) +define <8 x i16>@test_int_x86_avx512_vpermi2var_hi_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermi2var_hi_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermt2w %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0xf5,0x08,0x7d,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = call <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2) + ret <8 x i16> %1 +} + define <8 x i16>@test_int_x86_avx512_mask_vpermi2var_hi_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermi2var_hi_128: ; X86: # %bb.0: -; X86-NEXT: vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8] -; X86-NEXT: vpermt2w %xmm2, %xmm1, %xmm3 # encoding: [0x62,0xf2,0xf5,0x08,0x7d,0xda] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpermi2w %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x75,0xca] -; X86-NEXT: vpaddw %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc3] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermi2var_hi_128: ; X64: # %bb.0: -; X64-NEXT: vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8] -; X64-NEXT: vpermt2w %xmm2, %xmm1, %xmm3 # encoding: [0x62,0xf2,0xf5,0x08,0x7d,0xda] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpermi2w %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x75,0xca] -; X64-NEXT: vpaddw %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc3] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %1 = call <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2) %2 = bitcast i8 %x3 to <8 x i1> %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %x1 - %4 = call <8 x i16> @llvm.x86.avx512.vpermi2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2) - %res2 = add <8 x i16> %3, %4 - ret <8 x i16> %res2 + ret <8 x i16> %3 } declare <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16>, <16 x i16>, <16 x i16>) +define <16 x i16>@test_int_x86_avx512_vpermi2var_hi_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermi2var_hi_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermt2w %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0xf5,0x28,0x7d,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = call <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2) + ret <16 x i16> %1 +} + define <16 x i16>@test_int_x86_avx512_mask_vpermi2var_hi_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermi2var_hi_256: ; X86: # %bb.0: -; X86-NEXT: vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8] -; X86-NEXT: vpermt2w %ymm2, %ymm1, %ymm3 # encoding: [0x62,0xf2,0xf5,0x28,0x7d,0xda] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpermi2w %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x75,0xca] -; X86-NEXT: vpaddw %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc3] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermi2var_hi_256: ; X64: # %bb.0: -; X64-NEXT: vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8] -; X64-NEXT: vpermt2w %ymm2, %ymm1, %ymm3 # encoding: [0x62,0xf2,0xf5,0x28,0x7d,0xda] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpermi2w %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x75,0xca] -; X64-NEXT: vpaddw %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc3] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %1 = call <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2) %2 = bitcast i16 %x3 to <16 x i1> %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %x1 - %4 = call <16 x i16> @llvm.x86.avx512.vpermi2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2) - %res2 = add <16 x i16> %3, %4 - ret <16 x i16> %res2 + ret <16 x i16> %3 } declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8>, <16 x i8>) @@ -1716,38 +1712,56 @@ define void @test_int_x86_avx512_mask_pmovus_wb_mem_128(i8* %ptr, <8 x i16> %x1, ret void } +define <16 x i8>@test_int_x86_avx512_pmov_wb_256(<16 x i16> %x0) { +; CHECK-LABEL: test_int_x86_avx512_pmov_wb_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovwb %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x30,0xc0] +; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = trunc <16 x i16> %x0 to <16 x i8> + ret <16 x i8> %1 +} + define <16 x i8>@test_int_x86_avx512_mask_pmov_wb_256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmov_wb_256: ; X86: # %bb.0: -; X86-NEXT: vpmovwb %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x30,0xc2] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmovwb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x30,0xc1] -; X86-NEXT: vpmovwb %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x30,0xc0] -; X86-NEXT: vpaddb %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xc0] -; X86-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmov_wb_256: ; X64: # %bb.0: -; X64-NEXT: vpmovwb %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x30,0xc2] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmovwb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x30,0xc1] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X64-NEXT: retq # encoding: [0xc3] + %1 = trunc <16 x i16> %x0 to <16 x i8> + %2 = bitcast i16 %x2 to <16 x i1> + %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %x1 + ret <16 x i8> %3 +} + +define <16 x i8>@test_int_x86_avx512_maskz_pmov_wb_256(<16 x i16> %x0, i16 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmov_wb_256: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpmovwb %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x30,0xc0] +; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmov_wb_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmovwb %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x30,0xc0] -; X64-NEXT: vpaddb %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xc0] -; X64-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] %1 = trunc <16 x i16> %x0 to <16 x i8> - %2 = trunc <16 x i16> %x0 to <16 x i8> - %3 = bitcast i16 %x2 to <16 x i1> - %4 = select <16 x i1> %3, <16 x i8> %2, <16 x i8> %x1 - %5 = trunc <16 x i16> %x0 to <16 x i8> - %6 = bitcast i16 %x2 to <16 x i1> - %7 = select <16 x i1> %6, <16 x i8> %5, <16 x i8> zeroinitializer - %res3 = add <16 x i8> %1, %4 - %res4 = add <16 x i8> %res3, %7 - ret <16 x i8> %res4 + %2 = bitcast i16 %x2 to <16 x i1> + %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> zeroinitializer + ret <16 x i8> %3 } declare void @llvm.x86.avx512.mask.pmov.wb.mem.256(i8* %ptr, <16 x i16>, i16) @@ -1776,34 +1790,52 @@ define void @test_int_x86_avx512_mask_pmov_wb_mem_256(i8* %ptr, <16 x i16> %x1, declare <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.256(<16 x i16>, <16 x i8>, i16) +define <16 x i8>@test_int_x86_avx512_pmovs_wb_256(<16 x i16> %x0, <16 x i8> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmovs_wb_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovswb %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x20,0xc0] +; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 -1) + ret <16 x i8> %res +} + define <16 x i8>@test_int_x86_avx512_mask_pmovs_wb_256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmovs_wb_256: ; X86: # %bb.0: ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmovswb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x20,0xc1] -; X86-NEXT: vpmovswb %ymm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x20,0xc2] -; X86-NEXT: vpaddb %xmm2, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xca] -; X86-NEXT: vpmovswb %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x20,0xc0] -; X86-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc1] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmovs_wb_256: ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vpmovswb %ymm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x20,0xc2] ; X64-NEXT: vpmovswb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x20,0xc1] -; X64-NEXT: vpaddb %xmm2, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xca] -; X64-NEXT: vpmovswb %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x20,0xc0] -; X64-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc1] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] - %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 -1) - %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2) - %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.256(<16 x i16> %x0, <16 x i8> zeroinitializer, i16 %x2) - %res3 = add <16 x i8> %res0, %res1 - %res4 = add <16 x i8> %res3, %res2 - ret <16 x i8> %res4 + %res = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2) + ret <16 x i8> %res +} + +define <16 x i8>@test_int_x86_avx512_maskz_pmovs_wb_256(<16 x i16> %x0, i16 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmovs_wb_256: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpmovswb %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x20,0xc0] +; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmovs_wb_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpmovswb %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x20,0xc0] +; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.256(<16 x i16> %x0, <16 x i8> zeroinitializer, i16 %x2) + ret <16 x i8> %res } declare void @llvm.x86.avx512.mask.pmovs.wb.mem.256(i8* %ptr, <16 x i16>, i16) @@ -1832,34 +1864,52 @@ define void @test_int_x86_avx512_mask_pmovs_wb_mem_256(i8* %ptr, <16 x i16> %x1, declare <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.256(<16 x i16>, <16 x i8>, i16) +define <16 x i8>@test_int_x86_avx512_pmovus_wb_256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_pmovus_wb_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovuswb %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x10,0xc0] +; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 -1) + ret <16 x i8> %res +} + define <16 x i8>@test_int_x86_avx512_mask_pmovus_wb_256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmovus_wb_256: ; X86: # %bb.0: ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmovuswb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x10,0xc1] -; X86-NEXT: vpmovuswb %ymm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x10,0xc2] -; X86-NEXT: vpaddb %xmm2, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xca] -; X86-NEXT: vpmovuswb %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x10,0xc0] -; X86-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc1] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmovus_wb_256: ; X64: # %bb.0: ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vpmovuswb %ymm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x10,0xc2] ; X64-NEXT: vpmovuswb %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x10,0xc1] -; X64-NEXT: vpaddb %xmm2, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xca] -; X64-NEXT: vpmovuswb %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x10,0xc0] -; X64-NEXT: vpaddb %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc1] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] - %res0 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 -1) - %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2) - %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.256(<16 x i16> %x0, <16 x i8> zeroinitializer, i16 %x2) - %res3 = add <16 x i8> %res0, %res1 - %res4 = add <16 x i8> %res3, %res2 - ret <16 x i8> %res4 + %res = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2) + ret <16 x i8> %res +} + +define <16 x i8>@test_int_x86_avx512_maskz_pmovus_wb_256(<16 x i16> %x0, i16 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmovus_wb_256: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpmovuswb %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x10,0xc0] +; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmovus_wb_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpmovuswb %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x10,0xc0] +; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.256(<16 x i16> %x0, <16 x i8> zeroinitializer, i16 %x2) + ret <16 x i8> %res } declare void @llvm.x86.avx512.mask.pmovus.wb.mem.256(i8* %ptr, <16 x i16>, i16) @@ -2360,70 +2410,100 @@ declare <16 x i16> @llvm.x86.avx512.psllv.w.256(<16 x i16>, <16 x i16>) declare <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16>, <8 x i16>) +define <8 x i16>@test_int_x86_avx512_permvar_hi_128(<8 x i16> %x0, <8 x i16> %x1) { +; CHECK-LABEL: test_int_x86_avx512_permvar_hi_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermw %xmm0, %xmm1, %xmm0 # encoding: [0x62,0xf2,0xf5,0x08,0x8d,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = call <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16> %x0, <8 x i16> %x1) + ret <8 x i16> %1 +} + define <8 x i16>@test_int_x86_avx512_mask_permvar_hi_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_permvar_hi_128: ; X86: # %bb.0: -; X86-NEXT: vpermw %xmm0, %xmm1, %xmm3 # encoding: [0x62,0xf2,0xf5,0x08,0x8d,0xd8] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] ; X86-NEXT: vpermw %xmm0, %xmm1, %xmm2 {%k1} # encoding: [0x62,0xf2,0xf5,0x09,0x8d,0xd0] -; X86-NEXT: vpermw %xmm0, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0x8d,0xc0] -; X86-NEXT: vpaddw %xmm3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc3] -; X86-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0] +; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_permvar_hi_128: ; X64: # %bb.0: -; X64-NEXT: vpermw %xmm0, %xmm1, %xmm3 # encoding: [0x62,0xf2,0xf5,0x08,0x8d,0xd8] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpermw %xmm0, %xmm1, %xmm2 {%k1} # encoding: [0x62,0xf2,0xf5,0x09,0x8d,0xd0] -; X64-NEXT: vpermw %xmm0, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0x8d,0xc0] -; X64-NEXT: vpaddw %xmm3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc3] -; X64-NEXT: vpaddw %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0] +; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %1 = call <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16> %x0, <8 x i16> %x1) %2 = bitcast i8 %x3 to <8 x i1> %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %x2 - %4 = call <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16> %x0, <8 x i16> %x1) - %5 = bitcast i8 %x3 to <8 x i1> - %6 = select <8 x i1> %5, <8 x i16> %4, <8 x i16> zeroinitializer - %7 = call <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16> %x0, <8 x i16> %x1) - %res3 = add <8 x i16> %3, %6 - %res4 = add <8 x i16> %res3, %7 - ret <8 x i16> %res4 + ret <8 x i16> %3 +} + +define <8 x i16>@test_int_x86_avx512_maskz_permvar_hi_128(<8 x i16> %x0, <8 x i16> %x1, i8 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_permvar_hi_128: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] +; X86-NEXT: vpermw %xmm0, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0x8d,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_permvar_hi_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpermw %xmm0, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0x8d,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %1 = call <8 x i16> @llvm.x86.avx512.permvar.hi.128(<8 x i16> %x0, <8 x i16> %x1) + %2 = bitcast i8 %x3 to <8 x i1> + %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> zeroinitializer + ret <8 x i16> %3 } declare <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16>, <16 x i16>) +define <16 x i16>@test_int_x86_avx512_permvar_hi_256(<16 x i16> %x0, <16 x i16> %x1) { +; CHECK-LABEL: test_int_x86_avx512_permvar_hi_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermw %ymm0, %ymm1, %ymm0 # encoding: [0x62,0xf2,0xf5,0x28,0x8d,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = call <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16> %x0, <16 x i16> %x1) + ret <16 x i16> %1 +} + define <16 x i16>@test_int_x86_avx512_mask_permvar_hi_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_permvar_hi_256: ; X86: # %bb.0: -; X86-NEXT: vpermw %ymm0, %ymm1, %ymm3 # encoding: [0x62,0xf2,0xf5,0x28,0x8d,0xd8] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpermw %ymm0, %ymm1, %ymm2 {%k1} # encoding: [0x62,0xf2,0xf5,0x29,0x8d,0xd0] -; X86-NEXT: vpermw %ymm0, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0x8d,0xc0] -; X86-NEXT: vpaddw %ymm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc3] -; X86-NEXT: vpaddw %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_permvar_hi_256: ; X64: # %bb.0: -; X64-NEXT: vpermw %ymm0, %ymm1, %ymm3 # encoding: [0x62,0xf2,0xf5,0x28,0x8d,0xd8] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpermw %ymm0, %ymm1, %ymm2 {%k1} # encoding: [0x62,0xf2,0xf5,0x29,0x8d,0xd0] -; X64-NEXT: vpermw %ymm0, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0x8d,0xc0] -; X64-NEXT: vpaddw %ymm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc3] -; X64-NEXT: vpaddw %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %1 = call <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16> %x0, <16 x i16> %x1) %2 = bitcast i16 %x3 to <16 x i1> %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %x2 - %4 = call <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16> %x0, <16 x i16> %x1) - %5 = bitcast i16 %x3 to <16 x i1> - %6 = select <16 x i1> %5, <16 x i16> %4, <16 x i16> zeroinitializer - %7 = call <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16> %x0, <16 x i16> %x1) - %res3 = add <16 x i16> %3, %6 - %res4 = add <16 x i16> %res3, %7 - ret <16 x i16> %res4 + ret <16 x i16> %3 } +define <16 x i16>@test_int_x86_avx512_maskz_permvar_hi_256(<16 x i16> %x0, <16 x i16> %x1, i16 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_permvar_hi_256: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpermw %ymm0, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0x8d,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_permvar_hi_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpermw %ymm0, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0x8d,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %1 = call <16 x i16> @llvm.x86.avx512.permvar.hi.256(<16 x i16> %x0, <16 x i16> %x1) + %2 = bitcast i16 %x3 to <16 x i1> + %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> zeroinitializer + ret <16 x i16> %3 +} diff --git a/llvm/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll index 64659d4558604..0029d6a60881b 100644 --- a/llvm/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll @@ -4,184 +4,277 @@ declare <2 x double> @llvm.x86.avx512.mask.vextractf64x2.512(<8 x double>, i32, <2 x double>, i8) +define <2 x double>@test_int_x86_avx512_vextractf64x2_512(<8 x double> %x0, <2 x double> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vextractf64x2_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 # encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01] +; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.512(<8 x double> %x0,i32 1, <2 x double> zeroinitializer, i8 -1) + ret <2 x double> %res +} + define <2 x double>@test_int_x86_avx512_mask_vextractf64x2_512(<8 x double> %x0, <2 x double> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vextractf64x2_512: ; X86: # %bb.0: -; X86-NEXT: vextractf128 $1, %ymm0, %xmm2 # encoding: [0xc4,0xe3,0x7d,0x19,0xc2,0x01] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vextractf64x2 $1, %zmm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x19,0xc1,0x01] -; X86-NEXT: vaddpd %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x58,0xca] -; X86-NEXT: vextractf64x2 $1, %zmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0x19,0xc0,0x01] -; X86-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x58,0xc1] +; X86-NEXT: vmovapd %xmm1, %xmm0 # encoding: [0xc5,0xf9,0x28,0xc1] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vextractf64x2_512: ; X64: # %bb.0: -; X64-NEXT: vextractf128 $1, %ymm0, %xmm2 # encoding: [0xc4,0xe3,0x7d,0x19,0xc2,0x01] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vextractf64x2 $1, %zmm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x19,0xc1,0x01] -; X64-NEXT: vaddpd %xmm2, %xmm1, %xmm1 # encoding: [0xc5,0xf1,0x58,0xca] -; X64-NEXT: vextractf64x2 $1, %zmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0x19,0xc0,0x01] -; X64-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x58,0xc1] +; X64-NEXT: vmovapd %xmm1, %xmm0 # encoding: [0xc5,0xf9,0x28,0xc1] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] %res = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.512(<8 x double> %x0,i32 1, <2 x double> %x2, i8 %x3) - %res2 = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.512(<8 x double> %x0,i32 1, <2 x double> zeroinitializer, i8 %x3) - %res1 = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.512(<8 x double> %x0,i32 1, <2 x double> zeroinitializer, i8 -1) - %res3 = fadd <2 x double> %res, %res1 - %res4 = fadd <2 x double> %res2, %res3 - ret <2 x double> %res4 + ret <2 x double> %res +} + +define <2 x double>@test_int_x86_avx512_maskz_vextractf64x2_512(<8 x double> %x0, i8 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_vextractf64x2_512: +; X86: # %bb.0: +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vextractf64x2 $1, %zmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0x19,0xc0,0x01] +; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_vextractf64x2_512: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vextractf64x2 $1, %zmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0x19,0xc0,0x01] +; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.512(<8 x double> %x0,i32 1, <2 x double> zeroinitializer, i8 %x3) + ret <2 x double> %res } declare <8 x float> @llvm.x86.avx512.mask.vextractf32x8.512(<16 x float>, i32, <8 x float>, i8) +define <8 x float>@test_int_x86_avx512_vextractf32x8(<16 x float> %x0, <8 x float> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vextractf32x8: +; CHECK: # %bb.0: +; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm0 # encoding: [0x62,0xf3,0xfd,0x48,0x1b,0xc0,0x01] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x float> @llvm.x86.avx512.mask.vextractf32x8.512(<16 x float> %x0,i32 1, <8 x float> zeroinitializer, i8 -1) + ret <8 x float> %res +} + define <8 x float>@test_int_x86_avx512_mask_vextractf32x8(<16 x float> %x0, <8 x float> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vextractf32x8: ; X86: # %bb.0: -; X86-NEXT: vextractf64x4 $1, %zmm0, %ymm2 # encoding: [0x62,0xf3,0xfd,0x48,0x1b,0xc2,0x01] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vextractf32x8 $1, %zmm0, %ymm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x1b,0xc1,0x01] -; X86-NEXT: vaddps %ymm2, %ymm1, %ymm1 # encoding: [0xc5,0xf4,0x58,0xca] -; X86-NEXT: vextractf32x8 $1, %zmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xc9,0x1b,0xc0,0x01] -; X86-NEXT: vaddps %ymm1, %ymm0, %ymm0 # encoding: [0xc5,0xfc,0x58,0xc1] +; X86-NEXT: vmovaps %ymm1, %ymm0 # encoding: [0xc5,0xfc,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vextractf32x8: ; X64: # %bb.0: -; X64-NEXT: vextractf64x4 $1, %zmm0, %ymm2 # encoding: [0x62,0xf3,0xfd,0x48,0x1b,0xc2,0x01] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vextractf32x8 $1, %zmm0, %ymm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x1b,0xc1,0x01] -; X64-NEXT: vaddps %ymm2, %ymm1, %ymm1 # encoding: [0xc5,0xf4,0x58,0xca] -; X64-NEXT: vextractf32x8 $1, %zmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xc9,0x1b,0xc0,0x01] -; X64-NEXT: vaddps %ymm1, %ymm0, %ymm0 # encoding: [0xc5,0xfc,0x58,0xc1] +; X64-NEXT: vmovaps %ymm1, %ymm0 # encoding: [0xc5,0xfc,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx512.mask.vextractf32x8.512(<16 x float> %x0,i32 1, <8 x float> %x2, i8 %x3) - %res2 = call <8 x float> @llvm.x86.avx512.mask.vextractf32x8.512(<16 x float> %x0,i32 1, <8 x float> zeroinitializer, i8 %x3) - %res1 = call <8 x float> @llvm.x86.avx512.mask.vextractf32x8.512(<16 x float> %x0,i32 1, <8 x float> zeroinitializer, i8 -1) - %res3 = fadd <8 x float> %res, %res1 - %res4 = fadd <8 x float> %res2, %res3 - ret <8 x float> %res4 + ret <8 x float> %res +} + +define <8 x float>@test_int_x86_avx512_maskz_vextractf32x8(<16 x float> %x0, i8 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_vextractf32x8: +; X86: # %bb.0: +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vextractf32x8 $1, %zmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xc9,0x1b,0xc0,0x01] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_vextractf32x8: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vextractf32x8 $1, %zmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xc9,0x1b,0xc0,0x01] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <8 x float> @llvm.x86.avx512.mask.vextractf32x8.512(<16 x float> %x0,i32 1, <8 x float> zeroinitializer, i8 %x3) + ret <8 x float> %res } declare <16 x float> @llvm.x86.avx512.mask.insertf32x8.512(<16 x float>, <8 x float>, i32, <16 x float>, i16) +define <16 x float>@test_int_x86_avx512_insertf32x8_512(<16 x float> %x0, <8 x float> %x1, <16 x float> %x3) { +; CHECK-LABEL: test_int_x86_avx512_insertf32x8_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0xfd,0x48,0x1a,0xc1,0x01] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.insertf32x8.512(<16 x float> %x0, <8 x float> %x1, i32 1, <16 x float> %x3, i16 -1) + ret <16 x float> %res +} + define <16 x float>@test_int_x86_avx512_mask_insertf32x8_512(<16 x float> %x0, <8 x float> %x1, <16 x float> %x3, i16 %x4) { ; X86-LABEL: test_int_x86_avx512_mask_insertf32x8_512: ; X86: # %bb.0: -; X86-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0xfd,0x48,0x1a,0xd9,0x01] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vinsertf32x8 $1, %ymm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x1a,0xd1,0x01] -; X86-NEXT: vinsertf32x8 $1, %ymm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xc9,0x1a,0xc1,0x01] -; X86-NEXT: vaddps %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6c,0x48,0x58,0xc0] -; X86-NEXT: vaddps %zmm0, %zmm3, %zmm0 # encoding: [0x62,0xf1,0x64,0x48,0x58,0xc0] +; X86-NEXT: vmovaps %zmm2, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_insertf32x8_512: ; X64: # %bb.0: -; X64-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0xfd,0x48,0x1a,0xd9,0x01] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vinsertf32x8 $1, %ymm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x1a,0xd1,0x01] -; X64-NEXT: vinsertf32x8 $1, %ymm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xc9,0x1a,0xc1,0x01] -; X64-NEXT: vaddps %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6c,0x48,0x58,0xc0] -; X64-NEXT: vaddps %zmm0, %zmm3, %zmm0 # encoding: [0x62,0xf1,0x64,0x48,0x58,0xc0] +; X64-NEXT: vmovaps %zmm2, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <16 x float> @llvm.x86.avx512.mask.insertf32x8.512(<16 x float> %x0, <8 x float> %x1, i32 1, <16 x float> %x3, i16 %x4) - %res1 = call <16 x float> @llvm.x86.avx512.mask.insertf32x8.512(<16 x float> %x0, <8 x float> %x1, i32 1, <16 x float> zeroinitializer, i16 %x4) - %res2 = call <16 x float> @llvm.x86.avx512.mask.insertf32x8.512(<16 x float> %x0, <8 x float> %x1, i32 1, <16 x float> %x3, i16 -1) - %res3 = fadd <16 x float> %res, %res1 - %res4 = fadd <16 x float> %res2, %res3 - ret <16 x float> %res4 + ret <16 x float> %res +} + +define <16 x float>@test_int_x86_avx512_maskz_insertf32x8_512(<16 x float> %x0, <8 x float> %x1, i16 %x4) { +; X86-LABEL: test_int_x86_avx512_maskz_insertf32x8_512: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vinsertf32x8 $1, %ymm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xc9,0x1a,0xc1,0x01] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_insertf32x8_512: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vinsertf32x8 $1, %ymm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xc9,0x1a,0xc1,0x01] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <16 x float> @llvm.x86.avx512.mask.insertf32x8.512(<16 x float> %x0, <8 x float> %x1, i32 1, <16 x float> zeroinitializer, i16 %x4) + ret <16 x float> %res } declare <8 x double> @llvm.x86.avx512.mask.insertf64x2.512(<8 x double>, <2 x double>, i32, <8 x double>, i8) +define <8 x double>@test_int_x86_avx512_insertf64x2_512(<8 x double> %x0, <2 x double> %x1,<8 x double> %x3) { +; CHECK-LABEL: test_int_x86_avx512_insertf64x2_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0x7d,0x48,0x18,0xc1,0x01] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x double> @llvm.x86.avx512.mask.insertf64x2.512(<8 x double> %x0, <2 x double> %x1, i32 1, <8 x double> %x3, i8 -1) + ret <8 x double> %res +} + define <8 x double>@test_int_x86_avx512_mask_insertf64x2_512(<8 x double> %x0, <2 x double> %x1,<8 x double> %x3, i8 %x4) { ; X86-LABEL: test_int_x86_avx512_mask_insertf64x2_512: ; X86: # %bb.0: -; X86-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0x7d,0x48,0x18,0xd9,0x01] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vinsertf64x2 $1, %xmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x18,0xd1,0x01] -; X86-NEXT: vinsertf64x2 $1, %xmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0x18,0xc1,0x01] -; X86-NEXT: vaddpd %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0xed,0x48,0x58,0xc0] -; X86-NEXT: vaddpd %zmm3, %zmm0, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x58,0xc3] +; X86-NEXT: vmovapd %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_insertf64x2_512: ; X64: # %bb.0: -; X64-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0x7d,0x48,0x18,0xd9,0x01] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vinsertf64x2 $1, %xmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x18,0xd1,0x01] -; X64-NEXT: vinsertf64x2 $1, %xmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0x18,0xc1,0x01] -; X64-NEXT: vaddpd %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0xed,0x48,0x58,0xc0] -; X64-NEXT: vaddpd %zmm3, %zmm0, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x58,0xc3] +; X64-NEXT: vmovapd %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x double> @llvm.x86.avx512.mask.insertf64x2.512(<8 x double> %x0, <2 x double> %x1, i32 1, <8 x double> %x3, i8 %x4) - %res1 = call <8 x double> @llvm.x86.avx512.mask.insertf64x2.512(<8 x double> %x0, <2 x double> %x1, i32 1, <8 x double> zeroinitializer, i8 %x4) - %res2 = call <8 x double> @llvm.x86.avx512.mask.insertf64x2.512(<8 x double> %x0, <2 x double> %x1, i32 1, <8 x double> %x3, i8 -1) - %res3 = fadd <8 x double> %res, %res1 - %res4 = fadd <8 x double> %res3, %res2 - ret <8 x double> %res4 + ret <8 x double> %res +} + +define <8 x double>@test_int_x86_avx512_maskz_insertf64x2_512(<8 x double> %x0, <2 x double> %x1, i8 %x4) { +; X86-LABEL: test_int_x86_avx512_maskz_insertf64x2_512: +; X86: # %bb.0: +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vinsertf64x2 $1, %xmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0x18,0xc1,0x01] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_insertf64x2_512: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vinsertf64x2 $1, %xmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0x18,0xc1,0x01] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <8 x double> @llvm.x86.avx512.mask.insertf64x2.512(<8 x double> %x0, <2 x double> %x1, i32 1, <8 x double> zeroinitializer, i8 %x4) + ret <8 x double> %res } declare <16 x i32> @llvm.x86.avx512.mask.inserti32x8.512(<16 x i32>, <8 x i32>, i32, <16 x i32>, i16) +define <16 x i32>@test_int_x86_avx512_inserti32x8_512(<16 x i32> %x0, <8 x i32> %x1, <16 x i32> %x3) { +; CHECK-LABEL: test_int_x86_avx512_inserti32x8_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0xfd,0x48,0x1a,0xc1,0x01] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <16 x i32> @llvm.x86.avx512.mask.inserti32x8.512(<16 x i32> %x0, <8 x i32> %x1, i32 1, <16 x i32> %x3, i16 -1) + ret <16 x i32> %res +} + define <16 x i32>@test_int_x86_avx512_mask_inserti32x8_512(<16 x i32> %x0, <8 x i32> %x1, <16 x i32> %x3, i16 %x4) { ; X86-LABEL: test_int_x86_avx512_mask_inserti32x8_512: ; X86: # %bb.0: -; X86-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0xfd,0x48,0x3a,0xd9,0x01] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3a,0xd1,0x01] -; X86-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xc9,0x3a,0xc1,0x01] -; X86-NEXT: vpaddd %zmm3, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfe,0xc3] -; X86-NEXT: vpaddd %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc0] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_inserti32x8_512: ; X64: # %bb.0: -; X64-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0xfd,0x48,0x3a,0xd9,0x01] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3a,0xd1,0x01] -; X64-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xc9,0x3a,0xc1,0x01] -; X64-NEXT: vpaddd %zmm3, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfe,0xc3] -; X64-NEXT: vpaddd %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc0] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <16 x i32> @llvm.x86.avx512.mask.inserti32x8.512(<16 x i32> %x0, <8 x i32> %x1, i32 1, <16 x i32> %x3, i16 %x4) - %res1 = call <16 x i32> @llvm.x86.avx512.mask.inserti32x8.512(<16 x i32> %x0, <8 x i32> %x1, i32 1, <16 x i32> zeroinitializer, i16 %x4) - %res2 = call <16 x i32> @llvm.x86.avx512.mask.inserti32x8.512(<16 x i32> %x0, <8 x i32> %x1, i32 1, <16 x i32> %x3, i16 -1) - %res3 = add <16 x i32> %res, %res1 - %res4 = add <16 x i32> %res3, %res2 - ret <16 x i32> %res4 + ret <16 x i32> %res +} + +define <16 x i32>@test_int_x86_avx512_maskz_inserti32x8_512(<16 x i32> %x0, <8 x i32> %x1, i16 %x4) { +; X86-LABEL: test_int_x86_avx512_maskz_inserti32x8_512: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xc9,0x3a,0xc1,0x01] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_inserti32x8_512: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xc9,0x3a,0xc1,0x01] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <16 x i32> @llvm.x86.avx512.mask.inserti32x8.512(<16 x i32> %x0, <8 x i32> %x1, i32 1, <16 x i32> zeroinitializer, i16 %x4) + ret <16 x i32> %res } declare <8 x i64> @llvm.x86.avx512.mask.inserti64x2.512(<8 x i64>, <2 x i64>, i32, <8 x i64>, i8) +define <8 x i64>@test_int_x86_avx512_inserti64x2_512(<8 x i64> %x0, <2 x i64> %x1, <8 x i64> %x3) { +; CHECK-LABEL: test_int_x86_avx512_inserti64x2_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm0 # encoding: [0x62,0xf3,0x7d,0x48,0x18,0xc1,0x01] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i64> @llvm.x86.avx512.mask.inserti64x2.512(<8 x i64> %x0, <2 x i64> %x1, i32 1, <8 x i64> %x3, i8 -1) + ret <8 x i64> %res +} + define <8 x i64>@test_int_x86_avx512_mask_inserti64x2_512(<8 x i64> %x0, <2 x i64> %x1, <8 x i64> %x3, i8 %x4) { ; X86-LABEL: test_int_x86_avx512_mask_inserti64x2_512: ; X86: # %bb.0: -; X86-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0x7d,0x48,0x38,0xd9,0x01] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vinserti64x2 $1, %xmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x38,0xd1,0x01] -; X86-NEXT: vinserti64x2 $1, %xmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0x38,0xc1,0x01] -; X86-NEXT: vpaddq %zmm3, %zmm0, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0xd4,0xc3] -; X86-NEXT: vpaddq %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc0] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_inserti64x2_512: ; X64: # %bb.0: -; X64-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm3 # encoding: [0x62,0xf3,0x7d,0x48,0x38,0xd9,0x01] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vinserti64x2 $1, %xmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x38,0xd1,0x01] -; X64-NEXT: vinserti64x2 $1, %xmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0x38,0xc1,0x01] -; X64-NEXT: vpaddq %zmm3, %zmm0, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0xd4,0xc3] -; X64-NEXT: vpaddq %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc0] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x i64> @llvm.x86.avx512.mask.inserti64x2.512(<8 x i64> %x0, <2 x i64> %x1, i32 1, <8 x i64> %x3, i8 %x4) - %res1 = call <8 x i64> @llvm.x86.avx512.mask.inserti64x2.512(<8 x i64> %x0, <2 x i64> %x1, i32 1, <8 x i64> zeroinitializer, i8 %x4) - %res2 = call <8 x i64> @llvm.x86.avx512.mask.inserti64x2.512(<8 x i64> %x0, <2 x i64> %x1, i32 1, <8 x i64> %x3, i8 -1) - %res3 = add <8 x i64> %res, %res1 - %res4 = add <8 x i64> %res2, %res3 - ret <8 x i64> %res4 + ret <8 x i64> %res +} + +define <8 x i64>@test_int_x86_avx512_maskz_inserti64x2_512(<8 x i64> %x0, <2 x i64> %x1, i8 %x4) { +; X86-LABEL: test_int_x86_avx512_maskz_inserti64x2_512: +; X86: # %bb.0: +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vinserti64x2 $1, %xmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0x38,0xc1,0x01] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_inserti64x2_512: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vinserti64x2 $1, %xmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0x38,0xc1,0x01] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <8 x i64> @llvm.x86.avx512.mask.inserti64x2.512(<8 x i64> %x0, <2 x i64> %x1, i32 1, <8 x i64> zeroinitializer, i8 %x4) + ret <8 x i64> %res } @@ -223,35 +316,55 @@ define <8 x i64>@test_int_x86_avx512_cvtmask2q_512(i8 %x0) { declare <16 x float> @llvm.x86.avx512.mask.broadcastf32x8.512(<8 x float>, <16 x float>, i16) +define <16 x float>@test_int_x86_avx512_broadcastf32x8_512(<8 x float> %x0, <16 x float> %x2) { +; CHECK-LABEL: test_int_x86_avx512_broadcastf32x8_512: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; CHECK-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 # encoding: [0x62,0xf3,0xfd,0x48,0x1a,0xc0,0x01] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + + %res = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x8.512(<8 x float> %x0, <16 x float> %x2, i16 -1) + ret <16 x float> %res +} + define <16 x float>@test_int_x86_avx512_mask_broadcastf32x8_512(<8 x float> %x0, <16 x float> %x2, i16 %mask) { ; X86-LABEL: test_int_x86_avx512_mask_broadcastf32x8_512: ; X86: # %bb.0: ; X86-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; X86-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm2 # encoding: [0x62,0xf3,0xfd,0x48,0x1a,0xd0,0x01] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vinsertf32x8 $1, %ymm0, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x1a,0xc8,0x01] -; X86-NEXT: vaddps %zmm1, %zmm2, %zmm1 # encoding: [0x62,0xf1,0x6c,0x48,0x58,0xc9] -; X86-NEXT: vinsertf32x8 $1, %ymm0, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xc9,0x1a,0xc0,0x01] -; X86-NEXT: vaddps %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x58,0xc1] +; X86-NEXT: vmovaps %zmm1, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_broadcastf32x8_512: ; X64: # %bb.0: ; X64-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; X64-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm2 # encoding: [0x62,0xf3,0xfd,0x48,0x1a,0xd0,0x01] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vinsertf32x8 $1, %ymm0, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x1a,0xc8,0x01] -; X64-NEXT: vaddps %zmm1, %zmm2, %zmm1 # encoding: [0x62,0xf1,0x6c,0x48,0x58,0xc9] +; X64-NEXT: vmovaps %zmm1, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + + %res = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x8.512(<8 x float> %x0, <16 x float> %x2, i16 %mask) + ret <16 x float> %res +} + +define <16 x float>@test_int_x86_avx512_maskz_broadcastf32x8_512(<8 x float> %x0, i16 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_broadcastf32x8_512: +; X86: # %bb.0: +; X86-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vinsertf32x8 $1, %ymm0, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xc9,0x1a,0xc0,0x01] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_broadcastf32x8_512: +; X64: # %bb.0: +; X64-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vinsertf32x8 $1, %ymm0, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xc9,0x1a,0xc0,0x01] -; X64-NEXT: vaddps %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x58,0xc1] ; X64-NEXT: retq # encoding: [0xc3] - %res1 = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x8.512(<8 x float> %x0, <16 x float> %x2, i16 -1) - %res2 = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x8.512(<8 x float> %x0, <16 x float> %x2, i16 %mask) - %res3 = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x8.512(<8 x float> %x0, <16 x float> zeroinitializer, i16 %mask) - %res4 = fadd <16 x float> %res1, %res2 - %res5 = fadd <16 x float> %res3, %res4 - ret <16 x float> %res5 + %res = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x8.512(<8 x float> %x0, <16 x float> zeroinitializer, i16 %mask) + ret <16 x float> %res } define <16 x float>@test_int_x86_avx512_mask_broadcastf32x8_512_load(<8 x float>* %x0ptr, <16 x float> %x2, i16 %mask) { @@ -277,37 +390,60 @@ define <16 x float>@test_int_x86_avx512_mask_broadcastf32x8_512_load(<8 x float> declare <8 x double> @llvm.x86.avx512.mask.broadcastf64x2.512(<2 x double>, <8 x double>, i8) +define <8 x double>@test_int_x86_avx512_broadcastf64x2_512(<2 x double> %x0, <8 x double> %x2) { +; CHECK-LABEL: test_int_x86_avx512_broadcastf64x2_512: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x18,0xc0,0x01] +; CHECK-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 # encoding: [0x62,0xf3,0xfd,0x48,0x1a,0xc0,0x01] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + + %res = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x2.512(<2 x double> %x0, <8 x double> %x2, i8 -1) + ret <8 x double> %res +} + define <8 x double>@test_int_x86_avx512_mask_broadcastf64x2_512(<2 x double> %x0, <8 x double> %x2, i8 %mask) { ; X86-LABEL: test_int_x86_avx512_mask_broadcastf64x2_512: ; X86: # %bb.0: ; X86-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x18,0xc0,0x01] -; X86-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm2 # encoding: [0x62,0xf3,0xfd,0x48,0x1a,0xd0,0x01] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x1a,0xc8,0x01] -; X86-NEXT: vaddpd %zmm1, %zmm2, %zmm1 # encoding: [0x62,0xf1,0xed,0x48,0x58,0xc9] -; X86-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0x1a,0xc0,0x01] -; X86-NEXT: vaddpd %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x58,0xc1] +; X86-NEXT: vmovapd %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_broadcastf64x2_512: ; X64: # %bb.0: ; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x18,0xc0,0x01] -; X64-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm2 # encoding: [0x62,0xf3,0xfd,0x48,0x1a,0xd0,0x01] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x1a,0xc8,0x01] -; X64-NEXT: vaddpd %zmm1, %zmm2, %zmm1 # encoding: [0x62,0xf1,0xed,0x48,0x58,0xc9] +; X64-NEXT: vmovapd %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x28,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + + %res = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x2.512(<2 x double> %x0, <8 x double> %x2, i8 %mask) + ret <8 x double> %res +} + +define <8 x double>@test_int_x86_avx512_maskz_broadcastf64x2_512(<2 x double> %x0, i8 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_broadcastf64x2_512: +; X86: # %bb.0: +; X86-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x18,0xc0,0x01] +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0x1a,0xc0,0x01] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_broadcastf64x2_512: +; X64: # %bb.0: +; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x18,0xc0,0x01] +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0x1a,0xc0,0x01] -; X64-NEXT: vaddpd %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x58,0xc1] ; X64-NEXT: retq # encoding: [0xc3] - %res1 = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x2.512(<2 x double> %x0, <8 x double> %x2, i8 -1) - %res2 = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x2.512(<2 x double> %x0, <8 x double> %x2, i8 %mask) - %res3 = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x2.512(<2 x double> %x0, <8 x double> zeroinitializer, i8 %mask) - %res4 = fadd <8 x double> %res1, %res2 - %res5 = fadd <8 x double> %res3, %res4 - ret <8 x double> %res5 + %res = call <8 x double> @llvm.x86.avx512.mask.broadcastf64x2.512(<2 x double> %x0, <8 x double> zeroinitializer, i8 %mask) + ret <8 x double> %res } define <8 x double>@test_int_x86_avx512_mask_broadcastf64x2_512_load(<2 x double>* %x0ptr, <8 x double> %x2, i8 %mask) { @@ -333,35 +469,55 @@ define <8 x double>@test_int_x86_avx512_mask_broadcastf64x2_512_load(<2 x double declare <16 x i32> @llvm.x86.avx512.mask.broadcasti32x8.512(<8 x i32>, <16 x i32>, i16) +define <16 x i32>@test_int_x86_avx512_broadcasti32x8_512(<8 x i32> %x0, <16 x i32> %x2) { +; CHECK-LABEL: test_int_x86_avx512_broadcasti32x8_512: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; CHECK-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 # encoding: [0x62,0xf3,0xfd,0x48,0x1a,0xc0,0x01] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + + %res = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x8.512(<8 x i32> %x0, <16 x i32> %x2, i16 -1) + ret <16 x i32> %res +} + define <16 x i32>@test_int_x86_avx512_mask_broadcasti32x8_512(<8 x i32> %x0, <16 x i32> %x2, i16 %mask) { ; X86-LABEL: test_int_x86_avx512_mask_broadcasti32x8_512: ; X86: # %bb.0: ; X86-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; X86-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 # encoding: [0x62,0xf3,0xfd,0x48,0x3a,0xd0,0x01] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3a,0xc8,0x01] -; X86-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xc9,0x3a,0xc0,0x01] -; X86-NEXT: vpaddd %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfe,0xc0] -; X86-NEXT: vpaddd %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc0] +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_broadcasti32x8_512: ; X64: # %bb.0: ; X64-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; X64-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 # encoding: [0x62,0xf3,0xfd,0x48,0x3a,0xd0,0x01] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3a,0xc8,0x01] +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + + %res = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x8.512(<8 x i32> %x0, <16 x i32> %x2, i16 %mask) + ret <16 x i32> %res +} + +define <16 x i32>@test_int_x86_avx512_maskz_broadcasti32x8_512(<8 x i32> %x0, i16 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_broadcasti32x8_512: +; X86: # %bb.0: +; X86-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xc9,0x3a,0xc0,0x01] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_broadcasti32x8_512: +; X64: # %bb.0: +; X64-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xc9,0x3a,0xc0,0x01] -; X64-NEXT: vpaddd %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfe,0xc0] -; X64-NEXT: vpaddd %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfe,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res1 = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x8.512(<8 x i32> %x0, <16 x i32> %x2, i16 -1) - %res2 = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x8.512(<8 x i32> %x0, <16 x i32> %x2, i16 %mask) - %res3 = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x8.512(<8 x i32> %x0, <16 x i32> zeroinitializer, i16 %mask) - %res4 = add <16 x i32> %res1, %res2 - %res5 = add <16 x i32> %res3, %res4 - ret <16 x i32> %res5 + %res = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x8.512(<8 x i32> %x0, <16 x i32> zeroinitializer, i16 %mask) + ret <16 x i32> %res } define <16 x i32>@test_int_x86_avx512_mask_broadcasti32x8_512_load(<8 x i32>* %x0ptr, <16 x i32> %x2, i16 %mask) { @@ -387,37 +543,60 @@ define <16 x i32>@test_int_x86_avx512_mask_broadcasti32x8_512_load(<8 x i32>* %x declare <8 x i64> @llvm.x86.avx512.mask.broadcasti64x2.512(<2 x i64>, <8 x i64>, i8) +define <8 x i64>@test_int_x86_avx512_broadcasti64x2_512(<2 x i64> %x0, <8 x i64> %x2) { +; CHECK-LABEL: test_int_x86_avx512_broadcasti64x2_512: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x18,0xc0,0x01] +; CHECK-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0 # encoding: [0x62,0xf3,0xfd,0x48,0x1a,0xc0,0x01] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + + %res = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x2.512(<2 x i64> %x0, <8 x i64> %x2, i8 -1) + ret <8 x i64> %res +} + define <8 x i64>@test_int_x86_avx512_mask_broadcasti64x2_512(<2 x i64> %x0, <8 x i64> %x2, i8 %mask) { ; X86-LABEL: test_int_x86_avx512_mask_broadcasti64x2_512: ; X86: # %bb.0: ; X86-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; X86-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x38,0xc0,0x01] -; X86-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 # encoding: [0x62,0xf3,0xfd,0x48,0x3a,0xd0,0x01] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3a,0xc8,0x01] -; X86-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0x3a,0xc0,0x01] -; X86-NEXT: vpaddq %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc0] -; X86-NEXT: vpaddq %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc0] +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_broadcasti64x2_512: ; X64: # %bb.0: ; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; X64-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x38,0xc0,0x01] -; X64-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 # encoding: [0x62,0xf3,0xfd,0x48,0x3a,0xd0,0x01] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3a,0xc8,0x01] +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + + %res = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x2.512(<2 x i64> %x0, <8 x i64> %x2, i8 %mask) + ret <8 x i64> %res +} + +define <8 x i64>@test_int_x86_avx512_maskz_broadcasti64x2_512(<2 x i64> %x0, i8 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_broadcasti64x2_512: +; X86: # %bb.0: +; X86-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; X86-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x38,0xc0,0x01] +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0x3a,0xc0,0x01] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_broadcasti64x2_512: +; X64: # %bb.0: +; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; X64-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x38,0xc0,0x01] +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xc9,0x3a,0xc0,0x01] -; X64-NEXT: vpaddq %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0xf5,0x48,0xd4,0xc0] -; X64-NEXT: vpaddq %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0xed,0x48,0xd4,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res1 = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x2.512(<2 x i64> %x0, <8 x i64> %x2, i8 -1) - %res2 = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x2.512(<2 x i64> %x0, <8 x i64> %x2, i8 %mask) - %res3 = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x2.512(<2 x i64> %x0, <8 x i64> zeroinitializer, i8 %mask) - %res4 = add <8 x i64> %res1, %res2 - %res5 = add <8 x i64> %res3, %res4 - ret <8 x i64> %res5 + %res = call <8 x i64> @llvm.x86.avx512.mask.broadcasti64x2.512(<2 x i64> %x0, <8 x i64> zeroinitializer, i8 %mask) + ret <8 x i64> %res } define <8 x i64>@test_int_x86_avx512_mask_broadcasti64x2_512_load(<2 x i64>* %x0ptr, <8 x i64> %x2, i8 %mask) { @@ -441,74 +620,6 @@ define <8 x i64>@test_int_x86_avx512_mask_broadcasti64x2_512_load(<2 x i64>* %x0 ret <8 x i64> %res } -declare <16 x float> @llvm.x86.avx512.mask.broadcastf32x2.512(<4 x float>, <16 x float>, i16) - -define <16 x float>@test_int_x86_avx512_mask_broadcastf32x2_512(<4 x float> %x0, <16 x float> %x2, i16 %x3) { -; X86-LABEL: test_int_x86_avx512_mask_broadcastf32x2_512: -; X86: # %bb.0: -; X86-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x18,0xc0,0x01] -; X86-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm2 # encoding: [0x62,0xf3,0xfd,0x48,0x1a,0xd0,0x01] -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] -; X86-NEXT: vinsertf32x8 $1, %ymm0, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x1a,0xc8,0x01] -; X86-NEXT: vinsertf32x8 $1, %ymm0, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xc9,0x1a,0xc0,0x01] -; X86-NEXT: vaddps %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x74,0x48,0x58,0xc0] -; X86-NEXT: vaddps %zmm2, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x58,0xc2] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_int_x86_avx512_mask_broadcastf32x2_512: -; X64: # %bb.0: -; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x18,0xc0,0x01] -; X64-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm2 # encoding: [0x62,0xf3,0xfd,0x48,0x1a,0xd0,0x01] -; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vinsertf32x8 $1, %ymm0, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x1a,0xc8,0x01] -; X64-NEXT: vinsertf32x8 $1, %ymm0, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xc9,0x1a,0xc0,0x01] -; X64-NEXT: vaddps %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x74,0x48,0x58,0xc0] -; X64-NEXT: vaddps %zmm2, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x58,0xc2] -; X64-NEXT: retq # encoding: [0xc3] - %res = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x2.512(<4 x float> %x0, <16 x float> %x2, i16 %x3) - %res1 = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x2.512(<4 x float> %x0, <16 x float> zeroinitializer, i16 %x3) - %res2 = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x2.512(<4 x float> %x0, <16 x float> %x2, i16 -1) - %res3 = fadd <16 x float> %res, %res1 - %res4 = fadd <16 x float> %res3, %res2 - ret <16 x float> %res4 -} - -declare <16 x i32> @llvm.x86.avx512.mask.broadcasti32x2.512(<4 x i32>, <16 x i32>, i16) - -define <16 x i32>@test_int_x86_avx512_mask_broadcasti32x2_512(<4 x i32> %x0, <16 x i32> %x2, i16 %x3) { -; X86-LABEL: test_int_x86_avx512_mask_broadcasti32x2_512: -; X86: # %bb.0: -; X86-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; X86-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x38,0xc0,0x01] -; X86-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 # encoding: [0x62,0xf3,0xfd,0x48,0x3a,0xd0,0x01] -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] -; X86-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3a,0xc8,0x01] -; X86-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xc9,0x3a,0xc0,0x01] -; X86-NEXT: vpaddd %zmm2, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfe,0xc2] -; X86-NEXT: vpaddd %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfe,0xc0] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_int_x86_avx512_mask_broadcasti32x2_512: -; X64: # %bb.0: -; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; X64-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x38,0xc0,0x01] -; X64-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2 # encoding: [0x62,0xf3,0xfd,0x48,0x3a,0xd0,0x01] -; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3a,0xc8,0x01] -; X64-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xc9,0x3a,0xc0,0x01] -; X64-NEXT: vpaddd %zmm2, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfe,0xc2] -; X64-NEXT: vpaddd %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfe,0xc0] -; X64-NEXT: retq # encoding: [0xc3] - %res = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x2.512(<4 x i32> %x0, <16 x i32> %x2, i16 %x3) - %res1 = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x2.512(<4 x i32> %x0, <16 x i32> zeroinitializer, i16 %x3) - %res2 = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x2.512(<4 x i32> %x0, <16 x i32> %x2, i16 -1) - %res3 = add <16 x i32> %res, %res1 - %res4 = add <16 x i32> %res3, %res2 - ret <16 x i32> %res4 -} - declare i16 @llvm.x86.avx512.cvtd2mask.512(<16 x i32>) define i16@test_int_x86_avx512_cvtd2mask_512(<16 x i32> %x0) { diff --git a/llvm/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll index 879a1643325d8..ed83a23a5f4c7 100644 --- a/llvm/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll @@ -2388,94 +2388,142 @@ declare <2 x i64> @llvm.x86.avx512.mask.pmull.q.128(<2 x i64>, <2 x i64>, <2 x i declare <2 x double> @llvm.x86.avx512.mask.vextractf64x2.256(<4 x double>, i32, <2 x double>, i8) +define <2 x double>@test_int_x86_avx512_vextractf64x2_256(<4 x double> %x0, <2 x double> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vextractf64x2_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01] +; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.256(<4 x double> %x0,i32 1, <2 x double> zeroinitializer, i8 -1) + ret <2 x double> %res +} + define <2 x double>@test_int_x86_avx512_mask_vextractf64x2_256(<4 x double> %x0, <2 x double> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vextractf64x2_256: ; X86: # %bb.0: -; X86-NEXT: vextractf128 $1, %ymm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x19,0xc2,0x01] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vextractf64x2 $1, %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x19,0xc1,0x01] -; X86-NEXT: vaddpd %xmm2, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xca] -; X86-NEXT: vextractf64x2 $1, %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0x19,0xc0,0x01] -; X86-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc0] +; X86-NEXT: vmovapd %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc1] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vextractf64x2_256: ; X64: # %bb.0: -; X64-NEXT: vextractf128 $1, %ymm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x19,0xc2,0x01] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vextractf64x2 $1, %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x19,0xc1,0x01] -; X64-NEXT: vaddpd %xmm2, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xca] -; X64-NEXT: vextractf64x2 $1, %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0x19,0xc0,0x01] -; X64-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc0] +; X64-NEXT: vmovapd %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc1] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] %res = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.256(<4 x double> %x0,i32 1, <2 x double> %x2, i8 %x3) - %res2 = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.256(<4 x double> %x0,i32 1, <2 x double> zeroinitializer, i8 %x3) - %res1 = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.256(<4 x double> %x0,i32 1, <2 x double> zeroinitializer, i8 -1) - %res3 = fadd <2 x double> %res, %res1 - %res4 = fadd <2 x double> %res3, %res2 - ret <2 x double> %res4 + ret <2 x double> %res +} + +define <2 x double>@test_int_x86_avx512_maskz_vextractf64x2_256(<4 x double> %x0, i8 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_vextractf64x2_256: +; X86: # %bb.0: +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vextractf64x2 $1, %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0x19,0xc0,0x01] +; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_vextractf64x2_256: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vextractf64x2 $1, %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0x19,0xc0,0x01] +; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.256(<4 x double> %x0,i32 1, <2 x double> zeroinitializer, i8 %x3) + ret <2 x double> %res } declare <4 x double> @llvm.x86.avx512.mask.insertf64x2.256(<4 x double>, <2 x double>, i32, <4 x double>, i8) +define <4 x double>@test_int_x86_avx512_insertf64x2_256(<4 x double> %x0, <2 x double> %x1, <4 x double> %x3) { +; CHECK-LABEL: test_int_x86_avx512_insertf64x2_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x18,0xc1,0x01] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x double> @llvm.x86.avx512.mask.insertf64x2.256(<4 x double> %x0, <2 x double> %x1, i32 1, <4 x double> %x3, i8 -1) + ret <4 x double> %res +} + define <4 x double>@test_int_x86_avx512_mask_insertf64x2_256(<4 x double> %x0, <2 x double> %x1, <4 x double> %x3, i8 %x4) { ; X86-LABEL: test_int_x86_avx512_mask_insertf64x2_256: ; X86: # %bb.0: -; X86-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x18,0xd9,0x01] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vinsertf64x2 $1, %xmm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x18,0xd1,0x01] -; X86-NEXT: vaddpd %ymm3, %ymm2, %ymm2 # EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xd3] -; X86-NEXT: vinsertf64x2 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0x18,0xc1,0x01] -; X86-NEXT: vaddpd %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc2] +; X86-NEXT: vmovapd %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_insertf64x2_256: ; X64: # %bb.0: -; X64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x18,0xd9,0x01] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vinsertf64x2 $1, %xmm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x18,0xd1,0x01] -; X64-NEXT: vaddpd %ymm3, %ymm2, %ymm2 # EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xd3] -; X64-NEXT: vinsertf64x2 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0x18,0xc1,0x01] -; X64-NEXT: vaddpd %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc2] +; X64-NEXT: vmovapd %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x double> @llvm.x86.avx512.mask.insertf64x2.256(<4 x double> %x0, <2 x double> %x1, i32 1, <4 x double> %x3, i8 %x4) - %res1 = call <4 x double> @llvm.x86.avx512.mask.insertf64x2.256(<4 x double> %x0, <2 x double> %x1, i32 1, <4 x double> %x3, i8 -1) - %res2 = call <4 x double> @llvm.x86.avx512.mask.insertf64x2.256(<4 x double> %x0, <2 x double> %x1, i32 1, <4 x double> zeroinitializer, i8 %x4) - %res3 = fadd <4 x double> %res, %res1 - %res4 = fadd <4 x double> %res2, %res3 - ret <4 x double> %res4 + ret <4 x double> %res +} + +define <4 x double>@test_int_x86_avx512_maskz_insertf64x2_256(<4 x double> %x0, <2 x double> %x1, i8 %x4) { +; X86-LABEL: test_int_x86_avx512_maskz_insertf64x2_256: +; X86: # %bb.0: +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vinsertf64x2 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0x18,0xc1,0x01] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_insertf64x2_256: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vinsertf64x2 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0x18,0xc1,0x01] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <4 x double> @llvm.x86.avx512.mask.insertf64x2.256(<4 x double> %x0, <2 x double> %x1, i32 1, <4 x double> zeroinitializer, i8 %x4) + ret <4 x double> %res } declare <4 x i64> @llvm.x86.avx512.mask.inserti64x2.256(<4 x i64>, <2 x i64>, i32, <4 x i64>, i8) +define <4 x i64>@test_int_x86_avx512_inserti64x2_256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x3) { +; CHECK-LABEL: test_int_x86_avx512_inserti64x2_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x18,0xc1,0x01] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i64> @llvm.x86.avx512.mask.inserti64x2.256(<4 x i64> %x0, <2 x i64> %x1, i32 1, <4 x i64> %x3, i8 -1) + ret <4 x i64> %res +} + define <4 x i64>@test_int_x86_avx512_mask_inserti64x2_256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x3, i8 %x4) { ; X86-LABEL: test_int_x86_avx512_mask_inserti64x2_256: ; X86: # %bb.0: -; X86-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xd9,0x01] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vinserti64x2 $1, %xmm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x38,0xd1,0x01] -; X86-NEXT: vinserti64x2 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0x38,0xc1,0x01] -; X86-NEXT: vpaddq %ymm0, %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0xd4,0xc0] -; X86-NEXT: vpaddq %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_inserti64x2_256: ; X64: # %bb.0: -; X64-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xd9,0x01] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vinserti64x2 $1, %xmm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x38,0xd1,0x01] -; X64-NEXT: vinserti64x2 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0x38,0xc1,0x01] -; X64-NEXT: vpaddq %ymm0, %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0xd4,0xc0] -; X64-NEXT: vpaddq %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.inserti64x2.256(<4 x i64> %x0, <2 x i64> %x1, i32 1, <4 x i64> %x3, i8 %x4) - %res1 = call <4 x i64> @llvm.x86.avx512.mask.inserti64x2.256(<4 x i64> %x0, <2 x i64> %x1, i32 1, <4 x i64> %x3, i8 -1) - %res2 = call <4 x i64> @llvm.x86.avx512.mask.inserti64x2.256(<4 x i64> %x0, <2 x i64> %x1, i32 1, <4 x i64> zeroinitializer, i8 %x4) - %res3 = add <4 x i64> %res, %res1 - %res4 = add <4 x i64> %res3, %res2 - ret <4 x i64> %res4 + ret <4 x i64> %res +} + +define <4 x i64>@test_int_x86_avx512_maskz_inserti64x2_256(<4 x i64> %x0, <2 x i64> %x1, i8 %x4) { +; X86-LABEL: test_int_x86_avx512_maskz_inserti64x2_256: +; X86: # %bb.0: +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vinserti64x2 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0x38,0xc1,0x01] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_inserti64x2_256: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vinserti64x2 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0x38,0xc1,0x01] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <4 x i64> @llvm.x86.avx512.mask.inserti64x2.256(<4 x i64> %x0, <2 x i64> %x1, i32 1, <4 x i64> zeroinitializer, i8 %x4) + ret <4 x i64> %res } declare <4 x i32> @llvm.x86.avx512.cvtmask2d.128(i8) @@ -2552,35 +2600,55 @@ define <4 x i64>@test_int_x86_avx512_cvtmask2q_256(i8 %x0) { declare <4 x double> @llvm.x86.avx512.mask.broadcastf64x2.256(<2 x double>, <4 x double>, i8) +define <4 x double>@test_int_x86_avx512_broadcastf64x2_256(<2 x double> %x0, <4 x double> %x2) { +; CHECK-LABEL: test_int_x86_avx512_broadcastf64x2_256: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x18,0xc0,0x01] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + + %res = call <4 x double> @llvm.x86.avx512.mask.broadcastf64x2.256(<2 x double> %x0, <4 x double> %x2, i8 -1) + ret <4 x double> %res +} + define <4 x double>@test_int_x86_avx512_mask_broadcastf64x2_256(<2 x double> %x0, <4 x double> %x2, i8 %mask) { ; X86-LABEL: test_int_x86_avx512_mask_broadcastf64x2_256: ; X86: # %bb.0: ; X86-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x18,0xd0,0x01] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vinsertf64x2 $1, %xmm0, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x18,0xc8,0x01] -; X86-NEXT: vaddpd %ymm1, %ymm2, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc9] -; X86-NEXT: vinsertf64x2 $1, %xmm0, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0x18,0xc0,0x01] -; X86-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc1] +; X86-NEXT: vmovapd %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_broadcastf64x2_256: ; X64: # %bb.0: ; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x18,0xd0,0x01] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vinsertf64x2 $1, %xmm0, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x18,0xc8,0x01] -; X64-NEXT: vaddpd %ymm1, %ymm2, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc9] +; X64-NEXT: vmovapd %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + + %res = call <4 x double> @llvm.x86.avx512.mask.broadcastf64x2.256(<2 x double> %x0, <4 x double> %x2, i8 %mask) + ret <4 x double> %res +} + +define <4 x double>@test_int_x86_avx512_maskz_broadcastf64x2_256(<2 x double> %x0, i8 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_broadcastf64x2_256: +; X86: # %bb.0: +; X86-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vinsertf64x2 $1, %xmm0, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0x18,0xc0,0x01] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_broadcastf64x2_256: +; X64: # %bb.0: +; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vinsertf64x2 $1, %xmm0, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0x18,0xc0,0x01] -; X64-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc1] ; X64-NEXT: retq # encoding: [0xc3] - %res1 = call <4 x double> @llvm.x86.avx512.mask.broadcastf64x2.256(<2 x double> %x0, <4 x double> %x2, i8 -1) - %res2 = call <4 x double> @llvm.x86.avx512.mask.broadcastf64x2.256(<2 x double> %x0, <4 x double> %x2, i8 %mask) - %res3 = call <4 x double> @llvm.x86.avx512.mask.broadcastf64x2.256(<2 x double> %x0, <4 x double> zeroinitializer, i8 %mask) - %res4 = fadd <4 x double> %res1, %res2 - %res5 = fadd <4 x double> %res3, %res4 - ret <4 x double> %res5 + %res = call <4 x double> @llvm.x86.avx512.mask.broadcastf64x2.256(<2 x double> %x0, <4 x double> zeroinitializer, i8 %mask) + ret <4 x double> %res } define <4 x double>@test_int_x86_avx512_mask_broadcastf64x2_256_load(<2 x double>* %x0ptr, <4 x double> %x2, i8 %mask) { @@ -2606,35 +2674,55 @@ define <4 x double>@test_int_x86_avx512_mask_broadcastf64x2_256_load(<2 x double declare <4 x i64> @llvm.x86.avx512.mask.broadcasti64x2.256(<2 x i64>, <4 x i64>, i8) +define <4 x i64>@test_int_x86_avx512_broadcasti64x2_256(<2 x i64> %x0, <4 x i64> %x2) { +; CHECK-LABEL: test_int_x86_avx512_broadcasti64x2_256: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x18,0xc0,0x01] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + + %res = call <4 x i64> @llvm.x86.avx512.mask.broadcasti64x2.256(<2 x i64> %x0, <4 x i64> %x2, i8 -1) + ret <4 x i64> %res +} + define <4 x i64>@test_int_x86_avx512_mask_broadcasti64x2_256(<2 x i64> %x0, <4 x i64> %x2, i8 %mask) { ; X86-LABEL: test_int_x86_avx512_mask_broadcasti64x2_256: ; X86: # %bb.0: ; X86-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; X86-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xd0,0x01] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vinserti64x2 $1, %xmm0, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x38,0xc8,0x01] -; X86-NEXT: vinserti64x2 $1, %xmm0, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0x38,0xc0,0x01] -; X86-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] -; X86-NEXT: vpaddq %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_broadcasti64x2_256: ; X64: # %bb.0: ; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; X64-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xd0,0x01] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vinserti64x2 $1, %xmm0, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x38,0xc8,0x01] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + + %res = call <4 x i64> @llvm.x86.avx512.mask.broadcasti64x2.256(<2 x i64> %x0, <4 x i64> %x2, i8 %mask) + ret <4 x i64> %res +} + +define <4 x i64>@test_int_x86_avx512_maskz_broadcasti64x2_256(<2 x i64> %x0, i8 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_broadcasti64x2_256: +; X86: # %bb.0: +; X86-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vinserti64x2 $1, %xmm0, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0x38,0xc0,0x01] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_broadcasti64x2_256: +; X64: # %bb.0: +; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vinserti64x2 $1, %xmm0, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0x38,0xc0,0x01] -; X64-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] -; X64-NEXT: vpaddq %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res1 = call <4 x i64> @llvm.x86.avx512.mask.broadcasti64x2.256(<2 x i64> %x0, <4 x i64> %x2, i8 -1) - %res2 = call <4 x i64> @llvm.x86.avx512.mask.broadcasti64x2.256(<2 x i64> %x0, <4 x i64> %x2, i8 %mask) - %res3 = call <4 x i64> @llvm.x86.avx512.mask.broadcasti64x2.256(<2 x i64> %x0, <4 x i64> zeroinitializer, i8 %mask) - %res4 = add <4 x i64> %res1, %res2 - %res5 = add <4 x i64> %res3, %res4 - ret <4 x i64> %res5 + %res = call <4 x i64> @llvm.x86.avx512.mask.broadcasti64x2.256(<2 x i64> %x0, <4 x i64> zeroinitializer, i8 %mask) + ret <4 x i64> %res } define <4 x i64>@test_int_x86_avx512_mask_broadcasti64x2_256_load(<2 x i64>* %x0ptr, <4 x i64> %x2, i8 %mask) { @@ -2658,104 +2746,6 @@ define <4 x i64>@test_int_x86_avx512_mask_broadcasti64x2_256_load(<2 x i64>* %x0 ret <4 x i64> %res } -declare <8 x float> @llvm.x86.avx512.mask.broadcastf32x2.256(<4 x float>, <8 x float>, i8) - -define <8 x float>@test_int_x86_avx512_mask_broadcastf32x2_256(<4 x float> %x0, <8 x float> %x2, i8 %x3) { -; X86-LABEL: test_int_x86_avx512_mask_broadcastf32x2_256: -; X86: # %bb.0: -; X86-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x18,0xd0,0x01] -; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vinsertf32x4 $1, %xmm0, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x18,0xc8,0x01] -; X86-NEXT: vinsertf32x4 $1, %xmm0, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x18,0xc0,0x01] -; X86-NEXT: vaddps %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xc0] -; X86-NEXT: vaddps %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc2] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_int_x86_avx512_mask_broadcastf32x2_256: -; X64: # %bb.0: -; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x18,0xd0,0x01] -; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vinsertf32x4 $1, %xmm0, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x18,0xc8,0x01] -; X64-NEXT: vinsertf32x4 $1, %xmm0, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x18,0xc0,0x01] -; X64-NEXT: vaddps %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xc0] -; X64-NEXT: vaddps %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc2] -; X64-NEXT: retq # encoding: [0xc3] - %res = call <8 x float> @llvm.x86.avx512.mask.broadcastf32x2.256(<4 x float> %x0, <8 x float> %x2, i8 %x3) - %res1 = call <8 x float> @llvm.x86.avx512.mask.broadcastf32x2.256(<4 x float> %x0, <8 x float> zeroinitializer, i8 %x3) - %res2 = call <8 x float> @llvm.x86.avx512.mask.broadcastf32x2.256(<4 x float> %x0, <8 x float> %x2, i8 -1) - %res3 = fadd <8 x float> %res, %res1 - %res4 = fadd <8 x float> %res3, %res2 - ret <8 x float> %res4 -} - -declare <8 x i32> @llvm.x86.avx512.mask.broadcasti32x2.256(<4 x i32>, <8 x i32>, i8) - -define <8 x i32>@test_int_x86_avx512_mask_broadcasti32x2_256(<4 x i32> %x0, <8 x i32> %x2, i8 %x3, i64 * %y_ptr) { -; X86-LABEL: test_int_x86_avx512_mask_broadcasti32x2_256: -; X86: # %bb.0: -; X86-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08] -; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vbroadcasti32x2 (%eax), %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x59,0x08] -; X86-NEXT: # ymm1 {%k1} = mem[0,1,0,1,0,1,0,1] -; X86-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xd0,0x01] -; X86-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x38,0xc0,0x01] -; X86-NEXT: vpaddd %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc2] -; X86-NEXT: vpaddd %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_int_x86_avx512_mask_broadcasti32x2_256: -; X64: # %bb.0: -; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vbroadcasti32x2 (%rsi), %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x59,0x0e] -; X64-NEXT: # ymm1 {%k1} = mem[0,1,0,1,0,1,0,1] -; X64-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xd0,0x01] -; X64-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x38,0xc0,0x01] -; X64-NEXT: vpaddd %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc2] -; X64-NEXT: vpaddd %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0] -; X64-NEXT: retq # encoding: [0xc3] - %y_64 = load i64, i64 * %y_ptr - %y_v2i64 = insertelement <2 x i64> undef, i64 %y_64, i32 0 - %y = bitcast <2 x i64> %y_v2i64 to <4 x i32> - %res = call <8 x i32> @llvm.x86.avx512.mask.broadcasti32x2.256(<4 x i32> %y, <8 x i32> %x2, i8 %x3) - %res1 = call <8 x i32> @llvm.x86.avx512.mask.broadcasti32x2.256(<4 x i32> %x0, <8 x i32> zeroinitializer, i8 %x3) - %res2 = call <8 x i32> @llvm.x86.avx512.mask.broadcasti32x2.256(<4 x i32> %x0, <8 x i32> %x2, i8 -1) - %res3 = add <8 x i32> %res, %res1 - %res4 = add <8 x i32> %res3, %res2 - ret <8 x i32> %res4 -} - -declare <4 x i32> @llvm.x86.avx512.mask.broadcasti32x2.128(<4 x i32>, <4 x i32>, i8) - -define <4 x i32>@test_int_x86_avx512_mask_broadcasti32x2_128(<4 x i32> %x0, <4 x i32> %x2, i8 %x3) { -; X86-LABEL: test_int_x86_avx512_mask_broadcasti32x2_128: -; X86: # %bb.0: -; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x6f,0xc8] -; X86-NEXT: vmovdqa32 %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0x6f,0xd0] -; X86-NEXT: vpaddd %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0] -; X86-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_int_x86_avx512_mask_broadcasti32x2_128: -; X64: # %bb.0: -; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vmovdqa32 %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x6f,0xc8] -; X64-NEXT: vmovdqa32 %xmm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0x6f,0xd0] -; X64-NEXT: vpaddd %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0] -; X64-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] -; X64-NEXT: retq # encoding: [0xc3] - %res = call <4 x i32> @llvm.x86.avx512.mask.broadcasti32x2.128(<4 x i32> %x0, <4 x i32> %x2, i8 %x3) - %res1 = call <4 x i32> @llvm.x86.avx512.mask.broadcasti32x2.128(<4 x i32> %x0, <4 x i32> zeroinitializer, i8 %x3) - %res2 = call <4 x i32> @llvm.x86.avx512.mask.broadcasti32x2.128(<4 x i32> %x0, <4 x i32> %x2, i8 -1) - %res3 = add <4 x i32> %res, %res1 - %res4 = add <4 x i32> %res3, %res2 - ret <4 x i32> %res4 -} - declare i8 @llvm.x86.avx512.cvtd2mask.128(<4 x i32>) define i8@test_int_x86_avx512_cvtd2mask_128(<4 x i32> %x0) { @@ -2812,98 +2802,118 @@ define i8@test_int_x86_avx512_cvtq2mask_256(<4 x i64> %x0) { declare <2 x double> @llvm.x86.avx512.mask.cvtqq2pd.128(<2 x i64>, <2 x double>, i8) +define <2 x double>@test_int_x86_avx512_cvt_qq2pd_128(<2 x i64> %x0, <2 x double> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvt_qq2pd_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtqq2pd %xmm0, %xmm0 # encoding: [0x62,0xf1,0xfe,0x08,0xe6,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <2 x double> @llvm.x86.avx512.mask.cvtqq2pd.128(<2 x i64> %x0, <2 x double> %x1, i8 -1) + ret <2 x double> %res +} + define <2 x double>@test_int_x86_avx512_mask_cvt_qq2pd_128(<2 x i64> %x0, <2 x double> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvt_qq2pd_128: ; X86: # %bb.0: -; X86-NEXT: vcvtqq2pd %xmm0, %xmm2 # encoding: [0x62,0xf1,0xfe,0x08,0xe6,0xd0] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vcvtqq2pd %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfe,0x09,0xe6,0xc8] -; X86-NEXT: vaddpd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc2] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_qq2pd_128: ; X64: # %bb.0: -; X64-NEXT: vcvtqq2pd %xmm0, %xmm2 # encoding: [0x62,0xf1,0xfe,0x08,0xe6,0xd0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtqq2pd %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfe,0x09,0xe6,0xc8] -; X64-NEXT: vaddpd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc2] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <2 x double> @llvm.x86.avx512.mask.cvtqq2pd.128(<2 x i64> %x0, <2 x double> %x1, i8 %x2) - %res1 = call <2 x double> @llvm.x86.avx512.mask.cvtqq2pd.128(<2 x i64> %x0, <2 x double> %x1, i8 -1) - %res2 = fadd <2 x double> %res, %res1 - ret <2 x double> %res2 + ret <2 x double> %res } declare <4 x double> @llvm.x86.avx512.mask.cvtqq2pd.256(<4 x i64>, <4 x double>, i8) +define <4 x double>@test_int_x86_avx512_cvt_qq2pd_256(<4 x i64> %x0, <4 x double> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvt_qq2pd_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtqq2pd %ymm0, %ymm0 # encoding: [0x62,0xf1,0xfe,0x28,0xe6,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x double> @llvm.x86.avx512.mask.cvtqq2pd.256(<4 x i64> %x0, <4 x double> %x1, i8 -1) + ret <4 x double> %res +} + define <4 x double>@test_int_x86_avx512_mask_cvt_qq2pd_256(<4 x i64> %x0, <4 x double> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvt_qq2pd_256: ; X86: # %bb.0: -; X86-NEXT: vcvtqq2pd %ymm0, %ymm2 # encoding: [0x62,0xf1,0xfe,0x28,0xe6,0xd0] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vcvtqq2pd %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0xfe,0x29,0xe6,0xc8] -; X86-NEXT: vaddpd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc2] +; X86-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_qq2pd_256: ; X64: # %bb.0: -; X64-NEXT: vcvtqq2pd %ymm0, %ymm2 # encoding: [0x62,0xf1,0xfe,0x28,0xe6,0xd0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtqq2pd %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0xfe,0x29,0xe6,0xc8] -; X64-NEXT: vaddpd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc2] +; X64-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x double> @llvm.x86.avx512.mask.cvtqq2pd.256(<4 x i64> %x0, <4 x double> %x1, i8 %x2) - %res1 = call <4 x double> @llvm.x86.avx512.mask.cvtqq2pd.256(<4 x i64> %x0, <4 x double> %x1, i8 -1) - %res2 = fadd <4 x double> %res, %res1 - ret <4 x double> %res2 + ret <4 x double> %res } declare <2 x double> @llvm.x86.avx512.mask.cvtuqq2pd.128(<2 x i64>, <2 x double>, i8) +define <2 x double>@test_int_x86_avx512_cvt_uqq2pd_128(<2 x i64> %x0, <2 x double> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvt_uqq2pd_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtuqq2pd %xmm0, %xmm0 # encoding: [0x62,0xf1,0xfe,0x08,0x7a,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <2 x double> @llvm.x86.avx512.mask.cvtuqq2pd.128(<2 x i64> %x0, <2 x double> %x1, i8 -1) + ret <2 x double> %res +} + define <2 x double>@test_int_x86_avx512_mask_cvt_uqq2pd_128(<2 x i64> %x0, <2 x double> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvt_uqq2pd_128: ; X86: # %bb.0: -; X86-NEXT: vcvtuqq2pd %xmm0, %xmm2 # encoding: [0x62,0xf1,0xfe,0x08,0x7a,0xd0] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vcvtuqq2pd %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfe,0x09,0x7a,0xc8] -; X86-NEXT: vaddpd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc2] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_uqq2pd_128: ; X64: # %bb.0: -; X64-NEXT: vcvtuqq2pd %xmm0, %xmm2 # encoding: [0x62,0xf1,0xfe,0x08,0x7a,0xd0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtuqq2pd %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfe,0x09,0x7a,0xc8] -; X64-NEXT: vaddpd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc2] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <2 x double> @llvm.x86.avx512.mask.cvtuqq2pd.128(<2 x i64> %x0, <2 x double> %x1, i8 %x2) - %res1 = call <2 x double> @llvm.x86.avx512.mask.cvtuqq2pd.128(<2 x i64> %x0, <2 x double> %x1, i8 -1) - %res2 = fadd <2 x double> %res, %res1 - ret <2 x double> %res2 + ret <2 x double> %res } declare <4 x double> @llvm.x86.avx512.mask.cvtuqq2pd.256(<4 x i64>, <4 x double>, i8) +define <4 x double>@test_int_x86_avx512_cvt_uqq2pd_256(<4 x i64> %x0, <4 x double> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvt_uqq2pd_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtuqq2pd %ymm0, %ymm0 # encoding: [0x62,0xf1,0xfe,0x28,0x7a,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x double> @llvm.x86.avx512.mask.cvtuqq2pd.256(<4 x i64> %x0, <4 x double> %x1, i8 -1) + ret <4 x double> %res +} + define <4 x double>@test_int_x86_avx512_mask_cvt_uqq2pd_256(<4 x i64> %x0, <4 x double> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvt_uqq2pd_256: ; X86: # %bb.0: -; X86-NEXT: vcvtuqq2pd %ymm0, %ymm2 # encoding: [0x62,0xf1,0xfe,0x28,0x7a,0xd0] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vcvtuqq2pd %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0xfe,0x29,0x7a,0xc8] -; X86-NEXT: vaddpd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc2] +; X86-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_uqq2pd_256: ; X64: # %bb.0: -; X64-NEXT: vcvtuqq2pd %ymm0, %ymm2 # encoding: [0x62,0xf1,0xfe,0x28,0x7a,0xd0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtuqq2pd %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0xfe,0x29,0x7a,0xc8] -; X64-NEXT: vaddpd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc2] +; X64-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x double> @llvm.x86.avx512.mask.cvtuqq2pd.256(<4 x i64> %x0, <4 x double> %x1, i8 %x2) - %res1 = call <4 x double> @llvm.x86.avx512.mask.cvtuqq2pd.256(<4 x i64> %x0, <4 x double> %x1, i8 -1) - %res2 = fadd <4 x double> %res, %res1 - ret <4 x double> %res2 + ret <4 x double> %res } declare i8 @llvm.x86.avx512.mask.fpclass.ps.128(<4 x float>, i32, i8) @@ -2970,52 +2980,64 @@ define i8 @test_int_x86_avx512_mask_fpclass_pd_256(<4 x double> %x0) { declare <4 x float> @llvm.x86.avx512.mask.cvtqq2ps.256(<4 x i64>, <4 x float>, i8) +define <4 x float> @test_int_x86_avx512_cvt_qq2ps_256(<4 x i64> %x0, <4 x float> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvt_qq2ps_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtqq2ps %ymm0, %xmm0 # encoding: [0x62,0xf1,0xfc,0x28,0x5b,0xc0] +; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x float> @llvm.x86.avx512.mask.cvtqq2ps.256(<4 x i64> %x0, <4 x float> %x1, i8 -1) + ret <4 x float> %res +} + define <4 x float> @test_int_x86_avx512_mask_cvt_qq2ps_256(<4 x i64> %x0, <4 x float> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvt_qq2ps_256: ; X86: # %bb.0: -; X86-NEXT: vcvtqq2ps %ymm0, %xmm2 # encoding: [0x62,0xf1,0xfc,0x28,0x5b,0xd0] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vcvtqq2ps %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfc,0x29,0x5b,0xc8] -; X86-NEXT: vaddps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc2] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_qq2ps_256: ; X64: # %bb.0: -; X64-NEXT: vcvtqq2ps %ymm0, %xmm2 # encoding: [0x62,0xf1,0xfc,0x28,0x5b,0xd0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtqq2ps %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfc,0x29,0x5b,0xc8] -; X64-NEXT: vaddps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc2] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.cvtqq2ps.256(<4 x i64> %x0, <4 x float> %x1, i8 %x2) - %res1 = call <4 x float> @llvm.x86.avx512.mask.cvtqq2ps.256(<4 x i64> %x0, <4 x float> %x1, i8 -1) - %res2 = fadd <4 x float> %res, %res1 - ret <4 x float> %res2 + ret <4 x float> %res } declare <4 x float> @llvm.x86.avx512.mask.cvtuqq2ps.256(<4 x i64>, <4 x float>, i8) +define <4 x float> @test_int_x86_avx512_cvt_uqq2ps_256(<4 x i64> %x0, <4 x float> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvt_uqq2ps_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtuqq2ps %ymm0, %xmm0 # encoding: [0x62,0xf1,0xff,0x28,0x7a,0xc0] +; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x float> @llvm.x86.avx512.mask.cvtuqq2ps.256(<4 x i64> %x0, <4 x float> %x1, i8 -1) + ret <4 x float> %res +} + define <4 x float> @test_int_x86_avx512_mask_cvt_uqq2ps_256(<4 x i64> %x0, <4 x float> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvt_uqq2ps_256: ; X86: # %bb.0: -; X86-NEXT: vcvtuqq2ps %ymm0, %xmm2 # encoding: [0x62,0xf1,0xff,0x28,0x7a,0xd0] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vcvtuqq2ps %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xff,0x29,0x7a,0xc8] -; X86-NEXT: vaddps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc2] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_uqq2ps_256: ; X64: # %bb.0: -; X64-NEXT: vcvtuqq2ps %ymm0, %xmm2 # encoding: [0x62,0xf1,0xff,0x28,0x7a,0xd0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtuqq2ps %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xff,0x29,0x7a,0xc8] -; X64-NEXT: vaddps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc2] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.cvtuqq2ps.256(<4 x i64> %x0, <4 x float> %x1, i8 %x2) - %res1 = call <4 x float> @llvm.x86.avx512.mask.cvtuqq2ps.256(<4 x i64> %x0, <4 x float> %x1, i8 -1) - %res2 = fadd <4 x float> %res, %res1 - ret <4 x float> %res2 + ret <4 x float> %res } diff --git a/llvm/test/CodeGen/X86/avx512dqvl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512dqvl-intrinsics.ll index 431d6f9d28f00..169ce93170bf6 100644 --- a/llvm/test/CodeGen/X86/avx512dqvl-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512dqvl-intrinsics.ll @@ -4,122 +4,147 @@ declare <2 x i64> @llvm.x86.avx512.mask.cvtpd2qq.128(<2 x double>, <2 x i64>, i8) +define <2 x i64>@test_int_x86_avx512_cvt_pd2qq_128(<2 x double> %x0, <2 x i64> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvt_pd2qq_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtpd2qq %xmm0, %xmm0 # encoding: [0x62,0xf1,0xfd,0x08,0x7b,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <2 x i64> @llvm.x86.avx512.mask.cvtpd2qq.128(<2 x double> %x0, <2 x i64> %x1, i8 -1) + ret <2 x i64> %res +} + define <2 x i64>@test_int_x86_avx512_mask_cvt_pd2qq_128(<2 x double> %x0, <2 x i64> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvt_pd2qq_128: ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vcvtpd2qq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0x7b,0xc8] -; X86-NEXT: vcvtpd2qq %xmm0, %xmm0 # encoding: [0x62,0xf1,0xfd,0x08,0x7b,0xc0] -; X86-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_pd2qq_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtpd2qq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0x7b,0xc8] -; X64-NEXT: vcvtpd2qq %xmm0, %xmm0 # encoding: [0x62,0xf1,0xfd,0x08,0x7b,0xc0] -; X64-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.mask.cvtpd2qq.128(<2 x double> %x0, <2 x i64> %x1, i8 %x2) - %res1 = call <2 x i64> @llvm.x86.avx512.mask.cvtpd2qq.128(<2 x double> %x0, <2 x i64> %x1, i8 -1) - %res2 = add <2 x i64> %res, %res1 - ret <2 x i64> %res2 + ret <2 x i64> %res } declare <4 x i64> @llvm.x86.avx512.mask.cvtpd2qq.256(<4 x double>, <4 x i64>, i8) +define <4 x i64>@test_int_x86_avx512_cvt_pd2qq_256(<4 x double> %x0, <4 x i64> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvt_pd2qq_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtpd2qq %ymm0, %ymm0 # encoding: [0x62,0xf1,0xfd,0x28,0x7b,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i64> @llvm.x86.avx512.mask.cvtpd2qq.256(<4 x double> %x0, <4 x i64> %x1, i8 -1) + ret <4 x i64> %res +} + define <4 x i64>@test_int_x86_avx512_mask_cvt_pd2qq_256(<4 x double> %x0, <4 x i64> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvt_pd2qq_256: ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vcvtpd2qq %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0x7b,0xc8] -; X86-NEXT: vcvtpd2qq %ymm0, %ymm0 # encoding: [0x62,0xf1,0xfd,0x28,0x7b,0xc0] -; X86-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] +; X86-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_pd2qq_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtpd2qq %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0x7b,0xc8] -; X64-NEXT: vcvtpd2qq %ymm0, %ymm0 # encoding: [0x62,0xf1,0xfd,0x28,0x7b,0xc0] -; X64-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] +; X64-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.cvtpd2qq.256(<4 x double> %x0, <4 x i64> %x1, i8 %x2) - %res1 = call <4 x i64> @llvm.x86.avx512.mask.cvtpd2qq.256(<4 x double> %x0, <4 x i64> %x1, i8 -1) - %res2 = add <4 x i64> %res, %res1 - ret <4 x i64> %res2 + ret <4 x i64> %res } declare <2 x i64> @llvm.x86.avx512.mask.cvtpd2uqq.128(<2 x double>, <2 x i64>, i8) +define <2 x i64>@test_int_x86_avx512_cvt_pd2uqq_128(<2 x double> %x0, <2 x i64> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvt_pd2uqq_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtpd2uqq %xmm0, %xmm0 # encoding: [0x62,0xf1,0xfd,0x08,0x79,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <2 x i64> @llvm.x86.avx512.mask.cvtpd2uqq.128(<2 x double> %x0, <2 x i64> %x1, i8 -1) + ret <2 x i64> %res +} + define <2 x i64>@test_int_x86_avx512_mask_cvt_pd2uqq_128(<2 x double> %x0, <2 x i64> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvt_pd2uqq_128: ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vcvtpd2uqq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0x79,0xc8] -; X86-NEXT: vcvtpd2uqq %xmm0, %xmm0 # encoding: [0x62,0xf1,0xfd,0x08,0x79,0xc0] -; X86-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_pd2uqq_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtpd2uqq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0x79,0xc8] -; X64-NEXT: vcvtpd2uqq %xmm0, %xmm0 # encoding: [0x62,0xf1,0xfd,0x08,0x79,0xc0] -; X64-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.mask.cvtpd2uqq.128(<2 x double> %x0, <2 x i64> %x1, i8 %x2) - %res1 = call <2 x i64> @llvm.x86.avx512.mask.cvtpd2uqq.128(<2 x double> %x0, <2 x i64> %x1, i8 -1) - %res2 = add <2 x i64> %res, %res1 - ret <2 x i64> %res2 + ret <2 x i64> %res } declare <4 x i64> @llvm.x86.avx512.mask.cvtpd2uqq.256(<4 x double>, <4 x i64>, i8) +define <4 x i64>@test_int_x86_avx512_cvt_pd2uqq_256(<4 x double> %x0, <4 x i64> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvt_pd2uqq_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtpd2uqq %ymm0, %ymm0 # encoding: [0x62,0xf1,0xfd,0x28,0x79,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i64> @llvm.x86.avx512.mask.cvtpd2uqq.256(<4 x double> %x0, <4 x i64> %x1, i8 -1) + ret <4 x i64> %res +} + define <4 x i64>@test_int_x86_avx512_mask_cvt_pd2uqq_256(<4 x double> %x0, <4 x i64> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvt_pd2uqq_256: ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vcvtpd2uqq %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0x79,0xc8] -; X86-NEXT: vcvtpd2uqq %ymm0, %ymm0 # encoding: [0x62,0xf1,0xfd,0x28,0x79,0xc0] -; X86-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] +; X86-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_pd2uqq_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtpd2uqq %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0x79,0xc8] -; X64-NEXT: vcvtpd2uqq %ymm0, %ymm0 # encoding: [0x62,0xf1,0xfd,0x28,0x79,0xc0] -; X64-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] +; X64-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.cvtpd2uqq.256(<4 x double> %x0, <4 x i64> %x1, i8 %x2) - %res1 = call <4 x i64> @llvm.x86.avx512.mask.cvtpd2uqq.256(<4 x double> %x0, <4 x i64> %x1, i8 -1) - %res2 = add <4 x i64> %res, %res1 - ret <4 x i64> %res2 + ret <4 x i64> %res } declare <2 x i64> @llvm.x86.avx512.mask.cvtps2qq.128(<4 x float>, <2 x i64>, i8) +define <2 x i64>@test_int_x86_avx512_cvt_ps2qq_128(<4 x float> %x0, <2 x i64> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvt_ps2qq_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtps2qq %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7d,0x08,0x7b,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <2 x i64> @llvm.x86.avx512.mask.cvtps2qq.128(<4 x float> %x0, <2 x i64> %x1, i8 -1) + ret <2 x i64> %res +} + define <2 x i64>@test_int_x86_avx512_mask_cvt_ps2qq_128(<4 x float> %x0, <2 x i64> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvt_ps2qq_128: ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vcvtps2qq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x7b,0xc8] -; X86-NEXT: vcvtps2qq %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7d,0x08,0x7b,0xc0] -; X86-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_ps2qq_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtps2qq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x7b,0xc8] -; X64-NEXT: vcvtps2qq %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7d,0x08,0x7b,0xc0] -; X64-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.mask.cvtps2qq.128(<4 x float> %x0, <2 x i64> %x1, i8 %x2) - %res1 = call <2 x i64> @llvm.x86.avx512.mask.cvtps2qq.128(<4 x float> %x0, <2 x i64> %x1, i8 -1) - %res2 = add <2 x i64> %res, %res1 - ret <2 x i64> %res2 + ret <2 x i64> %res } define <2 x i64> @test_int_x86_avx512_cvt_ps2qq_128_load(<2 x float>* %p) { @@ -287,50 +312,60 @@ define <2 x i64> @test_int_x86_avx512_maskz_cvt_ps2qq_128_load_3(<4 x float>* %p declare <4 x i64> @llvm.x86.avx512.mask.cvtps2qq.256(<4 x float>, <4 x i64>, i8) +define <4 x i64>@test_int_x86_avx512_cvt_ps2qq_256(<4 x float> %x0, <4 x i64> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvt_ps2qq_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtps2qq %xmm0, %ymm0 # encoding: [0x62,0xf1,0x7d,0x28,0x7b,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i64> @llvm.x86.avx512.mask.cvtps2qq.256(<4 x float> %x0, <4 x i64> %x1, i8 -1) + ret <4 x i64> %res +} + define <4 x i64>@test_int_x86_avx512_mask_cvt_ps2qq_256(<4 x float> %x0, <4 x i64> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvt_ps2qq_256: ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vcvtps2qq %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x7b,0xc8] -; X86-NEXT: vcvtps2qq %xmm0, %ymm0 # encoding: [0x62,0xf1,0x7d,0x28,0x7b,0xc0] -; X86-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] +; X86-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_ps2qq_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtps2qq %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x7b,0xc8] -; X64-NEXT: vcvtps2qq %xmm0, %ymm0 # encoding: [0x62,0xf1,0x7d,0x28,0x7b,0xc0] -; X64-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] +; X64-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.cvtps2qq.256(<4 x float> %x0, <4 x i64> %x1, i8 %x2) - %res1 = call <4 x i64> @llvm.x86.avx512.mask.cvtps2qq.256(<4 x float> %x0, <4 x i64> %x1, i8 -1) - %res2 = add <4 x i64> %res, %res1 - ret <4 x i64> %res2 + ret <4 x i64> %res } declare <2 x i64> @llvm.x86.avx512.mask.cvtps2uqq.128(<4 x float>, <2 x i64>, i8) +define <2 x i64>@test_int_x86_avx512_cvt_ps2uqq_128(<4 x float> %x0, <2 x i64> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvt_ps2uqq_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtps2uqq %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7d,0x08,0x79,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <2 x i64> @llvm.x86.avx512.mask.cvtps2uqq.128(<4 x float> %x0, <2 x i64> %x1, i8 -1) + ret <2 x i64> %res +} + define <2 x i64>@test_int_x86_avx512_mask_cvt_ps2uqq_128(<4 x float> %x0, <2 x i64> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvt_ps2uqq_128: ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vcvtps2uqq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x79,0xc8] -; X86-NEXT: vcvtps2uqq %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7d,0x08,0x79,0xc0] -; X86-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_ps2uqq_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtps2uqq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x79,0xc8] -; X64-NEXT: vcvtps2uqq %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7d,0x08,0x79,0xc0] -; X64-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.mask.cvtps2uqq.128(<4 x float> %x0, <2 x i64> %x1, i8 %x2) - %res1 = call <2 x i64> @llvm.x86.avx512.mask.cvtps2uqq.128(<4 x float> %x0, <2 x i64> %x1, i8 -1) - %res2 = add <2 x i64> %res, %res1 - ret <2 x i64> %res2 + ret <2 x i64> %res } define <2 x i64> @test_int_x86_avx512_cvt_ps2uqq_128_load(<2 x float>* %p) { @@ -497,50 +532,70 @@ define <2 x i64> @test_int_x86_avx512_maskz_cvt_ps2uqq_128_load_3(<4 x float>* % declare <4 x i64> @llvm.x86.avx512.mask.cvtps2uqq.256(<4 x float>, <4 x i64>, i8) +define <4 x i64>@test_int_x86_avx512_cvt_ps2uqq_256(<4 x float> %x0, <4 x i64> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvt_ps2uqq_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtps2uqq %xmm0, %ymm0 # encoding: [0x62,0xf1,0x7d,0x28,0x79,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i64> @llvm.x86.avx512.mask.cvtps2uqq.256(<4 x float> %x0, <4 x i64> %x1, i8 -1) + ret <4 x i64> %res +} + define <4 x i64>@test_int_x86_avx512_mask_cvt_ps2uqq_256(<4 x float> %x0, <4 x i64> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvt_ps2uqq_256: ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vcvtps2uqq %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x79,0xc8] -; X86-NEXT: vcvtps2uqq %xmm0, %ymm0 # encoding: [0x62,0xf1,0x7d,0x28,0x79,0xc0] -; X86-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] +; X86-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_ps2uqq_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtps2uqq %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x79,0xc8] -; X64-NEXT: vcvtps2uqq %xmm0, %ymm0 # encoding: [0x62,0xf1,0x7d,0x28,0x79,0xc0] -; X64-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] +; X64-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.cvtps2uqq.256(<4 x float> %x0, <4 x i64> %x1, i8 %x2) - %res1 = call <4 x i64> @llvm.x86.avx512.mask.cvtps2uqq.256(<4 x float> %x0, <4 x i64> %x1, i8 -1) - %res2 = add <4 x i64> %res, %res1 - ret <4 x i64> %res2 + ret <4 x i64> %res } declare <4 x float> @llvm.x86.avx512.mask.cvtqq2ps.128(<2 x i64>, <4 x float>, i8) +define <4 x float>@test_int_x86_avx512_ask_cvt_qq2ps_128(<2 x i64> %x0, <4 x float> %x1) { +; CHECK-LABEL: test_int_x86_avx512_ask_cvt_qq2ps_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtqq2ps %xmm0, %xmm0 # encoding: [0x62,0xf1,0xfc,0x08,0x5b,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x float> @llvm.x86.avx512.mask.cvtqq2ps.128(<2 x i64> %x0, <4 x float> %x1, i8 -1) + ret <4 x float> %res +} + define <4 x float>@test_int_x86_avx512_mask_cvt_qq2ps_128(<2 x i64> %x0, <4 x float> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvt_qq2ps_128: ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vcvtqq2ps %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfc,0x09,0x5b,0xc8] -; X86-NEXT: vcvtqq2ps %xmm0, %xmm0 # encoding: [0x62,0xf1,0xfc,0x08,0x5b,0xc0] -; X86-NEXT: vaddps %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc0] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_qq2ps_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtqq2ps %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfc,0x09,0x5b,0xc8] -; X64-NEXT: vcvtqq2ps %xmm0, %xmm0 # encoding: [0x62,0xf1,0xfc,0x08,0x5b,0xc0] -; X64-NEXT: vaddps %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc0] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.cvtqq2ps.128(<2 x i64> %x0, <4 x float> %x1, i8 %x2) - %res1 = call <4 x float> @llvm.x86.avx512.mask.cvtqq2ps.128(<2 x i64> %x0, <4 x float> %x1, i8 -1) - %res2 = fadd <4 x float> %res, %res1 - ret <4 x float> %res2 + ret <4 x float> %res +} + +define <4 x float>@test_int_x86_avx512_cvt_qq2ps_128_zext(<2 x i64> %x0, <4 x float> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvt_qq2ps_128_zext: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtqq2ps %xmm0, %xmm0 # encoding: [0x62,0xf1,0xfc,0x08,0x5b,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res2 = call <4 x float> @llvm.x86.avx512.mask.cvtqq2ps.128(<2 x i64> %x0, <4 x float> %x1, i8 -1) + %res3 = shufflevector <4 x float> %res2, <4 x float> zeroinitializer, <4 x i32> + ret <4 x float> %res3 } define <4 x float>@test_int_x86_avx512_mask_cvt_qq2ps_128_zext(<2 x i64> %x0, <4 x float> %x1, i8 %x2) { @@ -548,170 +603,197 @@ define <4 x float>@test_int_x86_avx512_mask_cvt_qq2ps_128_zext(<2 x i64> %x0, <4 ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vcvtqq2ps %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfc,0x09,0x5b,0xc8] -; X86-NEXT: vcvtqq2ps %xmm0, %xmm0 # encoding: [0x62,0xf1,0xfc,0x08,0x5b,0xc0] -; X86-NEXT: vaddps %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc0] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_qq2ps_128_zext: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtqq2ps %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfc,0x09,0x5b,0xc8] -; X64-NEXT: vcvtqq2ps %xmm0, %xmm0 # encoding: [0x62,0xf1,0xfc,0x08,0x5b,0xc0] -; X64-NEXT: vaddps %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc0] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.cvtqq2ps.128(<2 x i64> %x0, <4 x float> %x1, i8 %x2) %res1 = shufflevector <4 x float> %res, <4 x float> zeroinitializer, <4 x i32> - %res2 = call <4 x float> @llvm.x86.avx512.mask.cvtqq2ps.128(<2 x i64> %x0, <4 x float> %x1, i8 -1) - %res3 = shufflevector <4 x float> %res2, <4 x float> zeroinitializer, <4 x i32> - %res4 = fadd <4 x float> %res1, %res3 - ret <4 x float> %res4 + ret <4 x float> %res1 +} + + +define <4 x float>@test_int_x86_avx512_cvt_qq2ps_256(<4 x i64> %x0, <4 x float> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvt_qq2ps_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtqq2ps %ymm0, %xmm0 # encoding: [0x62,0xf1,0xfc,0x28,0x5b,0xc0] +; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %cvt = sitofp <4 x i64> %x0 to <4 x float> + ret <4 x float> %cvt } define <4 x float>@test_int_x86_avx512_mask_cvt_qq2ps_256(<4 x i64> %x0, <4 x float> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvt_qq2ps_256: ; X86: # %bb.0: -; X86-NEXT: vcvtqq2ps %ymm0, %xmm2 # encoding: [0x62,0xf1,0xfc,0x28,0x5b,0xd0] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vcvtqq2ps %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfc,0x29,0x5b,0xc8] -; X86-NEXT: vaddps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc2] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_qq2ps_256: ; X64: # %bb.0: -; X64-NEXT: vcvtqq2ps %ymm0, %xmm2 # encoding: [0x62,0xf1,0xfc,0x28,0x5b,0xd0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtqq2ps %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfc,0x29,0x5b,0xc8] -; X64-NEXT: vaddps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc2] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] %cvt1 = sitofp <4 x i64> %x0 to <4 x float> %1 = bitcast i8 %x2 to <8 x i1> %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32> %2 = select <4 x i1> %extract, <4 x float> %cvt1, <4 x float> %x1 - %cvt = sitofp <4 x i64> %x0 to <4 x float> - %res2 = fadd <4 x float> %2, %cvt - ret <4 x float> %res2 + ret <4 x float> %2 } declare <2 x i64> @llvm.x86.avx512.mask.cvttpd2qq.128(<2 x double>, <2 x i64>, i8) +define <2 x i64>@test_int_x86_avx512_cvtt_pd2qq_128(<2 x double> %x0, <2 x i64> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvtt_pd2qq_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttpd2qq %xmm0, %xmm0 # encoding: [0x62,0xf1,0xfd,0x08,0x7a,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <2 x i64> @llvm.x86.avx512.mask.cvttpd2qq.128(<2 x double> %x0, <2 x i64> %x1, i8 -1) + ret <2 x i64> %res +} + define <2 x i64>@test_int_x86_avx512_mask_cvtt_pd2qq_128(<2 x double> %x0, <2 x i64> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvtt_pd2qq_128: ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vcvttpd2qq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0x7a,0xc8] -; X86-NEXT: vcvttpd2qq %xmm0, %xmm0 # encoding: [0x62,0xf1,0xfd,0x08,0x7a,0xc0] -; X86-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvtt_pd2qq_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvttpd2qq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0x7a,0xc8] -; X64-NEXT: vcvttpd2qq %xmm0, %xmm0 # encoding: [0x62,0xf1,0xfd,0x08,0x7a,0xc0] -; X64-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.mask.cvttpd2qq.128(<2 x double> %x0, <2 x i64> %x1, i8 %x2) - %res1 = call <2 x i64> @llvm.x86.avx512.mask.cvttpd2qq.128(<2 x double> %x0, <2 x i64> %x1, i8 -1) - %res2 = add <2 x i64> %res, %res1 - ret <2 x i64> %res2 + ret <2 x i64> %res } declare <4 x i64> @llvm.x86.avx512.mask.cvttpd2qq.256(<4 x double>, <4 x i64>, i8) +define <4 x i64>@test_int_x86_avx512_cvtt_pd2qq_256(<4 x double> %x0, <4 x i64> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvtt_pd2qq_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttpd2qq %ymm0, %ymm0 # encoding: [0x62,0xf1,0xfd,0x28,0x7a,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i64> @llvm.x86.avx512.mask.cvttpd2qq.256(<4 x double> %x0, <4 x i64> %x1, i8 -1) + ret <4 x i64> %res +} + define <4 x i64>@test_int_x86_avx512_mask_cvtt_pd2qq_256(<4 x double> %x0, <4 x i64> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvtt_pd2qq_256: ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vcvttpd2qq %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0x7a,0xc8] -; X86-NEXT: vcvttpd2qq %ymm0, %ymm0 # encoding: [0x62,0xf1,0xfd,0x28,0x7a,0xc0] -; X86-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] +; X86-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvtt_pd2qq_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvttpd2qq %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0x7a,0xc8] -; X64-NEXT: vcvttpd2qq %ymm0, %ymm0 # encoding: [0x62,0xf1,0xfd,0x28,0x7a,0xc0] -; X64-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] +; X64-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.cvttpd2qq.256(<4 x double> %x0, <4 x i64> %x1, i8 %x2) - %res1 = call <4 x i64> @llvm.x86.avx512.mask.cvttpd2qq.256(<4 x double> %x0, <4 x i64> %x1, i8 -1) - %res2 = add <4 x i64> %res, %res1 - ret <4 x i64> %res2 + ret <4 x i64> %res } declare <2 x i64> @llvm.x86.avx512.mask.cvttpd2uqq.128(<2 x double>, <2 x i64>, i8) +define <2 x i64>@test_int_x86_avx512_cvtt_pd2uqq_128(<2 x double> %x0, <2 x i64> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvtt_pd2uqq_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttpd2uqq %xmm0, %xmm0 # encoding: [0x62,0xf1,0xfd,0x08,0x78,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <2 x i64> @llvm.x86.avx512.mask.cvttpd2uqq.128(<2 x double> %x0, <2 x i64> %x1, i8 -1) + ret <2 x i64> %res +} + define <2 x i64>@test_int_x86_avx512_mask_cvtt_pd2uqq_128(<2 x double> %x0, <2 x i64> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvtt_pd2uqq_128: ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vcvttpd2uqq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0x78,0xc8] -; X86-NEXT: vcvttpd2uqq %xmm0, %xmm0 # encoding: [0x62,0xf1,0xfd,0x08,0x78,0xc0] -; X86-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvtt_pd2uqq_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvttpd2uqq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0x78,0xc8] -; X64-NEXT: vcvttpd2uqq %xmm0, %xmm0 # encoding: [0x62,0xf1,0xfd,0x08,0x78,0xc0] -; X64-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.mask.cvttpd2uqq.128(<2 x double> %x0, <2 x i64> %x1, i8 %x2) - %res1 = call <2 x i64> @llvm.x86.avx512.mask.cvttpd2uqq.128(<2 x double> %x0, <2 x i64> %x1, i8 -1) - %res2 = add <2 x i64> %res, %res1 - ret <2 x i64> %res2 + ret <2 x i64> %res } declare <4 x i64> @llvm.x86.avx512.mask.cvttpd2uqq.256(<4 x double>, <4 x i64>, i8) +define <4 x i64>@test_int_x86_avx512_cvtt_pd2uqq_256(<4 x double> %x0, <4 x i64> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvtt_pd2uqq_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttpd2uqq %ymm0, %ymm0 # encoding: [0x62,0xf1,0xfd,0x28,0x78,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i64> @llvm.x86.avx512.mask.cvttpd2uqq.256(<4 x double> %x0, <4 x i64> %x1, i8 -1) + ret <4 x i64> %res +} + define <4 x i64>@test_int_x86_avx512_mask_cvtt_pd2uqq_256(<4 x double> %x0, <4 x i64> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvtt_pd2uqq_256: ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vcvttpd2uqq %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0x78,0xc8] -; X86-NEXT: vcvttpd2uqq %ymm0, %ymm0 # encoding: [0x62,0xf1,0xfd,0x28,0x78,0xc0] -; X86-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] +; X86-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvtt_pd2uqq_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvttpd2uqq %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0x78,0xc8] -; X64-NEXT: vcvttpd2uqq %ymm0, %ymm0 # encoding: [0x62,0xf1,0xfd,0x28,0x78,0xc0] -; X64-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] +; X64-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.cvttpd2uqq.256(<4 x double> %x0, <4 x i64> %x1, i8 %x2) - %res1 = call <4 x i64> @llvm.x86.avx512.mask.cvttpd2uqq.256(<4 x double> %x0, <4 x i64> %x1, i8 -1) - %res2 = add <4 x i64> %res, %res1 - ret <4 x i64> %res2 + ret <4 x i64> %res } declare <2 x i64> @llvm.x86.avx512.mask.cvttps2qq.128(<4 x float>, <2 x i64>, i8) +define <2 x i64>@test_int_x86_avx512_cvtt_ps2qq_128(<4 x float> %x0, <2 x i64> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvtt_ps2qq_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttps2qq %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7d,0x08,0x7a,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <2 x i64> @llvm.x86.avx512.mask.cvttps2qq.128(<4 x float> %x0, <2 x i64> %x1, i8 -1) + ret <2 x i64> %res +} + define <2 x i64>@test_int_x86_avx512_mask_cvtt_ps2qq_128(<4 x float> %x0, <2 x i64> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvtt_ps2qq_128: ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vcvttps2qq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x7a,0xc8] -; X86-NEXT: vcvttps2qq %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7d,0x08,0x7a,0xc0] -; X86-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvtt_ps2qq_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvttps2qq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x7a,0xc8] -; X64-NEXT: vcvttps2qq %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7d,0x08,0x7a,0xc0] -; X64-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.mask.cvttps2qq.128(<4 x float> %x0, <2 x i64> %x1, i8 %x2) - %res1 = call <2 x i64> @llvm.x86.avx512.mask.cvttps2qq.128(<4 x float> %x0, <2 x i64> %x1, i8 -1) - %res2 = add <2 x i64> %res, %res1 - ret <2 x i64> %res2 + ret <2 x i64> %res } define <2 x i64> @test_int_x86_avx512_cvtt_ps2qq_128_load(<2 x float>* %p) { @@ -879,50 +961,70 @@ define <2 x i64> @test_int_x86_avx512_maskz_cvtt_ps2qq_128_load_3(<4 x float>* % declare <4 x i64> @llvm.x86.avx512.mask.cvttps2qq.256(<4 x float>, <4 x i64>, i8) +define <4 x i64>@test_int_x86_avx512_cvtt_ps2qq_256(<4 x float> %x0, <4 x i64> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvtt_ps2qq_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttps2qq %xmm0, %ymm0 # encoding: [0x62,0xf1,0x7d,0x28,0x7a,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i64> @llvm.x86.avx512.mask.cvttps2qq.256(<4 x float> %x0, <4 x i64> %x1, i8 -1) + ret <4 x i64> %res +} + define <4 x i64>@test_int_x86_avx512_mask_cvtt_ps2qq_256(<4 x float> %x0, <4 x i64> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvtt_ps2qq_256: ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vcvttps2qq %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x7a,0xc8] -; X86-NEXT: vcvttps2qq %xmm0, %ymm0 # encoding: [0x62,0xf1,0x7d,0x28,0x7a,0xc0] -; X86-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] +; X86-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvtt_ps2qq_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvttps2qq %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x7a,0xc8] -; X64-NEXT: vcvttps2qq %xmm0, %ymm0 # encoding: [0x62,0xf1,0x7d,0x28,0x7a,0xc0] -; X64-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] +; X64-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.cvttps2qq.256(<4 x float> %x0, <4 x i64> %x1, i8 %x2) - %res1 = call <4 x i64> @llvm.x86.avx512.mask.cvttps2qq.256(<4 x float> %x0, <4 x i64> %x1, i8 -1) - %res2 = add <4 x i64> %res, %res1 - ret <4 x i64> %res2 + ret <4 x i64> %res } declare <4 x float> @llvm.x86.avx512.mask.cvtuqq2ps.128(<2 x i64>, <4 x float>, i8) +define <4 x float>@test_int_x86_avx512_cvt_uqq2ps_128(<2 x i64> %x0, <4 x float> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvt_uqq2ps_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtuqq2ps %xmm0, %xmm0 # encoding: [0x62,0xf1,0xff,0x08,0x7a,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x float> @llvm.x86.avx512.mask.cvtuqq2ps.128(<2 x i64> %x0, <4 x float> %x1, i8 -1) + ret <4 x float> %res +} + define <4 x float>@test_int_x86_avx512_mask_cvt_uqq2ps_128(<2 x i64> %x0, <4 x float> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvt_uqq2ps_128: ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vcvtuqq2ps %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x7a,0xc8] -; X86-NEXT: vcvtuqq2ps %xmm0, %xmm0 # encoding: [0x62,0xf1,0xff,0x08,0x7a,0xc0] -; X86-NEXT: vaddps %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc0] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_uqq2ps_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtuqq2ps %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x7a,0xc8] -; X64-NEXT: vcvtuqq2ps %xmm0, %xmm0 # encoding: [0x62,0xf1,0xff,0x08,0x7a,0xc0] -; X64-NEXT: vaddps %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc0] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.cvtuqq2ps.128(<2 x i64> %x0, <4 x float> %x1, i8 %x2) - %res1 = call <4 x float> @llvm.x86.avx512.mask.cvtuqq2ps.128(<2 x i64> %x0, <4 x float> %x1, i8 -1) - %res2 = fadd <4 x float> %res, %res1 - ret <4 x float> %res2 + ret <4 x float> %res +} + +define <4 x float>@test_int_x86_avx512_cvt_uqq2ps_128_zext(<2 x i64> %x0, <4 x float> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvt_uqq2ps_128_zext: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtuqq2ps %xmm0, %xmm0 # encoding: [0x62,0xf1,0xff,0x08,0x7a,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res2 = call <4 x float> @llvm.x86.avx512.mask.cvtuqq2ps.128(<2 x i64> %x0, <4 x float> %x1, i8 -1) + %res3 = shufflevector <4 x float> %res2, <4 x float> zeroinitializer, <4 x i32> + ret <4 x float> %res3 } define <4 x float>@test_int_x86_avx512_mask_cvt_uqq2ps_128_zext(<2 x i64> %x0, <4 x float> %x1, i8 %x2) { @@ -930,76 +1032,82 @@ define <4 x float>@test_int_x86_avx512_mask_cvt_uqq2ps_128_zext(<2 x i64> %x0, < ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vcvtuqq2ps %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x7a,0xc8] -; X86-NEXT: vcvtuqq2ps %xmm0, %xmm0 # encoding: [0x62,0xf1,0xff,0x08,0x7a,0xc0] -; X86-NEXT: vaddps %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc0] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_uqq2ps_128_zext: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtuqq2ps %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x7a,0xc8] -; X64-NEXT: vcvtuqq2ps %xmm0, %xmm0 # encoding: [0x62,0xf1,0xff,0x08,0x7a,0xc0] -; X64-NEXT: vaddps %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc0] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.cvtuqq2ps.128(<2 x i64> %x0, <4 x float> %x1, i8 %x2) %res1 = shufflevector <4 x float> %res, <4 x float> zeroinitializer, <4 x i32> - %res2 = call <4 x float> @llvm.x86.avx512.mask.cvtuqq2ps.128(<2 x i64> %x0, <4 x float> %x1, i8 -1) - %res3 = shufflevector <4 x float> %res2, <4 x float> zeroinitializer, <4 x i32> - %res4 = fadd <4 x float> %res1, %res3 - ret <4 x float> %res4 + ret <4 x float> %res1 } declare <4 x float> @llvm.x86.avx512.mask.cvtuqq2ps.256(<4 x i64>, <4 x float>, i8) +define <4 x float>@test_int_x86_avx512_cvt_uqq2ps_256(<4 x i64> %x0) { +; CHECK-LABEL: test_int_x86_avx512_cvt_uqq2ps_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtuqq2ps %ymm0, %xmm0 # encoding: [0x62,0xf1,0xff,0x28,0x7a,0xc0] +; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %cvt = uitofp <4 x i64> %x0 to <4 x float> + ret <4 x float> %cvt +} + define <4 x float>@test_int_x86_avx512_mask_cvt_uqq2ps_256(<4 x i64> %x0, <4 x float> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvt_uqq2ps_256: ; X86: # %bb.0: -; X86-NEXT: vcvtuqq2ps %ymm0, %xmm2 # encoding: [0x62,0xf1,0xff,0x28,0x7a,0xd0] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vcvtuqq2ps %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xff,0x29,0x7a,0xc8] -; X86-NEXT: vaddps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc2] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_uqq2ps_256: ; X64: # %bb.0: -; X64-NEXT: vcvtuqq2ps %ymm0, %xmm2 # encoding: [0x62,0xf1,0xff,0x28,0x7a,0xd0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtuqq2ps %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xff,0x29,0x7a,0xc8] -; X64-NEXT: vaddps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc2] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] - %cvt1 = uitofp <4 x i64> %x0 to <4 x float> + %cvt = uitofp <4 x i64> %x0 to <4 x float> %1 = bitcast i8 %x2 to <8 x i1> %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32> - %2 = select <4 x i1> %extract, <4 x float> %cvt1, <4 x float> %x1 - %cvt = uitofp <4 x i64> %x0 to <4 x float> - %res2 = fadd <4 x float> %2, %cvt - ret <4 x float> %res2 + %2 = select <4 x i1> %extract, <4 x float> %cvt, <4 x float> %x1 + ret <4 x float> %2 } declare <2 x i64> @llvm.x86.avx512.mask.cvttps2uqq.128(<4 x float>, <2 x i64>, i8) +define <2 x i64>@test_int_x86_avx512_cvtt_ps2uqq_128(<4 x float> %x0, <2 x i64> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvtt_ps2uqq_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttps2uqq %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7d,0x08,0x78,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <2 x i64> @llvm.x86.avx512.mask.cvttps2uqq.128(<4 x float> %x0, <2 x i64> %x1, i8 -1) + ret <2 x i64> %res +} + define <2 x i64>@test_int_x86_avx512_mask_cvtt_ps2uqq_128(<4 x float> %x0, <2 x i64> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvtt_ps2uqq_128: ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vcvttps2uqq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x78,0xc8] -; X86-NEXT: vcvttps2uqq %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7d,0x08,0x78,0xc0] -; X86-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvtt_ps2uqq_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvttps2uqq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x78,0xc8] -; X64-NEXT: vcvttps2uqq %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7d,0x08,0x78,0xc0] -; X64-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.mask.cvttps2uqq.128(<4 x float> %x0, <2 x i64> %x1, i8 %x2) - %res1 = call <2 x i64> @llvm.x86.avx512.mask.cvttps2uqq.128(<4 x float> %x0, <2 x i64> %x1, i8 -1) - %res2 = add <2 x i64> %res, %res1 - ret <2 x i64> %res2 + ret <2 x i64> %res } define <2 x i64> @test_int_x86_avx512_cvtt_ps2uqq_128_load(<2 x float>* %p) { @@ -1167,26 +1275,31 @@ define <2 x i64> @test_int_x86_avx512_maskz_cvtt_ps2uqq_128_load_3(<4 x float>* declare <4 x i64> @llvm.x86.avx512.mask.cvttps2uqq.256(<4 x float>, <4 x i64>, i8) +define <4 x i64>@test_int_x86_avx512_cvtt_ps2uqq_256(<4 x float> %x0, <4 x i64> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvtt_ps2uqq_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttps2uqq %xmm0, %ymm0 # encoding: [0x62,0xf1,0x7d,0x28,0x78,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i64> @llvm.x86.avx512.mask.cvttps2uqq.256(<4 x float> %x0, <4 x i64> %x1, i8 -1) + ret <4 x i64> %res +} + define <4 x i64>@test_int_x86_avx512_mask_cvtt_ps2uqq_256(<4 x float> %x0, <4 x i64> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvtt_ps2uqq_256: ; X86: # %bb.0: ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vcvttps2uqq %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x78,0xc8] -; X86-NEXT: vcvttps2uqq %xmm0, %ymm0 # encoding: [0x62,0xf1,0x7d,0x28,0x78,0xc0] -; X86-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] +; X86-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvtt_ps2uqq_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvttps2uqq %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x78,0xc8] -; X64-NEXT: vcvttps2uqq %xmm0, %ymm0 # encoding: [0x62,0xf1,0x7d,0x28,0x78,0xc0] -; X64-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] +; X64-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.cvttps2uqq.256(<4 x float> %x0, <4 x i64> %x1, i8 %x2) - %res1 = call <4 x i64> @llvm.x86.avx512.mask.cvttps2uqq.256(<4 x float> %x0, <4 x i64> %x1, i8 -1) - %res2 = add <4 x i64> %res, %res1 - ret <4 x i64> %res2 + ret <4 x i64> %res } declare <2 x double> @llvm.x86.avx512.mask.reduce.pd.128(<2 x double>, i32, <2 x double>, i8) diff --git a/llvm/test/CodeGen/X86/avx512vbmi-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512vbmi-intrinsics-upgrade.ll index 8c6b982fe7475..746e3858333e6 100644 --- a/llvm/test/CodeGen/X86/avx512vbmi-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512vbmi-intrinsics-upgrade.ll @@ -4,130 +4,150 @@ declare <64 x i8> @llvm.x86.avx512.mask.permvar.qi.512(<64 x i8>, <64 x i8>, <64 x i8>, i64) +define <64 x i8>@test_int_x86_avx512_permvar_qi_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2) { +; CHECK-LABEL: test_int_x86_avx512_permvar_qi_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermb %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0x8d,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <64 x i8> @llvm.x86.avx512.mask.permvar.qi.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1) + ret <64 x i8> %res +} + define <64 x i8>@test_int_x86_avx512_mask_permvar_qi_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_permvar_qi_512: ; X86: # %bb.0: -; X86-NEXT: vpermb %zmm0, %zmm1, %zmm3 # encoding: [0x62,0xf2,0x75,0x48,0x8d,0xd8] ; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpermb %zmm0, %zmm1, %zmm2 {%k1} # encoding: [0x62,0xf2,0x75,0x49,0x8d,0xd0] -; X86-NEXT: vpermb %zmm0, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xc9,0x8d,0xc0] -; X86-NEXT: vpaddb %zmm3, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfc,0xc3] -; X86-NEXT: vpaddb %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc0] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_permvar_qi_512: ; X64: # %bb.0: -; X64-NEXT: vpermb %zmm0, %zmm1, %zmm3 # encoding: [0x62,0xf2,0x75,0x48,0x8d,0xd8] ; X64-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] ; X64-NEXT: vpermb %zmm0, %zmm1, %zmm2 {%k1} # encoding: [0x62,0xf2,0x75,0x49,0x8d,0xd0] -; X64-NEXT: vpermb %zmm0, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xc9,0x8d,0xc0] -; X64-NEXT: vpaddb %zmm3, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfc,0xc3] -; X64-NEXT: vpaddb %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc0] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <64 x i8> @llvm.x86.avx512.mask.permvar.qi.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) - %res1 = call <64 x i8> @llvm.x86.avx512.mask.permvar.qi.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> zeroinitializer, i64 %x3) - %res2 = call <64 x i8> @llvm.x86.avx512.mask.permvar.qi.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1) - %res3 = add <64 x i8> %res, %res1 - %res4 = add <64 x i8> %res3, %res2 - ret <64 x i8> %res4 + ret <64 x i8> %res +} + +define <64 x i8>@test_int_x86_avx512_maskz_permvar_qi_512(<64 x i8> %x0, <64 x i8> %x1, i64 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_permvar_qi_512: +; X86: # %bb.0: +; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpermb %zmm0, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xc9,0x8d,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_permvar_qi_512: +; X64: # %bb.0: +; X64-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] +; X64-NEXT: vpermb %zmm0, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xc9,0x8d,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <64 x i8> @llvm.x86.avx512.mask.permvar.qi.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> zeroinitializer, i64 %x3) + ret <64 x i8> %res } declare <64 x i8> @llvm.x86.avx512.mask.pmultishift.qb.512(<64 x i8>, <64 x i8>, <64 x i8>, i64) +define <64 x i8>@test_int_x86_avx512_pmultishift_qb_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pmultishift_qb_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmultishiftqb %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf2,0xfd,0x48,0x83,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <64 x i8> @llvm.x86.avx512.mask.pmultishift.qb.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1) + ret <64 x i8> %res +} + define <64 x i8>@test_int_x86_avx512_mask_pmultishift_qb_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pmultishift_qb_512: ; X86: # %bb.0: -; X86-NEXT: vpmultishiftqb %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf2,0xfd,0x48,0x83,0xd9] ; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmultishiftqb %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x49,0x83,0xd1] -; X86-NEXT: vpmultishiftqb %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xc9,0x83,0xc1] -; X86-NEXT: vpaddb %zmm3, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfc,0xc3] -; X86-NEXT: vpaddb %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc0] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmultishift_qb_512: ; X64: # %bb.0: -; X64-NEXT: vpmultishiftqb %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf2,0xfd,0x48,0x83,0xd9] ; X64-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] ; X64-NEXT: vpmultishiftqb %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x49,0x83,0xd1] -; X64-NEXT: vpmultishiftqb %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xc9,0x83,0xc1] -; X64-NEXT: vpaddb %zmm3, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfc,0xc3] -; X64-NEXT: vpaddb %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc0] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <64 x i8> @llvm.x86.avx512.mask.pmultishift.qb.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) - %res1 = call <64 x i8> @llvm.x86.avx512.mask.pmultishift.qb.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> zeroinitializer, i64 %x3) - %res2 = call <64 x i8> @llvm.x86.avx512.mask.pmultishift.qb.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1) - %res3 = add <64 x i8> %res, %res1 - %res4 = add <64 x i8> %res3, %res2 - ret <64 x i8> %res4 + ret <64 x i8> %res +} + +define <64 x i8>@test_int_x86_avx512_maskz_pmultishift_qb_512(<64 x i8> %x0, <64 x i8> %x1, i64 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_pmultishift_qb_512: +; X86: # %bb.0: +; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpmultishiftqb %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xc9,0x83,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmultishift_qb_512: +; X64: # %bb.0: +; X64-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] +; X64-NEXT: vpmultishiftqb %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xc9,0x83,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <64 x i8> @llvm.x86.avx512.mask.pmultishift.qb.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> zeroinitializer, i64 %x3) + ret <64 x i8> %res } declare <64 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.512(<64 x i8>, <64 x i8>, <64 x i8>, i64) +define <64 x i8>@test_int_x86_avx512_vpermi2var_qi_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermi2var_qi_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermt2b %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0x7d,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <64 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1) + ret <64 x i8> %res +} + define <64 x i8>@test_int_x86_avx512_mask_vpermi2var_qi_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermi2var_qi_512: ; X86: # %bb.0: -; X86-NEXT: vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8] -; X86-NEXT: vpermt2b %zmm2, %zmm1, %zmm3 # encoding: [0x62,0xf2,0x75,0x48,0x7d,0xda] ; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpermi2b %zmm2, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x75,0xca] -; X86-NEXT: vmovdqa64 %zmm1, %zmm4 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xe1] -; X86-NEXT: vpermi2b %zmm2, %zmm0, %zmm4 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x75,0xe2] -; X86-NEXT: vpaddb %zmm3, %zmm4, %zmm0 # encoding: [0x62,0xf1,0x5d,0x48,0xfc,0xc3] -; X86-NEXT: vpaddb %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfc,0xc0] +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermi2var_qi_512: ; X64: # %bb.0: -; X64-NEXT: vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8] -; X64-NEXT: vpermt2b %zmm2, %zmm1, %zmm3 # encoding: [0x62,0xf2,0x75,0x48,0x7d,0xda] ; X64-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] ; X64-NEXT: vpermi2b %zmm2, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x75,0xca] -; X64-NEXT: vmovdqa64 %zmm1, %zmm4 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xe1] -; X64-NEXT: vpermi2b %zmm2, %zmm0, %zmm4 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x75,0xe2] -; X64-NEXT: vpaddb %zmm3, %zmm4, %zmm0 # encoding: [0x62,0xf1,0x5d,0x48,0xfc,0xc3] -; X64-NEXT: vpaddb %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfc,0xc0] +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <64 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) - %res1 = call <64 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.512(<64 x i8> %x0, <64 x i8> %res, <64 x i8> %x2, i64 %x3) - %res2 = call <64 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1) - %res3 = add <64 x i8> %res, %res1 - %res4 = add <64 x i8> %res3, %res2 - ret <64 x i8> %res4 + ret <64 x i8> %res } declare <64 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.512(<64 x i8>, <64 x i8>, <64 x i8>, i64) +define <64 x i8>@test_int_x86_avx512_vpermt2var_qi_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermt2var_qi_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermi2b %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0x75,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <64 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1) + ret <64 x i8> %res +} + define <64 x i8>@test_int_x86_avx512_mask_vpermt2var_qi_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermt2var_qi_512: ; X86: # %bb.0: -; X86-NEXT: vmovdqa64 %zmm1, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd9] -; X86-NEXT: vpermt2b %zmm2, %zmm0, %zmm3 # encoding: [0x62,0xf2,0x7d,0x48,0x7d,0xda] ; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpermt2b %zmm2, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x7d,0xca] -; X86-NEXT: vpxor %xmm4, %xmm4, %xmm4 # encoding: [0xc5,0xd9,0xef,0xe4] -; X86-NEXT: vpermt2b %zmm2, %zmm0, %zmm4 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0x7d,0xe2] -; X86-NEXT: vpaddb %zmm3, %zmm4, %zmm0 # encoding: [0x62,0xf1,0x5d,0x48,0xfc,0xc3] -; X86-NEXT: vpaddb %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfc,0xc0] +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermt2var_qi_512: ; X64: # %bb.0: -; X64-NEXT: vmovdqa64 %zmm1, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd9] -; X64-NEXT: vpermt2b %zmm2, %zmm0, %zmm3 # encoding: [0x62,0xf2,0x7d,0x48,0x7d,0xda] ; X64-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] ; X64-NEXT: vpermt2b %zmm2, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x7d,0xca] -; X64-NEXT: vpxor %xmm4, %xmm4, %xmm4 # encoding: [0xc5,0xd9,0xef,0xe4] -; X64-NEXT: vpermt2b %zmm2, %zmm0, %zmm4 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0x7d,0xe2] -; X64-NEXT: vpaddb %zmm3, %zmm4, %zmm0 # encoding: [0x62,0xf1,0x5d,0x48,0xfc,0xc3] -; X64-NEXT: vpaddb %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfc,0xc0] +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <64 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) - %res1 = call <64 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.512(<64 x i8> %x0, <64 x i8> zeroinitializer, <64 x i8> %x2, i64 %x3) - %res2 = call <64 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1) - %res3 = add <64 x i8> %res, %res1 - %res4 = add <64 x i8> %res3, %res2 - ret <64 x i8> %res4 + ret <64 x i8> %res } declare <64 x i8> @llvm.x86.avx512.maskz.vpermt2var.qi.512(<64 x i8>, <64 x i8>, <64 x i8>, i64) diff --git a/llvm/test/CodeGen/X86/avx512vbmi-intrinsics.ll b/llvm/test/CodeGen/X86/avx512vbmi-intrinsics.ll index 23a7e2ac3e965..0df1a1dd75f3f 100644 --- a/llvm/test/CodeGen/X86/avx512vbmi-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512vbmi-intrinsics.ll @@ -4,142 +4,160 @@ declare <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8>, <64 x i8>) +define <64 x i8>@test_int_x86_avx512_permvar_qi_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2) { +; CHECK-LABEL: test_int_x86_avx512_permvar_qi_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermb %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0x8d,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %x0, <64 x i8> %x1) + ret <64 x i8> %1 +} + define <64 x i8>@test_int_x86_avx512_mask_permvar_qi_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_permvar_qi_512: ; X86: # %bb.0: -; X86-NEXT: vpermb %zmm0, %zmm1, %zmm3 # encoding: [0x62,0xf2,0x75,0x48,0x8d,0xd8] ; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpermb %zmm0, %zmm1, %zmm2 {%k1} # encoding: [0x62,0xf2,0x75,0x49,0x8d,0xd0] -; X86-NEXT: vpermb %zmm0, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xc9,0x8d,0xc0] -; X86-NEXT: vpaddb %zmm3, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfc,0xc3] -; X86-NEXT: vpaddb %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc0] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_permvar_qi_512: ; X64: # %bb.0: -; X64-NEXT: vpermb %zmm0, %zmm1, %zmm3 # encoding: [0x62,0xf2,0x75,0x48,0x8d,0xd8] ; X64-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] ; X64-NEXT: vpermb %zmm0, %zmm1, %zmm2 {%k1} # encoding: [0x62,0xf2,0x75,0x49,0x8d,0xd0] -; X64-NEXT: vpermb %zmm0, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xc9,0x8d,0xc0] -; X64-NEXT: vpaddb %zmm3, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfc,0xc3] -; X64-NEXT: vpaddb %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc0] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %1 = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %x0, <64 x i8> %x1) %2 = bitcast i64 %x3 to <64 x i1> %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> %x2 - %4 = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %x0, <64 x i8> %x1) - %5 = bitcast i64 %x3 to <64 x i1> - %6 = select <64 x i1> %5, <64 x i8> %4, <64 x i8> zeroinitializer - %7 = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %x0, <64 x i8> %x1) - %res3 = add <64 x i8> %3, %6 - %res4 = add <64 x i8> %res3, %7 - ret <64 x i8> %res4 + ret <64 x i8> %3 +} + +define <64 x i8>@test_int_x86_avx512_maskz_permvar_qi_512(<64 x i8> %x0, <64 x i8> %x1, i64 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_permvar_qi_512: +; X86: # %bb.0: +; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpermb %zmm0, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xc9,0x8d,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_permvar_qi_512: +; X64: # %bb.0: +; X64-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] +; X64-NEXT: vpermb %zmm0, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xc9,0x8d,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %1 = call <64 x i8> @llvm.x86.avx512.permvar.qi.512(<64 x i8> %x0, <64 x i8> %x1) + %2 = bitcast i64 %x3 to <64 x i1> + %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> zeroinitializer + ret <64 x i8> %3 } declare <64 x i8> @llvm.x86.avx512.pmultishift.qb.512(<64 x i8>, <64 x i8>) +define <64 x i8>@test_int_x86_avx512_pmultishift_qb_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pmultishift_qb_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmultishiftqb %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf2,0xfd,0x48,0x83,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = call <64 x i8> @llvm.x86.avx512.pmultishift.qb.512(<64 x i8> %x0, <64 x i8> %x1) + ret <64 x i8> %1 +} + define <64 x i8>@test_int_x86_avx512_mask_pmultishift_qb_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pmultishift_qb_512: ; X86: # %bb.0: -; X86-NEXT: vpmultishiftqb %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf2,0xfd,0x48,0x83,0xd9] ; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmultishiftqb %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x49,0x83,0xd1] -; X86-NEXT: vpmultishiftqb %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xc9,0x83,0xc1] -; X86-NEXT: vpaddb %zmm3, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfc,0xc3] -; X86-NEXT: vpaddb %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc0] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmultishift_qb_512: ; X64: # %bb.0: -; X64-NEXT: vpmultishiftqb %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf2,0xfd,0x48,0x83,0xd9] ; X64-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] ; X64-NEXT: vpmultishiftqb %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x49,0x83,0xd1] -; X64-NEXT: vpmultishiftqb %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xc9,0x83,0xc1] -; X64-NEXT: vpaddb %zmm3, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfc,0xc3] -; X64-NEXT: vpaddb %zmm0, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc0] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %1 = call <64 x i8> @llvm.x86.avx512.pmultishift.qb.512(<64 x i8> %x0, <64 x i8> %x1) %2 = bitcast i64 %x3 to <64 x i1> %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> %x2 - %4 = call <64 x i8> @llvm.x86.avx512.pmultishift.qb.512(<64 x i8> %x0, <64 x i8> %x1) - %5 = bitcast i64 %x3 to <64 x i1> - %6 = select <64 x i1> %5, <64 x i8> %4, <64 x i8> zeroinitializer - %7 = call <64 x i8> @llvm.x86.avx512.pmultishift.qb.512(<64 x i8> %x0, <64 x i8> %x1) - %res3 = add <64 x i8> %3, %6 - %res4 = add <64 x i8> %res3, %7 - ret <64 x i8> %res4 + ret <64 x i8> %3 +} + +define <64 x i8>@test_int_x86_avx512_maskz_pmultishift_qb_512(<64 x i8> %x0, <64 x i8> %x1, i64 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_pmultishift_qb_512: +; X86: # %bb.0: +; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpmultishiftqb %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xc9,0x83,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmultishift_qb_512: +; X64: # %bb.0: +; X64-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] +; X64-NEXT: vpmultishiftqb %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xc9,0x83,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %1 = call <64 x i8> @llvm.x86.avx512.pmultishift.qb.512(<64 x i8> %x0, <64 x i8> %x1) + %2 = bitcast i64 %x3 to <64 x i1> + %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> zeroinitializer + ret <64 x i8> %3 } declare <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8>, <64 x i8>, <64 x i8>) +define <64 x i8>@test_int_x86_avx512_vpermi2var_qi_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermi2var_qi_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermt2b %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0x7d,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2) + ret <64 x i8> %1 +} + define <64 x i8>@test_int_x86_avx512_mask_vpermi2var_qi_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermi2var_qi_512: ; X86: # %bb.0: -; X86-NEXT: vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8] -; X86-NEXT: vpermt2b %zmm2, %zmm1, %zmm3 # encoding: [0x62,0xf2,0x75,0x48,0x7d,0xda] ; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpermi2b %zmm2, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x75,0xca] -; X86-NEXT: vpermt2b %zmm2, %zmm3, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x65,0xc9,0x7d,0xc2] -; X86-NEXT: vpaddb %zmm3, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfc,0xc3] -; X86-NEXT: vpaddb %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfc,0xc0] +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermi2var_qi_512: ; X64: # %bb.0: -; X64-NEXT: vmovdqa64 %zmm0, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd8] -; X64-NEXT: vpermt2b %zmm2, %zmm1, %zmm3 # encoding: [0x62,0xf2,0x75,0x48,0x7d,0xda] ; X64-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] ; X64-NEXT: vpermi2b %zmm2, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x75,0xca] -; X64-NEXT: vpermt2b %zmm2, %zmm3, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x65,0xc9,0x7d,0xc2] -; X64-NEXT: vpaddb %zmm3, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xfc,0xc3] -; X64-NEXT: vpaddb %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfc,0xc0] +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %1 = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2) %2 = bitcast i64 %x3 to <64 x i1> %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> %x1 - %4 = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> %x0, <64 x i8> %1, <64 x i8> %x2) - %5 = bitcast i64 %x3 to <64 x i1> - %6 = select <64 x i1> %5, <64 x i8> %4, <64 x i8> zeroinitializer - %7 = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2) - %res3 = add <64 x i8> %3, %6 - %res4 = add <64 x i8> %res3, %7 - ret <64 x i8> %res4 + ret <64 x i8> %3 +} + +define <64 x i8>@test_int_x86_avx512_vpermt2var_qi_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermt2var_qi_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermi2b %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0x75,0x48,0x75,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> %x1, <64 x i8> %x0, <64 x i8> %x2) + ret <64 x i8> %1 } define <64 x i8>@test_int_x86_avx512_mask_vpermt2var_qi_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermt2var_qi_512: ; X86: # %bb.0: -; X86-NEXT: vmovdqa64 %zmm1, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd9] -; X86-NEXT: vpermt2b %zmm2, %zmm0, %zmm3 # encoding: [0x62,0xf2,0x7d,0x48,0x7d,0xda] ; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpermt2b %zmm2, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x7d,0xca] -; X86-NEXT: vpxor %xmm4, %xmm4, %xmm4 # encoding: [0xc5,0xd9,0xef,0xe4] -; X86-NEXT: vpermt2b %zmm2, %zmm0, %zmm4 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0x7d,0xe2] -; X86-NEXT: vpaddb %zmm3, %zmm4, %zmm0 # encoding: [0x62,0xf1,0x5d,0x48,0xfc,0xc3] -; X86-NEXT: vpaddb %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfc,0xc0] +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermt2var_qi_512: ; X64: # %bb.0: -; X64-NEXT: vmovdqa64 %zmm1, %zmm3 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xd9] -; X64-NEXT: vpermt2b %zmm2, %zmm0, %zmm3 # encoding: [0x62,0xf2,0x7d,0x48,0x7d,0xda] ; X64-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] ; X64-NEXT: vpermt2b %zmm2, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x7d,0xca] -; X64-NEXT: vpxor %xmm4, %xmm4, %xmm4 # encoding: [0xc5,0xd9,0xef,0xe4] -; X64-NEXT: vpermt2b %zmm2, %zmm0, %zmm4 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0x7d,0xe2] -; X64-NEXT: vpaddb %zmm3, %zmm4, %zmm0 # encoding: [0x62,0xf1,0x5d,0x48,0xfc,0xc3] -; X64-NEXT: vpaddb %zmm0, %zmm1, %zmm0 # encoding: [0x62,0xf1,0x75,0x48,0xfc,0xc0] +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %1 = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> %x1, <64 x i8> %x0, <64 x i8> %x2) %2 = bitcast i64 %x3 to <64 x i1> %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> %x1 - %4 = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> zeroinitializer, <64 x i8> %x0, <64 x i8> %x2) - %5 = bitcast i64 %x3 to <64 x i1> - %6 = select <64 x i1> %5, <64 x i8> %4, <64 x i8> zeroinitializer - %7 = call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> %x1, <64 x i8> %x0, <64 x i8> %x2) - %res3 = add <64 x i8> %3, %6 - %res4 = add <64 x i8> %res3, %7 - ret <64 x i8> %res4 + ret <64 x i8> %3 } define <64 x i8>@test_int_x86_avx512_maskz_vpermt2var_qi_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) { diff --git a/llvm/test/CodeGen/X86/avx512vbmivl-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512vbmivl-intrinsics-upgrade.ll index 4b1d51da46fbe..57479666506ab 100644 --- a/llvm/test/CodeGen/X86/avx512vbmivl-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512vbmivl-intrinsics-upgrade.ll @@ -4,258 +4,298 @@ declare <16 x i8> @llvm.x86.avx512.mask.permvar.qi.128(<16 x i8>, <16 x i8>, <16 x i8>, i16) +define <16 x i8>@test_int_x86_avx512_permvar_qi_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2) { +; CHECK-LABEL: test_int_x86_avx512_permvar_qi_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermb %xmm0, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x75,0x08,0x8d,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.avx512.mask.permvar.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1) + ret <16 x i8> %res +} + define <16 x i8>@test_int_x86_avx512_mask_permvar_qi_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_permvar_qi_128: ; X86: # %bb.0: -; X86-NEXT: vpermb %xmm0, %xmm1, %xmm3 # encoding: [0x62,0xf2,0x75,0x08,0x8d,0xd8] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpermb %xmm0, %xmm1, %xmm2 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0x8d,0xd0] -; X86-NEXT: vpermb %xmm0, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0x8d,0xc0] -; X86-NEXT: vpaddb %xmm3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc3] -; X86-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0] +; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_permvar_qi_128: ; X64: # %bb.0: -; X64-NEXT: vpermb %xmm0, %xmm1, %xmm3 # encoding: [0x62,0xf2,0x75,0x08,0x8d,0xd8] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpermb %xmm0, %xmm1, %xmm2 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0x8d,0xd0] -; X64-NEXT: vpermb %xmm0, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0x8d,0xc0] -; X64-NEXT: vpaddb %xmm3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc3] -; X64-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0] +; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <16 x i8> @llvm.x86.avx512.mask.permvar.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) - %res1 = call <16 x i8> @llvm.x86.avx512.mask.permvar.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> zeroinitializer, i16 %x3) - %res2 = call <16 x i8> @llvm.x86.avx512.mask.permvar.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1) - %res3 = add <16 x i8> %res, %res1 - %res4 = add <16 x i8> %res3, %res2 - ret <16 x i8> %res4 + ret <16 x i8> %res +} + +define <16 x i8>@test_int_x86_avx512_maskz_permvar_qi_128(<16 x i8> %x0, <16 x i8> %x1, i16 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_permvar_qi_128: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpermb %xmm0, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0x8d,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_permvar_qi_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpermb %xmm0, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0x8d,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.avx512.mask.permvar.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> zeroinitializer, i16 %x3) + ret <16 x i8> %res } declare <32 x i8> @llvm.x86.avx512.mask.permvar.qi.256(<32 x i8>, <32 x i8>, <32 x i8>, i32) +define <32 x i8>@test_int_x86_avx512_permvar_qi_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2) { +; CHECK-LABEL: test_int_x86_avx512_permvar_qi_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermb %ymm0, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x75,0x28,0x8d,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx512.mask.permvar.qi.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1) + ret <32 x i8> %res +} + define <32 x i8>@test_int_x86_avx512_mask_permvar_qi_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_permvar_qi_256: ; X86: # %bb.0: -; X86-NEXT: vpermb %ymm0, %ymm1, %ymm3 # encoding: [0x62,0xf2,0x75,0x28,0x8d,0xd8] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpermb %ymm0, %ymm1, %ymm2 {%k1} # encoding: [0x62,0xf2,0x75,0x29,0x8d,0xd0] -; X86-NEXT: vpermb %ymm0, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x8d,0xc0] -; X86-NEXT: vpaddb %ymm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfc,0xc3] -; X86-NEXT: vpaddb %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc0] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_permvar_qi_256: ; X64: # %bb.0: -; X64-NEXT: vpermb %ymm0, %ymm1, %ymm3 # encoding: [0x62,0xf2,0x75,0x28,0x8d,0xd8] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpermb %ymm0, %ymm1, %ymm2 {%k1} # encoding: [0x62,0xf2,0x75,0x29,0x8d,0xd0] -; X64-NEXT: vpermb %ymm0, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x8d,0xc0] -; X64-NEXT: vpaddb %ymm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfc,0xc3] -; X64-NEXT: vpaddb %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc0] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <32 x i8> @llvm.x86.avx512.mask.permvar.qi.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) - %res1 = call <32 x i8> @llvm.x86.avx512.mask.permvar.qi.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> zeroinitializer, i32 %x3) - %res2 = call <32 x i8> @llvm.x86.avx512.mask.permvar.qi.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1) - %res3 = add <32 x i8> %res, %res1 - %res4 = add <32 x i8> %res3, %res2 - ret <32 x i8> %res4 + ret <32 x i8> %res +} + +define <32 x i8>@test_int_x86_avx512_maskz_permvar_qi_256(<32 x i8> %x0, <32 x i8> %x1, i32 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_permvar_qi_256: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpermb %ymm0, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x8d,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_permvar_qi_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpermb %ymm0, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x8d,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx512.mask.permvar.qi.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> zeroinitializer, i32 %x3) + ret <32 x i8> %res } declare <16 x i8> @llvm.x86.avx512.mask.pmultishift.qb.128(<16 x i8>, <16 x i8>, <16 x i8>, i16) +define <16 x i8>@test_int_x86_avx512_pmultishift_qb_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pmultishift_qb_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmultishiftqb %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf2,0xfd,0x08,0x83,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.avx512.mask.pmultishift.qb.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1) + ret <16 x i8> %res +} + define <16 x i8>@test_int_x86_avx512_mask_pmultishift_qb_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pmultishift_qb_128: ; X86: # %bb.0: -; X86-NEXT: vpmultishiftqb %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf2,0xfd,0x08,0x83,0xd9] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmultishiftqb %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x83,0xd1] -; X86-NEXT: vpmultishiftqb %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0x83,0xc1] -; X86-NEXT: vpaddb %xmm3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc3] -; X86-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0] +; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmultishift_qb_128: ; X64: # %bb.0: -; X64-NEXT: vpmultishiftqb %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf2,0xfd,0x08,0x83,0xd9] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmultishiftqb %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x83,0xd1] -; X64-NEXT: vpmultishiftqb %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0x83,0xc1] -; X64-NEXT: vpaddb %xmm3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc3] -; X64-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0] +; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <16 x i8> @llvm.x86.avx512.mask.pmultishift.qb.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) - %res1 = call <16 x i8> @llvm.x86.avx512.mask.pmultishift.qb.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> zeroinitializer, i16 %x3) - %res2 = call <16 x i8> @llvm.x86.avx512.mask.pmultishift.qb.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1) - %res3 = add <16 x i8> %res, %res1 - %res4 = add <16 x i8> %res3, %res2 - ret <16 x i8> %res4 + ret <16 x i8> %res +} + +define <16 x i8>@test_int_x86_avx512_maskz_pmultishift_qb_128(<16 x i8> %x0, <16 x i8> %x1, i16 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_pmultishift_qb_128: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpmultishiftqb %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0x83,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmultishift_qb_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpmultishiftqb %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0x83,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.avx512.mask.pmultishift.qb.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> zeroinitializer, i16 %x3) + ret <16 x i8> %res } declare <32 x i8> @llvm.x86.avx512.mask.pmultishift.qb.256(<32 x i8>, <32 x i8>, <32 x i8>, i32) +define <32 x i8>@test_int_x86_avx512_pmultishift_qb_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pmultishift_qb_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmultishiftqb %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf2,0xfd,0x28,0x83,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx512.mask.pmultishift.qb.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1) + ret <32 x i8> %res +} + define <32 x i8>@test_int_x86_avx512_mask_pmultishift_qb_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pmultishift_qb_256: ; X86: # %bb.0: -; X86-NEXT: vpmultishiftqb %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf2,0xfd,0x28,0x83,0xd9] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmultishiftqb %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x83,0xd1] -; X86-NEXT: vpmultishiftqb %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x83,0xc1] -; X86-NEXT: vpaddb %ymm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfc,0xc3] -; X86-NEXT: vpaddb %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc0] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmultishift_qb_256: ; X64: # %bb.0: -; X64-NEXT: vpmultishiftqb %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf2,0xfd,0x28,0x83,0xd9] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmultishiftqb %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x83,0xd1] -; X64-NEXT: vpmultishiftqb %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x83,0xc1] -; X64-NEXT: vpaddb %ymm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfc,0xc3] -; X64-NEXT: vpaddb %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc0] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <32 x i8> @llvm.x86.avx512.mask.pmultishift.qb.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) - %res1 = call <32 x i8> @llvm.x86.avx512.mask.pmultishift.qb.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> zeroinitializer, i32 %x3) - %res2 = call <32 x i8> @llvm.x86.avx512.mask.pmultishift.qb.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1) - %res3 = add <32 x i8> %res, %res1 - %res4 = add <32 x i8> %res3, %res2 - ret <32 x i8> %res4 + ret <32 x i8> %res +} + +define <32 x i8>@test_int_x86_avx512_maskz_pmultishift_qb_256(<32 x i8> %x0, <32 x i8> %x1, i32 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_pmultishift_qb_256: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpmultishiftqb %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x83,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmultishift_qb_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpmultishiftqb %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x83,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx512.mask.pmultishift.qb.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> zeroinitializer, i32 %x3) + ret <32 x i8> %res } declare <16 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.128(<16 x i8>, <16 x i8>, <16 x i8>, i16) +define <16 x i8>@test_int_x86_avx512_vpermi2var_qi_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermi2var_qi_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermt2b %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x75,0x08,0x7d,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1) + ret <16 x i8> %res +} + define <16 x i8>@test_int_x86_avx512_mask_vpermi2var_qi_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermi2var_qi_128: ; X86: # %bb.0: -; X86-NEXT: vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8] -; X86-NEXT: vpermt2b %xmm2, %xmm1, %xmm3 # encoding: [0x62,0xf2,0x75,0x08,0x7d,0xda] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpermi2b %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x75,0xca] -; X86-NEXT: vmovdqa %xmm1, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xe1] -; X86-NEXT: vpermi2b %xmm2, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x75,0xe2] -; X86-NEXT: vpaddb %xmm3, %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xd9,0xfc,0xc3] -; X86-NEXT: vpaddb %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xc0] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermi2var_qi_128: ; X64: # %bb.0: -; X64-NEXT: vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8] -; X64-NEXT: vpermt2b %xmm2, %xmm1, %xmm3 # encoding: [0x62,0xf2,0x75,0x08,0x7d,0xda] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpermi2b %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x75,0xca] -; X64-NEXT: vmovdqa %xmm1, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xe1] -; X64-NEXT: vpermi2b %xmm2, %xmm0, %xmm4 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x75,0xe2] -; X64-NEXT: vpaddb %xmm3, %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xd9,0xfc,0xc3] -; X64-NEXT: vpaddb %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xc0] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <16 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) - %res1 = call <16 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.128(<16 x i8> %x0, <16 x i8> %res, <16 x i8> %x2, i16 %x3) - %res2 = call <16 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1) - %res3 = add <16 x i8> %res, %res1 - %res4 = add <16 x i8> %res3, %res2 - ret <16 x i8> %res4 + ret <16 x i8> %res } declare <32 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.256(<32 x i8>, <32 x i8>, <32 x i8>, i32) +define <32 x i8>@test_int_x86_avx512_vpermi2var_qi_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermi2var_qi_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermt2b %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x75,0x28,0x7d,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1) + ret <32 x i8> %res +} + define <32 x i8>@test_int_x86_avx512_mask_vpermi2var_qi_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermi2var_qi_256: ; X86: # %bb.0: -; X86-NEXT: vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8] -; X86-NEXT: vpermt2b %ymm2, %ymm1, %ymm3 # encoding: [0x62,0xf2,0x75,0x28,0x7d,0xda] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpermi2b %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x75,0xca] -; X86-NEXT: vmovdqa %ymm1, %ymm4 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xe1] -; X86-NEXT: vpermi2b %ymm2, %ymm0, %ymm4 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x75,0xe2] -; X86-NEXT: vpaddb %ymm3, %ymm4, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xdd,0xfc,0xc3] -; X86-NEXT: vpaddb %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfc,0xc0] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermi2var_qi_256: ; X64: # %bb.0: -; X64-NEXT: vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8] -; X64-NEXT: vpermt2b %ymm2, %ymm1, %ymm3 # encoding: [0x62,0xf2,0x75,0x28,0x7d,0xda] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpermi2b %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x75,0xca] -; X64-NEXT: vmovdqa %ymm1, %ymm4 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xe1] -; X64-NEXT: vpermi2b %ymm2, %ymm0, %ymm4 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x75,0xe2] -; X64-NEXT: vpaddb %ymm3, %ymm4, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xdd,0xfc,0xc3] -; X64-NEXT: vpaddb %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfc,0xc0] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <32 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) - %res1 = call <32 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.256(<32 x i8> %x0, <32 x i8> %res, <32 x i8> %x2, i32 %x3) - %res2 = call <32 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1) - %res3 = add <32 x i8> %res, %res1 - %res4 = add <32 x i8> %res3, %res2 - ret <32 x i8> %res4 + ret <32 x i8> %res } declare <16 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.128(<16 x i8>, <16 x i8>, <16 x i8>, i16) +define <16 x i8>@test_int_x86_avx512_vpermt2var_qi_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermt2var_qi_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermi2b %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x75,0x08,0x75,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1) + ret <16 x i8> %res +} + define <16 x i8>@test_int_x86_avx512_mask_vpermt2var_qi_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermt2var_qi_128: ; X86: # %bb.0: -; X86-NEXT: vmovdqa %xmm1, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9] -; X86-NEXT: vpermt2b %xmm2, %xmm0, %xmm3 # encoding: [0x62,0xf2,0x7d,0x08,0x7d,0xda] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpermt2b %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x7d,0xca] -; X86-NEXT: vpxor %xmm4, %xmm4, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4] -; X86-NEXT: vpermt2b %xmm2, %xmm0, %xmm4 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x7d,0xe2] -; X86-NEXT: vpaddb %xmm3, %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xd9,0xfc,0xc3] -; X86-NEXT: vpaddb %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xc0] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermt2var_qi_128: ; X64: # %bb.0: -; X64-NEXT: vmovdqa %xmm1, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9] -; X64-NEXT: vpermt2b %xmm2, %xmm0, %xmm3 # encoding: [0x62,0xf2,0x7d,0x08,0x7d,0xda] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpermt2b %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x7d,0xca] -; X64-NEXT: vpxor %xmm4, %xmm4, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4] -; X64-NEXT: vpermt2b %xmm2, %xmm0, %xmm4 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x7d,0xe2] -; X64-NEXT: vpaddb %xmm3, %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xd9,0xfc,0xc3] -; X64-NEXT: vpaddb %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xc0] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <16 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) - %res1 = call <16 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.128(<16 x i8> %x0, <16 x i8> zeroinitializer, <16 x i8> %x2, i16 %x3) - %res2 = call <16 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1) - %res3 = add <16 x i8> %res, %res1 - %res4 = add <16 x i8> %res3, %res2 - ret <16 x i8> %res4 + ret <16 x i8> %res } declare <32 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.256(<32 x i8>, <32 x i8>, <32 x i8>, i32) +define <32 x i8>@test_int_x86_avx512_vpermt2var_qi_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermt2var_qi_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermi2b %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x75,0x28,0x75,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1) + ret <32 x i8> %res +} + define <32 x i8>@test_int_x86_avx512_mask_vpermt2var_qi_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermt2var_qi_256: ; X86: # %bb.0: -; X86-NEXT: vmovdqa %ymm1, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9] -; X86-NEXT: vpermt2b %ymm2, %ymm0, %ymm3 # encoding: [0x62,0xf2,0x7d,0x28,0x7d,0xda] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpermt2b %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x7d,0xca] -; X86-NEXT: vpxor %xmm4, %xmm4, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4] -; X86-NEXT: vpermt2b %ymm2, %ymm0, %ymm4 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x7d,0xe2] -; X86-NEXT: vpaddb %ymm3, %ymm4, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xdd,0xfc,0xc3] -; X86-NEXT: vpaddb %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfc,0xc0] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermt2var_qi_256: ; X64: # %bb.0: -; X64-NEXT: vmovdqa %ymm1, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9] -; X64-NEXT: vpermt2b %ymm2, %ymm0, %ymm3 # encoding: [0x62,0xf2,0x7d,0x28,0x7d,0xda] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpermt2b %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x7d,0xca] -; X64-NEXT: vpxor %xmm4, %xmm4, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4] -; X64-NEXT: vpermt2b %ymm2, %ymm0, %ymm4 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x7d,0xe2] -; X64-NEXT: vpaddb %ymm3, %ymm4, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xdd,0xfc,0xc3] -; X64-NEXT: vpaddb %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfc,0xc0] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <32 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) - %res1 = call <32 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.256(<32 x i8> %x0, <32 x i8> zeroinitializer, <32 x i8> %x2, i32 %x3) - %res2 = call <32 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1) - %res3 = add <32 x i8> %res, %res1 - %res4 = add <32 x i8> %res3, %res2 - ret <32 x i8> %res4 + ret <32 x i8> %res } declare <16 x i8> @llvm.x86.avx512.maskz.vpermt2var.qi.128(<16 x i8>, <16 x i8>, <16 x i8>, i16) diff --git a/llvm/test/CodeGen/X86/avx512vbmivl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512vbmivl-intrinsics.ll index 7c03d78c825f3..748c35a3afcb9 100644 --- a/llvm/test/CodeGen/X86/avx512vbmivl-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512vbmivl-intrinsics.ll @@ -4,282 +4,318 @@ declare <16 x i8> @llvm.x86.avx512.permvar.qi.128(<16 x i8>, <16 x i8>) +define <16 x i8>@test_int_x86_avx512_permvar_qi_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2) { +; CHECK-LABEL: test_int_x86_avx512_permvar_qi_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermb %xmm0, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x75,0x08,0x8d,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = call <16 x i8> @llvm.x86.avx512.permvar.qi.128(<16 x i8> %x0, <16 x i8> %x1) + ret <16 x i8> %1 +} + define <16 x i8>@test_int_x86_avx512_mask_permvar_qi_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_permvar_qi_128: ; X86: # %bb.0: -; X86-NEXT: vpermb %xmm0, %xmm1, %xmm3 # encoding: [0x62,0xf2,0x75,0x08,0x8d,0xd8] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpermb %xmm0, %xmm1, %xmm2 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0x8d,0xd0] -; X86-NEXT: vpermb %xmm0, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0x8d,0xc0] -; X86-NEXT: vpaddb %xmm3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc3] -; X86-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0] +; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_permvar_qi_128: ; X64: # %bb.0: -; X64-NEXT: vpermb %xmm0, %xmm1, %xmm3 # encoding: [0x62,0xf2,0x75,0x08,0x8d,0xd8] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpermb %xmm0, %xmm1, %xmm2 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0x8d,0xd0] -; X64-NEXT: vpermb %xmm0, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0x8d,0xc0] -; X64-NEXT: vpaddb %xmm3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc3] -; X64-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0] +; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %1 = call <16 x i8> @llvm.x86.avx512.permvar.qi.128(<16 x i8> %x0, <16 x i8> %x1) %2 = bitcast i16 %x3 to <16 x i1> %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %x2 - %4 = call <16 x i8> @llvm.x86.avx512.permvar.qi.128(<16 x i8> %x0, <16 x i8> %x1) - %5 = bitcast i16 %x3 to <16 x i1> - %6 = select <16 x i1> %5, <16 x i8> %4, <16 x i8> zeroinitializer - %7 = call <16 x i8> @llvm.x86.avx512.permvar.qi.128(<16 x i8> %x0, <16 x i8> %x1) - %res3 = add <16 x i8> %3, %6 - %res4 = add <16 x i8> %res3, %7 - ret <16 x i8> %res4 + ret <16 x i8> %3 +} + +define <16 x i8>@test_int_x86_avx512_maskz_permvar_qi_128(<16 x i8> %x0, <16 x i8> %x1, i16 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_permvar_qi_128: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpermb %xmm0, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0x8d,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_permvar_qi_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpermb %xmm0, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0x8d,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %1 = call <16 x i8> @llvm.x86.avx512.permvar.qi.128(<16 x i8> %x0, <16 x i8> %x1) + %2 = bitcast i16 %x3 to <16 x i1> + %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> zeroinitializer + ret <16 x i8> %3 } declare <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8>, <32 x i8>) +define <32 x i8>@test_int_x86_avx512_permvar_qi_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2) { +; CHECK-LABEL: test_int_x86_avx512_permvar_qi_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermb %ymm0, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x75,0x28,0x8d,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = call <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8> %x0, <32 x i8> %x1) + ret <32 x i8> %1 +} + define <32 x i8>@test_int_x86_avx512_mask_permvar_qi_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_permvar_qi_256: ; X86: # %bb.0: -; X86-NEXT: vpermb %ymm0, %ymm1, %ymm3 # encoding: [0x62,0xf2,0x75,0x28,0x8d,0xd8] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpermb %ymm0, %ymm1, %ymm2 {%k1} # encoding: [0x62,0xf2,0x75,0x29,0x8d,0xd0] -; X86-NEXT: vpermb %ymm0, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x8d,0xc0] -; X86-NEXT: vpaddb %ymm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfc,0xc3] -; X86-NEXT: vpaddb %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc0] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_permvar_qi_256: ; X64: # %bb.0: -; X64-NEXT: vpermb %ymm0, %ymm1, %ymm3 # encoding: [0x62,0xf2,0x75,0x28,0x8d,0xd8] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpermb %ymm0, %ymm1, %ymm2 {%k1} # encoding: [0x62,0xf2,0x75,0x29,0x8d,0xd0] -; X64-NEXT: vpermb %ymm0, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x8d,0xc0] -; X64-NEXT: vpaddb %ymm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfc,0xc3] -; X64-NEXT: vpaddb %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc0] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %1 = call <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8> %x0, <32 x i8> %x1) %2 = bitcast i32 %x3 to <32 x i1> %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %x2 - %4 = call <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8> %x0, <32 x i8> %x1) - %5 = bitcast i32 %x3 to <32 x i1> - %6 = select <32 x i1> %5, <32 x i8> %4, <32 x i8> zeroinitializer - %7 = call <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8> %x0, <32 x i8> %x1) - %res3 = add <32 x i8> %3, %6 - %res4 = add <32 x i8> %res3, %7 - ret <32 x i8> %res4 + ret <32 x i8> %3 +} + +define <32 x i8>@test_int_x86_avx512_maskz_permvar_qi_256(<32 x i8> %x0, <32 x i8> %x1, i32 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_permvar_qi_256: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpermb %ymm0, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x8d,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_permvar_qi_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpermb %ymm0, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x8d,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %1 = call <32 x i8> @llvm.x86.avx512.permvar.qi.256(<32 x i8> %x0, <32 x i8> %x1) + %2 = bitcast i32 %x3 to <32 x i1> + %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> zeroinitializer + ret <32 x i8> %3 } declare <16 x i8> @llvm.x86.avx512.pmultishift.qb.128(<16 x i8>, <16 x i8>) +define <16 x i8>@test_int_x86_avx512_pmultishift_qb_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pmultishift_qb_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmultishiftqb %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf2,0xfd,0x08,0x83,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = call <16 x i8> @llvm.x86.avx512.pmultishift.qb.128(<16 x i8> %x0, <16 x i8> %x1) + ret <16 x i8> %1 +} + define <16 x i8>@test_int_x86_avx512_mask_pmultishift_qb_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pmultishift_qb_128: ; X86: # %bb.0: -; X86-NEXT: vpmultishiftqb %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf2,0xfd,0x08,0x83,0xd9] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmultishiftqb %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x83,0xd1] -; X86-NEXT: vpmultishiftqb %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0x83,0xc1] -; X86-NEXT: vpaddb %xmm3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc3] -; X86-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0] +; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmultishift_qb_128: ; X64: # %bb.0: -; X64-NEXT: vpmultishiftqb %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf2,0xfd,0x08,0x83,0xd9] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmultishiftqb %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x83,0xd1] -; X64-NEXT: vpmultishiftqb %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0x83,0xc1] -; X64-NEXT: vpaddb %xmm3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc3] -; X64-NEXT: vpaddb %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0] +; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %1 = call <16 x i8> @llvm.x86.avx512.pmultishift.qb.128(<16 x i8> %x0, <16 x i8> %x1) %2 = bitcast i16 %x3 to <16 x i1> %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %x2 - %4 = call <16 x i8> @llvm.x86.avx512.pmultishift.qb.128(<16 x i8> %x0, <16 x i8> %x1) - %5 = bitcast i16 %x3 to <16 x i1> - %6 = select <16 x i1> %5, <16 x i8> %4, <16 x i8> zeroinitializer - %7 = call <16 x i8> @llvm.x86.avx512.pmultishift.qb.128(<16 x i8> %x0, <16 x i8> %x1) - %res3 = add <16 x i8> %3, %6 - %res4 = add <16 x i8> %res3, %7 - ret <16 x i8> %res4 + ret <16 x i8> %3 +} + +define <16 x i8>@test_int_x86_avx512_maskz_pmultishift_qb_128(<16 x i8> %x0, <16 x i8> %x1, i16 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_pmultishift_qb_128: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpmultishiftqb %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0x83,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmultishift_qb_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpmultishiftqb %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0x83,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %1 = call <16 x i8> @llvm.x86.avx512.pmultishift.qb.128(<16 x i8> %x0, <16 x i8> %x1) + %2 = bitcast i16 %x3 to <16 x i1> + %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> zeroinitializer + ret <16 x i8> %3 } declare <32 x i8> @llvm.x86.avx512.pmultishift.qb.256(<32 x i8>, <32 x i8>) +define <32 x i8>@test_int_x86_avx512_pmultishift_qb_256(<32 x i8> %x0, <32 x i8> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmultishift_qb_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmultishiftqb %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf2,0xfd,0x28,0x83,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = call <32 x i8> @llvm.x86.avx512.pmultishift.qb.256(<32 x i8> %x0, <32 x i8> %x1) + ret <32 x i8> %1 +} + define <32 x i8>@test_int_x86_avx512_mask_pmultishift_qb_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pmultishift_qb_256: ; X86: # %bb.0: -; X86-NEXT: vpmultishiftqb %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf2,0xfd,0x28,0x83,0xd9] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpmultishiftqb %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x83,0xd1] -; X86-NEXT: vpmultishiftqb %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x83,0xc1] -; X86-NEXT: vpaddb %ymm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfc,0xc3] -; X86-NEXT: vpaddb %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc0] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmultishift_qb_256: ; X64: # %bb.0: -; X64-NEXT: vpmultishiftqb %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf2,0xfd,0x28,0x83,0xd9] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpmultishiftqb %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x83,0xd1] -; X64-NEXT: vpmultishiftqb %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x83,0xc1] -; X64-NEXT: vpaddb %ymm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfc,0xc3] -; X64-NEXT: vpaddb %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc0] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %1 = call <32 x i8> @llvm.x86.avx512.pmultishift.qb.256(<32 x i8> %x0, <32 x i8> %x1) %2 = bitcast i32 %x3 to <32 x i1> %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %x2 - %4 = call <32 x i8> @llvm.x86.avx512.pmultishift.qb.256(<32 x i8> %x0, <32 x i8> %x1) - %5 = bitcast i32 %x3 to <32 x i1> - %6 = select <32 x i1> %5, <32 x i8> %4, <32 x i8> zeroinitializer - %7 = call <32 x i8> @llvm.x86.avx512.pmultishift.qb.256(<32 x i8> %x0, <32 x i8> %x1) - %res3 = add <32 x i8> %3, %6 - %res4 = add <32 x i8> %res3, %7 - ret <32 x i8> %res4 + ret <32 x i8> %3 +} + +define <32 x i8>@test_int_x86_avx512_maskz_pmultishift_qb_256(<32 x i8> %x0, <32 x i8> %x1, i32 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_pmultishift_qb_256: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpmultishiftqb %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x83,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmultishift_qb_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpmultishiftqb %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x83,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %1 = call <32 x i8> @llvm.x86.avx512.pmultishift.qb.256(<32 x i8> %x0, <32 x i8> %x1) + %2 = bitcast i32 %x3 to <32 x i1> + %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> zeroinitializer + ret <32 x i8> %3 } declare <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8>, <16 x i8>, <16 x i8>) +define <16 x i8>@test_int_x86_avx512_vpermi2var_qi_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermi2var_qi_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermt2b %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x75,0x08,0x7d,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = call <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2) + ret <16 x i8> %1 +} + define <16 x i8>@test_int_x86_avx512_mask_vpermi2var_qi_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermi2var_qi_128: ; X86: # %bb.0: -; X86-NEXT: vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8] -; X86-NEXT: vpermt2b %xmm2, %xmm1, %xmm3 # encoding: [0x62,0xf2,0x75,0x08,0x7d,0xda] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpermi2b %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x75,0xca] -; X86-NEXT: vpermt2b %xmm2, %xmm3, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x65,0x89,0x7d,0xc2] -; X86-NEXT: vpaddb %xmm3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc3] -; X86-NEXT: vpaddb %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xc0] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermi2var_qi_128: ; X64: # %bb.0: -; X64-NEXT: vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8] -; X64-NEXT: vpermt2b %xmm2, %xmm1, %xmm3 # encoding: [0x62,0xf2,0x75,0x08,0x7d,0xda] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpermi2b %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x75,0xca] -; X64-NEXT: vpermt2b %xmm2, %xmm3, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x65,0x89,0x7d,0xc2] -; X64-NEXT: vpaddb %xmm3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc3] -; X64-NEXT: vpaddb %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xc0] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %1 = call <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2) %2 = bitcast i16 %x3 to <16 x i1> %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %x1 - %4 = call <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8> %x0, <16 x i8> %1, <16 x i8> %x2) - %5 = bitcast i16 %x3 to <16 x i1> - %6 = select <16 x i1> %5, <16 x i8> %4, <16 x i8> zeroinitializer - %7 = call <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2) - %res3 = add <16 x i8> %3, %6 - %res4 = add <16 x i8> %res3, %7 - ret <16 x i8> %res4 + ret <16 x i8> %3 } declare <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8>, <32 x i8>, <32 x i8>) +define <32 x i8>@test_int_x86_avx512_vpermi2var_qi_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermi2var_qi_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermt2b %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x75,0x28,0x7d,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = call <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2) + ret <32 x i8> %1 +} + define <32 x i8>@test_int_x86_avx512_mask_vpermi2var_qi_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermi2var_qi_256: ; X86: # %bb.0: -; X86-NEXT: vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8] -; X86-NEXT: vpermt2b %ymm2, %ymm1, %ymm3 # encoding: [0x62,0xf2,0x75,0x28,0x7d,0xda] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpermi2b %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x75,0xca] -; X86-NEXT: vpermt2b %ymm2, %ymm3, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x65,0xa9,0x7d,0xc2] -; X86-NEXT: vpaddb %ymm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfc,0xc3] -; X86-NEXT: vpaddb %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfc,0xc0] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermi2var_qi_256: ; X64: # %bb.0: -; X64-NEXT: vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8] -; X64-NEXT: vpermt2b %ymm2, %ymm1, %ymm3 # encoding: [0x62,0xf2,0x75,0x28,0x7d,0xda] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpermi2b %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x75,0xca] -; X64-NEXT: vpermt2b %ymm2, %ymm3, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x65,0xa9,0x7d,0xc2] -; X64-NEXT: vpaddb %ymm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfc,0xc3] -; X64-NEXT: vpaddb %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfc,0xc0] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %1 = call <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2) %2 = bitcast i32 %x3 to <32 x i1> %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %x1 - %4 = call <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8> %x0, <32 x i8> %1, <32 x i8> %x2) - %5 = bitcast i32 %x3 to <32 x i1> - %6 = select <32 x i1> %5, <32 x i8> %4, <32 x i8> zeroinitializer - %7 = call <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2) - %res3 = add <32 x i8> %3, %6 - %res4 = add <32 x i8> %res3, %7 - ret <32 x i8> %res4 + ret <32 x i8> %3 +} + +define <16 x i8>@test_int_x86_avx512_vpermt2var_qi_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermt2var_qi_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermi2b %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x75,0x08,0x75,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = call <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8> %x1, <16 x i8> %x0, <16 x i8> %x2) + ret <16 x i8> %1 } define <16 x i8>@test_int_x86_avx512_mask_vpermt2var_qi_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermt2var_qi_128: ; X86: # %bb.0: -; X86-NEXT: vmovdqa %xmm1, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9] -; X86-NEXT: vpermt2b %xmm2, %xmm0, %xmm3 # encoding: [0x62,0xf2,0x7d,0x08,0x7d,0xda] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpermt2b %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x7d,0xca] -; X86-NEXT: vpxor %xmm4, %xmm4, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4] -; X86-NEXT: vpermt2b %xmm2, %xmm0, %xmm4 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x7d,0xe2] -; X86-NEXT: vpaddb %xmm3, %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xd9,0xfc,0xc3] -; X86-NEXT: vpaddb %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xc0] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermt2var_qi_128: ; X64: # %bb.0: -; X64-NEXT: vmovdqa %xmm1, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9] -; X64-NEXT: vpermt2b %xmm2, %xmm0, %xmm3 # encoding: [0x62,0xf2,0x7d,0x08,0x7d,0xda] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpermt2b %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x7d,0xca] -; X64-NEXT: vpxor %xmm4, %xmm4, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4] -; X64-NEXT: vpermt2b %xmm2, %xmm0, %xmm4 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x7d,0xe2] -; X64-NEXT: vpaddb %xmm3, %xmm4, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xd9,0xfc,0xc3] -; X64-NEXT: vpaddb %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xc0] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %1 = call <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8> %x1, <16 x i8> %x0, <16 x i8> %x2) %2 = bitcast i16 %x3 to <16 x i1> %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %x1 - %4 = call <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8> zeroinitializer, <16 x i8> %x0, <16 x i8> %x2) - %5 = bitcast i16 %x3 to <16 x i1> - %6 = select <16 x i1> %5, <16 x i8> %4, <16 x i8> zeroinitializer - %7 = call <16 x i8> @llvm.x86.avx512.vpermi2var.qi.128(<16 x i8> %x1, <16 x i8> %x0, <16 x i8> %x2) - %res3 = add <16 x i8> %3, %6 - %res4 = add <16 x i8> %res3, %7 - ret <16 x i8> %res4 + ret <16 x i8> %3 +} + +define <32 x i8>@test_int_x86_avx512_vpermt2var_qi_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermt2var_qi_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermi2b %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x75,0x28,0x75,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = call <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8> %x1, <32 x i8> %x0, <32 x i8> %x2) + ret <32 x i8> %1 } define <32 x i8>@test_int_x86_avx512_mask_vpermt2var_qi_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermt2var_qi_256: ; X86: # %bb.0: -; X86-NEXT: vmovdqa %ymm1, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9] -; X86-NEXT: vpermt2b %ymm2, %ymm0, %ymm3 # encoding: [0x62,0xf2,0x7d,0x28,0x7d,0xda] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] ; X86-NEXT: vpermt2b %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x7d,0xca] -; X86-NEXT: vpxor %xmm4, %xmm4, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4] -; X86-NEXT: vpermt2b %ymm2, %ymm0, %ymm4 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x7d,0xe2] -; X86-NEXT: vpaddb %ymm3, %ymm4, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xdd,0xfc,0xc3] -; X86-NEXT: vpaddb %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfc,0xc0] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermt2var_qi_256: ; X64: # %bb.0: -; X64-NEXT: vmovdqa %ymm1, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9] -; X64-NEXT: vpermt2b %ymm2, %ymm0, %ymm3 # encoding: [0x62,0xf2,0x7d,0x28,0x7d,0xda] ; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] ; X64-NEXT: vpermt2b %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x7d,0xca] -; X64-NEXT: vpxor %xmm4, %xmm4, %xmm4 # EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4] -; X64-NEXT: vpermt2b %ymm2, %ymm0, %ymm4 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x7d,0xe2] -; X64-NEXT: vpaddb %ymm3, %ymm4, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xdd,0xfc,0xc3] -; X64-NEXT: vpaddb %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfc,0xc0] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %1 = call <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8> %x1, <32 x i8> %x0, <32 x i8> %x2) %2 = bitcast i32 %x3 to <32 x i1> %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %x1 - %4 = call <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8> zeroinitializer, <32 x i8> %x0, <32 x i8> %x2) - %5 = bitcast i32 %x3 to <32 x i1> - %6 = select <32 x i1> %5, <32 x i8> %4, <32 x i8> zeroinitializer - %7 = call <32 x i8> @llvm.x86.avx512.vpermi2var.qi.256(<32 x i8> %x1, <32 x i8> %x0, <32 x i8> %x2) - %res3 = add <32 x i8> %3, %6 - %res4 = add <32 x i8> %res3, %7 - ret <32 x i8> %res4 + ret <32 x i8> %3 } define <16 x i8>@test_int_x86_avx512_maskz_vpermt2var_qi_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) { diff --git a/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll index 9ea5000f19bd4..3e9ad09b8d587 100644 --- a/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll @@ -4,375 +4,573 @@ declare <4 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.128(i32, <4 x i32>, i8) +define <4 x i32>@test_int_x86_avx512_pbroadcast_d_gpr_128(i32 %x0, <4 x i32> %x1, i8 %mask) { +; X86-LABEL: test_int_x86_avx512_pbroadcast_d_gpr_128: +; X86: # %bb.0: +; X86-NEXT: vbroadcastss {{[0-9]+}}(%esp), %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x44,0x24,0x04] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_pbroadcast_d_gpr_128: +; X64: # %bb.0: +; X64-NEXT: vpbroadcastd %edi, %xmm0 # encoding: [0x62,0xf2,0x7d,0x08,0x7c,0xc7] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.128(i32 %x0, <4 x i32> %x1, i8 -1) + ret <4 x i32> %res +} + define <4 x i32>@test_int_x86_avx512_mask_pbroadcast_d_gpr_128(i32 %x0, <4 x i32> %x1, i8 %mask) { ; X86-LABEL: test_int_x86_avx512_mask_pbroadcast_d_gpr_128: ; X86: # %bb.0: -; X86-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x58,0x4c,0x24,0x04] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x6f,0xc1] -; X86-NEXT: vmovdqa32 %xmm1, %xmm2 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0x6f,0xd1] -; X86-NEXT: vpaddd %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc2] -; X86-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] +; X86-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm0 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x58,0x44,0x24,0x01] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pbroadcast_d_gpr_128: ; X64: # %bb.0: -; X64-NEXT: vpbroadcastd %edi, %xmm1 # encoding: [0x62,0xf2,0x7d,0x08,0x7c,0xcf] ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] ; X64-NEXT: vpbroadcastd %edi, %xmm0 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x7c,0xc7] -; X64-NEXT: vpbroadcastd %edi, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x7c,0xd7] -; X64-NEXT: vpaddd %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc2] -; X64-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <4 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.128(i32 %x0, <4 x i32> %x1, i8 -1) - %res1 = call <4 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.128(i32 %x0, <4 x i32> %x1, i8 %mask) - %res2 = call <4 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.128(i32 %x0, <4 x i32> zeroinitializer, i8 %mask) - %res3 = add <4 x i32> %res, %res1 - %res4 = add <4 x i32> %res2, %res3 - ret <4 x i32> %res4 + %res = call <4 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.128(i32 %x0, <4 x i32> %x1, i8 %mask) + ret <4 x i32> %res } +define <4 x i32>@test_int_x86_avx512_maskz_pbroadcast_d_gpr_128(i32 %x0, i8 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_pbroadcast_d_gpr_128: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x58,0x44,0x24,0x01] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pbroadcast_d_gpr_128: +; X64: # %bb.0: +; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] +; X64-NEXT: vpbroadcastd %edi, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x7c,0xc7] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.128(i32 %x0, <4 x i32> zeroinitializer, i8 %mask) + ret <4 x i32> %res +} declare <2 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.128(i64, <2 x i64>, i8) +define <2 x i64>@test_int_x86_avx512_pbroadcast_q_gpr_128(i64 %x0, <2 x i64> %x1) { +; X86-LABEL: test_int_x86_avx512_pbroadcast_q_gpr_128: +; X86: # %bb.0: +; X86-NEXT: vmovddup {{[0-9]+}}(%esp), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x12,0x44,0x24,0x04] +; X86-NEXT: # xmm0 = mem[0,0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_pbroadcast_q_gpr_128: +; X64: # %bb.0: +; X64-NEXT: vpbroadcastq %rdi, %xmm0 # encoding: [0x62,0xf2,0xfd,0x08,0x7c,0xc7] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <2 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.128(i64 %x0, <2 x i64> %x1,i8 -1) + ret <2 x i64> %res +} + define <2 x i64>@test_int_x86_avx512_mask_pbroadcast_q_gpr_128(i64 %x0, <2 x i64> %x1, i8 %mask) { ; X86-LABEL: test_int_x86_avx512_mask_pbroadcast_q_gpr_128: ; X86: # %bb.0: -; X86-NEXT: vpbroadcastq {{[0-9]+}}(%esp), %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0x4c,0x24,0x04] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x0c] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vmovdqa64 %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0x6f,0xc1] -; X86-NEXT: vmovdqa64 %xmm1, %xmm2 {%k1} {z} # encoding: [0x62,0xf1,0xfd,0x89,0x6f,0xd1] -; X86-NEXT: vpaddq %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc2] -; X86-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] +; X86-NEXT: vpbroadcastq {{[0-9]+}}(%esp), %xmm0 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x59,0x84,0x24,0x04,0x00,0x00,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pbroadcast_q_gpr_128: ; X64: # %bb.0: -; X64-NEXT: vpbroadcastq %rdi, %xmm1 # encoding: [0x62,0xf2,0xfd,0x08,0x7c,0xcf] ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] ; X64-NEXT: vpbroadcastq %rdi, %xmm0 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x7c,0xc7] -; X64-NEXT: vpbroadcastq %rdi, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0x7c,0xd7] -; X64-NEXT: vpaddq %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc2] -; X64-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <2 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.128(i64 %x0, <2 x i64> %x1,i8 -1) - %res1 = call <2 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.128(i64 %x0, <2 x i64> %x1,i8 %mask) - %res2 = call <2 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.128(i64 %x0, <2 x i64> zeroinitializer,i8 %mask) - %res3 = add <2 x i64> %res, %res1 - %res4 = add <2 x i64> %res2, %res3 - ret <2 x i64> %res4 + %res = call <2 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.128(i64 %x0, <2 x i64> %x1,i8 %mask) + ret <2 x i64> %res +} + +define <2 x i64>@test_int_x86_avx512_maskz_pbroadcast_q_gpr_128(i64 %x0, i8 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_pbroadcast_q_gpr_128: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x0c] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpbroadcastq {{[0-9]+}}(%esp), %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0x59,0x84,0x24,0x04,0x00,0x00,0x00] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pbroadcast_q_gpr_128: +; X64: # %bb.0: +; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] +; X64-NEXT: vpbroadcastq %rdi, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0x7c,0xc7] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <2 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.128(i64 %x0, <2 x i64> zeroinitializer,i8 %mask) + ret <2 x i64> %res } +declare <8 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.256(i32, <8 x i32>, i8) - declare <8 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.256(i32, <8 x i32>, i8) +define <8 x i32>@test_int_x86_avx512_pbroadcast_d_gpr_256(i32 %x0, <8 x i32> %x1) { +; X86-LABEL: test_int_x86_avx512_pbroadcast_d_gpr_256: +; X86: # %bb.0: +; X86-NEXT: vbroadcastss {{[0-9]+}}(%esp), %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x18,0x44,0x24,0x04] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_pbroadcast_d_gpr_256: +; X64: # %bb.0: +; X64-NEXT: vpbroadcastd %edi, %ymm0 # encoding: [0x62,0xf2,0x7d,0x28,0x7c,0xc7] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <8 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.256(i32 %x0, <8 x i32> %x1, i8 -1) + ret <8 x i32> %res +} - define <8 x i32>@test_int_x86_avx512_mask_pbroadcast_d_gpr_256(i32 %x0, <8 x i32> %x1, i8 %mask) { +define <8 x i32>@test_int_x86_avx512_mask_pbroadcast_d_gpr_256(i32 %x0, <8 x i32> %x1, i8 %mask) { ; X86-LABEL: test_int_x86_avx512_mask_pbroadcast_d_gpr_256: ; X86: # %bb.0: -; X86-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x58,0x4c,0x24,0x04] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vmovdqa32 %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x6f,0xc1] -; X86-NEXT: vmovdqa32 %ymm1, %ymm2 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0x6f,0xd1] -; X86-NEXT: vpaddd %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc2] -; X86-NEXT: vpaddd %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0] +; X86-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %ymm0 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x58,0x44,0x24,0x01] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pbroadcast_d_gpr_256: ; X64: # %bb.0: -; X64-NEXT: vpbroadcastd %edi, %ymm1 # encoding: [0x62,0xf2,0x7d,0x28,0x7c,0xcf] ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] ; X64-NEXT: vpbroadcastd %edi, %ymm0 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x7c,0xc7] -; X64-NEXT: vpbroadcastd %edi, %ymm2 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x7c,0xd7] -; X64-NEXT: vpaddd %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc2] -; X64-NEXT: vpaddd %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <8 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.256(i32 %x0, <8 x i32> %x1, i8 -1) - %res1 = call <8 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.256(i32 %x0, <8 x i32> %x1, i8 %mask) - %res2 = call <8 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.256(i32 %x0, <8 x i32> zeroinitializer, i8 %mask) - %res3 = add <8 x i32> %res, %res1 - %res4 = add <8 x i32> %res2, %res3 - ret <8 x i32> %res4 - } + %res = call <8 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.256(i32 %x0, <8 x i32> %x1, i8 %mask) + ret <8 x i32> %res +} + +define <8 x i32>@test_int_x86_avx512_maskz_pbroadcast_d_gpr_256(i32 %x0, i8 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_pbroadcast_d_gpr_256: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x58,0x44,0x24,0x01] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pbroadcast_d_gpr_256: +; X64: # %bb.0: +; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] +; X64-NEXT: vpbroadcastd %edi, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x7c,0xc7] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <8 x i32> @llvm.x86.avx512.mask.pbroadcast.d.gpr.256(i32 %x0, <8 x i32> zeroinitializer, i8 %mask) + ret <8 x i32> %res +} + +declare <4 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.256(i64, <4 x i64>, i8) - declare <4 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.256(i64, <4 x i64>, i8) +define <4 x i64>@test_int_x86_avx512_pbroadcast_q_gpr_256(i64 %x0, <4 x i64> %x1) { +; X86-LABEL: test_int_x86_avx512_pbroadcast_q_gpr_256: +; X86: # %bb.0: +; X86-NEXT: vbroadcastsd {{[0-9]+}}(%esp), %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x19,0x44,0x24,0x04] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_pbroadcast_q_gpr_256: +; X64: # %bb.0: +; X64-NEXT: vpbroadcastq %rdi, %ymm0 # encoding: [0x62,0xf2,0xfd,0x28,0x7c,0xc7] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <4 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.256(i64 %x0, <4 x i64> %x1,i8 -1) + ret <4 x i64> %res +} - define <4 x i64>@test_int_x86_avx512_mask_pbroadcast_q_gpr_256(i64 %x0, <4 x i64> %x1, i8 %mask) { +define <4 x i64>@test_int_x86_avx512_mask_pbroadcast_q_gpr_256(i64 %x0, <4 x i64> %x1, i8 %mask) { ; X86-LABEL: test_int_x86_avx512_mask_pbroadcast_q_gpr_256: ; X86: # %bb.0: -; X86-NEXT: vpbroadcastq {{[0-9]+}}(%esp), %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x59,0x4c,0x24,0x04] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x0c] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vmovdqa64 %ymm1, %ymm0 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0x6f,0xc1] -; X86-NEXT: vmovdqa64 %ymm1, %ymm2 {%k1} {z} # encoding: [0x62,0xf1,0xfd,0xa9,0x6f,0xd1] -; X86-NEXT: vpaddq %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc2] -; X86-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] +; X86-NEXT: vpbroadcastq {{[0-9]+}}(%esp), %ymm0 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x59,0x84,0x24,0x04,0x00,0x00,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pbroadcast_q_gpr_256: ; X64: # %bb.0: -; X64-NEXT: vpbroadcastq %rdi, %ymm1 # encoding: [0x62,0xf2,0xfd,0x28,0x7c,0xcf] ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] ; X64-NEXT: vpbroadcastq %rdi, %ymm0 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x7c,0xc7] -; X64-NEXT: vpbroadcastq %rdi, %ymm2 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x7c,0xd7] -; X64-NEXT: vpaddq %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc2] -; X64-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <4 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.256(i64 %x0, <4 x i64> %x1,i8 -1) - %res1 = call <4 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.256(i64 %x0, <4 x i64> %x1,i8 %mask) - %res2 = call <4 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.256(i64 %x0, <4 x i64> zeroinitializer,i8 %mask) - %res3 = add <4 x i64> %res, %res1 - %res4 = add <4 x i64> %res2, %res3 - ret <4 x i64> %res4 - } - + %res = call <4 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.256(i64 %x0, <4 x i64> %x1,i8 %mask) + ret <4 x i64> %res +} +define <4 x i64>@test_int_x86_avx512_maskz_pbroadcast_q_gpr_256(i64 %x0, i8 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_pbroadcast_q_gpr_256: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x0c] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpbroadcastq {{[0-9]+}}(%esp), %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x59,0x84,0x24,0x04,0x00,0x00,0x00] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pbroadcast_q_gpr_256: +; X64: # %bb.0: +; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] +; X64-NEXT: vpbroadcastq %rdi, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x7c,0xc7] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <4 x i64> @llvm.x86.avx512.mask.pbroadcast.q.gpr.256(i64 %x0, <4 x i64> zeroinitializer,i8 %mask) + ret <4 x i64> %res +} declare <8 x i32> @llvm.x86.avx512.pbroadcastd.256(<4 x i32>, <8 x i32>, i8) -define <8 x i32>@test_int_x86_avx512_pbroadcastd_256(<4 x i32> %x0, <8 x i32> %x1, i8 %mask, i32 * %y_ptr) { +define <8 x i32>@test_int_x86_avx512_pbroadcastd_256(<4 x i32> %x0, <8 x i32> %x1, i32 * %y_ptr) { ; X86-LABEL: test_int_x86_avx512_pbroadcastd_256: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08] -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x04] -; X86-NEXT: kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9] -; X86-NEXT: vpbroadcastd %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x58,0xc8] -; X86-NEXT: vpbroadcastd %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x58,0xc0] -; X86-NEXT: vpaddd (%eax){1to8}, %ymm1, %ymm1 # encoding: [0x62,0xf1,0x75,0x38,0xfe,0x08] -; X86-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc1] +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vbroadcastss (%eax), %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x18,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_pbroadcastd_256: ; X64: # %bb.0: -; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vpbroadcastd %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x58,0xc8] -; X64-NEXT: vpbroadcastd %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x58,0xc0] -; X64-NEXT: vpaddd (%rsi){1to8}, %ymm1, %ymm1 # encoding: [0x62,0xf1,0x75,0x38,0xfe,0x0e] -; X64-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc1] +; X64-NEXT: vbroadcastss (%rdi), %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x18,0x07] ; X64-NEXT: retq # encoding: [0xc3] %y_32 = load i32, i32 * %y_ptr %y = insertelement <4 x i32> undef, i32 %y_32, i32 0 %res = call <8 x i32> @llvm.x86.avx512.pbroadcastd.256(<4 x i32> %y, <8 x i32> %x1, i8 -1) - %res1 = call <8 x i32> @llvm.x86.avx512.pbroadcastd.256(<4 x i32> %x0, <8 x i32> %x1, i8 %mask) - %res2 = call <8 x i32> @llvm.x86.avx512.pbroadcastd.256(<4 x i32> %x0, <8 x i32> zeroinitializer, i8 %mask) - %res3 = add <8 x i32> %res, %res1 - %res4 = add <8 x i32> %res2, %res3 - ret <8 x i32> %res4 + ret <8 x i32> %res } -declare <4 x i32> @llvm.x86.avx512.pbroadcastd.128(<4 x i32>, <4 x i32>, i8) - -define <4 x i32>@test_int_x86_avx512_pbroadcastd_128(<4 x i32> %x0, <4 x i32> %x1, i8 %mask) { -; X86-LABEL: test_int_x86_avx512_pbroadcastd_128: +define <8 x i32>@test_int_x86_avx512_mask_pbroadcastd_256(<4 x i32> %x0, <8 x i32> %x1, i8 %mask, i32 * %y_ptr) { +; X86-LABEL: test_int_x86_avx512_mask_pbroadcastd_256: ; X86: # %bb.0: -; X86-NEXT: vpbroadcastd %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x58,0xd0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpbroadcastd %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x58,0xc8] -; X86-NEXT: vpbroadcastd %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x58,0xc0] -; X86-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] -; X86-NEXT: vpaddd %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0] +; X86-NEXT: vpbroadcastd %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x58,0xc8] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; -; X64-LABEL: test_int_x86_avx512_pbroadcastd_128: +; X64-LABEL: test_int_x86_avx512_mask_pbroadcastd_256: ; X64: # %bb.0: -; X64-NEXT: vpbroadcastd %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x58,0xd0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vpbroadcastd %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x58,0xc8] -; X64-NEXT: vpbroadcastd %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x58,0xc0] -; X64-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] -; X64-NEXT: vpaddd %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0] +; X64-NEXT: vpbroadcastd %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x58,0xc8] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <4 x i32> @llvm.x86.avx512.pbroadcastd.128(<4 x i32> %x0, <4 x i32> %x1, i8 -1) - %res1 = call <4 x i32> @llvm.x86.avx512.pbroadcastd.128(<4 x i32> %x0, <4 x i32> %x1, i8 %mask) - %res2 = call <4 x i32> @llvm.x86.avx512.pbroadcastd.128(<4 x i32> %x0, <4 x i32> zeroinitializer, i8 %mask) - %res3 = add <4 x i32> %res, %res1 - %res4 = add <4 x i32> %res2, %res3 - ret <4 x i32> %res4 + %y_32 = load i32, i32 * %y_ptr + %y = insertelement <4 x i32> undef, i32 %y_32, i32 0 + %res = call <8 x i32> @llvm.x86.avx512.pbroadcastd.256(<4 x i32> %x0, <8 x i32> %x1, i8 %mask) + ret <8 x i32> %res } -declare <4 x i64> @llvm.x86.avx512.pbroadcastq.256(<2 x i64>, <4 x i64>, i8) - -define <4 x i64>@test_int_x86_avx512_pbroadcastq_256(<2 x i64> %x0, <4 x i64> %x1, i8 %mask) { -; X86-LABEL: test_int_x86_avx512_pbroadcastq_256: +define <8 x i32>@test_int_x86_avx512_maskz_pbroadcastd_256(<4 x i32> %x0, i8 %mask, i32 * %y_ptr) { +; X86-LABEL: test_int_x86_avx512_maskz_pbroadcastd_256: ; X86: # %bb.0: -; X86-NEXT: vpbroadcastq %xmm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x59,0xd0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpbroadcastq %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x59,0xc8] -; X86-NEXT: vpbroadcastq %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x59,0xc0] -; X86-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] -; X86-NEXT: vpaddq %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0] +; X86-NEXT: vpbroadcastd %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x58,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; -; X64-LABEL: test_int_x86_avx512_pbroadcastq_256: +; X64-LABEL: test_int_x86_avx512_maskz_pbroadcastd_256: ; X64: # %bb.0: -; X64-NEXT: vpbroadcastq %xmm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x59,0xd0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vpbroadcastq %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x59,0xc8] -; X64-NEXT: vpbroadcastq %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x59,0xc0] -; X64-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] -; X64-NEXT: vpaddq %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0] +; X64-NEXT: vpbroadcastd %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x58,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <4 x i64> @llvm.x86.avx512.pbroadcastq.256(<2 x i64> %x0, <4 x i64> %x1,i8 -1) - %res1 = call <4 x i64> @llvm.x86.avx512.pbroadcastq.256(<2 x i64> %x0, <4 x i64> %x1,i8 %mask) - %res2 = call <4 x i64> @llvm.x86.avx512.pbroadcastq.256(<2 x i64> %x0, <4 x i64> zeroinitializer,i8 %mask) - %res3 = add <4 x i64> %res, %res1 - %res4 = add <4 x i64> %res2, %res3 - ret <4 x i64> %res4 + %y_32 = load i32, i32 * %y_ptr + %y = insertelement <4 x i32> undef, i32 %y_32, i32 0 + %res = call <8 x i32> @llvm.x86.avx512.pbroadcastd.256(<4 x i32> %x0, <8 x i32> zeroinitializer, i8 %mask) + ret <8 x i32> %res } -declare <2 x i64> @llvm.x86.avx512.pbroadcastq.128(<2 x i64>, <2 x i64>, i8) +declare <4 x i32> @llvm.x86.avx512.pbroadcastd.128(<4 x i32>, <4 x i32>, i8) -define <2 x i64>@test_int_x86_avx512_pbroadcastq_128(<2 x i64> %x0, <2 x i64> %x1, i8 %mask) { -; X86-LABEL: test_int_x86_avx512_pbroadcastq_128: +define <4 x i32>@test_int_x86_avx512_pbroadcastd_128(<4 x i32> %x0, <4 x i32> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pbroadcastd_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vbroadcastss %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.pbroadcastd.128(<4 x i32> %x0, <4 x i32> %x1, i8 -1) + ret <4 x i32> %res +} + +define <4 x i32>@test_int_x86_avx512_mask_pbroadcastd_128(<4 x i32> %x0, <4 x i32> %x1, i8 %mask) { +; X86-LABEL: test_int_x86_avx512_mask_pbroadcastd_128: ; X86: # %bb.0: -; X86-NEXT: vpbroadcastq %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0xd0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpbroadcastq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x59,0xc8] -; X86-NEXT: vpbroadcastq %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0x59,0xc0] -; X86-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] -; X86-NEXT: vpaddq %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc0] +; X86-NEXT: vpbroadcastd %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x58,0xc8] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; -; X64-LABEL: test_int_x86_avx512_pbroadcastq_128: +; X64-LABEL: test_int_x86_avx512_mask_pbroadcastd_128: ; X64: # %bb.0: -; X64-NEXT: vpbroadcastq %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0xd0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vpbroadcastq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x59,0xc8] -; X64-NEXT: vpbroadcastq %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0x59,0xc0] -; X64-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] -; X64-NEXT: vpaddq %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc0] +; X64-NEXT: vpbroadcastd %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x58,0xc8] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <2 x i64> @llvm.x86.avx512.pbroadcastq.128(<2 x i64> %x0, <2 x i64> %x1,i8 -1) - %res1 = call <2 x i64> @llvm.x86.avx512.pbroadcastq.128(<2 x i64> %x0, <2 x i64> %x1,i8 %mask) - %res2 = call <2 x i64> @llvm.x86.avx512.pbroadcastq.128(<2 x i64> %x0, <2 x i64> zeroinitializer,i8 %mask) - %res3 = add <2 x i64> %res, %res1 - %res4 = add <2 x i64> %res2, %res3 - ret <2 x i64> %res4 + %res = call <4 x i32> @llvm.x86.avx512.pbroadcastd.128(<4 x i32> %x0, <4 x i32> %x1, i8 %mask) + ret <4 x i32> %res } -declare <4 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.256(<2 x double>, <4 x double>, i8) nounwind readonly - -define <4 x double> @test_x86_vbroadcast_sd_pd_256(<2 x double> %a0, <4 x double> %a1, i8 %mask ) { -; X86-LABEL: test_x86_vbroadcast_sd_pd_256: +define <4 x i32>@test_int_x86_avx512_maskz_pbroadcastd_128(<4 x i32> %x0, i8 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_pbroadcastd_128: ; X86: # %bb.0: -; X86-NEXT: vbroadcastsd %xmm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x19,0xd0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vbroadcastsd %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x19,0xc8] -; X86-NEXT: vaddpd %ymm1, %ymm2, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc9] -; X86-NEXT: vbroadcastsd %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x19,0xc0] -; X86-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc1] +; X86-NEXT: vpbroadcastd %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x58,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; -; X64-LABEL: test_x86_vbroadcast_sd_pd_256: +; X64-LABEL: test_int_x86_avx512_maskz_pbroadcastd_128: ; X64: # %bb.0: -; X64-NEXT: vbroadcastsd %xmm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x19,0xd0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vbroadcastsd %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x19,0xc8] -; X64-NEXT: vaddpd %ymm1, %ymm2, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc9] -; X64-NEXT: vbroadcastsd %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x19,0xc0] -; X64-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc1] +; X64-NEXT: vpbroadcastd %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x58,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <4 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.256(<2 x double> %a0, <4 x double> zeroinitializer, i8 -1) - %res1 = call <4 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.256(<2 x double> %a0, <4 x double> %a1, i8 %mask) - %res2 = call <4 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.256(<2 x double> %a0, <4 x double> zeroinitializer, i8 %mask) - %res3 = fadd <4 x double> %res, %res1 - %res4 = fadd <4 x double> %res2, %res3 - ret <4 x double> %res4 + %res = call <4 x i32> @llvm.x86.avx512.pbroadcastd.128(<4 x i32> %x0, <4 x i32> zeroinitializer, i8 %mask) + ret <4 x i32> %res } -declare <8 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.256(<4 x float>, <8 x float>, i8) nounwind readonly +declare <4 x i64> @llvm.x86.avx512.pbroadcastq.256(<2 x i64>, <4 x i64>, i8) + +define <4 x i64>@test_int_x86_avx512_pbroadcastq_256(<2 x i64> %x0, <4 x i64> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pbroadcastq_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x19,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i64> @llvm.x86.avx512.pbroadcastq.256(<2 x i64> %x0, <4 x i64> %x1,i8 -1) + ret <4 x i64> %res +} -define <8 x float> @test_x86_vbroadcast_ss_ps_256(<4 x float> %a0, <8 x float> %a1, i8 %mask ) { -; X86-LABEL: test_x86_vbroadcast_ss_ps_256: +define <4 x i64>@test_int_x86_avx512_mask_pbroadcastq_256(<2 x i64> %x0, <4 x i64> %x1, i8 %mask) { +; X86-LABEL: test_int_x86_avx512_mask_pbroadcastq_256: ; X86: # %bb.0: -; X86-NEXT: vbroadcastss %xmm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x18,0xd0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vbroadcastss %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x18,0xc8] -; X86-NEXT: vaddps %ymm1, %ymm2, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc9] -; X86-NEXT: vbroadcastss %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x18,0xc0] -; X86-NEXT: vaddps %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc1] +; X86-NEXT: vpbroadcastq %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x59,0xc8] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; -; X64-LABEL: test_x86_vbroadcast_ss_ps_256: +; X64-LABEL: test_int_x86_avx512_mask_pbroadcastq_256: ; X64: # %bb.0: -; X64-NEXT: vbroadcastss %xmm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x18,0xd0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vbroadcastss %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x18,0xc8] -; X64-NEXT: vaddps %ymm1, %ymm2, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc9] -; X64-NEXT: vbroadcastss %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x18,0xc0] -; X64-NEXT: vaddps %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc1] +; X64-NEXT: vpbroadcastq %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x59,0xc8] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <8 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.256(<4 x float> %a0, <8 x float> zeroinitializer, i8 -1) - %res1 = call <8 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.256(<4 x float> %a0, <8 x float> %a1, i8 %mask) - %res2 = call <8 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.256(<4 x float> %a0, <8 x float> zeroinitializer, i8 %mask) - %res3 = fadd <8 x float> %res, %res1 - %res4 = fadd <8 x float> %res2, %res3 - ret <8 x float> %res4 + %res = call <4 x i64> @llvm.x86.avx512.pbroadcastq.256(<2 x i64> %x0, <4 x i64> %x1,i8 %mask) + ret <4 x i64> %res } -declare <4 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.128(<4 x float>, <4 x float>, i8) nounwind readonly - -define <4 x float> @test_x86_vbroadcast_ss_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask ) { -; X86-LABEL: test_x86_vbroadcast_ss_ps_128: +define <4 x i64>@test_int_x86_avx512_maskz_pbroadcastq_256(<2 x i64> %x0, i8 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_pbroadcastq_256: ; X86: # %bb.0: -; X86-NEXT: vbroadcastss %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xd0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vbroadcastss %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x18,0xc8] -; X86-NEXT: vaddps %xmm1, %xmm2, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xc9] -; X86-NEXT: vbroadcastss %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x18,0xc0] -; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1] +; X86-NEXT: vpbroadcastq %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x59,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; -; X64-LABEL: test_x86_vbroadcast_ss_ps_128: +; X64-LABEL: test_int_x86_avx512_maskz_pbroadcastq_256: ; X64: # %bb.0: -; X64-NEXT: vbroadcastss %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xd0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vbroadcastss %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x18,0xc8] -; X64-NEXT: vaddps %xmm1, %xmm2, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xc9] -; X64-NEXT: vbroadcastss %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x18,0xc0] -; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1] +; X64-NEXT: vpbroadcastq %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x59,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <4 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.128(<4 x float> %a0, <4 x float> zeroinitializer, i8 -1) - %res1 = call <4 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.128(<4 x float> %a0, <4 x float> %a1, i8 %mask) - %res2 = call <4 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.128(<4 x float> %a0, <4 x float> zeroinitializer, i8 %mask) - %res3 = fadd <4 x float> %res, %res1 - %res4 = fadd <4 x float> %res2, %res3 - ret <4 x float> %res4 + %res = call <4 x i64> @llvm.x86.avx512.pbroadcastq.256(<2 x i64> %x0, <4 x i64> zeroinitializer,i8 %mask) + ret <4 x i64> %res } -declare <4 x float> @llvm.x86.avx512.mask.movsldup.128(<4 x float>, <4 x float>, i8) +declare <2 x i64> @llvm.x86.avx512.pbroadcastq.128(<2 x i64>, <2 x i64>, i8) -define <4 x float>@test_int_x86_avx512_mask_movsldup_128(<4 x float> %x0, <4 x float> %x1, i8 %x2) { -; X86-LABEL: test_int_x86_avx512_mask_movsldup_128: +define <2 x i64>@test_int_x86_avx512_pbroadcastq_128(<2 x i64> %x0, <2 x i64> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pbroadcastq_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovddup %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x12,0xc0] +; CHECK-NEXT: # xmm0 = xmm0[0,0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <2 x i64> @llvm.x86.avx512.pbroadcastq.128(<2 x i64> %x0, <2 x i64> %x1,i8 -1) + ret <2 x i64> %res +} + +define <2 x i64>@test_int_x86_avx512_mask_pbroadcastq_128(<2 x i64> %x0, <2 x i64> %x1, i8 %mask) { +; X86-LABEL: test_int_x86_avx512_mask_pbroadcastq_128: ; X86: # %bb.0: -; X86-NEXT: vmovsldup %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x12,0xd0] -; X86-NEXT: # xmm2 = xmm0[0,0,2,2] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vmovsldup %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7e,0x09,0x12,0xc8] -; X86-NEXT: # xmm1 {%k1} = xmm0[0,0,2,2] -; X86-NEXT: vaddps %xmm2, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca] -; X86-NEXT: vmovsldup %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7e,0x89,0x12,0xc0] -; X86-NEXT: # xmm0 {%k1} {z} = xmm0[0,0,2,2] -; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1] +; X86-NEXT: vpbroadcastq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x59,0xc8] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; -; X64-LABEL: test_int_x86_avx512_mask_movsldup_128: +; X64-LABEL: test_int_x86_avx512_mask_pbroadcastq_128: ; X64: # %bb.0: -; X64-NEXT: vmovsldup %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x12,0xd0] -; X64-NEXT: # xmm2 = xmm0[0,0,2,2] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vmovsldup %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7e,0x09,0x12,0xc8] -; X64-NEXT: # xmm1 {%k1} = xmm0[0,0,2,2] -; X64-NEXT: vaddps %xmm2, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca] +; X64-NEXT: vpbroadcastq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x59,0xc8] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <2 x i64> @llvm.x86.avx512.pbroadcastq.128(<2 x i64> %x0, <2 x i64> %x1,i8 %mask) + ret <2 x i64> %res +} + +define <2 x i64>@test_int_x86_avx512_maskz_pbroadcastq_128(<2 x i64> %x0, i8 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_pbroadcastq_128: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpbroadcastq %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0x59,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pbroadcastq_128: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vpbroadcastq %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0x59,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <2 x i64> @llvm.x86.avx512.pbroadcastq.128(<2 x i64> %x0, <2 x i64> zeroinitializer,i8 %mask) + ret <2 x i64> %res +} + +declare <4 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.256(<2 x double>, <4 x double>, i8) nounwind readonly + +define <4 x double> @test_x86_vbroadcast_sd_pd_256(<2 x double> %a0, <4 x double> %a1) { +; CHECK-LABEL: test_x86_vbroadcast_sd_pd_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x19,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.256(<2 x double> %a0, <4 x double> undef, i8 -1) + ret <4 x double> %res +} + +define <4 x double> @test_x86_mask_vbroadcast_sd_pd_256(<2 x double> %a0, <4 x double> %a1, i8 %mask ) { +; X86-LABEL: test_x86_mask_vbroadcast_sd_pd_256: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vbroadcastsd %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x19,0xc8] +; X86-NEXT: vmovapd %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_x86_mask_vbroadcast_sd_pd_256: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vbroadcastsd %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x19,0xc8] +; X64-NEXT: vmovapd %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <4 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.256(<2 x double> %a0, <4 x double> %a1, i8 %mask) + ret <4 x double> %res +} + +define <4 x double> @test_x86_maskz_vbroadcast_sd_pd_256(<2 x double> %a0, i8 %mask ) { +; X86-LABEL: test_x86_maskz_vbroadcast_sd_pd_256: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vbroadcastsd %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x19,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_x86_maskz_vbroadcast_sd_pd_256: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vbroadcastsd %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x19,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <4 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.256(<2 x double> %a0, <4 x double> zeroinitializer, i8 %mask) + ret <4 x double> %res +} + +declare <8 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.256(<4 x float>, <8 x float>, i8) nounwind readonly + +define <8 x float> @test_x86_vbroadcast_ss_ps_256(<4 x float> %a0, <8 x float> %a1) { +; CHECK-LABEL: test_x86_vbroadcast_ss_ps_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vbroadcastss %xmm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x18,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.256(<4 x float> %a0, <8 x float> zeroinitializer, i8 -1) + ret <8 x float> %res +} + +define <8 x float> @test_x86_mask_vbroadcast_ss_ps_256(<4 x float> %a0, <8 x float> %a1, i8 %mask ) { +; X86-LABEL: test_x86_mask_vbroadcast_ss_ps_256: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vbroadcastss %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x18,0xc8] +; X86-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_x86_mask_vbroadcast_ss_ps_256: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vbroadcastss %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x18,0xc8] +; X64-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <8 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.256(<4 x float> %a0, <8 x float> %a1, i8 %mask) + ret <8 x float> %res +} + +define <8 x float> @test_x86_maskz_vbroadcast_ss_ps_256(<4 x float> %a0, i8 %mask ) { +; X86-LABEL: test_x86_maskz_vbroadcast_ss_ps_256: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vbroadcastss %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x18,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_x86_maskz_vbroadcast_ss_ps_256: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vbroadcastss %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x18,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <8 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.256(<4 x float> %a0, <8 x float> zeroinitializer, i8 %mask) + ret <8 x float> %res +} + +declare <4 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.128(<4 x float>, <4 x float>, i8) nounwind readonly + +define <4 x float> @test_x86_vbroadcast_ss_ps_128(<4 x float> %a0, <4 x float> %a1) { +; CHECK-LABEL: test_x86_vbroadcast_ss_ps_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vbroadcastss %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.128(<4 x float> %a0, <4 x float> undef, i8 -1) + ret <4 x float> %res +} + + +define <4 x float> @test_x86_mask_vbroadcast_ss_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask ) { +; X86-LABEL: test_x86_mask_vbroadcast_ss_ps_128: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vbroadcastss %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x18,0xc8] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_x86_mask_vbroadcast_ss_ps_128: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vbroadcastss %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x18,0xc8] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <4 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.128(<4 x float> %a0, <4 x float> %a1, i8 %mask) + ret <4 x float> %res +} + +define <4 x float> @test_x86_maskz_vbroadcast_ss_ps_128(<4 x float> %a0, i8 %mask ) { +; X86-LABEL: test_x86_maskz_vbroadcast_ss_ps_128: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vbroadcastss %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x18,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_x86_maskz_vbroadcast_ss_ps_128: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vbroadcastss %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x18,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <4 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.128(<4 x float> %a0, <4 x float> zeroinitializer, i8 %mask) + ret <4 x float> %res +} + +declare <4 x float> @llvm.x86.avx512.mask.movsldup.128(<4 x float>, <4 x float>, i8) + +define <4 x float>@test_int_x86_avx512_mask_movsldup_128(<4 x float> %x0, <4 x float> %x1, i8 %x2) { +; X86-LABEL: test_int_x86_avx512_mask_movsldup_128: +; X86: # %bb.0: +; X86-NEXT: vmovsldup %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x12,0xd0] +; X86-NEXT: # xmm2 = xmm0[0,0,2,2] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vmovsldup %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7e,0x09,0x12,0xc8] +; X86-NEXT: # xmm1 {%k1} = xmm0[0,0,2,2] +; X86-NEXT: vaddps %xmm2, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca] +; X86-NEXT: vmovsldup %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7e,0x89,0x12,0xc0] +; X86-NEXT: # xmm0 {%k1} {z} = xmm0[0,0,2,2] +; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_mask_movsldup_128: +; X64: # %bb.0: +; X64-NEXT: vmovsldup %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x12,0xd0] +; X64-NEXT: # xmm2 = xmm0[0,0,2,2] +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vmovsldup %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7e,0x09,0x12,0xc8] +; X64-NEXT: # xmm1 {%k1} = xmm0[0,0,2,2] +; X64-NEXT: vaddps %xmm2, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca] ; X64-NEXT: vmovsldup %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7e,0x89,0x12,0xc0] ; X64-NEXT: # xmm0 {%k1} {z} = xmm0[0,0,2,2] ; X64-NEXT: vaddps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1] @@ -497,298 +695,419 @@ define <8 x float>@test_int_x86_avx512_mask_movshdup_256(<8 x float> %x0, <8 x f } declare <2 x double> @llvm.x86.avx512.mask.movddup.128(<2 x double>, <2 x double>, i8) +define <2 x double>@test_int_x86_avx512_movddup_128(<2 x double> %x0, <2 x double> %x1) { +; CHECK-LABEL: test_int_x86_avx512_movddup_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovddup %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x12,0xc0] +; CHECK-NEXT: # xmm0 = xmm0[0,0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <2 x double> @llvm.x86.avx512.mask.movddup.128(<2 x double> %x0, <2 x double> %x1, i8 -1) + ret <2 x double> %res +} + define <2 x double>@test_int_x86_avx512_mask_movddup_128(<2 x double> %x0, <2 x double> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_movddup_128: ; X86: # %bb.0: -; X86-NEXT: vmovddup %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x12,0xd0] -; X86-NEXT: # xmm2 = xmm0[0,0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vmovddup %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x12,0xc8] ; X86-NEXT: # xmm1 {%k1} = xmm0[0,0] -; X86-NEXT: vaddpd %xmm2, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xca] -; X86-NEXT: vmovddup %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0xff,0x89,0x12,0xc0] -; X86-NEXT: # xmm0 {%k1} {z} = xmm0[0,0] -; X86-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x58,0xc1] +; X86-NEXT: vmovapd %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_movddup_128: ; X64: # %bb.0: -; X64-NEXT: vmovddup %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x12,0xd0] -; X64-NEXT: # xmm2 = xmm0[0,0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vmovddup %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x12,0xc8] ; X64-NEXT: # xmm1 {%k1} = xmm0[0,0] -; X64-NEXT: vaddpd %xmm2, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xca] +; X64-NEXT: vmovapd %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <2 x double> @llvm.x86.avx512.mask.movddup.128(<2 x double> %x0, <2 x double> %x1, i8 %x2) + ret <2 x double> %res +} + +define <2 x double>@test_int_x86_avx512_maskz_movddup_128(<2 x double> %x0, i8 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_movddup_128: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vmovddup %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0xff,0x89,0x12,0xc0] +; X86-NEXT: # xmm0 {%k1} {z} = xmm0[0,0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_movddup_128: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vmovddup %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0xff,0x89,0x12,0xc0] ; X64-NEXT: # xmm0 {%k1} {z} = xmm0[0,0] -; X64-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x58,0xc1] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <2 x double> @llvm.x86.avx512.mask.movddup.128(<2 x double> %x0, <2 x double> %x1, i8 %x2) - %res1 = call <2 x double> @llvm.x86.avx512.mask.movddup.128(<2 x double> %x0, <2 x double> %x1, i8 -1) - %res2 = call <2 x double> @llvm.x86.avx512.mask.movddup.128(<2 x double> %x0, <2 x double> zeroinitializer, i8 %x2) - %res3 = fadd <2 x double> %res, %res1 - %res4 = fadd <2 x double> %res2, %res3 - ret <2 x double> %res4 + %res = call <2 x double> @llvm.x86.avx512.mask.movddup.128(<2 x double> %x0, <2 x double> zeroinitializer, i8 %x2) + ret <2 x double> %res } declare <4 x double> @llvm.x86.avx512.mask.movddup.256(<4 x double>, <4 x double>, i8) +define <4 x double>@test_int_x86_avx512_movddup_256(<4 x double> %x0, <4 x double> %x1) { +; CHECK-LABEL: test_int_x86_avx512_movddup_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovddup %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xff,0x12,0xc0] +; CHECK-NEXT: # ymm0 = ymm0[0,0,2,2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x double> @llvm.x86.avx512.mask.movddup.256(<4 x double> %x0, <4 x double> %x1, i8 -1) + ret <4 x double> %res +} + + define <4 x double>@test_int_x86_avx512_mask_movddup_256(<4 x double> %x0, <4 x double> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_movddup_256: ; X86: # %bb.0: -; X86-NEXT: vmovddup %ymm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc5,0xff,0x12,0xd0] -; X86-NEXT: # ymm2 = ymm0[0,0,2,2] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vmovddup %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0xff,0x29,0x12,0xc8] ; X86-NEXT: # ymm1 {%k1} = ymm0[0,0,2,2] -; X86-NEXT: vaddpd %ymm2, %ymm1, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xca] -; X86-NEXT: vmovddup %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0xff,0xa9,0x12,0xc0] -; X86-NEXT: # ymm0 {%k1} {z} = ymm0[0,0,2,2] -; X86-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc1] +; X86-NEXT: vmovapd %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_movddup_256: ; X64: # %bb.0: -; X64-NEXT: vmovddup %ymm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc5,0xff,0x12,0xd0] -; X64-NEXT: # ymm2 = ymm0[0,0,2,2] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vmovddup %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0xff,0x29,0x12,0xc8] ; X64-NEXT: # ymm1 {%k1} = ymm0[0,0,2,2] -; X64-NEXT: vaddpd %ymm2, %ymm1, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xca] +; X64-NEXT: vmovapd %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <4 x double> @llvm.x86.avx512.mask.movddup.256(<4 x double> %x0, <4 x double> %x1, i8 %x2) + ret <4 x double> %res +} + +define <4 x double>@test_int_x86_avx512_maskz_movddup_256(<4 x double> %x0, i8 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_movddup_256: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vmovddup %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0xff,0xa9,0x12,0xc0] +; X86-NEXT: # ymm0 {%k1} {z} = ymm0[0,0,2,2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_movddup_256: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vmovddup %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0xff,0xa9,0x12,0xc0] ; X64-NEXT: # ymm0 {%k1} {z} = ymm0[0,0,2,2] -; X64-NEXT: vaddpd %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc1] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <4 x double> @llvm.x86.avx512.mask.movddup.256(<4 x double> %x0, <4 x double> %x1, i8 %x2) - %res1 = call <4 x double> @llvm.x86.avx512.mask.movddup.256(<4 x double> %x0, <4 x double> %x1, i8 -1) - %res2 = call <4 x double> @llvm.x86.avx512.mask.movddup.256(<4 x double> %x0, <4 x double> zeroinitializer, i8 %x2) - %res3 = fadd <4 x double> %res, %res1 - %res4 = fadd <4 x double> %res2, %res3 - ret <4 x double> %res4 + %res = call <4 x double> @llvm.x86.avx512.mask.movddup.256(<4 x double> %x0, <4 x double> zeroinitializer, i8 %x2) + ret <4 x double> %res } declare <4 x double> @llvm.x86.avx512.mask.vpermil.pd.256(<4 x double>, i32, <4 x double>, i8) +define <4 x double>@test_int_x86_avx512_vpermil_pd_256(<4 x double> %x0, <4 x double> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermil_pd_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermilpd $6, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x05,0xc0,0x06] +; CHECK-NEXT: # ymm0 = ymm0[0,1,3,2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x double> @llvm.x86.avx512.mask.vpermil.pd.256(<4 x double> %x0, i32 22, <4 x double> %x2, i8 -1) + ret <4 x double> %res +} + define <4 x double>@test_int_x86_avx512_mask_vpermil_pd_256(<4 x double> %x0, <4 x double> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermil_pd_256: ; X86: # %bb.0: -; X86-NEXT: vpermilpd $6, %ymm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x05,0xd0,0x06] -; X86-NEXT: # ymm2 = ymm0[0,1,3,2] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermilpd $6, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x05,0xc8,0x06] ; X86-NEXT: # ymm1 {%k1} = ymm0[0,1,3,2] -; X86-NEXT: vpermilpd $6, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0x05,0xc0,0x06] -; X86-NEXT: # ymm0 {%k1} {z} = ymm0[0,1,3,2] -; X86-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc0] -; X86-NEXT: vaddpd %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc0] +; X86-NEXT: vmovapd %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermil_pd_256: ; X64: # %bb.0: -; X64-NEXT: vpermilpd $6, %ymm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x05,0xd0,0x06] -; X64-NEXT: # ymm2 = ymm0[0,1,3,2] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermilpd $6, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x05,0xc8,0x06] ; X64-NEXT: # ymm1 {%k1} = ymm0[0,1,3,2] +; X64-NEXT: vmovapd %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <4 x double> @llvm.x86.avx512.mask.vpermil.pd.256(<4 x double> %x0, i32 22, <4 x double> %x2, i8 %x3) + ret <4 x double> %res +} + +define <4 x double>@test_int_x86_avx512_maskz_vpermil_pd_256(<4 x double> %x0, i8 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_vpermil_pd_256: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpermilpd $6, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0x05,0xc0,0x06] +; X86-NEXT: # ymm0 {%k1} {z} = ymm0[0,1,3,2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_vpermil_pd_256: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermilpd $6, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0x05,0xc0,0x06] ; X64-NEXT: # ymm0 {%k1} {z} = ymm0[0,1,3,2] -; X64-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc0] -; X64-NEXT: vaddpd %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <4 x double> @llvm.x86.avx512.mask.vpermil.pd.256(<4 x double> %x0, i32 22, <4 x double> %x2, i8 %x3) - %res1 = call <4 x double> @llvm.x86.avx512.mask.vpermil.pd.256(<4 x double> %x0, i32 22, <4 x double> zeroinitializer, i8 %x3) - %res2 = call <4 x double> @llvm.x86.avx512.mask.vpermil.pd.256(<4 x double> %x0, i32 22, <4 x double> %x2, i8 -1) - %res3 = fadd <4 x double> %res, %res1 - %res4 = fadd <4 x double> %res2, %res3 - ret <4 x double> %res4 + %res = call <4 x double> @llvm.x86.avx512.mask.vpermil.pd.256(<4 x double> %x0, i32 22, <4 x double> zeroinitializer, i8 %x3) + ret <4 x double> %res } declare <2 x double> @llvm.x86.avx512.mask.vpermil.pd.128(<2 x double>, i32, <2 x double>, i8) +define <2 x double>@test_int_x86_avx512_vpermil_pd_128(<2 x double> %x0, <2 x double> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermil_pd_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermilpd $1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x05,0xc0,0x01] +; CHECK-NEXT: # xmm0 = xmm0[1,0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <2 x double> @llvm.x86.avx512.mask.vpermil.pd.128(<2 x double> %x0, i32 1, <2 x double> %x2, i8 -1) + ret <2 x double> %res +} + define <2 x double>@test_int_x86_avx512_mask_vpermil_pd_128(<2 x double> %x0, <2 x double> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermil_pd_128: ; X86: # %bb.0: -; X86-NEXT: vpermilpd $1, %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x05,0xd0,0x01] -; X86-NEXT: # xmm2 = xmm0[1,0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermilpd $1, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x05,0xc8,0x01] ; X86-NEXT: # xmm1 {%k1} = xmm0[1,0] -; X86-NEXT: vpermilpd $1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0x05,0xc0,0x01] -; X86-NEXT: # xmm0 {%k1} {z} = xmm0[1,0] -; X86-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc0] -; X86-NEXT: vaddpd %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x58,0xc2] +; X86-NEXT: vmovapd %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermil_pd_128: ; X64: # %bb.0: -; X64-NEXT: vpermilpd $1, %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x05,0xd0,0x01] -; X64-NEXT: # xmm2 = xmm0[1,0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermilpd $1, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x05,0xc8,0x01] ; X64-NEXT: # xmm1 {%k1} = xmm0[1,0] +; X64-NEXT: vmovapd %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <2 x double> @llvm.x86.avx512.mask.vpermil.pd.128(<2 x double> %x0, i32 1, <2 x double> %x2, i8 %x3) + ret <2 x double> %res +} + +define <2 x double>@test_int_x86_avx512_maskz_vpermil_pd_128(<2 x double> %x0, i8 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_vpermil_pd_128: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpermilpd $1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0x05,0xc0,0x01] +; X86-NEXT: # xmm0 {%k1} {z} = xmm0[1,0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_vpermil_pd_128: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermilpd $1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0x89,0x05,0xc0,0x01] ; X64-NEXT: # xmm0 {%k1} {z} = xmm0[1,0] -; X64-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc0] -; X64-NEXT: vaddpd %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x58,0xc2] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <2 x double> @llvm.x86.avx512.mask.vpermil.pd.128(<2 x double> %x0, i32 1, <2 x double> %x2, i8 %x3) - %res1 = call <2 x double> @llvm.x86.avx512.mask.vpermil.pd.128(<2 x double> %x0, i32 1, <2 x double> zeroinitializer, i8 %x3) - %res2 = call <2 x double> @llvm.x86.avx512.mask.vpermil.pd.128(<2 x double> %x0, i32 1, <2 x double> %x2, i8 -1) - %res3 = fadd <2 x double> %res, %res1 - %res4 = fadd <2 x double> %res3, %res2 - ret <2 x double> %res4 + %res = call <2 x double> @llvm.x86.avx512.mask.vpermil.pd.128(<2 x double> %x0, i32 1, <2 x double> zeroinitializer, i8 %x3) + ret <2 x double> %res } declare <8 x float> @llvm.x86.avx512.mask.vpermil.ps.256(<8 x float>, i32, <8 x float>, i8) +define <8 x float>@test_int_x86_avx512_vpermil_ps_256(<8 x float> %x0, <8 x float> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermil_ps_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermilps $22, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x04,0xc0,0x16] +; CHECK-NEXT: # ymm0 = ymm0[2,1,1,0,6,5,5,4] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x float> @llvm.x86.avx512.mask.vpermil.ps.256(<8 x float> %x0, i32 22, <8 x float> %x2, i8 -1) + ret <8 x float> %res +} + define <8 x float>@test_int_x86_avx512_mask_vpermil_ps_256(<8 x float> %x0, <8 x float> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermil_ps_256: ; X86: # %bb.0: -; X86-NEXT: vpermilps $22, %ymm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x04,0xd0,0x16] -; X86-NEXT: # ymm2 = ymm0[2,1,1,0,6,5,5,4] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermilps $22, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x04,0xc8,0x16] ; X86-NEXT: # ymm1 {%k1} = ymm0[2,1,1,0,6,5,5,4] -; X86-NEXT: vpermilps $22, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x04,0xc0,0x16] -; X86-NEXT: # ymm0 {%k1} {z} = ymm0[2,1,1,0,6,5,5,4] -; X86-NEXT: vaddps %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xc0] -; X86-NEXT: vaddps %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc2] +; X86-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermil_ps_256: ; X64: # %bb.0: -; X64-NEXT: vpermilps $22, %ymm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x04,0xd0,0x16] -; X64-NEXT: # ymm2 = ymm0[2,1,1,0,6,5,5,4] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermilps $22, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x04,0xc8,0x16] ; X64-NEXT: # ymm1 {%k1} = ymm0[2,1,1,0,6,5,5,4] +; X64-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <8 x float> @llvm.x86.avx512.mask.vpermil.ps.256(<8 x float> %x0, i32 22, <8 x float> %x2, i8 %x3) + ret <8 x float> %res +} + +define <8 x float>@test_int_x86_avx512_maskz_vpermil_ps_256(<8 x float> %x0, i8 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_vpermil_ps_256: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpermilps $22, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x04,0xc0,0x16] +; X86-NEXT: # ymm0 {%k1} {z} = ymm0[2,1,1,0,6,5,5,4] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_vpermil_ps_256: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermilps $22, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x04,0xc0,0x16] ; X64-NEXT: # ymm0 {%k1} {z} = ymm0[2,1,1,0,6,5,5,4] -; X64-NEXT: vaddps %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xc0] -; X64-NEXT: vaddps %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc2] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <8 x float> @llvm.x86.avx512.mask.vpermil.ps.256(<8 x float> %x0, i32 22, <8 x float> %x2, i8 %x3) - %res1 = call <8 x float> @llvm.x86.avx512.mask.vpermil.ps.256(<8 x float> %x0, i32 22, <8 x float> zeroinitializer, i8 %x3) - %res2 = call <8 x float> @llvm.x86.avx512.mask.vpermil.ps.256(<8 x float> %x0, i32 22, <8 x float> %x2, i8 -1) - %res3 = fadd <8 x float> %res, %res1 - %res4 = fadd <8 x float> %res3, %res2 - ret <8 x float> %res4 + %res = call <8 x float> @llvm.x86.avx512.mask.vpermil.ps.256(<8 x float> %x0, i32 22, <8 x float> zeroinitializer, i8 %x3) + ret <8 x float> %res } declare <4 x float> @llvm.x86.avx512.mask.vpermil.ps.128(<4 x float>, i32, <4 x float>, i8) +define <4 x float>@test_int_x86_avx512_vpermil_ps_128(<4 x float> %x0, <4 x float> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermil_ps_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermilps $22, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x16] +; CHECK-NEXT: # xmm0 = xmm0[2,1,1,0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x float> @llvm.x86.avx512.mask.vpermil.ps.128(<4 x float> %x0, i32 22, <4 x float> %x2, i8 -1) + ret <4 x float> %res +} + define <4 x float>@test_int_x86_avx512_mask_vpermil_ps_128(<4 x float> %x0, <4 x float> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermil_ps_128: ; X86: # %bb.0: -; X86-NEXT: vpermilps $22, %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xd0,0x16] -; X86-NEXT: # xmm2 = xmm0[2,1,1,0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermilps $22, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x04,0xc8,0x16] ; X86-NEXT: # xmm1 {%k1} = xmm0[2,1,1,0] -; X86-NEXT: vpermilps $22, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x04,0xc0,0x16] -; X86-NEXT: # xmm0 {%k1} {z} = xmm0[2,1,1,0] -; X86-NEXT: vaddps %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc0] -; X86-NEXT: vaddps %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xc0] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermil_ps_128: ; X64: # %bb.0: -; X64-NEXT: vpermilps $22, %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xd0,0x16] -; X64-NEXT: # xmm2 = xmm0[2,1,1,0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermilps $22, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x04,0xc8,0x16] ; X64-NEXT: # xmm1 {%k1} = xmm0[2,1,1,0] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <4 x float> @llvm.x86.avx512.mask.vpermil.ps.128(<4 x float> %x0, i32 22, <4 x float> %x2, i8 %x3) + ret <4 x float> %res +} + +define <4 x float>@test_int_x86_avx512_maskz_vpermil_ps_128(<4 x float> %x0, i8 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_vpermil_ps_128: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpermilps $22, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x04,0xc0,0x16] +; X86-NEXT: # xmm0 {%k1} {z} = xmm0[2,1,1,0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_vpermil_ps_128: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermilps $22, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x04,0xc0,0x16] ; X64-NEXT: # xmm0 {%k1} {z} = xmm0[2,1,1,0] -; X64-NEXT: vaddps %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc0] -; X64-NEXT: vaddps %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <4 x float> @llvm.x86.avx512.mask.vpermil.ps.128(<4 x float> %x0, i32 22, <4 x float> %x2, i8 %x3) - %res1 = call <4 x float> @llvm.x86.avx512.mask.vpermil.ps.128(<4 x float> %x0, i32 22, <4 x float> zeroinitializer, i8 %x3) - %res2 = call <4 x float> @llvm.x86.avx512.mask.vpermil.ps.128(<4 x float> %x0, i32 22, <4 x float> %x2, i8 -1) - %res3 = fadd <4 x float> %res, %res1 - %res4 = fadd <4 x float> %res2, %res3 - ret <4 x float> %res4 + %res = call <4 x float> @llvm.x86.avx512.mask.vpermil.ps.128(<4 x float> %x0, i32 22, <4 x float> zeroinitializer, i8 %x3) + ret <4 x float> %res } declare <4 x double> @llvm.x86.avx512.mask.perm.df.256(<4 x double>, i32, <4 x double>, i8) +define <4 x double>@test_int_x86_avx512_perm_df_256(<4 x double> %x0, i32 %x1, <4 x double> %x2) { +; CHECK-LABEL: test_int_x86_avx512_perm_df_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermpd $3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0x01,0xc0,0x03] +; CHECK-NEXT: # ymm0 = ymm0[3,0,0,0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x double> @llvm.x86.avx512.mask.perm.df.256(<4 x double> %x0, i32 3, <4 x double> %x2, i8 -1) + ret <4 x double> %res +} + define <4 x double>@test_int_x86_avx512_mask_perm_df_256(<4 x double> %x0, i32 %x1, <4 x double> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_perm_df_256: ; X86: # %bb.0: -; X86-NEXT: vpermpd $3, %ymm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0x01,0xd0,0x03] -; X86-NEXT: # ymm2 = ymm0[3,0,0,0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermpd $3, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x01,0xc8,0x03] ; X86-NEXT: # ymm1 {%k1} = ymm0[3,0,0,0] -; X86-NEXT: vpermpd $3, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0x01,0xc0,0x03] -; X86-NEXT: # ymm0 {%k1} {z} = ymm0[3,0,0,0] -; X86-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc0] -; X86-NEXT: vaddpd %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc2] +; X86-NEXT: vmovapd %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_perm_df_256: ; X64: # %bb.0: -; X64-NEXT: vpermpd $3, %ymm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0x01,0xd0,0x03] -; X64-NEXT: # ymm2 = ymm0[3,0,0,0] ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] ; X64-NEXT: vpermpd $3, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x01,0xc8,0x03] ; X64-NEXT: # ymm1 {%k1} = ymm0[3,0,0,0] +; X64-NEXT: vmovapd %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <4 x double> @llvm.x86.avx512.mask.perm.df.256(<4 x double> %x0, i32 3, <4 x double> %x2, i8 %x3) + ret <4 x double> %res +} + +define <4 x double>@test_int_x86_avx512_maskz_perm_df_256(<4 x double> %x0, i32 %x1, i8 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_perm_df_256: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpermpd $3, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0x01,0xc0,0x03] +; X86-NEXT: # ymm0 {%k1} {z} = ymm0[3,0,0,0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_perm_df_256: +; X64: # %bb.0: +; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] ; X64-NEXT: vpermpd $3, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0x01,0xc0,0x03] ; X64-NEXT: # ymm0 {%k1} {z} = ymm0[3,0,0,0] -; X64-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc0] -; X64-NEXT: vaddpd %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc2] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <4 x double> @llvm.x86.avx512.mask.perm.df.256(<4 x double> %x0, i32 3, <4 x double> %x2, i8 %x3) - %res1 = call <4 x double> @llvm.x86.avx512.mask.perm.df.256(<4 x double> %x0, i32 3, <4 x double> zeroinitializer, i8 %x3) - %res2 = call <4 x double> @llvm.x86.avx512.mask.perm.df.256(<4 x double> %x0, i32 3, <4 x double> %x2, i8 -1) - %res3 = fadd <4 x double> %res, %res1 - %res4 = fadd <4 x double> %res3, %res2 - ret <4 x double> %res4 + %res = call <4 x double> @llvm.x86.avx512.mask.perm.df.256(<4 x double> %x0, i32 3, <4 x double> zeroinitializer, i8 %x3) + ret <4 x double> %res } declare <4 x i64> @llvm.x86.avx512.mask.perm.di.256(<4 x i64>, i32, <4 x i64>, i8) +define <4 x i64>@test_int_x86_avx512_perm_di_256(<4 x i64> %x0, i32 %x1, <4 x i64> %x2) { +; CHECK-LABEL: test_int_x86_avx512_perm_di_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermpd $3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0x01,0xc0,0x03] +; CHECK-NEXT: # ymm0 = ymm0[3,0,0,0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i64> @llvm.x86.avx512.mask.perm.di.256(<4 x i64> %x0, i32 3, <4 x i64> %x2, i8 -1) + ret <4 x i64> %res +} + define <4 x i64>@test_int_x86_avx512_mask_perm_di_256(<4 x i64> %x0, i32 %x1, <4 x i64> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_perm_di_256: ; X86: # %bb.0: -; X86-NEXT: vpermq $3, %ymm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0x00,0xd0,0x03] -; X86-NEXT: # ymm2 = ymm0[3,0,0,0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermq $3, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x00,0xc8,0x03] ; X86-NEXT: # ymm1 {%k1} = ymm0[3,0,0,0] -; X86-NEXT: vpermq $3, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0x00,0xc0,0x03] -; X86-NEXT: # ymm0 {%k1} {z} = ymm0[3,0,0,0] -; X86-NEXT: vpaddq %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc2] -; X86-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_perm_di_256: ; X64: # %bb.0: -; X64-NEXT: vpermq $3, %ymm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0xfd,0x00,0xd0,0x03] -; X64-NEXT: # ymm2 = ymm0[3,0,0,0] ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] ; X64-NEXT: vpermq $3, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x00,0xc8,0x03] ; X64-NEXT: # ymm1 {%k1} = ymm0[3,0,0,0] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <4 x i64> @llvm.x86.avx512.mask.perm.di.256(<4 x i64> %x0, i32 3, <4 x i64> %x2, i8 %x3) + ret <4 x i64> %res +} + +define <4 x i64>@test_int_x86_avx512_maskz_perm_di_256(<4 x i64> %x0, i32 %x1, i8 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_perm_di_256: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpermq $3, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0x00,0xc0,0x03] +; X86-NEXT: # ymm0 {%k1} {z} = ymm0[3,0,0,0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_perm_di_256: +; X64: # %bb.0: +; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] ; X64-NEXT: vpermq $3, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0xfd,0xa9,0x00,0xc0,0x03] ; X64-NEXT: # ymm0 {%k1} {z} = ymm0[3,0,0,0] -; X64-NEXT: vpaddq %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc2] -; X64-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <4 x i64> @llvm.x86.avx512.mask.perm.di.256(<4 x i64> %x0, i32 3, <4 x i64> %x2, i8 %x3) - %res1 = call <4 x i64> @llvm.x86.avx512.mask.perm.di.256(<4 x i64> %x0, i32 3, <4 x i64> zeroinitializer, i8 %x3) - %res2 = call <4 x i64> @llvm.x86.avx512.mask.perm.di.256(<4 x i64> %x0, i32 3, <4 x i64> %x2, i8 -1) - %res3 = add <4 x i64> %res, %res1 - %res4 = add <4 x i64> %res3, %res2 - ret <4 x i64> %res4 + %res = call <4 x i64> @llvm.x86.avx512.mask.perm.di.256(<4 x i64> %x0, i32 3, <4 x i64> zeroinitializer, i8 %x3) + ret <4 x i64> %res } declare void @llvm.x86.avx512.mask.store.pd.128(i8*, <2 x double>, i8) @@ -1661,76 +1980,106 @@ define <4 x i64> @test_mask_load_aligned_q_256(<4 x i64> %data, i8* %ptr, i8 %ma declare <4 x i32> @llvm.x86.avx512.mask.pshuf.d.128(<4 x i32>, i32, <4 x i32>, i8) +define <4 x i32>@test_int_x86_avx512_pshuf_d_128(<4 x i32> %x0, i32 %x1, <4 x i32> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pshuf_d_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermilps $3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x04,0xc0,0x03] +; CHECK-NEXT: # xmm0 = xmm0[3,0,0,0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.mask.pshuf.d.128(<4 x i32> %x0, i32 3, <4 x i32> %x2, i8 -1) + ret <4 x i32> %res +} + define <4 x i32>@test_int_x86_avx512_mask_pshuf_d_128(<4 x i32> %x0, i32 %x1, <4 x i32> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pshuf_d_128: ; X86: # %bb.0: -; X86-NEXT: vpshufd $3, %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x70,0xd0,0x03] -; X86-NEXT: # xmm2 = xmm0[3,0,0,0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpshufd $3, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x70,0xc8,0x03] ; X86-NEXT: # xmm1 {%k1} = xmm0[3,0,0,0] -; X86-NEXT: vpshufd $3, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0x70,0xc0,0x03] -; X86-NEXT: # xmm0 {%k1} {z} = xmm0[3,0,0,0] -; X86-NEXT: vpaddd %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc2] -; X86-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pshuf_d_128: ; X64: # %bb.0: -; X64-NEXT: vpshufd $3, %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x70,0xd0,0x03] -; X64-NEXT: # xmm2 = xmm0[3,0,0,0] ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] ; X64-NEXT: vpshufd $3, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x70,0xc8,0x03] ; X64-NEXT: # xmm1 {%k1} = xmm0[3,0,0,0] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.mask.pshuf.d.128(<4 x i32> %x0, i32 3, <4 x i32> %x2, i8 %x3) + ret <4 x i32> %res +} + +define <4 x i32>@test_int_x86_avx512_maskz_pshuf_d_128(<4 x i32> %x0, i32 %x1, i8 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_pshuf_d_128: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpshufd $3, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0x70,0xc0,0x03] +; X86-NEXT: # xmm0 {%k1} {z} = xmm0[3,0,0,0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pshuf_d_128: +; X64: # %bb.0: +; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] ; X64-NEXT: vpshufd $3, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0x70,0xc0,0x03] ; X64-NEXT: # xmm0 {%k1} {z} = xmm0[3,0,0,0] -; X64-NEXT: vpaddd %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc2] -; X64-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <4 x i32> @llvm.x86.avx512.mask.pshuf.d.128(<4 x i32> %x0, i32 3, <4 x i32> %x2, i8 %x3) - %res1 = call <4 x i32> @llvm.x86.avx512.mask.pshuf.d.128(<4 x i32> %x0, i32 3, <4 x i32> zeroinitializer, i8 %x3) - %res2 = call <4 x i32> @llvm.x86.avx512.mask.pshuf.d.128(<4 x i32> %x0, i32 3, <4 x i32> %x2, i8 -1) - %res3 = add <4 x i32> %res, %res1 - %res4 = add <4 x i32> %res3, %res2 - ret <4 x i32> %res4 + %res = call <4 x i32> @llvm.x86.avx512.mask.pshuf.d.128(<4 x i32> %x0, i32 3, <4 x i32> zeroinitializer, i8 %x3) + ret <4 x i32> %res } declare <8 x i32> @llvm.x86.avx512.mask.pshuf.d.256(<8 x i32>, i32, <8 x i32>, i8) +define <8 x i32>@test_int_x86_avx512_pshuf_d_256(<8 x i32> %x0, i32 %x1, <8 x i32> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pshuf_d_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermilps $3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x04,0xc0,0x03] +; CHECK-NEXT: # ymm0 = ymm0[3,0,0,0,7,4,4,4] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i32> @llvm.x86.avx512.mask.pshuf.d.256(<8 x i32> %x0, i32 3, <8 x i32> %x2, i8 -1) + ret <8 x i32> %res +} + define <8 x i32>@test_int_x86_avx512_mask_pshuf_d_256(<8 x i32> %x0, i32 %x1, <8 x i32> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pshuf_d_256: ; X86: # %bb.0: -; X86-NEXT: vpshufd $3, %ymm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x70,0xd0,0x03] -; X86-NEXT: # ymm2 = ymm0[3,0,0,0,7,4,4,4] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpshufd $3, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x70,0xc8,0x03] ; X86-NEXT: # ymm1 {%k1} = ymm0[3,0,0,0,7,4,4,4] -; X86-NEXT: vpshufd $3, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0x70,0xc0,0x03] -; X86-NEXT: # ymm0 {%k1} {z} = ymm0[3,0,0,0,7,4,4,4] -; X86-NEXT: vpaddd %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc2] -; X86-NEXT: vpaddd %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pshuf_d_256: ; X64: # %bb.0: -; X64-NEXT: vpshufd $3, %ymm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x70,0xd0,0x03] -; X64-NEXT: # ymm2 = ymm0[3,0,0,0,7,4,4,4] ; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] ; X64-NEXT: vpshufd $3, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x70,0xc8,0x03] ; X64-NEXT: # ymm1 {%k1} = ymm0[3,0,0,0,7,4,4,4] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <8 x i32> @llvm.x86.avx512.mask.pshuf.d.256(<8 x i32> %x0, i32 3, <8 x i32> %x2, i8 %x3) + ret <8 x i32> %res +} + +define <8 x i32>@test_int_x86_avx512_maskz_pshuf_d_256(<8 x i32> %x0, i32 %x1, i8 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_pshuf_d_256: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpshufd $3, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0x70,0xc0,0x03] +; X86-NEXT: # ymm0 {%k1} {z} = ymm0[3,0,0,0,7,4,4,4] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pshuf_d_256: +; X64: # %bb.0: +; X64-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] ; X64-NEXT: vpshufd $3, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0x70,0xc0,0x03] ; X64-NEXT: # ymm0 {%k1} {z} = ymm0[3,0,0,0,7,4,4,4] -; X64-NEXT: vpaddd %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc2] -; X64-NEXT: vpaddd %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <8 x i32> @llvm.x86.avx512.mask.pshuf.d.256(<8 x i32> %x0, i32 3, <8 x i32> %x2, i8 %x3) - %res1 = call <8 x i32> @llvm.x86.avx512.mask.pshuf.d.256(<8 x i32> %x0, i32 3, <8 x i32> zeroinitializer, i8 %x3) - %res2 = call <8 x i32> @llvm.x86.avx512.mask.pshuf.d.256(<8 x i32> %x0, i32 3, <8 x i32> %x2, i8 -1) - %res3 = add <8 x i32> %res, %res1 - %res4 = add <8 x i32> %res3, %res2 - ret <8 x i32> %res4 + %res = call <8 x i32> @llvm.x86.avx512.mask.pshuf.d.256(<8 x i32> %x0, i32 3, <8 x i32> zeroinitializer, i8 %x3) + ret <8 x i32> %res } define i8 @test_pcmpeq_d_256(<8 x i32> %a, <8 x i32> %b) { @@ -2017,466 +2366,530 @@ declare i8 @llvm.x86.avx512.mask.pcmpgt.q.128(<2 x i64>, <2 x i64>, i8) declare <2 x double> @llvm.x86.avx512.mask.unpckh.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) +define <2 x double>@test_int_x86_avx512_unpckh_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2) { +; CHECK-LABEL: test_int_x86_avx512_unpckh_pd_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vunpckhpd %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x15,0xc1] +; CHECK-NEXT: # xmm0 = xmm0[1],xmm1[1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <2 x double> @llvm.x86.avx512.mask.unpckh.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1) + ret <2 x double> %res +} + define <2 x double>@test_int_x86_avx512_mask_unpckh_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_unpckh_pd_128: ; X86: # %bb.0: -; X86-NEXT: vunpckhpd %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x15,0xd9] -; X86-NEXT: # xmm3 = xmm0[1],xmm1[1] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vunpckhpd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0x15,0xd1] ; X86-NEXT: # xmm2 {%k1} = xmm0[1],xmm1[1] -; X86-NEXT: vaddpd %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0x58,0xc3] +; X86-NEXT: vmovapd %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_unpckh_pd_128: ; X64: # %bb.0: -; X64-NEXT: vunpckhpd %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x15,0xd9] -; X64-NEXT: # xmm3 = xmm0[1],xmm1[1] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vunpckhpd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0x15,0xd1] ; X64-NEXT: # xmm2 {%k1} = xmm0[1],xmm1[1] -; X64-NEXT: vaddpd %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0x58,0xc3] +; X64-NEXT: vmovapd %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <2 x double> @llvm.x86.avx512.mask.unpckh.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) - %res1 = call <2 x double> @llvm.x86.avx512.mask.unpckh.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1) - %res2 = fadd <2 x double> %res, %res1 - ret <2 x double> %res2 + ret <2 x double> %res } declare <4 x double> @llvm.x86.avx512.mask.unpckh.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) +define <4 x double>@test_int_x86_avx512_unpckh_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_unpckh_pd_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vunpckhpd %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x15,0xc1] +; CHECK-NEXT: # ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x double> @llvm.x86.avx512.mask.unpckh.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1) + ret <4 x double> %res +} + define <4 x double>@test_int_x86_avx512_mask_unpckh_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_unpckh_pd_256: ; X86: # %bb.0: -; X86-NEXT: vunpckhpd %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x15,0xd9] -; X86-NEXT: # ymm3 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vunpckhpd %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0x15,0xd1] ; X86-NEXT: # ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; X86-NEXT: vaddpd %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc3] +; X86-NEXT: vmovapd %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_unpckh_pd_256: ; X64: # %bb.0: -; X64-NEXT: vunpckhpd %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x15,0xd9] -; X64-NEXT: # ymm3 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vunpckhpd %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0x15,0xd1] ; X64-NEXT: # ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; X64-NEXT: vaddpd %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc3] +; X64-NEXT: vmovapd %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x double> @llvm.x86.avx512.mask.unpckh.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) - %res1 = call <4 x double> @llvm.x86.avx512.mask.unpckh.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1) - %res2 = fadd <4 x double> %res, %res1 - ret <4 x double> %res2 + ret <4 x double> %res } declare <4 x float> @llvm.x86.avx512.mask.unpckh.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) +define <4 x float>@test_int_x86_avx512_unpckh_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2) { +; CHECK-LABEL: test_int_x86_avx512_unpckh_ps_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vunpckhps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x15,0xc1] +; CHECK-NEXT: # xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x float> @llvm.x86.avx512.mask.unpckh.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1) + ret <4 x float> %res +} + define <4 x float>@test_int_x86_avx512_mask_unpckh_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_unpckh_ps_128: ; X86: # %bb.0: -; X86-NEXT: vunpckhps %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x15,0xd9] -; X86-NEXT: # xmm3 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vunpckhps %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7c,0x09,0x15,0xd1] ; X86-NEXT: # xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; X86-NEXT: vaddps %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xc3] +; X86-NEXT: vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_unpckh_ps_128: ; X64: # %bb.0: -; X64-NEXT: vunpckhps %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x15,0xd9] -; X64-NEXT: # xmm3 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vunpckhps %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7c,0x09,0x15,0xd1] ; X64-NEXT: # xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; X64-NEXT: vaddps %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xc3] +; X64-NEXT: vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.unpckh.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) - %res1 = call <4 x float> @llvm.x86.avx512.mask.unpckh.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1) - %res2 = fadd <4 x float> %res, %res1 - ret <4 x float> %res2 + ret <4 x float> %res } declare <8 x float> @llvm.x86.avx512.mask.unpckh.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) +define <8 x float>@test_int_x86_avx512_unpckh_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2) { +; CHECK-LABEL: test_int_x86_avx512_unpckh_ps_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vunpckhps %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x15,0xc1] +; CHECK-NEXT: # ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x float> @llvm.x86.avx512.mask.unpckh.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1) + ret <8 x float> %res +} + define <8 x float>@test_int_x86_avx512_mask_unpckh_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_unpckh_ps_256: ; X86: # %bb.0: -; X86-NEXT: vunpckhps %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x15,0xd9] -; X86-NEXT: # ymm3 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vunpckhps %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0x15,0xd1] ; X86-NEXT: # ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; X86-NEXT: vaddps %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc3] +; X86-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_unpckh_ps_256: ; X64: # %bb.0: -; X64-NEXT: vunpckhps %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x15,0xd9] -; X64-NEXT: # ymm3 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vunpckhps %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0x15,0xd1] ; X64-NEXT: # ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; X64-NEXT: vaddps %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc3] +; X64-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx512.mask.unpckh.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) - %res1 = call <8 x float> @llvm.x86.avx512.mask.unpckh.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1) - %res2 = fadd <8 x float> %res, %res1 - ret <8 x float> %res2 + ret <8 x float> %res } declare <2 x double> @llvm.x86.avx512.mask.unpckl.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) +define <2 x double>@test_int_x86_avx512_unpckl_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2) { +; CHECK-LABEL: test_int_x86_avx512_unpckl_pd_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovlhps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x16,0xc1] +; CHECK-NEXT: # xmm0 = xmm0[0],xmm1[0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <2 x double> @llvm.x86.avx512.mask.unpckl.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1) + ret <2 x double> %res +} + define <2 x double>@test_int_x86_avx512_mask_unpckl_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_unpckl_pd_128: ; X86: # %bb.0: -; X86-NEXT: vunpcklpd %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x14,0xd9] -; X86-NEXT: # xmm3 = xmm0[0],xmm1[0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vunpcklpd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0x14,0xd1] ; X86-NEXT: # xmm2 {%k1} = xmm0[0],xmm1[0] -; X86-NEXT: vaddpd %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0x58,0xc3] +; X86-NEXT: vmovapd %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_unpckl_pd_128: ; X64: # %bb.0: -; X64-NEXT: vunpcklpd %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x14,0xd9] -; X64-NEXT: # xmm3 = xmm0[0],xmm1[0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vunpcklpd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0x14,0xd1] ; X64-NEXT: # xmm2 {%k1} = xmm0[0],xmm1[0] -; X64-NEXT: vaddpd %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0x58,0xc3] +; X64-NEXT: vmovapd %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <2 x double> @llvm.x86.avx512.mask.unpckl.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) - %res1 = call <2 x double> @llvm.x86.avx512.mask.unpckl.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1) - %res2 = fadd <2 x double> %res, %res1 - ret <2 x double> %res2 + ret <2 x double> %res } declare <4 x double> @llvm.x86.avx512.mask.unpckl.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) +define <4 x double>@test_int_x86_avx512_unpckl_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2) { +; CHECK-LABEL: test_int_x86_avx512_unpckl_pd_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vunpcklpd %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x14,0xc1] +; CHECK-NEXT: # ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x double> @llvm.x86.avx512.mask.unpckl.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1) + ret <4 x double> %res +} + define <4 x double>@test_int_x86_avx512_mask_unpckl_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_unpckl_pd_256: ; X86: # %bb.0: -; X86-NEXT: vunpcklpd %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x14,0xd9] -; X86-NEXT: # ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vunpcklpd %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0x14,0xd1] ; X86-NEXT: # ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; X86-NEXT: vaddpd %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc3] +; X86-NEXT: vmovapd %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_unpckl_pd_256: ; X64: # %bb.0: -; X64-NEXT: vunpcklpd %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x14,0xd9] -; X64-NEXT: # ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vunpcklpd %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0x14,0xd1] ; X64-NEXT: # ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; X64-NEXT: vaddpd %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc3] +; X64-NEXT: vmovapd %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x double> @llvm.x86.avx512.mask.unpckl.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) - %res1 = call <4 x double> @llvm.x86.avx512.mask.unpckl.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1) - %res2 = fadd <4 x double> %res, %res1 - ret <4 x double> %res2 + ret <4 x double> %res } declare <4 x float> @llvm.x86.avx512.mask.unpckl.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) +define <4 x float>@test_int_x86_avx512_unpckl_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2) { +; CHECK-LABEL: test_int_x86_avx512_unpckl_ps_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vunpcklps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x14,0xc1] +; CHECK-NEXT: # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x float> @llvm.x86.avx512.mask.unpckl.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1) + ret <4 x float> %res +} + define <4 x float>@test_int_x86_avx512_mask_unpckl_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_unpckl_ps_128: ; X86: # %bb.0: -; X86-NEXT: vunpcklps %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x14,0xd9] -; X86-NEXT: # xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vunpcklps %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7c,0x09,0x14,0xd1] ; X86-NEXT: # xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X86-NEXT: vaddps %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xc3] +; X86-NEXT: vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_unpckl_ps_128: ; X64: # %bb.0: -; X64-NEXT: vunpcklps %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x14,0xd9] -; X64-NEXT: # xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vunpcklps %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7c,0x09,0x14,0xd1] ; X64-NEXT: # xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X64-NEXT: vaddps %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xc3] +; X64-NEXT: vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.unpckl.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) - %res1 = call <4 x float> @llvm.x86.avx512.mask.unpckl.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1) - %res2 = fadd <4 x float> %res, %res1 - ret <4 x float> %res2 + ret <4 x float> %res } declare <8 x float> @llvm.x86.avx512.mask.unpckl.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) +define <8 x float>@test_int_x86_avx512_unpckl_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2) { +; CHECK-LABEL: test_int_x86_avx512_unpckl_ps_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vunpcklps %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x14,0xc1] +; CHECK-NEXT: # ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x float> @llvm.x86.avx512.mask.unpckl.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1) + ret <8 x float> %res +} + define <8 x float>@test_int_x86_avx512_mask_unpckl_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_unpckl_ps_256: ; X86: # %bb.0: -; X86-NEXT: vunpcklps %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x14,0xd9] -; X86-NEXT: # ymm3 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vunpcklps %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0x14,0xd1] ; X86-NEXT: # ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; X86-NEXT: vaddps %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc3] +; X86-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_unpckl_ps_256: ; X64: # %bb.0: -; X64-NEXT: vunpcklps %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x14,0xd9] -; X64-NEXT: # ymm3 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vunpcklps %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0x14,0xd1] ; X64-NEXT: # ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; X64-NEXT: vaddps %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc3] +; X64-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx512.mask.unpckl.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) - %res1 = call <8 x float> @llvm.x86.avx512.mask.unpckl.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1) - %res2 = fadd <8 x float> %res, %res1 - ret <8 x float> %res2 + ret <8 x float> %res } declare <4 x i32> @llvm.x86.avx512.mask.punpckhd.q.128(<4 x i32>, <4 x i32>, <4 x i32>, i8) +define <4 x i32>@test_int_x86_avx512_ask_punpckhd_q_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) { +; CHECK-LABEL: test_int_x86_avx512_ask_punpckhd_q_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vunpckhps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x15,0xc1] +; CHECK-NEXT: # xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.mask.punpckhd.q.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1) + ret <4 x i32> %res +} + define <4 x i32>@test_int_x86_avx512_mask_punpckhd_q_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_punpckhd_q_128: ; X86: # %bb.0: -; X86-NEXT: vpunpckhdq %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6a,0xd9] -; X86-NEXT: # xmm3 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpunpckhdq %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x6a,0xd1] ; X86-NEXT: # xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; X86-NEXT: vpaddd %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc3] +; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_punpckhd_q_128: ; X64: # %bb.0: -; X64-NEXT: vpunpckhdq %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6a,0xd9] -; X64-NEXT: # xmm3 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpunpckhdq %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x6a,0xd1] ; X64-NEXT: # xmm2 {%k1} = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; X64-NEXT: vpaddd %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc3] +; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.punpckhd.q.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) - %res1 = call <4 x i32> @llvm.x86.avx512.mask.punpckhd.q.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1) - %res2 = add <4 x i32> %res, %res1 - ret <4 x i32> %res2 + ret <4 x i32> %res } declare <4 x i32> @llvm.x86.avx512.mask.punpckld.q.128(<4 x i32>, <4 x i32>, <4 x i32>, i8) +define <4 x i32>@test_int_x86_avx512_punpckld_q_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) { +; CHECK-LABEL: test_int_x86_avx512_punpckld_q_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vunpcklps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x14,0xc1] +; CHECK-NEXT: # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.mask.punpckld.q.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1) + ret <4 x i32> %res +} + define <4 x i32>@test_int_x86_avx512_mask_punpckld_q_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_punpckld_q_128: ; X86: # %bb.0: -; X86-NEXT: vpunpckldq %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x62,0xd9] -; X86-NEXT: # xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpunpckldq %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x62,0xd1] ; X86-NEXT: # xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X86-NEXT: vpaddd %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc3] +; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_punpckld_q_128: ; X64: # %bb.0: -; X64-NEXT: vpunpckldq %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x62,0xd9] -; X64-NEXT: # xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpunpckldq %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x62,0xd1] ; X64-NEXT: # xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X64-NEXT: vpaddd %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc3] +; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.punpckld.q.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) - %res1 = call <4 x i32> @llvm.x86.avx512.mask.punpckld.q.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1) - %res2 = add <4 x i32> %res, %res1 - ret <4 x i32> %res2 + ret <4 x i32> %res } declare <8 x i32> @llvm.x86.avx512.mask.punpckhd.q.256(<8 x i32>, <8 x i32>, <8 x i32>, i8) +define <8 x i32>@test_int_x86_avx512_punpckhd_q_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) { +; CHECK-LABEL: test_int_x86_avx512_punpckhd_q_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vunpckhps %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x15,0xc1] +; CHECK-NEXT: # ymm0 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i32> @llvm.x86.avx512.mask.punpckhd.q.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1) + ret <8 x i32> %res +} + define <8 x i32>@test_int_x86_avx512_mask_punpckhd_q_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_punpckhd_q_256: ; X86: # %bb.0: -; X86-NEXT: vpunpckhdq %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6a,0xd9] -; X86-NEXT: # ymm3 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpunpckhdq %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x6a,0xd1] ; X86-NEXT: # ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; X86-NEXT: vpaddd %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc3] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_punpckhd_q_256: ; X64: # %bb.0: -; X64-NEXT: vpunpckhdq %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6a,0xd9] -; X64-NEXT: # ymm3 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpunpckhdq %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x6a,0xd1] ; X64-NEXT: # ymm2 {%k1} = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] -; X64-NEXT: vpaddd %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc3] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.punpckhd.q.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) - %res1 = call <8 x i32> @llvm.x86.avx512.mask.punpckhd.q.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1) - %res2 = add <8 x i32> %res, %res1 - ret <8 x i32> %res2 + ret <8 x i32> %res } declare <8 x i32> @llvm.x86.avx512.mask.punpckld.q.256(<8 x i32>, <8 x i32>, <8 x i32>, i8) +define <8 x i32>@test_int_x86_avx512_punpckld_q_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) { +; CHECK-LABEL: test_int_x86_avx512_punpckld_q_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vunpcklps %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x14,0xc1] +; CHECK-NEXT: # ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i32> @llvm.x86.avx512.mask.punpckld.q.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1) + ret <8 x i32> %res +} + define <8 x i32>@test_int_x86_avx512_mask_punpckld_q_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_punpckld_q_256: ; X86: # %bb.0: -; X86-NEXT: vpunpckldq %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x62,0xd9] -; X86-NEXT: # ymm3 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpunpckldq %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x62,0xd1] ; X86-NEXT: # ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; X86-NEXT: vpaddd %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc3] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_punpckld_q_256: ; X64: # %bb.0: -; X64-NEXT: vpunpckldq %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x62,0xd9] -; X64-NEXT: # ymm3 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpunpckldq %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x62,0xd1] ; X64-NEXT: # ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] -; X64-NEXT: vpaddd %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc3] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.punpckld.q.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) - %res1 = call <8 x i32> @llvm.x86.avx512.mask.punpckld.q.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1) - %res2 = add <8 x i32> %res, %res1 - ret <8 x i32> %res2 + ret <8 x i32> %res } declare <2 x i64> @llvm.x86.avx512.mask.punpckhqd.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8) +define <2 x i64>@test_int_x86_avx512_punpckhqd_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) { +; CHECK-LABEL: test_int_x86_avx512_punpckhqd_q_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vunpckhpd %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x15,0xc1] +; CHECK-NEXT: # xmm0 = xmm0[1],xmm1[1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <2 x i64> @llvm.x86.avx512.mask.punpckhqd.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1) + ret <2 x i64> %res +} + define <2 x i64>@test_int_x86_avx512_mask_punpckhqd_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_punpckhqd_q_128: ; X86: # %bb.0: -; X86-NEXT: vpunpckhqdq %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6d,0xd9] -; X86-NEXT: # xmm3 = xmm0[1],xmm1[1] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpunpckhqdq %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0x6d,0xd1] ; X86-NEXT: # xmm2 {%k1} = xmm0[1],xmm1[1] -; X86-NEXT: vpaddq %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc3] +; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_punpckhqd_q_128: ; X64: # %bb.0: -; X64-NEXT: vpunpckhqdq %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6d,0xd9] -; X64-NEXT: # xmm3 = xmm0[1],xmm1[1] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpunpckhqdq %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0x6d,0xd1] ; X64-NEXT: # xmm2 {%k1} = xmm0[1],xmm1[1] -; X64-NEXT: vpaddq %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc3] +; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.mask.punpckhqd.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) - %res1 = call <2 x i64> @llvm.x86.avx512.mask.punpckhqd.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1) - %res2 = add <2 x i64> %res, %res1 - ret <2 x i64> %res2 + ret <2 x i64> %res } declare <2 x i64> @llvm.x86.avx512.mask.punpcklqd.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8) +define <2 x i64>@test_int_x86_avx512_punpcklqd_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) { +; CHECK-LABEL: test_int_x86_avx512_punpcklqd_q_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovlhps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x16,0xc1] +; CHECK-NEXT: # xmm0 = xmm0[0],xmm1[0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <2 x i64> @llvm.x86.avx512.mask.punpcklqd.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1) + ret <2 x i64> %res +} + define <2 x i64>@test_int_x86_avx512_mask_punpcklqd_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_punpcklqd_q_128: ; X86: # %bb.0: -; X86-NEXT: vpunpcklqdq %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6c,0xd9] -; X86-NEXT: # xmm3 = xmm0[0],xmm1[0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpunpcklqdq %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0x6c,0xd1] ; X86-NEXT: # xmm2 {%k1} = xmm0[0],xmm1[0] -; X86-NEXT: vpaddq %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc3] +; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_punpcklqd_q_128: ; X64: # %bb.0: -; X64-NEXT: vpunpcklqdq %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6c,0xd9] -; X64-NEXT: # xmm3 = xmm0[0],xmm1[0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpunpcklqdq %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0x6c,0xd1] ; X64-NEXT: # xmm2 {%k1} = xmm0[0],xmm1[0] -; X64-NEXT: vpaddq %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc3] +; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.mask.punpcklqd.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) - %res1 = call <2 x i64> @llvm.x86.avx512.mask.punpcklqd.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1) - %res2 = add <2 x i64> %res, %res1 - ret <2 x i64> %res2 + ret <2 x i64> %res } declare <4 x i64> @llvm.x86.avx512.mask.punpcklqd.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8) +define <4 x i64>@test_int_x86_avx512_punpcklqd_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2) { +; CHECK-LABEL: test_int_x86_avx512_punpcklqd_q_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vunpcklpd %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x14,0xc1] +; CHECK-NEXT: # ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i64> @llvm.x86.avx512.mask.punpcklqd.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1) + ret <4 x i64> %res +} + define <4 x i64>@test_int_x86_avx512_mask_punpcklqd_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_punpcklqd_q_256: ; X86: # %bb.0: -; X86-NEXT: vpunpcklqdq %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6c,0xd9] -; X86-NEXT: # ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpunpcklqdq %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0x6c,0xd1] ; X86-NEXT: # ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; X86-NEXT: vpaddq %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc3] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_punpcklqd_q_256: ; X64: # %bb.0: -; X64-NEXT: vpunpcklqdq %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6c,0xd9] -; X64-NEXT: # ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpunpcklqdq %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0x6c,0xd1] ; X64-NEXT: # ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[2],ymm1[2] -; X64-NEXT: vpaddq %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc3] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.punpcklqd.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) - %res1 = call <4 x i64> @llvm.x86.avx512.mask.punpcklqd.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1) - %res2 = add <4 x i64> %res, %res1 - ret <4 x i64> %res2 + ret <4 x i64> %res } declare <4 x i64> @llvm.x86.avx512.mask.punpckhqd.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8) +define <4 x i64>@test_int_x86_avx512_punpckhqd_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2) { +; CHECK-LABEL: test_int_x86_avx512_punpckhqd_q_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vunpckhpd %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x15,0xc1] +; CHECK-NEXT: # ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i64> @llvm.x86.avx512.mask.punpckhqd.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1) + ret <4 x i64> %res +} + define <4 x i64>@test_int_x86_avx512_mask_punpckhqd_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_punpckhqd_q_256: ; X86: # %bb.0: -; X86-NEXT: vpunpckhqdq %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6d,0xd9] -; X86-NEXT: # ymm3 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpunpckhqdq %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0x6d,0xd1] ; X86-NEXT: # ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; X86-NEXT: vpaddq %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc3] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_punpckhqd_q_256: ; X64: # %bb.0: -; X64-NEXT: vpunpckhqdq %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6d,0xd9] -; X64-NEXT: # ymm3 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpunpckhqdq %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0x6d,0xd1] ; X64-NEXT: # ymm2 {%k1} = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; X64-NEXT: vpaddq %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc3] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.punpckhqd.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) - %res1 = call <4 x i64> @llvm.x86.avx512.mask.punpckhqd.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1) - %res2 = add <4 x i64> %res, %res1 - ret <4 x i64> %res2 + ret <4 x i64> %res } define <4 x i32> @test_mask_and_epi32_rr_128(<4 x i32> %a, <4 x i32> %b) { @@ -5159,6 +5572,16 @@ declare <4 x float> @llvm.x86.avx512.mask.div.ps.128(<4 x float>, <4 x float>, < declare <8 x float> @llvm.x86.avx512.mask.shuf.f32x4.256(<8 x float>, <8 x float>, i32, <8 x float>, i8) +define <8 x float>@test_int_x86_avx512_shuf_f32x4_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x3) { +; CHECK-LABEL: test_int_x86_avx512_shuf_f32x4_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vblendps $240, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x0c,0xc1,0xf0] +; CHECK-NEXT: # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x float> @llvm.x86.avx512.mask.shuf.f32x4.256(<8 x float> %x0, <8 x float> %x1, i32 22, <8 x float> %x3, i8 -1) + ret <8 x float> %res +} + define <8 x float>@test_int_x86_avx512_mask_shuf_f32x4_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x3, i8 %x4) { ; X86-LABEL: test_int_x86_avx512_mask_shuf_f32x4_256: ; X86: # %bb.0: @@ -5166,10 +5589,7 @@ define <8 x float>@test_int_x86_avx512_mask_shuf_f32x4_256(<8 x float> %x0, <8 x ; X86-NEXT: # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vmovaps %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0x28,0xd0] -; X86-NEXT: vmovaps %ymm0, %ymm1 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xa9,0x28,0xc8] -; X86-NEXT: vaddps %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc0] -; X86-NEXT: vaddps %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xc0] +; X86-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x29,0x65,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_shuf_f32x4_256: @@ -5177,21 +5597,45 @@ define <8 x float>@test_int_x86_avx512_mask_shuf_f32x4_256(<8 x float> %x0, <8 x ; X64-NEXT: vblendps $240, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x0c,0xc1,0xf0] ; X64-NEXT: # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vmovaps %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0x28,0xd0] -; X64-NEXT: vmovaps %ymm0, %ymm1 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xa9,0x28,0xc8] -; X64-NEXT: vaddps %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc0] -; X64-NEXT: vaddps %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xc0] +; X64-NEXT: vblendmps %ymm0, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x29,0x65,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <8 x float> @llvm.x86.avx512.mask.shuf.f32x4.256(<8 x float> %x0, <8 x float> %x1, i32 22, <8 x float> %x3, i8 %x4) + ret <8 x float> %res +} + +define <8 x float>@test_int_x86_avx512_maskz_shuf_f32x4_256(<8 x float> %x0, <8 x float> %x1, i8 %x4) { +; X86-LABEL: test_int_x86_avx512_maskz_shuf_f32x4_256: +; X86: # %bb.0: +; X86-NEXT: vblendps $240, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x0c,0xc1,0xf0] +; X86-NEXT: # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xa9,0x28,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_shuf_f32x4_256: +; X64: # %bb.0: +; X64-NEXT: vblendps $240, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x0c,0xc1,0xf0] +; X64-NEXT: # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vmovaps %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7c,0xa9,0x28,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <8 x float> @llvm.x86.avx512.mask.shuf.f32x4.256(<8 x float> %x0, <8 x float> %x1, i32 22, <8 x float> %x3, i8 %x4) - %res1 = call <8 x float> @llvm.x86.avx512.mask.shuf.f32x4.256(<8 x float> %x0, <8 x float> %x1, i32 22, <8 x float> %x3, i8 -1) - %res2 = call <8 x float> @llvm.x86.avx512.mask.shuf.f32x4.256(<8 x float> %x0, <8 x float> %x1, i32 22, <8 x float> zeroinitializer, i8 %x4) - %res3 = fadd <8 x float> %res, %res1 - %res4 = fadd <8 x float> %res2, %res3 - ret <8 x float> %res4 + %res = call <8 x float> @llvm.x86.avx512.mask.shuf.f32x4.256(<8 x float> %x0, <8 x float> %x1, i32 22, <8 x float> zeroinitializer, i8 %x4) + ret <8 x float> %res } declare <4 x double> @llvm.x86.avx512.mask.shuf.f64x2.256(<4 x double>, <4 x double>, i32, <4 x double>, i8) +define <4 x double>@test_int_x86_avx512_shuf_f64x2_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x3, i8 %x4) { +; CHECK-LABEL: test_int_x86_avx512_shuf_f64x2_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vblendps $240, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x0c,0xc1,0xf0] +; CHECK-NEXT: # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x double> @llvm.x86.avx512.mask.shuf.f64x2.256(<4 x double> %x0, <4 x double> %x1, i32 22, <4 x double> %x3, i8 -1) + ret <4 x double> %res +} + define <4 x double>@test_int_x86_avx512_mask_shuf_f64x2_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x3, i8 %x4) { ; X86-LABEL: test_int_x86_avx512_mask_shuf_f64x2_256: ; X86: # %bb.0: @@ -5199,10 +5643,7 @@ define <4 x double>@test_int_x86_avx512_mask_shuf_f64x2_256(<4 x double> %x0, <4 ; X86-NEXT: # ymm0 = ymm0[0,1],ymm1[2,3] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vmovapd %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0x28,0xd0] -; X86-NEXT: vmovapd %ymm0, %ymm1 {%k1} {z} # encoding: [0x62,0xf1,0xfd,0xa9,0x28,0xc8] -; X86-NEXT: vaddpd %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc0] -; X86-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc0] +; X86-NEXT: vblendmpd %ymm0, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0xed,0x29,0x65,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_shuf_f64x2_256: @@ -5210,21 +5651,45 @@ define <4 x double>@test_int_x86_avx512_mask_shuf_f64x2_256(<4 x double> %x0, <4 ; X64-NEXT: vblendpd $12, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x0d,0xc1,0x0c] ; X64-NEXT: # ymm0 = ymm0[0,1],ymm1[2,3] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vmovapd %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0x28,0xd0] -; X64-NEXT: vmovapd %ymm0, %ymm1 {%k1} {z} # encoding: [0x62,0xf1,0xfd,0xa9,0x28,0xc8] -; X64-NEXT: vaddpd %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc0] -; X64-NEXT: vaddpd %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc0] +; X64-NEXT: vblendmpd %ymm0, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0xed,0x29,0x65,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <4 x double> @llvm.x86.avx512.mask.shuf.f64x2.256(<4 x double> %x0, <4 x double> %x1, i32 22, <4 x double> %x3, i8 %x4) + ret <4 x double> %res +} + +define <4 x double>@test_int_x86_avx512_maskz_shuf_f64x2_256(<4 x double> %x0, <4 x double> %x1, i8 %x4) { +; X86-LABEL: test_int_x86_avx512_maskz_shuf_f64x2_256: +; X86: # %bb.0: +; X86-NEXT: vblendpd $12, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x0d,0xc1,0x0c] +; X86-NEXT: # ymm0 = ymm0[0,1],ymm1[2,3] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0xfd,0xa9,0x28,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_shuf_f64x2_256: +; X64: # %bb.0: +; X64-NEXT: vblendpd $12, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x0d,0xc1,0x0c] +; X64-NEXT: # ymm0 = ymm0[0,1],ymm1[2,3] +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0xfd,0xa9,0x28,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <4 x double> @llvm.x86.avx512.mask.shuf.f64x2.256(<4 x double> %x0, <4 x double> %x1, i32 22, <4 x double> %x3, i8 %x4) - %res1 = call <4 x double> @llvm.x86.avx512.mask.shuf.f64x2.256(<4 x double> %x0, <4 x double> %x1, i32 22, <4 x double> %x3, i8 -1) - %res2 = call <4 x double> @llvm.x86.avx512.mask.shuf.f64x2.256(<4 x double> %x0, <4 x double> %x1, i32 22, <4 x double> zeroinitializer, i8 %x4) - %res3 = fadd <4 x double> %res, %res1 - %res4 = fadd <4 x double> %res2, %res3 - ret <4 x double> %res4 + %res = call <4 x double> @llvm.x86.avx512.mask.shuf.f64x2.256(<4 x double> %x0, <4 x double> %x1, i32 22, <4 x double> zeroinitializer, i8 %x4) + ret <4 x double> %res } declare <8 x i32> @llvm.x86.avx512.mask.shuf.i32x4.256(<8 x i32>, <8 x i32>, i32, <8 x i32>, i8) +define <8 x i32>@test_int_x86_avx512_shuf_i32x4_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x3, i8 %x4) { +; CHECK-LABEL: test_int_x86_avx512_shuf_i32x4_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vblendps $240, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x0c,0xc1,0xf0] +; CHECK-NEXT: # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i32> @llvm.x86.avx512.mask.shuf.i32x4.256(<8 x i32> %x0, <8 x i32> %x1, i32 22, <8 x i32> %x3, i8 -1) + ret <8 x i32> %res +} + define <8 x i32>@test_int_x86_avx512_mask_shuf_i32x4_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x3, i8 %x4) { ; X86-LABEL: test_int_x86_avx512_mask_shuf_i32x4_256: ; X86: # %bb.0: @@ -5232,8 +5697,7 @@ define <8 x i32>@test_int_x86_avx512_mask_shuf_i32x4_256(<8 x i32> %x0, <8 x i32 ; X86-NEXT: # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vmovdqa32 %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x6f,0xd0] -; X86-NEXT: vpaddd %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc0] +; X86-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x29,0x64,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_shuf_i32x4_256: @@ -5241,17 +5705,24 @@ define <8 x i32>@test_int_x86_avx512_mask_shuf_i32x4_256(<8 x i32> %x0, <8 x i32 ; X64-NEXT: vpblendd $240, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x02,0xc1,0xf0] ; X64-NEXT: # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vmovdqa32 %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x6f,0xd0] -; X64-NEXT: vpaddd %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc0] +; X64-NEXT: vpblendmd %ymm0, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0x6d,0x29,0x64,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <8 x i32> @llvm.x86.avx512.mask.shuf.i32x4.256(<8 x i32> %x0, <8 x i32> %x1, i32 22, <8 x i32> %x3, i8 %x4) - %res1 = call <8 x i32> @llvm.x86.avx512.mask.shuf.i32x4.256(<8 x i32> %x0, <8 x i32> %x1, i32 22, <8 x i32> %x3, i8 -1) - %res2 = add <8 x i32> %res, %res1 - ret <8 x i32> %res2 + %res = call <8 x i32> @llvm.x86.avx512.mask.shuf.i32x4.256(<8 x i32> %x0, <8 x i32> %x1, i32 22, <8 x i32> %x3, i8 %x4) + ret <8 x i32> %res } declare <4 x i64> @llvm.x86.avx512.mask.shuf.i64x2.256(<4 x i64>, <4 x i64>, i32, <4 x i64>, i8) +define <4 x i64>@test_int_x86_avx512_shuf_i64x2_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x3) { +; CHECK-LABEL: test_int_x86_avx512_shuf_i64x2_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vblendps $240, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x0c,0xc1,0xf0] +; CHECK-NEXT: # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i64> @llvm.x86.avx512.mask.shuf.i64x2.256(<4 x i64> %x0, <4 x i64> %x1, i32 22, <4 x i64> %x3, i8 -1) + ret <4 x i64> %res +} + define <4 x i64>@test_int_x86_avx512_mask_shuf_i64x2_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x3, i8 %x4) { ; X86-LABEL: test_int_x86_avx512_mask_shuf_i64x2_256: ; X86: # %bb.0: @@ -5259,8 +5730,7 @@ define <4 x i64>@test_int_x86_avx512_mask_shuf_i64x2_256(<4 x i64> %x0, <4 x i64 ; X86-NEXT: # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vmovdqa64 %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0x6f,0xd0] -; X86-NEXT: vpaddq %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0] +; X86-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0xed,0x29,0x64,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_shuf_i64x2_256: @@ -5268,137 +5738,161 @@ define <4 x i64>@test_int_x86_avx512_mask_shuf_i64x2_256(<4 x i64> %x0, <4 x i64 ; X64-NEXT: vpblendd $240, %ymm1, %ymm0, %ymm0 # encoding: [0xc4,0xe3,0x7d,0x02,0xc1,0xf0] ; X64-NEXT: # ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vmovdqa64 %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0x6f,0xd0] -; X64-NEXT: vpaddq %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0] +; X64-NEXT: vpblendmq %ymm0, %ymm2, %ymm0 {%k1} # encoding: [0x62,0xf2,0xed,0x29,0x64,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <4 x i64> @llvm.x86.avx512.mask.shuf.i64x2.256(<4 x i64> %x0, <4 x i64> %x1, i32 22, <4 x i64> %x3, i8 %x4) - %res1 = call <4 x i64> @llvm.x86.avx512.mask.shuf.i64x2.256(<4 x i64> %x0, <4 x i64> %x1, i32 22, <4 x i64> %x3, i8 -1) - %res2 = add <4 x i64> %res, %res1 - ret <4 x i64> %res2 + %res = call <4 x i64> @llvm.x86.avx512.mask.shuf.i64x2.256(<4 x i64> %x0, <4 x i64> %x1, i32 22, <4 x i64> %x3, i8 %x4) + ret <4 x i64> %res } declare <2 x double> @llvm.x86.avx512.mask.shuf.pd.128(<2 x double>, <2 x double>, i32, <2 x double>, i8) +define <2 x double>@test_int_x86_avx512_shuf_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x3, i8 %x4) { +; CHECK-LABEL: test_int_x86_avx512_shuf_pd_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vshufpd $1, %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc6,0xc1,0x01] +; CHECK-NEXT: # xmm0 = xmm0[1],xmm1[0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <2 x double> @llvm.x86.avx512.mask.shuf.pd.128(<2 x double> %x0, <2 x double> %x1, i32 1, <2 x double> %x3, i8 -1) + ret <2 x double> %res +} + define <2 x double>@test_int_x86_avx512_mask_shuf_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x3, i8 %x4) { ; X86-LABEL: test_int_x86_avx512_mask_shuf_pd_128: ; X86: # %bb.0: -; X86-NEXT: vshufpd $1, %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc6,0xd9,0x01] -; X86-NEXT: # xmm3 = xmm0[1],xmm1[0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vshufpd $1, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0xc6,0xd1,0x01] ; X86-NEXT: # xmm2 {%k1} = xmm0[1],xmm1[0] -; X86-NEXT: vaddpd %xmm3, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0x58,0xd3] -; X86-NEXT: vshufpd $1, %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0xfd,0x89,0xc6,0xc1,0x01] -; X86-NEXT: # xmm0 {%k1} {z} = xmm0[1],xmm1[0] -; X86-NEXT: vaddpd %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x58,0xc2] +; X86-NEXT: vmovapd %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_shuf_pd_128: ; X64: # %bb.0: -; X64-NEXT: vshufpd $1, %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc6,0xd9,0x01] -; X64-NEXT: # xmm3 = xmm0[1],xmm1[0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vshufpd $1, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0xc6,0xd1,0x01] ; X64-NEXT: # xmm2 {%k1} = xmm0[1],xmm1[0] -; X64-NEXT: vaddpd %xmm3, %xmm2, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0x58,0xd3] +; X64-NEXT: vmovapd %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <2 x double> @llvm.x86.avx512.mask.shuf.pd.128(<2 x double> %x0, <2 x double> %x1, i32 1, <2 x double> %x3, i8 %x4) + ret <2 x double> %res +} + +define <2 x double>@test_int_x86_avx512_maskz_shuf_pd_128(<2 x double> %x0, <2 x double> %x1, i8 %x4) { +; X86-LABEL: test_int_x86_avx512_maskz_shuf_pd_128: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vshufpd $1, %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0xfd,0x89,0xc6,0xc1,0x01] +; X86-NEXT: # xmm0 {%k1} {z} = xmm0[1],xmm1[0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_shuf_pd_128: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vshufpd $1, %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0xfd,0x89,0xc6,0xc1,0x01] ; X64-NEXT: # xmm0 {%k1} {z} = xmm0[1],xmm1[0] -; X64-NEXT: vaddpd %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x58,0xc2] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <2 x double> @llvm.x86.avx512.mask.shuf.pd.128(<2 x double> %x0, <2 x double> %x1, i32 1, <2 x double> %x3, i8 %x4) - %res1 = call <2 x double> @llvm.x86.avx512.mask.shuf.pd.128(<2 x double> %x0, <2 x double> %x1, i32 1, <2 x double> %x3, i8 -1) - %res2 = call <2 x double> @llvm.x86.avx512.mask.shuf.pd.128(<2 x double> %x0, <2 x double> %x1, i32 1, <2 x double> zeroinitializer, i8 %x4) - %res3 = fadd <2 x double> %res, %res1 - %res4 = fadd <2 x double> %res2, %res3 - ret <2 x double> %res4 + %res = call <2 x double> @llvm.x86.avx512.mask.shuf.pd.128(<2 x double> %x0, <2 x double> %x1, i32 1, <2 x double> zeroinitializer, i8 %x4) + ret <2 x double> %res } declare <4 x double> @llvm.x86.avx512.mask.shuf.pd.256(<4 x double>, <4 x double>, i32, <4 x double>, i8) +define <4 x double>@test_int_x86_avx512_shuf_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x3) { +; CHECK-LABEL: test_int_x86_avx512_shuf_pd_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vshufpd $6, %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xc6,0xc1,0x06] +; CHECK-NEXT: # ymm0 = ymm0[0],ymm1[1],ymm0[3],ymm1[2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x double> @llvm.x86.avx512.mask.shuf.pd.256(<4 x double> %x0, <4 x double> %x1, i32 6, <4 x double> %x3, i8 -1) + ret <4 x double> %res +} + define <4 x double>@test_int_x86_avx512_mask_shuf_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x3, i8 %x4) { ; X86-LABEL: test_int_x86_avx512_mask_shuf_pd_256: ; X86: # %bb.0: -; X86-NEXT: vshufpd $6, %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xc6,0xd9,0x06] -; X86-NEXT: # ymm3 = ymm0[0],ymm1[1],ymm0[3],ymm1[2] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vshufpd $6, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0xc6,0xd1,0x06] ; X86-NEXT: # ymm2 {%k1} = ymm0[0],ymm1[1],ymm0[3],ymm1[2] -; X86-NEXT: vaddpd %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc3] +; X86-NEXT: vmovapd %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_shuf_pd_256: ; X64: # %bb.0: -; X64-NEXT: vshufpd $6, %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xc6,0xd9,0x06] -; X64-NEXT: # ymm3 = ymm0[0],ymm1[1],ymm0[3],ymm1[2] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vshufpd $6, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0xc6,0xd1,0x06] ; X64-NEXT: # ymm2 {%k1} = ymm0[0],ymm1[1],ymm0[3],ymm1[2] -; X64-NEXT: vaddpd %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc3] +; X64-NEXT: vmovapd %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x double> @llvm.x86.avx512.mask.shuf.pd.256(<4 x double> %x0, <4 x double> %x1, i32 6, <4 x double> %x3, i8 %x4) - %res1 = call <4 x double> @llvm.x86.avx512.mask.shuf.pd.256(<4 x double> %x0, <4 x double> %x1, i32 6, <4 x double> %x3, i8 -1) - %res2 = fadd <4 x double> %res, %res1 - ret <4 x double> %res2 + ret <4 x double> %res } declare <4 x float> @llvm.x86.avx512.mask.shuf.ps.128(<4 x float>, <4 x float>, i32, <4 x float>, i8) +define <4 x float>@test_int_x86_avx512_shuf_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x3) { +; CHECK-LABEL: test_int_x86_avx512_shuf_ps_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vshufps $22, %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0xc6,0xc1,0x16] +; CHECK-NEXT: # xmm0 = xmm0[2,1],xmm1[1,0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x float> @llvm.x86.avx512.mask.shuf.ps.128(<4 x float> %x0, <4 x float> %x1, i32 22, <4 x float> %x3, i8 -1) + ret <4 x float> %res +} + define <4 x float>@test_int_x86_avx512_mask_shuf_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x3, i8 %x4) { ; X86-LABEL: test_int_x86_avx512_mask_shuf_ps_128: ; X86: # %bb.0: -; X86-NEXT: vshufps $22, %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0xc6,0xd9,0x16] -; X86-NEXT: # xmm3 = xmm0[2,1],xmm1[1,0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vshufps $22, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7c,0x09,0xc6,0xd1,0x16] ; X86-NEXT: # xmm2 {%k1} = xmm0[2,1],xmm1[1,0] -; X86-NEXT: vaddps %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xc3] +; X86-NEXT: vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_shuf_ps_128: ; X64: # %bb.0: -; X64-NEXT: vshufps $22, %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0xc6,0xd9,0x16] -; X64-NEXT: # xmm3 = xmm0[2,1],xmm1[1,0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vshufps $22, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7c,0x09,0xc6,0xd1,0x16] ; X64-NEXT: # xmm2 {%k1} = xmm0[2,1],xmm1[1,0] -; X64-NEXT: vaddps %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xc3] +; X64-NEXT: vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.shuf.ps.128(<4 x float> %x0, <4 x float> %x1, i32 22, <4 x float> %x3, i8 %x4) - %res1 = call <4 x float> @llvm.x86.avx512.mask.shuf.ps.128(<4 x float> %x0, <4 x float> %x1, i32 22, <4 x float> %x3, i8 -1) - %res2 = fadd <4 x float> %res, %res1 - ret <4 x float> %res2 + ret <4 x float> %res } declare <8 x float> @llvm.x86.avx512.mask.shuf.ps.256(<8 x float>, <8 x float>, i32, <8 x float>, i8) +define <8 x float>@test_int_x86_avx512_shuf_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x3) { +; CHECK-LABEL: test_int_x86_avx512_shuf_ps_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vshufps $22, %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0xc6,0xc1,0x16] +; CHECK-NEXT: # ymm0 = ymm0[2,1],ymm1[1,0],ymm0[6,5],ymm1[5,4] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x float> @llvm.x86.avx512.mask.shuf.ps.256(<8 x float> %x0, <8 x float> %x1, i32 22, <8 x float> %x3, i8 -1) + ret <8 x float> %res +} + define <8 x float>@test_int_x86_avx512_mask_shuf_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x3, i8 %x4) { ; X86-LABEL: test_int_x86_avx512_mask_shuf_ps_256: ; X86: # %bb.0: -; X86-NEXT: vshufps $22, %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0xc6,0xd9,0x16] -; X86-NEXT: # ymm3 = ymm0[2,1],ymm1[1,0],ymm0[6,5],ymm1[5,4] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vshufps $22, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0xc6,0xd1,0x16] ; X86-NEXT: # ymm2 {%k1} = ymm0[2,1],ymm1[1,0],ymm0[6,5],ymm1[5,4] -; X86-NEXT: vaddps %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc3] +; X86-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_shuf_ps_256: ; X64: # %bb.0: -; X64-NEXT: vshufps $22, %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0xc6,0xd9,0x16] -; X64-NEXT: # ymm3 = ymm0[2,1],ymm1[1,0],ymm0[6,5],ymm1[5,4] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vshufps $22, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0xc6,0xd1,0x16] ; X64-NEXT: # ymm2 {%k1} = ymm0[2,1],ymm1[1,0],ymm0[6,5],ymm1[5,4] -; X64-NEXT: vaddps %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc3] +; X64-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx512.mask.shuf.ps.256(<8 x float> %x0, <8 x float> %x1, i32 22, <8 x float> %x3, i8 %x4) - %res1 = call <8 x float> @llvm.x86.avx512.mask.shuf.ps.256(<8 x float> %x0, <8 x float> %x1, i32 22, <8 x float> %x3, i8 -1) - %res2 = fadd <8 x float> %res, %res1 - ret <8 x float> %res2 + ret <8 x float> %res } declare <4 x i32> @llvm.x86.avx512.mask.pmaxs.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8) @@ -5409,71 +5903,94 @@ define <4 x i32>@test_int_x86_avx512_mask_pmaxs_d_128(<4 x i32> %x0, <4 x i32> % ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmaxsd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x3d,0xd1] -; X86-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x3d,0xc1] -; X86-NEXT: vpaddd %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0] +; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmaxs_d_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmaxsd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x3d,0xd1] -; X64-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x3d,0xc1] -; X64-NEXT: vpaddd %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0] +; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.pmaxs.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2 ,i8 %mask) - %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmaxs.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> zeroinitializer, i8 %mask) - %res2 = add <4 x i32> %res, %res1 - ret <4 x i32> %res2 + ret <4 x i32> %res +} + +define <4 x i32>@test_int_x86_avx512_maskz_pmaxs_d_128(<4 x i32> %x0, <4 x i32> %x1, i8 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_pmaxs_d_128: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x3d,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmaxs_d_128: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x3d,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.mask.pmaxs.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> zeroinitializer, i8 %mask) + ret <4 x i32> %res } declare <8 x i32> @llvm.x86.avx512.mask.pmaxs.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8) +define <8 x i32>@test_int_x86_avx512_pmaxs_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pmaxs_d_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3d,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i32> @llvm.x86.avx512.mask.pmaxs.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1) + ret <8 x i32> %res +} + define <8 x i32>@test_int_x86_avx512_mask_pmaxs_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pmaxs_d_256: ; X86: # %bb.0: -; X86-NEXT: vpmaxsd %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3d,0xd9] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmaxsd %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x3d,0xd1] -; X86-NEXT: vpaddd %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc3] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmaxs_d_256: ; X64: # %bb.0: -; X64-NEXT: vpmaxsd %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3d,0xd9] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmaxsd %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x3d,0xd1] -; X64-NEXT: vpaddd %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc3] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.pmaxs.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) - %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmaxs.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1) - %res2 = add <8 x i32> %res, %res1 - ret <8 x i32> %res2 + ret <8 x i32> %res } declare <2 x i64> @llvm.x86.avx512.mask.pmaxs.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8) +define <2 x i64>@test_int_x86_avx512_pmaxs_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pmaxs_q_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmaxsq %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf2,0xfd,0x08,0x3d,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <2 x i64> @llvm.x86.avx512.mask.pmaxs.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1) + ret <2 x i64> %res +} + define <2 x i64>@test_int_x86_avx512_mask_pmaxs_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pmaxs_q_128: ; X86: # %bb.0: -; X86-NEXT: vpmaxsq %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf2,0xfd,0x08,0x3d,0xd9] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmaxsq %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x3d,0xd1] -; X86-NEXT: vpaddq %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc3] +; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmaxs_q_128: ; X64: # %bb.0: -; X64-NEXT: vpmaxsq %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf2,0xfd,0x08,0x3d,0xd9] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmaxsq %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x3d,0xd1] -; X64-NEXT: vpaddq %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc3] +; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.mask.pmaxs.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) - %res1 = call <2 x i64> @llvm.x86.avx512.mask.pmaxs.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1) - %res2 = add <2 x i64> %res, %res1 - ret <2 x i64> %res2 + ret <2 x i64> %res } declare <4 x i64> @llvm.x86.avx512.mask.pmaxs.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8) @@ -5484,21 +6001,34 @@ define <4 x i64>@test_int_x86_avx512_mask_pmaxs_q_256(<4 x i64> %x0, <4 x i64> % ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmaxsq %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x3d,0xd1] -; X86-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x3d,0xc1] -; X86-NEXT: vpaddq %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmaxs_q_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmaxsq %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x3d,0xd1] -; X64-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x3d,0xc1] -; X64-NEXT: vpaddq %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.pmaxs.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %mask) - %res1 = call <4 x i64> @llvm.x86.avx512.mask.pmaxs.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %mask) - %res2 = add <4 x i64> %res, %res1 - ret <4 x i64> %res2 + ret <4 x i64> %res +} + +define <4 x i64>@test_int_x86_avx512_maskz_pmaxs_q_256(<4 x i64> %x0, <4 x i64> %x1, i8 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_pmaxs_q_256: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x3d,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmaxs_q_256: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vpmaxsq %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x3d,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <4 x i64> @llvm.x86.avx512.mask.pmaxs.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %mask) + ret <4 x i64> %res } declare <4 x i32> @llvm.x86.avx512.mask.pmaxu.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8) @@ -5509,71 +6039,94 @@ define <4 x i32>@test_int_x86_avx512_mask_pmaxu_d_128(<4 x i32> %x0, <4 x i32> % ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmaxud %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x3f,0xd1] -; X86-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x3f,0xc1] -; X86-NEXT: vpaddd %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0] +; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmaxu_d_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmaxud %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x3f,0xd1] -; X64-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x3f,0xc1] -; X64-NEXT: vpaddd %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0] +; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.pmaxu.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %mask) - %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmaxu.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> zeroinitializer, i8 %mask) - %res2 = add <4 x i32> %res, %res1 - ret <4 x i32> %res2 + ret <4 x i32> %res } -declare <8 x i32> @llvm.x86.avx512.mask.pmaxu.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8) - -define <8 x i32>@test_int_x86_avx512_mask_pmaxu_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) { -; X86-LABEL: test_int_x86_avx512_mask_pmaxu_d_256: +define <4 x i32>@test_int_x86_avx512_maskz_pmaxu_d_128(<4 x i32> %x0, <4 x i32> %x1, i8 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_pmaxu_d_128: ; X86: # %bb.0: -; X86-NEXT: vpmaxud %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3f,0xd9] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpmaxud %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x3f,0xd1] -; X86-NEXT: vpaddd %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc3] +; X86-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x3f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; -; X64-LABEL: test_int_x86_avx512_mask_pmaxu_d_256: +; X64-LABEL: test_int_x86_avx512_maskz_pmaxu_d_128: ; X64: # %bb.0: -; X64-NEXT: vpmaxud %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3f,0xd9] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vpmaxud %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x3f,0xd1] -; X64-NEXT: vpaddd %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc3] +; X64-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x3f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <8 x i32> @llvm.x86.avx512.mask.pmaxu.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) - %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmaxu.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1) - %res2 = add <8 x i32> %res, %res1 - ret <8 x i32> %res2 + %res = call <4 x i32> @llvm.x86.avx512.mask.pmaxu.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> zeroinitializer, i8 %mask) + ret <4 x i32> %res } -declare <2 x i64> @llvm.x86.avx512.mask.pmaxu.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8) +declare <8 x i32> @llvm.x86.avx512.mask.pmaxu.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8) -define <2 x i64>@test_int_x86_avx512_mask_pmaxu_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) { -; X86-LABEL: test_int_x86_avx512_mask_pmaxu_q_128: -; X86: # %bb.0: -; X86-NEXT: vpmaxuq %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf2,0xfd,0x08,0x3f,0xd9] +define <8 x i32>@test_int_x86_avx512_pmaxu_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pmaxu_d_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3f,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i32> @llvm.x86.avx512.mask.pmaxu.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1) + ret <8 x i32> %res +} + +define <8 x i32>@test_int_x86_avx512_mask_pmaxu_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) { +; X86-LABEL: test_int_x86_avx512_mask_pmaxu_d_256: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpmaxud %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x3f,0xd1] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_mask_pmaxu_d_256: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vpmaxud %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x3f,0xd1] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <8 x i32> @llvm.x86.avx512.mask.pmaxu.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) + ret <8 x i32> %res +} + +declare <2 x i64> @llvm.x86.avx512.mask.pmaxu.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8) + +define <2 x i64>@test_int_x86_avx512_pmaxu_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pmaxu_q_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmaxuq %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf2,0xfd,0x08,0x3f,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <2 x i64> @llvm.x86.avx512.mask.pmaxu.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1) + ret <2 x i64> %res +} + +define <2 x i64>@test_int_x86_avx512_mask_pmaxu_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) { +; X86-LABEL: test_int_x86_avx512_mask_pmaxu_q_128: +; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmaxuq %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x3f,0xd1] -; X86-NEXT: vpaddq %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc3] +; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmaxu_q_128: ; X64: # %bb.0: -; X64-NEXT: vpmaxuq %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf2,0xfd,0x08,0x3f,0xd9] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmaxuq %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x3f,0xd1] -; X64-NEXT: vpaddq %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc3] +; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.mask.pmaxu.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) - %res1 = call <2 x i64> @llvm.x86.avx512.mask.pmaxu.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1) - %res2 = add <2 x i64> %res, %res1 - ret <2 x i64> %res2 + ret <2 x i64> %res } declare <4 x i64> @llvm.x86.avx512.mask.pmaxu.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8) @@ -5584,21 +6137,34 @@ define <4 x i64>@test_int_x86_avx512_mask_pmaxu_q_256(<4 x i64> %x0, <4 x i64> % ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmaxuq %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x3f,0xd1] -; X86-NEXT: vpmaxuq %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x3f,0xc1] -; X86-NEXT: vpaddq %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmaxu_q_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmaxuq %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x3f,0xd1] -; X64-NEXT: vpmaxuq %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x3f,0xc1] -; X64-NEXT: vpaddq %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.pmaxu.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %mask) - %res1 = call <4 x i64> @llvm.x86.avx512.mask.pmaxu.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %mask) - %res2 = add <4 x i64> %res, %res1 - ret <4 x i64> %res2 + ret <4 x i64> %res +} + +define <4 x i64>@test_int_x86_avx512_maskz_pmaxu_q_256(<4 x i64> %x0, <4 x i64> %x1, i8 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_pmaxu_q_256: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpmaxuq %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x3f,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmaxu_q_256: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vpmaxuq %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x3f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <4 x i64> @llvm.x86.avx512.mask.pmaxu.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %mask) + ret <4 x i64> %res } declare <4 x i32> @llvm.x86.avx512.mask.pmins.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8) @@ -5609,71 +6175,94 @@ define <4 x i32>@test_int_x86_avx512_mask_pmins_d_128(<4 x i32> %x0, <4 x i32> % ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpminsd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x39,0xd1] -; X86-NEXT: vpminsd %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x39,0xc1] -; X86-NEXT: vpaddd %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0] +; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmins_d_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpminsd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x39,0xd1] -; X64-NEXT: vpminsd %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x39,0xc1] -; X64-NEXT: vpaddd %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0] +; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.pmins.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %mask) - %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmins.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> zeroinitializer, i8 %mask) - %res2 = add <4 x i32> %res, %res1 - ret <4 x i32> %res2 + ret <4 x i32> %res +} + +define <4 x i32>@test_int_x86_avx512_maskz_pmins_d_128(<4 x i32> %x0, <4 x i32> %x1, i8 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_pmins_d_128: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpminsd %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x39,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmins_d_128: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vpminsd %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x39,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.mask.pmins.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> zeroinitializer, i8 %mask) + ret <4 x i32> %res } declare <8 x i32> @llvm.x86.avx512.mask.pmins.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8) +define <8 x i32>@test_int_x86_avx512_pmins_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pmins_d_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpminsd %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x39,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i32> @llvm.x86.avx512.mask.pmins.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1) + ret <8 x i32> %res +} + define <8 x i32>@test_int_x86_avx512_mask_pmins_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pmins_d_256: ; X86: # %bb.0: -; X86-NEXT: vpminsd %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x39,0xd9] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpminsd %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x39,0xd1] -; X86-NEXT: vpaddd %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc3] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmins_d_256: ; X64: # %bb.0: -; X64-NEXT: vpminsd %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x39,0xd9] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpminsd %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x39,0xd1] -; X64-NEXT: vpaddd %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc3] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.pmins.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) - %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmins.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1) - %res2 = add <8 x i32> %res, %res1 - ret <8 x i32> %res2 + ret <8 x i32> %res } declare <2 x i64> @llvm.x86.avx512.mask.pmins.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8) +define <2 x i64>@test_int_x86_avx512_pmins_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pmins_q_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpminsq %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf2,0xfd,0x08,0x39,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <2 x i64> @llvm.x86.avx512.mask.pmins.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1) + ret <2 x i64> %res +} + define <2 x i64>@test_int_x86_avx512_mask_pmins_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pmins_q_128: ; X86: # %bb.0: -; X86-NEXT: vpminsq %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf2,0xfd,0x08,0x39,0xd9] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpminsq %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x39,0xd1] -; X86-NEXT: vpaddq %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc3] +; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmins_q_128: ; X64: # %bb.0: -; X64-NEXT: vpminsq %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf2,0xfd,0x08,0x39,0xd9] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpminsq %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x39,0xd1] -; X64-NEXT: vpaddq %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc3] +; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.mask.pmins.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) - %res1 = call <2 x i64> @llvm.x86.avx512.mask.pmins.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1) - %res2 = add <2 x i64> %res, %res1 - ret <2 x i64> %res2 + ret <2 x i64> %res } declare <4 x i64> @llvm.x86.avx512.mask.pmins.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8) @@ -5684,21 +6273,34 @@ define <4 x i64>@test_int_x86_avx512_mask_pmins_q_256(<4 x i64> %x0, <4 x i64> % ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpminsq %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x39,0xd1] -; X86-NEXT: vpminsq %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x39,0xc1] -; X86-NEXT: vpaddq %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmins_q_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpminsq %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x39,0xd1] -; X64-NEXT: vpminsq %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x39,0xc1] -; X64-NEXT: vpaddq %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.pmins.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %mask) - %res1 = call <4 x i64> @llvm.x86.avx512.mask.pmins.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %mask) - %res2 = add <4 x i64> %res, %res1 - ret <4 x i64> %res2 + ret <4 x i64> %res +} + +define <4 x i64>@test_int_x86_avx512_maskz_pmins_q_256(<4 x i64> %x0, <4 x i64> %x1, i8 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_pmins_q_256: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpminsq %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x39,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmins_q_256: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vpminsq %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x39,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <4 x i64> @llvm.x86.avx512.mask.pmins.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %mask) + ret <4 x i64> %res } declare <4 x i32> @llvm.x86.avx512.mask.pminu.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8) @@ -5709,71 +6311,94 @@ define <4 x i32>@test_int_x86_avx512_mask_pminu_d_128(<4 x i32> %x0, <4 x i32> % ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpminud %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x3b,0xd1] -; X86-NEXT: vpminud %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x3b,0xc1] -; X86-NEXT: vpaddd %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0] +; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pminu_d_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpminud %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x3b,0xd1] -; X64-NEXT: vpminud %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x3b,0xc1] -; X64-NEXT: vpaddd %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0] +; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.pminu.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %mask) - %res1 = call <4 x i32> @llvm.x86.avx512.mask.pminu.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> zeroinitializer, i8 %mask) - %res2 = add <4 x i32> %res, %res1 - ret <4 x i32> %res2 + ret <4 x i32> %res +} + +define <4 x i32>@test_int_x86_avx512_maskz_pminu_d_128(<4 x i32> %x0, <4 x i32> %x1, i8 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_pminu_d_128: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpminud %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x3b,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pminu_d_128: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vpminud %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x3b,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.mask.pminu.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> zeroinitializer, i8 %mask) + ret <4 x i32> %res } declare <8 x i32> @llvm.x86.avx512.mask.pminu.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8) +define <8 x i32>@test_int_x86_avx512_pminu_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pminu_d_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpminud %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3b,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i32> @llvm.x86.avx512.mask.pminu.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1) + ret <8 x i32> %res +} + define <8 x i32>@test_int_x86_avx512_mask_pminu_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pminu_d_256: ; X86: # %bb.0: -; X86-NEXT: vpminud %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3b,0xd9] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpminud %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x3b,0xd1] -; X86-NEXT: vpaddd %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc3] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pminu_d_256: ; X64: # %bb.0: -; X64-NEXT: vpminud %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x3b,0xd9] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpminud %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x3b,0xd1] -; X64-NEXT: vpaddd %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc3] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.pminu.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) - %res1 = call <8 x i32> @llvm.x86.avx512.mask.pminu.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1) - %res2 = add <8 x i32> %res, %res1 - ret <8 x i32> %res2 + ret <8 x i32> %res } declare <2 x i64> @llvm.x86.avx512.mask.pminu.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8) +define <2 x i64>@test_int_x86_avx512_pminu_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) { +; CHECK-LABEL: test_int_x86_avx512_pminu_q_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpminuq %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf2,0xfd,0x08,0x3b,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <2 x i64> @llvm.x86.avx512.mask.pminu.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1) + ret <2 x i64> %res +} + define <2 x i64>@test_int_x86_avx512_mask_pminu_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_pminu_q_128: ; X86: # %bb.0: -; X86-NEXT: vpminuq %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf2,0xfd,0x08,0x3b,0xd9] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpminuq %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x3b,0xd1] -; X86-NEXT: vpaddq %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc3] +; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pminu_q_128: ; X64: # %bb.0: -; X64-NEXT: vpminuq %xmm1, %xmm0, %xmm3 # encoding: [0x62,0xf2,0xfd,0x08,0x3b,0xd9] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpminuq %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x3b,0xd1] -; X64-NEXT: vpaddq %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc3] +; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.mask.pminu.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) - %res1 = call <2 x i64> @llvm.x86.avx512.mask.pminu.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1) - %res2 = add <2 x i64> %res, %res1 - ret <2 x i64> %res2 + ret <2 x i64> %res } declare <4 x i64> @llvm.x86.avx512.mask.pminu.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8) @@ -5784,21 +6409,34 @@ define <4 x i64>@test_int_x86_avx512_mask_pminu_q_256(<4 x i64> %x0, <4 x i64> % ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpminuq %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x3b,0xd1] -; X86-NEXT: vpminuq %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x3b,0xc1] -; X86-NEXT: vpaddq %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pminu_q_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpminuq %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x3b,0xd1] -; X64-NEXT: vpminuq %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x3b,0xc1] -; X64-NEXT: vpaddq %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.pminu.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %mask) - %res1 = call <4 x i64> @llvm.x86.avx512.mask.pminu.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %mask) - %res2 = add <4 x i64> %res, %res1 - ret <4 x i64> %res2 + ret <4 x i64> %res +} + +define <4 x i64>@test_int_x86_avx512_maskz_pminu_q_256(<4 x i64> %x0, <4 x i64> %x1, i8 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_pminu_q_256: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpminuq %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x3b,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pminu_q_256: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vpminuq %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x3b,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <4 x i64> @llvm.x86.avx512.mask.pminu.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %mask) + ret <4 x i64> %res } declare <2 x i64> @llvm.x86.avx512.mask.psrl.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8) @@ -6600,620 +7238,900 @@ define <8 x i32>@test_int_x86_avx512_mask_psllv8_si(<8 x i32> %x0, <8 x i32> %x1 declare <4 x i32> @llvm.x86.avx512.mask.pmovzxb.d.128(<16 x i8>, <4 x i32>, i8) +define <4 x i32>@test_int_x86_avx512_pmovzxb_d_128(<16 x i8> %x0, <4 x i32> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmovzxb_d_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovzxbd %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x31,0xc0] +; CHECK-NEXT: # xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.mask.pmovzxb.d.128(<16 x i8> %x0, <4 x i32> %x1, i8 -1) + ret <4 x i32> %res +} + define <4 x i32>@test_int_x86_avx512_mask_pmovzxb_d_128(<16 x i8> %x0, <4 x i32> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmovzxb_d_128: ; X86: # %bb.0: -; X86-NEXT: vpmovzxbd %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x31,0xd0] -; X86-NEXT: # xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovzxbd %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x31,0xc8] ; X86-NEXT: # xmm1 {%k1} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; X86-NEXT: vpmovzxbd %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x31,0xc0] -; X86-NEXT: # xmm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; X86-NEXT: vpaddd %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc2] -; X86-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmovzxb_d_128: ; X64: # %bb.0: -; X64-NEXT: vpmovzxbd %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x31,0xd0] -; X64-NEXT: # xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovzxbd %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x31,0xc8] ; X64-NEXT: # xmm1 {%k1} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.mask.pmovzxb.d.128(<16 x i8> %x0, <4 x i32> %x1, i8 %x2) + ret <4 x i32> %res +} + +define <4 x i32>@test_int_x86_avx512_maskz_pmovzxb_d_128(<16 x i8> %x0, i8 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmovzxb_d_128: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpmovzxbd %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x31,0xc0] +; X86-NEXT: # xmm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmovzxb_d_128: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovzxbd %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x31,0xc0] ; X64-NEXT: # xmm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; X64-NEXT: vpaddd %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc2] -; X64-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <4 x i32> @llvm.x86.avx512.mask.pmovzxb.d.128(<16 x i8> %x0, <4 x i32> %x1, i8 %x2) - %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmovzxb.d.128(<16 x i8> %x0, <4 x i32> zeroinitializer, i8 %x2) - %res2 = call <4 x i32> @llvm.x86.avx512.mask.pmovzxb.d.128(<16 x i8> %x0, <4 x i32> %x1, i8 -1) - %res3 = add <4 x i32> %res, %res1 - %res4 = add <4 x i32> %res3, %res2 - ret <4 x i32> %res4 + %res = call <4 x i32> @llvm.x86.avx512.mask.pmovzxb.d.128(<16 x i8> %x0, <4 x i32> zeroinitializer, i8 %x2) + ret <4 x i32> %res } declare <8 x i32> @llvm.x86.avx512.mask.pmovzxb.d.256(<16 x i8>, <8 x i32>, i8) +define <8 x i32>@test_int_x86_avx512_pmovzxb_d_256(<16 x i8> %x0, <8 x i32> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmovzxb_d_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovzxbd %xmm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x31,0xc0] +; CHECK-NEXT: # ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i32> @llvm.x86.avx512.mask.pmovzxb.d.256(<16 x i8> %x0, <8 x i32> %x1, i8 -1) + ret <8 x i32> %res +} + define <8 x i32>@test_int_x86_avx512_mask_pmovzxb_d_256(<16 x i8> %x0, <8 x i32> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmovzxb_d_256: ; X86: # %bb.0: -; X86-NEXT: vpmovzxbd %xmm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x31,0xd0] -; X86-NEXT: # ymm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovzxbd %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x31,0xc8] ; X86-NEXT: # ymm1 {%k1} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; X86-NEXT: vpmovzxbd %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x31,0xc0] -; X86-NEXT: # ymm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; X86-NEXT: vpaddd %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc2] -; X86-NEXT: vpaddd %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmovzxb_d_256: ; X64: # %bb.0: -; X64-NEXT: vpmovzxbd %xmm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x31,0xd0] -; X64-NEXT: # ymm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovzxbd %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x31,0xc8] ; X64-NEXT: # ymm1 {%k1} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; X64-NEXT: vpmovzxbd %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x31,0xc0] -; X64-NEXT: # ymm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; X64-NEXT: vpaddd %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc2] -; X64-NEXT: vpaddd %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.pmovzxb.d.256(<16 x i8> %x0, <8 x i32> %x1, i8 %x2) - %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmovzxb.d.256(<16 x i8> %x0, <8 x i32> zeroinitializer, i8 %x2) - %res2 = call <8 x i32> @llvm.x86.avx512.mask.pmovzxb.d.256(<16 x i8> %x0, <8 x i32> %x1, i8 -1) - %res3 = add <8 x i32> %res, %res1 - %res4 = add <8 x i32> %res3, %res2 - ret <8 x i32> %res4 + ret <8 x i32> %res } -declare <2 x i64> @llvm.x86.avx512.mask.pmovzxb.q.128(<16 x i8>, <2 x i64>, i8) - -define <2 x i64>@test_int_x86_avx512_mask_pmovzxb_q_128(<16 x i8> %x0, <2 x i64> %x1, i8 %x2) { -; X86-LABEL: test_int_x86_avx512_mask_pmovzxb_q_128: +define <8 x i32>@test_int_x86_avx512_maskz_pmovzxb_d_256(<16 x i8> %x0, i8 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmovzxb_d_256: ; X86: # %bb.0: -; X86-NEXT: vpmovzxbq %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x32,0xd0] -; X86-NEXT: # xmm2 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpmovzxbq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x32,0xc8] -; X86-NEXT: # xmm1 {%k1} = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; X86-NEXT: vpmovzxbq %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x32,0xc0] -; X86-NEXT: # xmm0 {%k1} {z} = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; X86-NEXT: vpaddq %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc2] -; X86-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] +; X86-NEXT: vpmovzxbd %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x31,0xc0] +; X86-NEXT: # ymm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero ; X86-NEXT: retl # encoding: [0xc3] ; -; X64-LABEL: test_int_x86_avx512_mask_pmovzxb_q_128: +; X64-LABEL: test_int_x86_avx512_maskz_pmovzxb_d_256: ; X64: # %bb.0: -; X64-NEXT: vpmovzxbq %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x32,0xd0] -; X64-NEXT: # xmm2 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vpmovzxbq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x32,0xc8] -; X64-NEXT: # xmm1 {%k1} = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; X64-NEXT: vpmovzxbq %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x32,0xc0] -; X64-NEXT: # xmm0 {%k1} {z} = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; X64-NEXT: vpaddq %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc2] -; X64-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] +; X64-NEXT: vpmovzxbd %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x31,0xc0] +; X64-NEXT: # ymm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; X64-NEXT: retq # encoding: [0xc3] + %res = call <8 x i32> @llvm.x86.avx512.mask.pmovzxb.d.256(<16 x i8> %x0, <8 x i32> zeroinitializer, i8 %x2) + ret <8 x i32> %res +} + +declare <2 x i64> @llvm.x86.avx512.mask.pmovzxb.q.128(<16 x i8>, <2 x i64>, i8) + +define <2 x i64>@test_int_x86_avx512_pmovzxb_q_128(<16 x i8> %x0, <2 x i64> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmovzxb_q_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovzxbq %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x32,0xc0] +; CHECK-NEXT: # xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <2 x i64> @llvm.x86.avx512.mask.pmovzxb.q.128(<16 x i8> %x0, <2 x i64> %x1, i8 -1) + ret <2 x i64> %res +} + +define <2 x i64>@test_int_x86_avx512_mask_pmovzxb_q_128(<16 x i8> %x0, <2 x i64> %x1, i8 %x2) { +; X86-LABEL: test_int_x86_avx512_mask_pmovzxb_q_128: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpmovzxbq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x32,0xc8] +; X86-NEXT: # xmm1 {%k1} = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_mask_pmovzxb_q_128: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vpmovzxbq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x32,0xc8] +; X64-NEXT: # xmm1 {%k1} = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.mask.pmovzxb.q.128(<16 x i8> %x0, <2 x i64> %x1, i8 %x2) - %res1 = call <2 x i64> @llvm.x86.avx512.mask.pmovzxb.q.128(<16 x i8> %x0, <2 x i64> zeroinitializer, i8 %x2) - %res2 = call <2 x i64> @llvm.x86.avx512.mask.pmovzxb.q.128(<16 x i8> %x0, <2 x i64> %x1, i8 -1) - %res3 = add <2 x i64> %res, %res1 - %res4 = add <2 x i64> %res3, %res2 - ret <2 x i64> %res4 + ret <2 x i64> %res +} + +define <2 x i64>@test_int_x86_avx512_maskz_pmovzxb_q_128(<16 x i8> %x0, i8 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmovzxb_q_128: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpmovzxbq %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x32,0xc0] +; X86-NEXT: # xmm0 {%k1} {z} = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmovzxb_q_128: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vpmovzxbq %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x32,0xc0] +; X64-NEXT: # xmm0 {%k1} {z} = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; X64-NEXT: retq # encoding: [0xc3] + %res = call <2 x i64> @llvm.x86.avx512.mask.pmovzxb.q.128(<16 x i8> %x0, <2 x i64> zeroinitializer, i8 %x2) + ret <2 x i64> %res } declare <4 x i64> @llvm.x86.avx512.mask.pmovzxb.q.256(<16 x i8>, <4 x i64>, i8) +define <4 x i64>@test_int_x86_avx512_pmovzxb_q_256(<16 x i8> %x0, <4 x i64> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmovzxb_q_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovzxbq %xmm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x32,0xc0] +; CHECK-NEXT: # ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i64> @llvm.x86.avx512.mask.pmovzxb.q.256(<16 x i8> %x0, <4 x i64> %x1, i8 -1) + ret <4 x i64> %res +} + define <4 x i64>@test_int_x86_avx512_mask_pmovzxb_q_256(<16 x i8> %x0, <4 x i64> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmovzxb_q_256: ; X86: # %bb.0: -; X86-NEXT: vpmovzxbq %xmm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x32,0xd0] -; X86-NEXT: # ymm2 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovzxbq %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x32,0xc8] ; X86-NEXT: # ymm1 {%k1} = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero -; X86-NEXT: vpmovzxbq %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x32,0xc0] -; X86-NEXT: # ymm0 {%k1} {z} = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero -; X86-NEXT: vpaddq %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc2] -; X86-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmovzxb_q_256: ; X64: # %bb.0: -; X64-NEXT: vpmovzxbq %xmm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x32,0xd0] -; X64-NEXT: # ymm2 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovzxbq %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x32,0xc8] ; X64-NEXT: # ymm1 {%k1} = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <4 x i64> @llvm.x86.avx512.mask.pmovzxb.q.256(<16 x i8> %x0, <4 x i64> %x1, i8 %x2) + ret <4 x i64> %res +} + +define <4 x i64>@test_int_x86_avx512_maskz_pmovzxb_q_256(<16 x i8> %x0, i8 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmovzxb_q_256: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpmovzxbq %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x32,0xc0] +; X86-NEXT: # ymm0 {%k1} {z} = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmovzxb_q_256: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovzxbq %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x32,0xc0] ; X64-NEXT: # ymm0 {%k1} {z} = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero -; X64-NEXT: vpaddq %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc2] -; X64-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <4 x i64> @llvm.x86.avx512.mask.pmovzxb.q.256(<16 x i8> %x0, <4 x i64> %x1, i8 %x2) - %res1 = call <4 x i64> @llvm.x86.avx512.mask.pmovzxb.q.256(<16 x i8> %x0, <4 x i64> zeroinitializer, i8 %x2) - %res2 = call <4 x i64> @llvm.x86.avx512.mask.pmovzxb.q.256(<16 x i8> %x0, <4 x i64> %x1, i8 -1) - %res3 = add <4 x i64> %res, %res1 - %res4 = add <4 x i64> %res3, %res2 - ret <4 x i64> %res4 + %res = call <4 x i64> @llvm.x86.avx512.mask.pmovzxb.q.256(<16 x i8> %x0, <4 x i64> zeroinitializer, i8 %x2) + ret <4 x i64> %res } declare <2 x i64> @llvm.x86.avx512.mask.pmovzxd.q.128(<4 x i32>, <2 x i64>, i8) +define <2 x i64>@test_int_x86_avx512_pmovzxd_q_128(<4 x i32> %x0, <2 x i64> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmovzxd_q_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovzxdq %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x35,0xc0] +; CHECK-NEXT: # xmm0 = xmm0[0],zero,xmm0[1],zero +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <2 x i64> @llvm.x86.avx512.mask.pmovzxd.q.128(<4 x i32> %x0, <2 x i64> %x1, i8 -1) + ret <2 x i64> %res +} + define <2 x i64>@test_int_x86_avx512_mask_pmovzxd_q_128(<4 x i32> %x0, <2 x i64> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmovzxd_q_128: ; X86: # %bb.0: -; X86-NEXT: vpmovzxdq %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x35,0xd0] -; X86-NEXT: # xmm2 = xmm0[0],zero,xmm0[1],zero ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovzxdq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x35,0xc8] ; X86-NEXT: # xmm1 {%k1} = xmm0[0],zero,xmm0[1],zero -; X86-NEXT: vpmovzxdq %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x35,0xc0] -; X86-NEXT: # xmm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero -; X86-NEXT: vpaddq %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc2] -; X86-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmovzxd_q_128: ; X64: # %bb.0: -; X64-NEXT: vpmovzxdq %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x35,0xd0] -; X64-NEXT: # xmm2 = xmm0[0],zero,xmm0[1],zero ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovzxdq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x35,0xc8] ; X64-NEXT: # xmm1 {%k1} = xmm0[0],zero,xmm0[1],zero +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <2 x i64> @llvm.x86.avx512.mask.pmovzxd.q.128(<4 x i32> %x0, <2 x i64> %x1, i8 %x2) + ret <2 x i64> %res +} + +define <2 x i64>@test_int_x86_avx512_maskz_pmovzxd_q_128(<4 x i32> %x0, i8 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmovzxd_q_128: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpmovzxdq %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x35,0xc0] +; X86-NEXT: # xmm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmovzxd_q_128: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovzxdq %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x35,0xc0] ; X64-NEXT: # xmm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero -; X64-NEXT: vpaddq %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc2] -; X64-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <2 x i64> @llvm.x86.avx512.mask.pmovzxd.q.128(<4 x i32> %x0, <2 x i64> %x1, i8 %x2) - %res1 = call <2 x i64> @llvm.x86.avx512.mask.pmovzxd.q.128(<4 x i32> %x0, <2 x i64> zeroinitializer, i8 %x2) - %res2 = call <2 x i64> @llvm.x86.avx512.mask.pmovzxd.q.128(<4 x i32> %x0, <2 x i64> %x1, i8 -1) - %res3 = add <2 x i64> %res, %res1 - %res4 = add <2 x i64> %res3, %res2 - ret <2 x i64> %res4 + %res = call <2 x i64> @llvm.x86.avx512.mask.pmovzxd.q.128(<4 x i32> %x0, <2 x i64> zeroinitializer, i8 %x2) + ret <2 x i64> %res } declare <4 x i64> @llvm.x86.avx512.mask.pmovzxd.q.256(<4 x i32>, <4 x i64>, i8) +define <4 x i64>@test_int_x86_avx512_pmovzxd_q_256(<4 x i32> %x0, <4 x i64> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmovzxd_q_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovzxdq %xmm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x35,0xc0] +; CHECK-NEXT: # ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i64> @llvm.x86.avx512.mask.pmovzxd.q.256(<4 x i32> %x0, <4 x i64> %x1, i8 -1) + ret <4 x i64> %res +} + define <4 x i64>@test_int_x86_avx512_mask_pmovzxd_q_256(<4 x i32> %x0, <4 x i64> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmovzxd_q_256: ; X86: # %bb.0: -; X86-NEXT: vpmovzxdq %xmm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x35,0xd0] -; X86-NEXT: # ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovzxdq %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x35,0xc8] ; X86-NEXT: # ymm1 {%k1} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; X86-NEXT: vpmovzxdq %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x35,0xc0] -; X86-NEXT: # ymm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; X86-NEXT: vpaddq %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc2] -; X86-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmovzxd_q_256: ; X64: # %bb.0: -; X64-NEXT: vpmovzxdq %xmm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x35,0xd0] -; X64-NEXT: # ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovzxdq %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x35,0xc8] ; X64-NEXT: # ymm1 {%k1} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <4 x i64> @llvm.x86.avx512.mask.pmovzxd.q.256(<4 x i32> %x0, <4 x i64> %x1, i8 %x2) + ret <4 x i64> %res +} + +define <4 x i64>@test_int_x86_avx512_maskz_pmovzxd_q_256(<4 x i32> %x0, i8 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmovzxd_q_256: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpmovzxdq %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x35,0xc0] +; X86-NEXT: # ymm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmovzxd_q_256: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovzxdq %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x35,0xc0] ; X64-NEXT: # ymm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; X64-NEXT: vpaddq %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc2] -; X64-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <4 x i64> @llvm.x86.avx512.mask.pmovzxd.q.256(<4 x i32> %x0, <4 x i64> %x1, i8 %x2) - %res1 = call <4 x i64> @llvm.x86.avx512.mask.pmovzxd.q.256(<4 x i32> %x0, <4 x i64> zeroinitializer, i8 %x2) - %res2 = call <4 x i64> @llvm.x86.avx512.mask.pmovzxd.q.256(<4 x i32> %x0, <4 x i64> %x1, i8 -1) - %res3 = add <4 x i64> %res, %res1 - %res4 = add <4 x i64> %res3, %res2 - ret <4 x i64> %res4 + %res = call <4 x i64> @llvm.x86.avx512.mask.pmovzxd.q.256(<4 x i32> %x0, <4 x i64> zeroinitializer, i8 %x2) + ret <4 x i64> %res } declare <4 x i32> @llvm.x86.avx512.mask.pmovzxw.d.128(<8 x i16>, <4 x i32>, i8) +define <4 x i32>@test_int_x86_avx512_pmovzxw_d_128(<8 x i16> %x0, <4 x i32> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmovzxw_d_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovzxwd %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x33,0xc0] +; CHECK-NEXT: # xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.mask.pmovzxw.d.128(<8 x i16> %x0, <4 x i32> %x1, i8 -1) + ret <4 x i32> %res +} + + define <4 x i32>@test_int_x86_avx512_mask_pmovzxw_d_128(<8 x i16> %x0, <4 x i32> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmovzxw_d_128: ; X86: # %bb.0: -; X86-NEXT: vpmovzxwd %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x33,0xd0] -; X86-NEXT: # xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovzxwd %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x33,0xc8] ; X86-NEXT: # xmm1 {%k1} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; X86-NEXT: vpmovzxwd %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x33,0xc0] -; X86-NEXT: # xmm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; X86-NEXT: vpaddd %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc2] -; X86-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmovzxw_d_128: ; X64: # %bb.0: -; X64-NEXT: vpmovzxwd %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x33,0xd0] -; X64-NEXT: # xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovzxwd %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x33,0xc8] ; X64-NEXT: # xmm1 {%k1} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.mask.pmovzxw.d.128(<8 x i16> %x0, <4 x i32> %x1, i8 %x2) + ret <4 x i32> %res +} + + +define <4 x i32>@test_int_x86_avx512_maskz_pmovzxw_d_128(<8 x i16> %x0, i8 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmovzxw_d_128: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpmovzxwd %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x33,0xc0] +; X86-NEXT: # xmm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmovzxw_d_128: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovzxwd %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x33,0xc0] ; X64-NEXT: # xmm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; X64-NEXT: vpaddd %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc2] -; X64-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <4 x i32> @llvm.x86.avx512.mask.pmovzxw.d.128(<8 x i16> %x0, <4 x i32> %x1, i8 %x2) - %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmovzxw.d.128(<8 x i16> %x0, <4 x i32> zeroinitializer, i8 %x2) - %res2 = call <4 x i32> @llvm.x86.avx512.mask.pmovzxw.d.128(<8 x i16> %x0, <4 x i32> %x1, i8 -1) - %res3 = add <4 x i32> %res, %res1 - %res4 = add <4 x i32> %res3, %res2 - ret <4 x i32> %res4 + %res = call <4 x i32> @llvm.x86.avx512.mask.pmovzxw.d.128(<8 x i16> %x0, <4 x i32> zeroinitializer, i8 %x2) + ret <4 x i32> %res } declare <8 x i32> @llvm.x86.avx512.mask.pmovzxw.d.256(<8 x i16>, <8 x i32>, i8) +define <8 x i32>@test_int_x86_avx512_pmovzxw_d_256(<8 x i16> %x0, <8 x i32> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmovzxw_d_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovzxwd %xmm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x33,0xc0] +; CHECK-NEXT: # ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i32> @llvm.x86.avx512.mask.pmovzxw.d.256(<8 x i16> %x0, <8 x i32> %x1, i8 -1) + ret <8 x i32> %res +} + define <8 x i32>@test_int_x86_avx512_mask_pmovzxw_d_256(<8 x i16> %x0, <8 x i32> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmovzxw_d_256: ; X86: # %bb.0: -; X86-NEXT: vpmovzxwd %xmm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x33,0xd0] -; X86-NEXT: # ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovzxwd %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x33,0xc8] ; X86-NEXT: # ymm1 {%k1} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; X86-NEXT: vpmovzxwd %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x33,0xc0] -; X86-NEXT: # ymm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; X86-NEXT: vpaddd %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc2] -; X86-NEXT: vpaddd %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmovzxw_d_256: ; X64: # %bb.0: -; X64-NEXT: vpmovzxwd %xmm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x33,0xd0] -; X64-NEXT: # ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovzxwd %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x33,0xc8] ; X64-NEXT: # ymm1 {%k1} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <8 x i32> @llvm.x86.avx512.mask.pmovzxw.d.256(<8 x i16> %x0, <8 x i32> %x1, i8 %x2) + ret <8 x i32> %res +} + +define <8 x i32>@test_int_x86_avx512_maskz_pmovzxw_d_256(<8 x i16> %x0, i8 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmovzxw_d_256: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpmovzxwd %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x33,0xc0] +; X86-NEXT: # ymm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmovzxw_d_256: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovzxwd %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x33,0xc0] ; X64-NEXT: # ymm0 {%k1} {z} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; X64-NEXT: vpaddd %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc2] -; X64-NEXT: vpaddd %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <8 x i32> @llvm.x86.avx512.mask.pmovzxw.d.256(<8 x i16> %x0, <8 x i32> %x1, i8 %x2) - %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmovzxw.d.256(<8 x i16> %x0, <8 x i32> zeroinitializer, i8 %x2) - %res2 = call <8 x i32> @llvm.x86.avx512.mask.pmovzxw.d.256(<8 x i16> %x0, <8 x i32> %x1, i8 -1) - %res3 = add <8 x i32> %res, %res1 - %res4 = add <8 x i32> %res3, %res2 - ret <8 x i32> %res4 + %res = call <8 x i32> @llvm.x86.avx512.mask.pmovzxw.d.256(<8 x i16> %x0, <8 x i32> zeroinitializer, i8 %x2) + ret <8 x i32> %res } declare <2 x i64> @llvm.x86.avx512.mask.pmovzxw.q.128(<8 x i16>, <2 x i64>, i8) +define <2 x i64>@test_int_x86_avx512_pmovzxw_q_128(<8 x i16> %x0, <2 x i64> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmovzxw_q_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovzxwq %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x34,0xc0] +; CHECK-NEXT: # xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <2 x i64> @llvm.x86.avx512.mask.pmovzxw.q.128(<8 x i16> %x0, <2 x i64> %x1, i8 -1) + ret <2 x i64> %res +} + define <2 x i64>@test_int_x86_avx512_mask_pmovzxw_q_128(<8 x i16> %x0, <2 x i64> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmovzxw_q_128: ; X86: # %bb.0: -; X86-NEXT: vpmovzxwq %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x34,0xd0] -; X86-NEXT: # xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovzxwq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x34,0xc8] ; X86-NEXT: # xmm1 {%k1} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; X86-NEXT: vpmovzxwq %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x34,0xc0] -; X86-NEXT: # xmm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; X86-NEXT: vpaddq %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc2] -; X86-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmovzxw_q_128: ; X64: # %bb.0: -; X64-NEXT: vpmovzxwq %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x34,0xd0] -; X64-NEXT: # xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovzxwq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x34,0xc8] ; X64-NEXT: # xmm1 {%k1} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <2 x i64> @llvm.x86.avx512.mask.pmovzxw.q.128(<8 x i16> %x0, <2 x i64> %x1, i8 %x2) + ret <2 x i64> %res +} + +define <2 x i64>@test_int_x86_avx512_maskz_pmovzxw_q_128(<8 x i16> %x0, i8 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmovzxw_q_128: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpmovzxwq %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x34,0xc0] +; X86-NEXT: # xmm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmovzxw_q_128: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovzxwq %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x34,0xc0] ; X64-NEXT: # xmm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; X64-NEXT: vpaddq %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc2] -; X64-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <2 x i64> @llvm.x86.avx512.mask.pmovzxw.q.128(<8 x i16> %x0, <2 x i64> %x1, i8 %x2) - %res1 = call <2 x i64> @llvm.x86.avx512.mask.pmovzxw.q.128(<8 x i16> %x0, <2 x i64> zeroinitializer, i8 %x2) - %res2 = call <2 x i64> @llvm.x86.avx512.mask.pmovzxw.q.128(<8 x i16> %x0, <2 x i64> %x1, i8 -1) - %res3 = add <2 x i64> %res, %res1 - %res4 = add <2 x i64> %res3, %res2 - ret <2 x i64> %res4 + %res = call <2 x i64> @llvm.x86.avx512.mask.pmovzxw.q.128(<8 x i16> %x0, <2 x i64> zeroinitializer, i8 %x2) + ret <2 x i64> %res } declare <4 x i64> @llvm.x86.avx512.mask.pmovzxw.q.256(<8 x i16>, <4 x i64>, i8) +define <4 x i64>@test_int_x86_avx512_pmovzxw_q_256(<8 x i16> %x0, <4 x i64> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmovzxw_q_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovzxwq %xmm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x34,0xc0] +; CHECK-NEXT: # ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i64> @llvm.x86.avx512.mask.pmovzxw.q.256(<8 x i16> %x0, <4 x i64> %x1, i8 -1) + ret <4 x i64> %res +} + define <4 x i64>@test_int_x86_avx512_mask_pmovzxw_q_256(<8 x i16> %x0, <4 x i64> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmovzxw_q_256: ; X86: # %bb.0: -; X86-NEXT: vpmovzxwq %xmm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x34,0xd0] -; X86-NEXT: # ymm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovzxwq %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x34,0xc8] ; X86-NEXT: # ymm1 {%k1} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; X86-NEXT: vpmovzxwq %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x34,0xc0] -; X86-NEXT: # ymm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; X86-NEXT: vpaddq %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc2] -; X86-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmovzxw_q_256: ; X64: # %bb.0: -; X64-NEXT: vpmovzxwq %xmm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x34,0xd0] -; X64-NEXT: # ymm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovzxwq %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x34,0xc8] ; X64-NEXT: # ymm1 {%k1} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <4 x i64> @llvm.x86.avx512.mask.pmovzxw.q.256(<8 x i16> %x0, <4 x i64> %x1, i8 %x2) + ret <4 x i64> %res +} + +define <4 x i64>@test_int_x86_avx512_maskz_pmovzxw_q_256(<8 x i16> %x0, i8 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmovzxw_q_256: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpmovzxwq %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x34,0xc0] +; X86-NEXT: # ymm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmovzxw_q_256: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovzxwq %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x34,0xc0] ; X64-NEXT: # ymm0 {%k1} {z} = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero -; X64-NEXT: vpaddq %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc2] -; X64-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <4 x i64> @llvm.x86.avx512.mask.pmovzxw.q.256(<8 x i16> %x0, <4 x i64> %x1, i8 %x2) - %res1 = call <4 x i64> @llvm.x86.avx512.mask.pmovzxw.q.256(<8 x i16> %x0, <4 x i64> zeroinitializer, i8 %x2) - %res2 = call <4 x i64> @llvm.x86.avx512.mask.pmovzxw.q.256(<8 x i16> %x0, <4 x i64> %x1, i8 -1) - %res3 = add <4 x i64> %res, %res1 - %res4 = add <4 x i64> %res3, %res2 - ret <4 x i64> %res4 + %res = call <4 x i64> @llvm.x86.avx512.mask.pmovzxw.q.256(<8 x i16> %x0, <4 x i64> zeroinitializer, i8 %x2) + ret <4 x i64> %res } declare <4 x i32> @llvm.x86.avx512.mask.pmovsxb.d.128(<16 x i8>, <4 x i32>, i8) +define <4 x i32>@test_int_x86_avx512_pmovsxb_d_128(<16 x i8> %x0, <4 x i32> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmovsxb_d_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovsxbd %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x21,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.mask.pmovsxb.d.128(<16 x i8> %x0, <4 x i32> %x1, i8 -1) + ret <4 x i32> %res +} + define <4 x i32>@test_int_x86_avx512_mask_pmovsxb_d_128(<16 x i8> %x0, <4 x i32> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmovsxb_d_128: ; X86: # %bb.0: -; X86-NEXT: vpmovsxbd %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x21,0xd0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovsxbd %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x21,0xc8] -; X86-NEXT: vpmovsxbd %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x21,0xc0] -; X86-NEXT: vpaddd %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc2] -; X86-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmovsxb_d_128: ; X64: # %bb.0: -; X64-NEXT: vpmovsxbd %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x21,0xd0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovsxbd %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x21,0xc8] -; X64-NEXT: vpmovsxbd %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x21,0xc0] -; X64-NEXT: vpaddd %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc2] -; X64-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.pmovsxb.d.128(<16 x i8> %x0, <4 x i32> %x1, i8 %x2) - %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmovsxb.d.128(<16 x i8> %x0, <4 x i32> zeroinitializer, i8 %x2) - %res2 = call <4 x i32> @llvm.x86.avx512.mask.pmovsxb.d.128(<16 x i8> %x0, <4 x i32> %x1, i8 -1) - %res3 = add <4 x i32> %res, %res1 - %res4 = add <4 x i32> %res3, %res2 - ret <4 x i32> %res4 + ret <4 x i32> %res +} + +define <4 x i32>@test_int_x86_avx512_maskz_pmovsxb_d_128(<16 x i8> %x0, i8 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmovsxb_d_128: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpmovsxbd %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x21,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmovsxb_d_128: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vpmovsxbd %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x21,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.mask.pmovsxb.d.128(<16 x i8> %x0, <4 x i32> zeroinitializer, i8 %x2) + ret <4 x i32> %res } declare <8 x i32> @llvm.x86.avx512.mask.pmovsxb.d.256(<16 x i8>, <8 x i32>, i8) +define <8 x i32>@test_int_x86_avx512_pmovsxb_d_256(<16 x i8> %x0, <8 x i32> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmovsxb_d_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovsxbd %xmm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x21,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i32> @llvm.x86.avx512.mask.pmovsxb.d.256(<16 x i8> %x0, <8 x i32> %x1, i8 -1) + ret <8 x i32> %res +} + define <8 x i32>@test_int_x86_avx512_mask_pmovsxb_d_256(<16 x i8> %x0, <8 x i32> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmovsxb_d_256: ; X86: # %bb.0: -; X86-NEXT: vpmovsxbd %xmm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x21,0xd0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovsxbd %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x21,0xc8] -; X86-NEXT: vpmovsxbd %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x21,0xc0] -; X86-NEXT: vpaddd %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc2] -; X86-NEXT: vpaddd %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmovsxb_d_256: ; X64: # %bb.0: -; X64-NEXT: vpmovsxbd %xmm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x21,0xd0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovsxbd %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x21,0xc8] -; X64-NEXT: vpmovsxbd %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x21,0xc0] -; X64-NEXT: vpaddd %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc2] -; X64-NEXT: vpaddd %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.pmovsxb.d.256(<16 x i8> %x0, <8 x i32> %x1, i8 %x2) - %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmovsxb.d.256(<16 x i8> %x0, <8 x i32> zeroinitializer, i8 %x2) - %res2 = call <8 x i32> @llvm.x86.avx512.mask.pmovsxb.d.256(<16 x i8> %x0, <8 x i32> %x1, i8 -1) - %res3 = add <8 x i32> %res, %res1 - %res4 = add <8 x i32> %res3, %res2 - ret <8 x i32> %res4 + ret <8 x i32> %res +} + +define <8 x i32>@test_int_x86_avx512_maskz_pmovsxb_d_256(<16 x i8> %x0, i8 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmovsxb_d_256: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpmovsxbd %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x21,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmovsxb_d_256: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vpmovsxbd %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x21,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <8 x i32> @llvm.x86.avx512.mask.pmovsxb.d.256(<16 x i8> %x0, <8 x i32> zeroinitializer, i8 %x2) + ret <8 x i32> %res } declare <2 x i64> @llvm.x86.avx512.mask.pmovsxb.q.128(<16 x i8>, <2 x i64>, i8) +define <2 x i64>@test_int_x86_avx512_ask_pmovsxb_q_128(<16 x i8> %x0, <2 x i64> %x1) { +; CHECK-LABEL: test_int_x86_avx512_ask_pmovsxb_q_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovsxbq %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x22,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <2 x i64> @llvm.x86.avx512.mask.pmovsxb.q.128(<16 x i8> %x0, <2 x i64> %x1, i8 -1) + ret <2 x i64> %res +} + define <2 x i64>@test_int_x86_avx512_mask_pmovsxb_q_128(<16 x i8> %x0, <2 x i64> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmovsxb_q_128: ; X86: # %bb.0: -; X86-NEXT: vpmovsxbq %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x22,0xd0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovsxbq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x22,0xc8] -; X86-NEXT: vpmovsxbq %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x22,0xc0] -; X86-NEXT: vpaddq %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc2] -; X86-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmovsxb_q_128: ; X64: # %bb.0: -; X64-NEXT: vpmovsxbq %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x22,0xd0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovsxbq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x22,0xc8] -; X64-NEXT: vpmovsxbq %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x22,0xc0] -; X64-NEXT: vpaddq %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc2] -; X64-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.mask.pmovsxb.q.128(<16 x i8> %x0, <2 x i64> %x1, i8 %x2) - %res1 = call <2 x i64> @llvm.x86.avx512.mask.pmovsxb.q.128(<16 x i8> %x0, <2 x i64> zeroinitializer, i8 %x2) - %res2 = call <2 x i64> @llvm.x86.avx512.mask.pmovsxb.q.128(<16 x i8> %x0, <2 x i64> %x1, i8 -1) - %res3 = add <2 x i64> %res, %res1 - %res4 = add <2 x i64> %res3, %res2 - ret <2 x i64> %res4 + ret <2 x i64> %res +} + +define <2 x i64>@test_int_x86_avx512_maskz_pmovsxb_q_128(<16 x i8> %x0, i8 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmovsxb_q_128: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpmovsxbq %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x22,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmovsxb_q_128: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vpmovsxbq %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x22,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <2 x i64> @llvm.x86.avx512.mask.pmovsxb.q.128(<16 x i8> %x0, <2 x i64> zeroinitializer, i8 %x2) + ret <2 x i64> %res } declare <4 x i64> @llvm.x86.avx512.mask.pmovsxb.q.256(<16 x i8>, <4 x i64>, i8) +define <4 x i64>@test_int_x86_avx512_pmovsxb_q_256(<16 x i8> %x0, <4 x i64> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmovsxb_q_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovsxbq %xmm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x22,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i64> @llvm.x86.avx512.mask.pmovsxb.q.256(<16 x i8> %x0, <4 x i64> %x1, i8 -1) + ret <4 x i64> %res +} + define <4 x i64>@test_int_x86_avx512_mask_pmovsxb_q_256(<16 x i8> %x0, <4 x i64> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmovsxb_q_256: ; X86: # %bb.0: -; X86-NEXT: vpmovsxbq %xmm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x22,0xd0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovsxbq %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x22,0xc8] -; X86-NEXT: vpmovsxbq %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x22,0xc0] -; X86-NEXT: vpaddq %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc2] -; X86-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmovsxb_q_256: ; X64: # %bb.0: -; X64-NEXT: vpmovsxbq %xmm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x22,0xd0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovsxbq %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x22,0xc8] -; X64-NEXT: vpmovsxbq %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x22,0xc0] -; X64-NEXT: vpaddq %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc2] -; X64-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.pmovsxb.q.256(<16 x i8> %x0, <4 x i64> %x1, i8 %x2) - %res1 = call <4 x i64> @llvm.x86.avx512.mask.pmovsxb.q.256(<16 x i8> %x0, <4 x i64> zeroinitializer, i8 %x2) - %res2 = call <4 x i64> @llvm.x86.avx512.mask.pmovsxb.q.256(<16 x i8> %x0, <4 x i64> %x1, i8 -1) - %res3 = add <4 x i64> %res, %res1 - %res4 = add <4 x i64> %res3, %res2 - ret <4 x i64> %res4 + ret <4 x i64> %res +} + +define <4 x i64>@test_int_x86_avx512_maskz_pmovsxb_q_256(<16 x i8> %x0, i8 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmovsxb_q_256: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpmovsxbq %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x22,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmovsxb_q_256: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vpmovsxbq %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x22,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <4 x i64> @llvm.x86.avx512.mask.pmovsxb.q.256(<16 x i8> %x0, <4 x i64> zeroinitializer, i8 %x2) + ret <4 x i64> %res } declare <4 x i32> @llvm.x86.avx512.mask.pmovsxw.d.128(<8 x i16>, <4 x i32>, i8) +define <4 x i32>@test_int_x86_avx512_pmovsxw_d_128(<8 x i16> %x0, <4 x i32> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmovsxw_d_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovsxwd %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x23,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.mask.pmovsxw.d.128(<8 x i16> %x0, <4 x i32> %x1, i8 -1) + ret <4 x i32> %res +} + define <4 x i32>@test_int_x86_avx512_mask_pmovsxw_d_128(<8 x i16> %x0, <4 x i32> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmovsxw_d_128: ; X86: # %bb.0: -; X86-NEXT: vpmovsxwd %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x23,0xd0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovsxwd %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x23,0xc8] -; X86-NEXT: vpmovsxwd %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x23,0xc0] -; X86-NEXT: vpaddd %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc2] -; X86-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmovsxw_d_128: ; X64: # %bb.0: -; X64-NEXT: vpmovsxwd %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x23,0xd0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovsxwd %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x23,0xc8] -; X64-NEXT: vpmovsxwd %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x23,0xc0] -; X64-NEXT: vpaddd %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc2] -; X64-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.pmovsxw.d.128(<8 x i16> %x0, <4 x i32> %x1, i8 %x2) - %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmovsxw.d.128(<8 x i16> %x0, <4 x i32> zeroinitializer, i8 %x2) - %res2 = call <4 x i32> @llvm.x86.avx512.mask.pmovsxw.d.128(<8 x i16> %x0, <4 x i32> %x1, i8 -1) - %res3 = add <4 x i32> %res, %res1 - %res4 = add <4 x i32> %res3, %res2 - ret <4 x i32> %res4 + ret <4 x i32> %res +} + +define <4 x i32>@test_int_x86_avx512_maskz_pmovsxw_d_128(<8 x i16> %x0, i8 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmovsxw_d_128: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpmovsxwd %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x23,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmovsxw_d_128: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vpmovsxwd %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x23,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.mask.pmovsxw.d.128(<8 x i16> %x0, <4 x i32> zeroinitializer, i8 %x2) + ret <4 x i32> %res } declare <8 x i32> @llvm.x86.avx512.mask.pmovsxw.d.256(<8 x i16>, <8 x i32>, i8) +define <8 x i32>@test_int_x86_avx512_pmovsxw_d_256(<8 x i16> %x0, <8 x i32> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmovsxw_d_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovsxwd %xmm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x23,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i32> @llvm.x86.avx512.mask.pmovsxw.d.256(<8 x i16> %x0, <8 x i32> %x1, i8 -1) + ret <8 x i32> %res +} + define <8 x i32>@test_int_x86_avx512_mask_pmovsxw_d_256(<8 x i16> %x0, <8 x i32> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmovsxw_d_256: ; X86: # %bb.0: -; X86-NEXT: vpmovsxwd %xmm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x23,0xd0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovsxwd %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x23,0xc8] -; X86-NEXT: vpmovsxwd %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x23,0xc0] -; X86-NEXT: vpaddd %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc2] -; X86-NEXT: vpaddd %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmovsxw_d_256: ; X64: # %bb.0: -; X64-NEXT: vpmovsxwd %xmm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x23,0xd0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovsxwd %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x23,0xc8] -; X64-NEXT: vpmovsxwd %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x23,0xc0] -; X64-NEXT: vpaddd %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc2] -; X64-NEXT: vpaddd %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.pmovsxw.d.256(<8 x i16> %x0, <8 x i32> %x1, i8 %x2) - %res1 = call <8 x i32> @llvm.x86.avx512.mask.pmovsxw.d.256(<8 x i16> %x0, <8 x i32> zeroinitializer, i8 %x2) - %res2 = call <8 x i32> @llvm.x86.avx512.mask.pmovsxw.d.256(<8 x i16> %x0, <8 x i32> %x1, i8 -1) - %res3 = add <8 x i32> %res, %res1 - %res4 = add <8 x i32> %res3, %res2 - ret <8 x i32> %res4 + ret <8 x i32> %res +} + +define <8 x i32>@test_int_x86_avx512_maskz_pmovsxw_d_256(<8 x i16> %x0, i8 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmovsxw_d_256: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpmovsxwd %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x23,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmovsxw_d_256: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vpmovsxwd %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x23,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <8 x i32> @llvm.x86.avx512.mask.pmovsxw.d.256(<8 x i16> %x0, <8 x i32> zeroinitializer, i8 %x2) + ret <8 x i32> %res } declare <2 x i64> @llvm.x86.avx512.mask.pmovsxw.q.128(<8 x i16>, <2 x i64>, i8) +define <2 x i64>@test_int_x86_avx512_pmovsxw_q_128(<8 x i16> %x0, <2 x i64> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmovsxw_q_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovsxwq %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x24,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <2 x i64> @llvm.x86.avx512.mask.pmovsxw.q.128(<8 x i16> %x0, <2 x i64> %x1, i8 -1) + ret <2 x i64> %res +} + define <2 x i64>@test_int_x86_avx512_mask_pmovsxw_q_128(<8 x i16> %x0, <2 x i64> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmovsxw_q_128: ; X86: # %bb.0: -; X86-NEXT: vpmovsxwq %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x24,0xd0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpmovsxwq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x24,0xc8] -; X86-NEXT: vpmovsxwq %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x24,0xc0] -; X86-NEXT: vpaddq %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc2] -; X86-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] +; X86-NEXT: vpmovsxwq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x24,0xc8] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_mask_pmovsxw_q_128: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vpmovsxwq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x24,0xc8] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <2 x i64> @llvm.x86.avx512.mask.pmovsxw.q.128(<8 x i16> %x0, <2 x i64> %x1, i8 %x2) + ret <2 x i64> %res +} + +define <2 x i64>@test_int_x86_avx512_maskz_pmovsxw_q_128(<8 x i16> %x0, i8 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmovsxw_q_128: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpmovsxwq %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x24,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmovsxw_q_128: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vpmovsxwq %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x24,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <2 x i64> @llvm.x86.avx512.mask.pmovsxw.q.128(<8 x i16> %x0, <2 x i64> zeroinitializer, i8 %x2) + ret <2 x i64> %res +} + +declare <4 x i64> @llvm.x86.avx512.mask.pmovsxw.q.256(<8 x i16>, <4 x i64>, i8) + +define <4 x i64>@test_int_x86_avx512_pmovsxw_q_256(<8 x i16> %x0, <4 x i64> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmovsxw_q_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovsxwq %xmm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x24,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i64> @llvm.x86.avx512.mask.pmovsxw.q.256(<8 x i16> %x0, <4 x i64> %x1, i8 -1) + ret <4 x i64> %res +} + +define <4 x i64>@test_int_x86_avx512_mask_pmovsxw_q_256(<8 x i16> %x0, <4 x i64> %x1, i8 %x2) { +; X86-LABEL: test_int_x86_avx512_mask_pmovsxw_q_256: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpmovsxwq %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x24,0xc8] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; -; X64-LABEL: test_int_x86_avx512_mask_pmovsxw_q_128: +; X64-LABEL: test_int_x86_avx512_mask_pmovsxw_q_256: ; X64: # %bb.0: -; X64-NEXT: vpmovsxwq %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x24,0xd0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vpmovsxwq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x24,0xc8] -; X64-NEXT: vpmovsxwq %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x24,0xc0] -; X64-NEXT: vpaddq %xmm2, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc2] -; X64-NEXT: vpaddq %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] +; X64-NEXT: vpmovsxwq %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x24,0xc8] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <2 x i64> @llvm.x86.avx512.mask.pmovsxw.q.128(<8 x i16> %x0, <2 x i64> %x1, i8 %x2) - %res1 = call <2 x i64> @llvm.x86.avx512.mask.pmovsxw.q.128(<8 x i16> %x0, <2 x i64> zeroinitializer, i8 %x2) - %res2 = call <2 x i64> @llvm.x86.avx512.mask.pmovsxw.q.128(<8 x i16> %x0, <2 x i64> %x1, i8 -1) - %res3 = add <2 x i64> %res, %res1 - %res4 = add <2 x i64> %res3, %res2 - ret <2 x i64> %res4 + %res = call <4 x i64> @llvm.x86.avx512.mask.pmovsxw.q.256(<8 x i16> %x0, <4 x i64> %x1, i8 %x2) + ret <4 x i64> %res } -declare <4 x i64> @llvm.x86.avx512.mask.pmovsxw.q.256(<8 x i16>, <4 x i64>, i8) - -define <4 x i64>@test_int_x86_avx512_mask_pmovsxw_q_256(<8 x i16> %x0, <4 x i64> %x1, i8 %x2) { -; X86-LABEL: test_int_x86_avx512_mask_pmovsxw_q_256: +define <4 x i64>@test_int_x86_avx512_maskz_pmovsxw_q_256(<8 x i16> %x0, i8 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmovsxw_q_256: ; X86: # %bb.0: -; X86-NEXT: vpmovsxwq %xmm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x24,0xd0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpmovsxwq %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x24,0xc8] ; X86-NEXT: vpmovsxwq %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x24,0xc0] -; X86-NEXT: vpaddq %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc2] -; X86-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; -; X64-LABEL: test_int_x86_avx512_mask_pmovsxw_q_256: +; X64-LABEL: test_int_x86_avx512_maskz_pmovsxw_q_256: ; X64: # %bb.0: -; X64-NEXT: vpmovsxwq %xmm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x24,0xd0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vpmovsxwq %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x24,0xc8] ; X64-NEXT: vpmovsxwq %xmm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x24,0xc0] -; X64-NEXT: vpaddq %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc2] -; X64-NEXT: vpaddq %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <4 x i64> @llvm.x86.avx512.mask.pmovsxw.q.256(<8 x i16> %x0, <4 x i64> %x1, i8 %x2) - %res1 = call <4 x i64> @llvm.x86.avx512.mask.pmovsxw.q.256(<8 x i16> %x0, <4 x i64> zeroinitializer, i8 %x2) - %res2 = call <4 x i64> @llvm.x86.avx512.mask.pmovsxw.q.256(<8 x i16> %x0, <4 x i64> %x1, i8 -1) - %res3 = add <4 x i64> %res, %res1 - %res4 = add <4 x i64> %res3, %res2 - ret <4 x i64> %res4 + %res = call <4 x i64> @llvm.x86.avx512.mask.pmovsxw.q.256(<8 x i16> %x0, <4 x i64> zeroinitializer, i8 %x2) + ret <4 x i64> %res } declare <2 x i64> @llvm.x86.avx512.mask.psra.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8) @@ -7426,445 +8344,607 @@ define <4 x i64>@test_int_x86_avx512_mask_psrav_q_256(<4 x i64> %x0, <4 x i64> % declare <2 x double> @llvm.x86.avx512.mask.cvtdq2pd.128(<4 x i32>, <2 x double>, i8) +define <2 x double>@test_int_x86_avx512_cvt_dq2pd_128(<4 x i32> %x0, <2 x double> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvt_dq2pd_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtdq2pd %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0xe6,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <2 x double> @llvm.x86.avx512.mask.cvtdq2pd.128(<4 x i32> %x0, <2 x double> %x1, i8 -1) + ret <2 x double> %res +} + define <2 x double>@test_int_x86_avx512_mask_cvt_dq2pd_128(<4 x i32> %x0, <2 x double> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvt_dq2pd_128: ; X86: # %bb.0: -; X86-NEXT: vcvtdq2pd %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0xe6,0xd0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvtdq2pd %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7e,0x09,0xe6,0xc8] -; X86-NEXT: vaddpd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc2] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_dq2pd_128: ; X64: # %bb.0: -; X64-NEXT: vcvtdq2pd %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0xe6,0xd0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtdq2pd %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7e,0x09,0xe6,0xc8] -; X64-NEXT: vaddpd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc2] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <2 x double> @llvm.x86.avx512.mask.cvtdq2pd.128(<4 x i32> %x0, <2 x double> %x1, i8 %x2) - %res1 = call <2 x double> @llvm.x86.avx512.mask.cvtdq2pd.128(<4 x i32> %x0, <2 x double> %x1, i8 -1) - %res2 = fadd <2 x double> %res, %res1 - ret <2 x double> %res2 + ret <2 x double> %res } declare <4 x double> @llvm.x86.avx512.mask.cvtdq2pd.256(<4 x i32>, <4 x double>, i8) +define <4 x double>@test_int_x86_avx512_cvt_dq2pd_256(<4 x i32> %x0, <4 x double> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvt_dq2pd_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtdq2pd %xmm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfe,0xe6,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x double> @llvm.x86.avx512.mask.cvtdq2pd.256(<4 x i32> %x0, <4 x double> %x1, i8 -1) + ret <4 x double> %res +} + define <4 x double>@test_int_x86_avx512_mask_cvt_dq2pd_256(<4 x i32> %x0, <4 x double> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvt_dq2pd_256: ; X86: # %bb.0: -; X86-NEXT: vcvtdq2pd %xmm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc5,0xfe,0xe6,0xd0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvtdq2pd %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7e,0x29,0xe6,0xc8] -; X86-NEXT: vaddpd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc2] +; X86-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_dq2pd_256: ; X64: # %bb.0: -; X64-NEXT: vcvtdq2pd %xmm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc5,0xfe,0xe6,0xd0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtdq2pd %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7e,0x29,0xe6,0xc8] -; X64-NEXT: vaddpd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc2] +; X64-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x double> @llvm.x86.avx512.mask.cvtdq2pd.256(<4 x i32> %x0, <4 x double> %x1, i8 %x2) - %res1 = call <4 x double> @llvm.x86.avx512.mask.cvtdq2pd.256(<4 x i32> %x0, <4 x double> %x1, i8 -1) - %res2 = fadd <4 x double> %res, %res1 - ret <4 x double> %res2 + ret <4 x double> %res } declare <2 x double> @llvm.x86.avx512.mask.cvtudq2pd.128(<4 x i32>, <2 x double>, i8) +define <2 x double>@test_int_x86_avx512_cvt_udq2pd_128(<4 x i32> %x0, <2 x double> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvt_udq2pd_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtudq2pd %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7e,0x08,0x7a,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <2 x double> @llvm.x86.avx512.mask.cvtudq2pd.128(<4 x i32> %x0, <2 x double> %x1, i8 -1) + ret <2 x double> %res +} + define <2 x double>@test_int_x86_avx512_mask_cvt_udq2pd_128(<4 x i32> %x0, <2 x double> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvt_udq2pd_128: ; X86: # %bb.0: -; X86-NEXT: vcvtudq2pd %xmm0, %xmm2 # encoding: [0x62,0xf1,0x7e,0x08,0x7a,0xd0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvtudq2pd %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7e,0x09,0x7a,0xc8] -; X86-NEXT: vaddpd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc2] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_udq2pd_128: ; X64: # %bb.0: -; X64-NEXT: vcvtudq2pd %xmm0, %xmm2 # encoding: [0x62,0xf1,0x7e,0x08,0x7a,0xd0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtudq2pd %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7e,0x09,0x7a,0xc8] -; X64-NEXT: vaddpd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc2] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <2 x double> @llvm.x86.avx512.mask.cvtudq2pd.128(<4 x i32> %x0, <2 x double> %x1, i8 %x2) - %res1 = call <2 x double> @llvm.x86.avx512.mask.cvtudq2pd.128(<4 x i32> %x0, <2 x double> %x1, i8 -1) - %res2 = fadd <2 x double> %res, %res1 - ret <2 x double> %res2 + ret <2 x double> %res } declare <4 x double> @llvm.x86.avx512.mask.cvtudq2pd.256(<4 x i32>, <4 x double>, i8) +define <4 x double>@test_int_x86_avx512_cvt_udq2pd_256(<4 x i32> %x0, <4 x double> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvt_udq2pd_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtudq2pd %xmm0, %ymm0 # encoding: [0x62,0xf1,0x7e,0x28,0x7a,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x double> @llvm.x86.avx512.mask.cvtudq2pd.256(<4 x i32> %x0, <4 x double> %x1, i8 -1) + ret <4 x double> %res +} + define <4 x double>@test_int_x86_avx512_mask_cvt_udq2pd_256(<4 x i32> %x0, <4 x double> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvt_udq2pd_256: ; X86: # %bb.0: -; X86-NEXT: vcvtudq2pd %xmm0, %ymm2 # encoding: [0x62,0xf1,0x7e,0x28,0x7a,0xd0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvtudq2pd %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7e,0x29,0x7a,0xc8] -; X86-NEXT: vaddpd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc2] +; X86-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_udq2pd_256: ; X64: # %bb.0: -; X64-NEXT: vcvtudq2pd %xmm0, %ymm2 # encoding: [0x62,0xf1,0x7e,0x28,0x7a,0xd0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtudq2pd %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7e,0x29,0x7a,0xc8] -; X64-NEXT: vaddpd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc2] +; X64-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x double> @llvm.x86.avx512.mask.cvtudq2pd.256(<4 x i32> %x0, <4 x double> %x1, i8 %x2) - %res1 = call <4 x double> @llvm.x86.avx512.mask.cvtudq2pd.256(<4 x i32> %x0, <4 x double> %x1, i8 -1) - %res2 = fadd <4 x double> %res, %res1 - ret <4 x double> %res2 + ret <4 x double> %res } declare <4 x i32> @llvm.x86.avx512.mask.valign.d.128(<4 x i32>, <4 x i32>, i32, <4 x i32>, i8) +define <4 x i32>@test_int_x86_avx512_valign_d_128(<4 x i32> %x0, <4 x i32> %x1,<4 x i32> %x3) { +; CHECK-LABEL: test_int_x86_avx512_valign_d_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpalignr $8, %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x0f,0xc1,0x08] +; CHECK-NEXT: # xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.mask.valign.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 2, <4 x i32> %x3, i8 -1) + ret <4 x i32> %res +} + define <4 x i32>@test_int_x86_avx512_mask_valign_d_128(<4 x i32> %x0, <4 x i32> %x1,<4 x i32> %x3, i8 %x4) { ; X86-LABEL: test_int_x86_avx512_mask_valign_d_128: ; X86: # %bb.0: -; X86-NEXT: vpalignr $8, %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x0f,0xd9,0x08] -; X86-NEXT: # xmm3 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: valignd $2, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x03,0xd1,0x02] ; X86-NEXT: # xmm2 {%k1} = xmm1[2,3],xmm0[0,1] -; X86-NEXT: valignd $2, %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x03,0xc1,0x02] -; X86-NEXT: # xmm0 {%k1} {z} = xmm1[2,3],xmm0[0,1] -; X86-NEXT: vpaddd %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc0] -; X86-NEXT: vpaddd %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0] +; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_valign_d_128: ; X64: # %bb.0: -; X64-NEXT: vpalignr $8, %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x0f,0xd9,0x08] -; X64-NEXT: # xmm3 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: valignd $2, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x09,0x03,0xd1,0x02] ; X64-NEXT: # xmm2 {%k1} = xmm1[2,3],xmm0[0,1] +; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.mask.valign.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 2, <4 x i32> %x3, i8 %x4) + ret <4 x i32> %res +} + +define <4 x i32>@test_int_x86_avx512_maskz_valign_d_128(<4 x i32> %x0, <4 x i32> %x1, i8 %x4) { +; X86-LABEL: test_int_x86_avx512_maskz_valign_d_128: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: valignd $2, %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x03,0xc1,0x02] +; X86-NEXT: # xmm0 {%k1} {z} = xmm1[2,3],xmm0[0,1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_valign_d_128: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: valignd $2, %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0x89,0x03,0xc1,0x02] ; X64-NEXT: # xmm0 {%k1} {z} = xmm1[2,3],xmm0[0,1] -; X64-NEXT: vpaddd %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc0] -; X64-NEXT: vpaddd %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <4 x i32> @llvm.x86.avx512.mask.valign.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 2, <4 x i32> %x3, i8 %x4) - %res1 = call <4 x i32> @llvm.x86.avx512.mask.valign.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 2, <4 x i32> %x3, i8 -1) - %res2 = call <4 x i32> @llvm.x86.avx512.mask.valign.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 2, <4 x i32> zeroinitializer,i8 %x4) - %res3 = add <4 x i32> %res, %res1 - %res4 = add <4 x i32> %res3, %res2 - ret <4 x i32> %res4 + %res = call <4 x i32> @llvm.x86.avx512.mask.valign.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 2, <4 x i32> zeroinitializer,i8 %x4) + ret <4 x i32> %res } declare <8 x i32> @llvm.x86.avx512.mask.valign.d.256(<8 x i32>, <8 x i32>, i32, <8 x i32>, i8) +define <8 x i32>@test_int_x86_avx512_valign_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x3) { +; CHECK-LABEL: test_int_x86_avx512_valign_d_256: +; CHECK: # %bb.0: +; CHECK-NEXT: valignq $3, %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf3,0xfd,0x28,0x03,0xc1,0x03] +; CHECK-NEXT: # ymm0 = ymm1[3],ymm0[0,1,2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i32> @llvm.x86.avx512.mask.valign.d.256(<8 x i32> %x0, <8 x i32> %x1, i32 6, <8 x i32> %x3, i8 -1) + ret <8 x i32> %res +} + define <8 x i32>@test_int_x86_avx512_mask_valign_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x3, i8 %x4) { ; X86-LABEL: test_int_x86_avx512_mask_valign_d_256: ; X86: # %bb.0: -; X86-NEXT: valignq $3, %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf3,0xfd,0x28,0x03,0xd9,0x03] -; X86-NEXT: # ymm3 = ymm1[3],ymm0[0,1,2] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: valignd $6, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x03,0xd1,0x06] ; X86-NEXT: # ymm2 {%k1} = ymm1[6,7],ymm0[0,1,2,3,4,5] -; X86-NEXT: vpaddd %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc3] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_valign_d_256: ; X64: # %bb.0: -; X64-NEXT: valignq $3, %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf3,0xfd,0x28,0x03,0xd9,0x03] -; X64-NEXT: # ymm3 = ymm1[3],ymm0[0,1,2] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: valignd $6, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x03,0xd1,0x06] ; X64-NEXT: # ymm2 {%k1} = ymm1[6,7],ymm0[0,1,2,3,4,5] -; X64-NEXT: vpaddd %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc3] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.valign.d.256(<8 x i32> %x0, <8 x i32> %x1, i32 6, <8 x i32> %x3, i8 %x4) - %res1 = call <8 x i32> @llvm.x86.avx512.mask.valign.d.256(<8 x i32> %x0, <8 x i32> %x1, i32 6, <8 x i32> %x3, i8 -1) - %res2 = add <8 x i32> %res, %res1 - ret <8 x i32> %res2 + ret <8 x i32> %res } declare <2 x i64> @llvm.x86.avx512.mask.valign.q.128(<2 x i64>, <2 x i64>, i32, <2 x i64>, i8) +define <2 x i64>@test_int_x86_avx512_valign_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x3) { +; CHECK-LABEL: test_int_x86_avx512_valign_q_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpalignr $8, %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x0f,0xc1,0x08] +; CHECK-NEXT: # xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <2 x i64> @llvm.x86.avx512.mask.valign.q.128(<2 x i64> %x0, <2 x i64> %x1, i32 1, <2 x i64> %x3, i8 -1) + ret <2 x i64> %res +} + define <2 x i64>@test_int_x86_avx512_mask_valign_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x3, i8 %x4) { ; X86-LABEL: test_int_x86_avx512_mask_valign_q_128: ; X86: # %bb.0: -; X86-NEXT: vpalignr $8, %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x0f,0xd9,0x08] -; X86-NEXT: # xmm3 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: valignq $1, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x03,0xd1,0x01] ; X86-NEXT: # xmm2 {%k1} = xmm1[1],xmm0[0] -; X86-NEXT: vpaddq %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc3] +; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_valign_q_128: ; X64: # %bb.0: -; X64-NEXT: vpalignr $8, %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x0f,0xd9,0x08] -; X64-NEXT: # xmm3 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: valignq $1, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x09,0x03,0xd1,0x01] ; X64-NEXT: # xmm2 {%k1} = xmm1[1],xmm0[0] -; X64-NEXT: vpaddq %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc3] +; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.mask.valign.q.128(<2 x i64> %x0, <2 x i64> %x1, i32 1, <2 x i64> %x3, i8 %x4) - %res1 = call <2 x i64> @llvm.x86.avx512.mask.valign.q.128(<2 x i64> %x0, <2 x i64> %x1, i32 1, <2 x i64> %x3, i8 -1) - %res2 = add <2 x i64> %res, %res1 - ret <2 x i64> %res2 + ret <2 x i64> %res } declare <4 x i64> @llvm.x86.avx512.mask.valign.q.256(<4 x i64>, <4 x i64>, i32, <4 x i64>, i8) +define <4 x i64>@test_int_x86_avx512_valign_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x3) { +; CHECK-LABEL: test_int_x86_avx512_valign_q_256: +; CHECK: # %bb.0: +; CHECK-NEXT: valignq $3, %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf3,0xfd,0x28,0x03,0xc1,0x03] +; CHECK-NEXT: # ymm0 = ymm1[3],ymm0[0,1,2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i64> @llvm.x86.avx512.mask.valign.q.256(<4 x i64> %x0, <4 x i64> %x1, i32 3, <4 x i64> %x3, i8 -1) + ret <4 x i64> %res +} + define <4 x i64>@test_int_x86_avx512_mask_valign_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x3, i8 %x4) { ; X86-LABEL: test_int_x86_avx512_mask_valign_q_256: ; X86: # %bb.0: -; X86-NEXT: valignq $3, %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf3,0xfd,0x28,0x03,0xd9,0x03] -; X86-NEXT: # ymm3 = ymm1[3],ymm0[0,1,2] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: valignq $3, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x03,0xd1,0x03] ; X86-NEXT: # ymm2 {%k1} = ymm1[3],ymm0[0,1,2] -; X86-NEXT: vpaddq %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc3] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_valign_q_256: ; X64: # %bb.0: -; X64-NEXT: valignq $3, %ymm1, %ymm0, %ymm3 # encoding: [0x62,0xf3,0xfd,0x28,0x03,0xd9,0x03] -; X64-NEXT: # ymm3 = ymm1[3],ymm0[0,1,2] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: valignq $3, %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0xfd,0x29,0x03,0xd1,0x03] ; X64-NEXT: # ymm2 {%k1} = ymm1[3],ymm0[0,1,2] -; X64-NEXT: vpaddq %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc3] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.valign.q.256(<4 x i64> %x0, <4 x i64> %x1, i32 3, <4 x i64> %x3, i8 %x4) - %res1 = call <4 x i64> @llvm.x86.avx512.mask.valign.q.256(<4 x i64> %x0, <4 x i64> %x1, i32 3, <4 x i64> %x3, i8 -1) - %res2 = add <4 x i64> %res, %res1 - ret <4 x i64> %res2 + ret <4 x i64> %res } declare <4 x double> @llvm.x86.avx512.mask.vpermilvar.pd.256(<4 x double>, <4 x i64>, <4 x double>, i8) +define <4 x double>@test_int_x86_avx512_vpermilvar_pd_256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermilvar_pd_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x0d,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x double> @llvm.x86.avx512.mask.vpermilvar.pd.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 -1) + ret <4 x double> %res +} + define <4 x double>@test_int_x86_avx512_mask_vpermilvar_pd_256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermilvar_pd_256: ; X86: # %bb.0: -; X86-NEXT: vpermilpd %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x0d,0xd9] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermilpd %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x0d,0xd1] -; X86-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x0d,0xc1] -; X86-NEXT: vaddpd %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc0] -; X86-NEXT: vaddpd %ymm0, %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xc0] +; X86-NEXT: vmovapd %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermilvar_pd_256: ; X64: # %bb.0: -; X64-NEXT: vpermilpd %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x0d,0xd9] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermilpd %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x0d,0xd1] -; X64-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x0d,0xc1] -; X64-NEXT: vaddpd %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc0] -; X64-NEXT: vaddpd %ymm0, %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xc0] +; X64-NEXT: vmovapd %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x double> @llvm.x86.avx512.mask.vpermilvar.pd.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 %x3) - %res1 = call <4 x double> @llvm.x86.avx512.mask.vpermilvar.pd.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> zeroinitializer, i8 %x3) - %res2 = call <4 x double> @llvm.x86.avx512.mask.vpermilvar.pd.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 -1) - %res3 = fadd <4 x double> %res, %res1 - %res4 = fadd <4 x double> %res2, %res3 - ret <4 x double> %res4 + ret <4 x double> %res +} + +define <4 x double>@test_int_x86_avx512_maskz_vpermilvar_pd_256(<4 x double> %x0, <4 x i64> %x1, i8 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_vpermilvar_pd_256: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x0d,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_vpermilvar_pd_256: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x0d,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <4 x double> @llvm.x86.avx512.mask.vpermilvar.pd.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> zeroinitializer, i8 %x3) + ret <4 x double> %res } declare <2 x double> @llvm.x86.avx512.mask.vpermilvar.pd.128(<2 x double>, <2 x i64>, <2 x double>, i8) +define <2 x double>@test_int_x86_avx512_vpermilvar_pd_128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermilvar_pd_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermilpd %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x0d,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <2 x double> @llvm.x86.avx512.mask.vpermilvar.pd.128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2, i8 -1) + ret <2 x double> %res +} + define <2 x double>@test_int_x86_avx512_mask_vpermilvar_pd_128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermilvar_pd_128: ; X86: # %bb.0: -; X86-NEXT: vpermilpd %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x0d,0xd9] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermilpd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x0d,0xd1] -; X86-NEXT: vpermilpd %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0x0d,0xc1] -; X86-NEXT: vaddpd %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0x58,0xc0] -; X86-NEXT: vaddpd %xmm3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x58,0xc3] +; X86-NEXT: vmovapd %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermilvar_pd_128: ; X64: # %bb.0: -; X64-NEXT: vpermilpd %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x0d,0xd9] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermilpd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x0d,0xd1] -; X64-NEXT: vpermilpd %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0x0d,0xc1] -; X64-NEXT: vaddpd %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0x58,0xc0] -; X64-NEXT: vaddpd %xmm3, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x58,0xc3] +; X64-NEXT: vmovapd %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <2 x double> @llvm.x86.avx512.mask.vpermilvar.pd.128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2, i8 %x3) - %res1 = call <2 x double> @llvm.x86.avx512.mask.vpermilvar.pd.128(<2 x double> %x0, <2 x i64> %x1, <2 x double> zeroinitializer, i8 %x3) - %res2 = call <2 x double> @llvm.x86.avx512.mask.vpermilvar.pd.128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2, i8 -1) - %res3 = fadd <2 x double> %res, %res1 - %res4 = fadd <2 x double> %res3, %res2 - ret <2 x double> %res4 + ret <2 x double> %res +} + +define <2 x double>@test_int_x86_avx512_maskz_vpermilvar_pd_128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2, i8 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_vpermilvar_pd_128: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpermilpd %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0x0d,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_vpermilvar_pd_128: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vpermilpd %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0x0d,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <2 x double> @llvm.x86.avx512.mask.vpermilvar.pd.128(<2 x double> %x0, <2 x i64> %x1, <2 x double> zeroinitializer, i8 %x3) + ret <2 x double> %res } declare <8 x float> @llvm.x86.avx512.mask.vpermilvar.ps.256(<8 x float>, <8 x i32>, <8 x float>, i8) +define <8 x float>@test_int_x86_avx512_vpermilvar_ps_256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermilvar_ps_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermilps %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x0c,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x float> @llvm.x86.avx512.mask.vpermilvar.ps.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 -1) + ret <8 x float> %res +} + define <8 x float>@test_int_x86_avx512_mask_vpermilvar_ps_256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermilvar_ps_256: ; X86: # %bb.0: -; X86-NEXT: vpermilps %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x0c,0xd9] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermilps %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x0c,0xd1] -; X86-NEXT: vpermilps %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x0c,0xc1] -; X86-NEXT: vaddps %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc0] -; X86-NEXT: vaddps %ymm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc3] +; X86-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermilvar_ps_256: ; X64: # %bb.0: -; X64-NEXT: vpermilps %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x0c,0xd9] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermilps %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x0c,0xd1] -; X64-NEXT: vpermilps %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x0c,0xc1] -; X64-NEXT: vaddps %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc0] -; X64-NEXT: vaddps %ymm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc3] +; X64-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx512.mask.vpermilvar.ps.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 %x3) - %res1 = call <8 x float> @llvm.x86.avx512.mask.vpermilvar.ps.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> zeroinitializer, i8 %x3) - %res2 = call <8 x float> @llvm.x86.avx512.mask.vpermilvar.ps.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 -1) - %res3 = fadd <8 x float> %res, %res1 - %res4 = fadd <8 x float> %res3, %res2 - ret <8 x float> %res4 + ret <8 x float> %res +} + +define <8 x float>@test_int_x86_avx512_maskz_vpermilvar_ps_256(<8 x float> %x0, <8 x i32> %x1, i8 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_vpermilvar_ps_256: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpermilps %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x0c,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_vpermilvar_ps_256: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vpermilps %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x0c,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <8 x float> @llvm.x86.avx512.mask.vpermilvar.ps.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> zeroinitializer, i8 %x3) + ret <8 x float> %res } declare <4 x float> @llvm.x86.avx512.mask.vpermilvar.ps.128(<4 x float>, <4 x i32>, <4 x float>, i8) +define <4 x float>@test_int_x86_avx512_vpermilvar_ps_128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermilvar_ps_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermilps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x0c,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x float> @llvm.x86.avx512.mask.vpermilvar.ps.128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2, i8 -1) + ret <4 x float> %res +} + define <4 x float>@test_int_x86_avx512_mask_vpermilvar_ps_128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermilvar_ps_128: ; X86: # %bb.0: -; X86-NEXT: vpermilps %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x0c,0xd9] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermilps %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x0c,0xd1] -; X86-NEXT: vpermilps %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x0c,0xc1] -; X86-NEXT: vaddps %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xc0] -; X86-NEXT: vaddps %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe0,0x58,0xc0] +; X86-NEXT: vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermilvar_ps_128: ; X64: # %bb.0: -; X64-NEXT: vpermilps %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x0c,0xd9] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermilps %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x0c,0xd1] -; X64-NEXT: vpermilps %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x0c,0xc1] -; X64-NEXT: vaddps %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xc0] -; X64-NEXT: vaddps %xmm0, %xmm3, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe0,0x58,0xc0] +; X64-NEXT: vmovaps %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.vpermilvar.ps.128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2, i8 %x3) - %res1 = call <4 x float> @llvm.x86.avx512.mask.vpermilvar.ps.128(<4 x float> %x0, <4 x i32> %x1, <4 x float> zeroinitializer, i8 %x3) - %res2 = call <4 x float> @llvm.x86.avx512.mask.vpermilvar.ps.128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2, i8 -1) - %res3 = fadd <4 x float> %res, %res1 - %res4 = fadd <4 x float> %res2, %res3 - ret <4 x float> %res4 + ret <4 x float> %res +} + +define <4 x float>@test_int_x86_avx512_maskz_vpermilvar_ps_128(<4 x float> %x0, <4 x i32> %x1, i8 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_vpermilvar_ps_128: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpermilps %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x0c,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_vpermilvar_ps_128: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vpermilps %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x0c,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <4 x float> @llvm.x86.avx512.mask.vpermilvar.ps.128(<4 x float> %x0, <4 x i32> %x1, <4 x float> zeroinitializer, i8 %x3) + ret <4 x float> %res } declare <4 x float> @llvm.x86.avx512.mask.vextractf32x4.256(<8 x float>, i32, <4 x float>, i8) -define <4 x float>@test_int_x86_avx512_mask_vextractf32x4_256(<8 x float> %x0, <4 x float> %x2, i8 %x3) { -; X86-LABEL: test_int_x86_avx512_mask_vextractf32x4_256: +define <4 x float>@test_int_x86_avx512_vextractf32x4_256(<8 x float> %x0, <4 x float> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vextractf32x4_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x19,0xc0,0x01] +; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x float> @llvm.x86.avx512.mask.vextractf32x4.256(<8 x float> %x0, i32 1, <4 x float> zeroinitializer, i8 -1) + ret <4 x float> %res +} + +define <4 x float>@test_int_x86_avx512_mask_vextractf32x4_256(<8 x float> %x0, <4 x float> %x2, i8 %x3) { +; X86-LABEL: test_int_x86_avx512_mask_vextractf32x4_256: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vextractf32x4 $1, %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x19,0xc1,0x01] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_mask_vextractf32x4_256: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vextractf32x4 $1, %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x19,0xc1,0x01] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] +; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <4 x float> @llvm.x86.avx512.mask.vextractf32x4.256(<8 x float> %x0, i32 1, <4 x float> %x2, i8 %x3) + ret <4 x float> %res +} + +define <4 x float>@test_int_x86_avx512_maskz_vextractf32x4_256(<8 x float> %x0, i8 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_vextractf32x4_256: ; X86: # %bb.0: -; X86-NEXT: vextractf128 $1, %ymm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x19,0xc2,0x01] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vextractf32x4 $1, %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x19,0xc1,0x01] ; X86-NEXT: vextractf32x4 $1, %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x19,0xc0,0x01] -; X86-NEXT: vaddps %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc0] -; X86-NEXT: vaddps %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xc0] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; -; X64-LABEL: test_int_x86_avx512_mask_vextractf32x4_256: +; X64-LABEL: test_int_x86_avx512_maskz_vextractf32x4_256: ; X64: # %bb.0: -; X64-NEXT: vextractf128 $1, %ymm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x19,0xc2,0x01] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vextractf32x4 $1, %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x19,0xc1,0x01] ; X64-NEXT: vextractf32x4 $1, %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x19,0xc0,0x01] -; X64-NEXT: vaddps %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc0] -; X64-NEXT: vaddps %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xc0] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <4 x float> @llvm.x86.avx512.mask.vextractf32x4.256(<8 x float> %x0, i32 1, <4 x float> %x2, i8 %x3) - %res1 = call <4 x float> @llvm.x86.avx512.mask.vextractf32x4.256(<8 x float> %x0, i32 1, <4 x float> zeroinitializer, i8 %x3) - %res2 = call <4 x float> @llvm.x86.avx512.mask.vextractf32x4.256(<8 x float> %x0, i32 1, <4 x float> zeroinitializer, i8 -1) - %res3 = fadd <4 x float> %res, %res1 - %res4 = fadd <4 x float> %res2, %res3 - ret <4 x float> %res4 + %res = call <4 x float> @llvm.x86.avx512.mask.vextractf32x4.256(<8 x float> %x0, i32 1, <4 x float> zeroinitializer, i8 %x3) + ret <4 x float> %res } declare <8 x float> @llvm.x86.avx512.mask.insertf32x4.256(<8 x float>, <4 x float>, i32, <8 x float>, i8) +define <8 x float>@test_int_x86_avx512_insertf32x4_256(<8 x float> %x0, <4 x float> %x1, <8 x float> %x3) { +; CHECK-LABEL: test_int_x86_avx512_insertf32x4_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x18,0xc1,0x01] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x float> @llvm.x86.avx512.mask.insertf32x4.256(<8 x float> %x0, <4 x float> %x1, i32 1, <8 x float> %x3, i8 -1) + ret <8 x float> %res +} + define <8 x float>@test_int_x86_avx512_mask_insertf32x4_256(<8 x float> %x0, <4 x float> %x1, <8 x float> %x3, i8 %x4) { ; X86-LABEL: test_int_x86_avx512_mask_insertf32x4_256: ; X86: # %bb.0: -; X86-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x18,0xd9,0x01] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x18,0xd1,0x01] -; X86-NEXT: vaddps %ymm3, %ymm2, %ymm2 # EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xd3] -; X86-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x18,0xc1,0x01] -; X86-NEXT: vaddps %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc2] +; X86-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_insertf32x4_256: ; X64: # %bb.0: -; X64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x18,0xd9,0x01] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x18,0xd1,0x01] -; X64-NEXT: vaddps %ymm3, %ymm2, %ymm2 # EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xd3] -; X64-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x18,0xc1,0x01] -; X64-NEXT: vaddps %ymm2, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc2] +; X64-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx512.mask.insertf32x4.256(<8 x float> %x0, <4 x float> %x1, i32 1, <8 x float> %x3, i8 %x4) - %res1 = call <8 x float> @llvm.x86.avx512.mask.insertf32x4.256(<8 x float> %x0, <4 x float> %x1, i32 1, <8 x float> %x3, i8 -1) - %res2 = call <8 x float> @llvm.x86.avx512.mask.insertf32x4.256(<8 x float> %x0, <4 x float> %x1, i32 1, <8 x float> zeroinitializer, i8 %x4) - %res3 = fadd <8 x float> %res, %res1 - %res4 = fadd <8 x float> %res2, %res3 - ret <8 x float> %res4 + ret <8 x float> %res +} + +define <8 x float>@test_int_x86_avx512_maskz_insertf32x4_256(<8 x float> %x0, <4 x float> %x1, i8 %x4) { +; X86-LABEL: test_int_x86_avx512_maskz_insertf32x4_256: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x18,0xc1,0x01] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_insertf32x4_256: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x18,0xc1,0x01] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <8 x float> @llvm.x86.avx512.mask.insertf32x4.256(<8 x float> %x0, <4 x float> %x1, i32 1, <8 x float> zeroinitializer, i8 %x4) + ret <8 x float> %res } declare <8 x i32> @llvm.x86.avx512.mask.inserti32x4.256(<8 x i32>, <4 x i32>, i32, <8 x i32>, i8) +define <8 x i32>@test_int_x86_avx512_inserti32x4_256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x3) { +; CHECK-LABEL: test_int_x86_avx512_inserti32x4_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x18,0xc1,0x01] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i32> @llvm.x86.avx512.mask.inserti32x4.256(<8 x i32> %x0, <4 x i32> %x1, i32 1, <8 x i32> %x3, i8 -1) + ret <8 x i32> %res +} + define <8 x i32>@test_int_x86_avx512_mask_inserti32x4_256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x3, i8 %x4) { ; X86-LABEL: test_int_x86_avx512_mask_inserti32x4_256: ; X86: # %bb.0: -; X86-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xd9,0x01] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x38,0xd1,0x01] -; X86-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x38,0xc1,0x01] -; X86-NEXT: vpaddd %ymm0, %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfe,0xc0] -; X86-NEXT: vpaddd %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc0] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_inserti32x4_256: ; X64: # %bb.0: -; X64-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xd9,0x01] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x38,0xd1,0x01] -; X64-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x38,0xc1,0x01] -; X64-NEXT: vpaddd %ymm0, %ymm3, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfe,0xc0] -; X64-NEXT: vpaddd %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc0] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.inserti32x4.256(<8 x i32> %x0, <4 x i32> %x1, i32 1, <8 x i32> %x3, i8 %x4) - %res1 = call <8 x i32> @llvm.x86.avx512.mask.inserti32x4.256(<8 x i32> %x0, <4 x i32> %x1, i32 1, <8 x i32> %x3, i8 -1) - %res2 = call <8 x i32> @llvm.x86.avx512.mask.inserti32x4.256(<8 x i32> %x0, <4 x i32> %x1, i32 1, <8 x i32> zeroinitializer, i8 %x4) - %res3 = add <8 x i32> %res, %res1 - %res4 = add <8 x i32> %res2, %res3 - ret <8 x i32> %res4 + ret <8 x i32> %res +} + +define <8 x i32>@test_int_x86_avx512_maskz_inserti32x4_256(<8 x i32> %x0, <4 x i32> %x1, i8 %x4) { +; X86-LABEL: test_int_x86_avx512_maskz_inserti32x4_256: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x38,0xc1,0x01] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_inserti32x4_256: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x38,0xc1,0x01] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <8 x i32> @llvm.x86.avx512.mask.inserti32x4.256(<8 x i32> %x0, <4 x i32> %x1, i32 1, <8 x i32> zeroinitializer, i8 %x4) + ret <8 x i32> %res } define <8 x float> @test_mm512_maskz_max_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) { @@ -9011,35 +10091,54 @@ declare i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64>, <2 x i64>, i32, i8) nounw declare <8 x float> @llvm.x86.avx512.mask.broadcastf32x4.256(<4 x float>, <8 x float>, i8) +define <8 x float>@test_int_x86_avx512_broadcastf32x4_256(<4 x float> %x0, <8 x float> %x2) { +; CHECK-LABEL: test_int_x86_avx512_broadcastf32x4_256: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x18,0xc0,0x01] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x float> @llvm.x86.avx512.mask.broadcastf32x4.256(<4 x float> %x0, <8 x float> %x2, i8 -1) + ret <8 x float> %res +} + define <8 x float>@test_int_x86_avx512_mask_broadcastf32x4_256(<4 x float> %x0, <8 x float> %x2, i8 %mask) { ; X86-LABEL: test_int_x86_avx512_mask_broadcastf32x4_256: ; X86: # %bb.0: ; X86-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x18,0xd0,0x01] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vinsertf32x4 $1, %xmm0, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x18,0xc8,0x01] -; X86-NEXT: vaddps %ymm1, %ymm2, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc9] -; X86-NEXT: vinsertf32x4 $1, %xmm0, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x18,0xc0,0x01] -; X86-NEXT: vaddps %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc1] +; X86-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_broadcastf32x4_256: ; X64: # %bb.0: ; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x18,0xd0,0x01] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vinsertf32x4 $1, %xmm0, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x18,0xc8,0x01] -; X64-NEXT: vaddps %ymm1, %ymm2, %ymm1 # EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc9] +; X64-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <8 x float> @llvm.x86.avx512.mask.broadcastf32x4.256(<4 x float> %x0, <8 x float> %x2, i8 %mask) + ret <8 x float> %res +} + +define <8 x float>@test_int_x86_avx512_maskz_broadcastf32x4_256(<4 x float> %x0, i8 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_broadcastf32x4_256: +; X86: # %bb.0: +; X86-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vinsertf32x4 $1, %xmm0, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x18,0xc0,0x01] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_broadcastf32x4_256: +; X64: # %bb.0: +; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vinsertf32x4 $1, %xmm0, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x18,0xc0,0x01] -; X64-NEXT: vaddps %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc1] ; X64-NEXT: retq # encoding: [0xc3] - %res1 = call <8 x float> @llvm.x86.avx512.mask.broadcastf32x4.256(<4 x float> %x0, <8 x float> %x2, i8 -1) - %res2 = call <8 x float> @llvm.x86.avx512.mask.broadcastf32x4.256(<4 x float> %x0, <8 x float> %x2, i8 %mask) - %res3 = call <8 x float> @llvm.x86.avx512.mask.broadcastf32x4.256(<4 x float> %x0, <8 x float> zeroinitializer, i8 %mask) - %res4 = fadd <8 x float> %res1, %res2 - %res5 = fadd <8 x float> %res3, %res4 - ret <8 x float> %res5 + %res = call <8 x float> @llvm.x86.avx512.mask.broadcastf32x4.256(<4 x float> %x0, <8 x float> zeroinitializer, i8 %mask) + ret <8 x float> %res } define <8 x float>@test_int_x86_avx512_mask_broadcastf32x4_256_load(<4 x float>* %x0ptr, <8 x float> %x2, i8 %mask) { @@ -9065,35 +10164,54 @@ define <8 x float>@test_int_x86_avx512_mask_broadcastf32x4_256_load(<4 x float>* declare <8 x i32> @llvm.x86.avx512.mask.broadcasti32x4.256(<4 x i32>, <8 x i32>, i8) +define <8 x i32>@test_int_x86_avx512_broadcasti32x4_256(<4 x i32> %x0, <8 x i32> %x2) { +; CHECK-LABEL: test_int_x86_avx512_broadcasti32x4_256: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x18,0xc0,0x01] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i32> @llvm.x86.avx512.mask.broadcasti32x4.256(<4 x i32> %x0, <8 x i32> %x2, i8 -1) + ret <8 x i32> %res +} + define <8 x i32>@test_int_x86_avx512_mask_broadcasti32x4_256(<4 x i32> %x0, <8 x i32> %x2, i8 %mask) { ; X86-LABEL: test_int_x86_avx512_mask_broadcasti32x4_256: ; X86: # %bb.0: ; X86-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; X86-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xd0,0x01] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x38,0xc8,0x01] -; X86-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x38,0xc0,0x01] -; X86-NEXT: vpaddd %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0] -; X86-NEXT: vpaddd %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc0] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_broadcasti32x4_256: ; X64: # %bb.0: ; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; X64-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xd0,0x01] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf3,0x7d,0x29,0x38,0xc8,0x01] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <8 x i32> @llvm.x86.avx512.mask.broadcasti32x4.256(<4 x i32> %x0, <8 x i32> %x2, i8 %mask) + ret <8 x i32> %res +} + +define <8 x i32>@test_int_x86_avx512_maskz_broadcasti32x4_256(<4 x i32> %x0, i8 %mask) { +; X86-LABEL: test_int_x86_avx512_maskz_broadcasti32x4_256: +; X86: # %bb.0: +; X86-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x38,0xc0,0x01] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_broadcasti32x4_256: +; X64: # %bb.0: +; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vinserti32x4 $1, %xmm0, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf3,0x7d,0xa9,0x38,0xc0,0x01] -; X64-NEXT: vpaddd %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0] -; X64-NEXT: vpaddd %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res1 = call <8 x i32> @llvm.x86.avx512.mask.broadcasti32x4.256(<4 x i32> %x0, <8 x i32> %x2, i8 -1) - %res2 = call <8 x i32> @llvm.x86.avx512.mask.broadcasti32x4.256(<4 x i32> %x0, <8 x i32> %x2, i8 %mask) - %res3 = call <8 x i32> @llvm.x86.avx512.mask.broadcasti32x4.256(<4 x i32> %x0, <8 x i32> zeroinitializer, i8 %mask) - %res4 = add <8 x i32> %res1, %res2 - %res5 = add <8 x i32> %res3, %res4 - ret <8 x i32> %res5 + %res = call <8 x i32> @llvm.x86.avx512.mask.broadcasti32x4.256(<4 x i32> %x0, <8 x i32> zeroinitializer, i8 %mask) + ret <8 x i32> %res } define <8 x i32>@test_int_x86_avx512_mask_broadcasti32x4_256_load(<4 x i32>* %x0ptr, <8 x i32> %x2, i8 %mask) { @@ -10199,52 +11317,62 @@ declare < 4 x i64> @llvm.x86.avx512.mask.pmulu.dq.256(< 8 x i32>, < 8 x i32>, < declare <4 x float> @llvm.x86.avx512.mask.cvtdq2ps.128(<4 x i32>, <4 x float>, i8) +define <4 x float>@test_int_x86_avx512_cvt_dq2ps_128(<4 x i32> %x0, <4 x float> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvt_dq2ps_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtdq2ps %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x5b,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x float> @llvm.x86.avx512.mask.cvtdq2ps.128(<4 x i32> %x0, <4 x float> %x1, i8 -1) + ret <4 x float> %res +} + define <4 x float>@test_int_x86_avx512_mask_cvt_dq2ps_128(<4 x i32> %x0, <4 x float> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvt_dq2ps_128: ; X86: # %bb.0: -; X86-NEXT: vcvtdq2ps %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x5b,0xd0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvtdq2ps %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x09,0x5b,0xc8] -; X86-NEXT: vaddps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc2] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_dq2ps_128: ; X64: # %bb.0: -; X64-NEXT: vcvtdq2ps %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x5b,0xd0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtdq2ps %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x09,0x5b,0xc8] -; X64-NEXT: vaddps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc2] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.cvtdq2ps.128(<4 x i32> %x0, <4 x float> %x1, i8 %x2) - %res1 = call <4 x float> @llvm.x86.avx512.mask.cvtdq2ps.128(<4 x i32> %x0, <4 x float> %x1, i8 -1) - %res2 = fadd <4 x float> %res, %res1 - ret <4 x float> %res2 + ret <4 x float> %res } declare <8 x float> @llvm.x86.avx512.mask.cvtdq2ps.256(<8 x i32>, <8 x float>, i8) +define <8 x float>@test_int_x86_avx512_cvt_dq2ps_256(<8 x i32> %x0, <8 x float> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvt_dq2ps_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtdq2ps %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x5b,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x float> @llvm.x86.avx512.mask.cvtdq2ps.256(<8 x i32> %x0, <8 x float> %x1, i8 -1) + ret <8 x float> %res +} + define <8 x float>@test_int_x86_avx512_mask_cvt_dq2ps_256(<8 x i32> %x0, <8 x float> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvt_dq2ps_256: ; X86: # %bb.0: -; X86-NEXT: vcvtdq2ps %ymm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x5b,0xd0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvtdq2ps %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0x5b,0xc8] -; X86-NEXT: vaddps %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xc2] +; X86-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_dq2ps_256: ; X64: # %bb.0: -; X64-NEXT: vcvtdq2ps %ymm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x5b,0xd0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtdq2ps %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0x5b,0xc8] -; X64-NEXT: vaddps %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xc2] +; X64-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx512.mask.cvtdq2ps.256(<8 x i32> %x0, <8 x float> %x1, i8 %x2) - %res1 = call <8 x float> @llvm.x86.avx512.mask.cvtdq2ps.256(<8 x i32> %x0, <8 x float> %x1, i8 -1) - %res2 = fadd <8 x float> %res, %res1 - ret <8 x float> %res2 + ret <8 x float> %res } define <4 x float> @test_x86_vcvtph2ps_128(<8 x i16> %a0) { @@ -10343,307 +11471,409 @@ declare <8 x float> @llvm.x86.avx512.mask.vcvtph2ps.256(<8 x i16>, <8 x float>, declare <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.256(<4 x double>, <4 x i32>, i8) +define <4 x i32>@test_int_x86_avx512_cvt_pd2dq_256(<4 x double> %x0, <4 x i32> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvt_pd2dq_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtpd2dq %ymm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xff,0xe6,0xc0] +; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.256(<4 x double> %x0, <4 x i32> %x1, i8 -1) + ret <4 x i32> %res +} + define <4 x i32>@test_int_x86_avx512_mask_cvt_pd2dq_256(<4 x double> %x0, <4 x i32> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvt_pd2dq_256: ; X86: # %bb.0: -; X86-NEXT: vcvtpd2dq %ymm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xff,0xe6,0xd0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvtpd2dq %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xff,0x29,0xe6,0xc8] -; X86-NEXT: vpaddd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc2] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_pd2dq_256: ; X64: # %bb.0: -; X64-NEXT: vcvtpd2dq %ymm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xff,0xe6,0xd0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtpd2dq %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xff,0x29,0xe6,0xc8] -; X64-NEXT: vpaddd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc2] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.256(<4 x double> %x0, <4 x i32> %x1, i8 %x2) - %res1 = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.256(<4 x double> %x0, <4 x i32> %x1, i8 -1) - %res2 = add <4 x i32> %res, %res1 - ret <4 x i32> %res2 + ret <4 x i32> %res } declare <4 x float> @llvm.x86.avx512.mask.cvtpd2ps.256(<4 x double>, <4 x float>, i8) +define <4 x float>@test_int_x86_avx512_cvt_pd2ps_256(<4 x double> %x0, <4 x float> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvt_pd2ps_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtpd2ps %ymm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x5a,0xc0] +; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x float> @llvm.x86.avx512.mask.cvtpd2ps.256(<4 x double> %x0, <4 x float> %x1, i8 -1) + ret <4 x float> %res +} + define <4 x float>@test_int_x86_avx512_mask_cvt_pd2ps_256(<4 x double> %x0, <4 x float> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvt_pd2ps_256: ; X86: # %bb.0: -; X86-NEXT: vcvtpd2ps %ymm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x5a,0xd0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvtpd2ps %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0x5a,0xc8] -; X86-NEXT: vaddps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc2] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_pd2ps_256: ; X64: # %bb.0: -; X64-NEXT: vcvtpd2ps %ymm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x5a,0xd0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtpd2ps %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0x5a,0xc8] -; X64-NEXT: vaddps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc2] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.cvtpd2ps.256(<4 x double> %x0, <4 x float> %x1, i8 %x2) - %res1 = call <4 x float> @llvm.x86.avx512.mask.cvtpd2ps.256(<4 x double> %x0, <4 x float> %x1, i8 -1) - %res2 = fadd <4 x float> %res, %res1 - ret <4 x float> %res2 + ret <4 x float> %res } declare <4 x double> @llvm.x86.avx512.mask.cvtps2pd.256(<4 x float>, <4 x double>, i8) +define <4 x double>@test_int_x86_avx512_cvt_ps2pd_256(<4 x float> %x0, <4 x double> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvt_ps2pd_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtps2pd %xmm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x5a,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x double> @llvm.x86.avx512.mask.cvtps2pd.256(<4 x float> %x0, <4 x double> %x1, i8 -1) + ret <4 x double> %res +} + define <4 x double>@test_int_x86_avx512_mask_cvt_ps2pd_256(<4 x float> %x0, <4 x double> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvt_ps2pd_256: ; X86: # %bb.0: -; X86-NEXT: vcvtps2pd %xmm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x5a,0xd0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvtps2pd %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0x5a,0xc8] -; X86-NEXT: vaddpd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc2] +; X86-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_ps2pd_256: ; X64: # %bb.0: -; X64-NEXT: vcvtps2pd %xmm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x5a,0xd0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtps2pd %xmm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0x5a,0xc8] -; X64-NEXT: vaddpd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc2] +; X64-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x double> @llvm.x86.avx512.mask.cvtps2pd.256(<4 x float> %x0, <4 x double> %x1, i8 %x2) - %res1 = call <4 x double> @llvm.x86.avx512.mask.cvtps2pd.256(<4 x float> %x0, <4 x double> %x1, i8 -1) - %res2 = fadd <4 x double> %res, %res1 - ret <4 x double> %res2 + ret <4 x double> %res } declare <2 x double> @llvm.x86.avx512.mask.cvtps2pd.128(<4 x float>, <2 x double>, i8) +define <2 x double>@test_int_x86_avx512_cvt_ps2pd_128(<4 x float> %x0, <2 x double> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvt_ps2pd_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtps2pd %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x5a,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <2 x double> @llvm.x86.avx512.mask.cvtps2pd.128(<4 x float> %x0, <2 x double> %x1, i8 -1) + ret <2 x double> %res +} + define <2 x double>@test_int_x86_avx512_mask_cvt_ps2pd_128(<4 x float> %x0, <2 x double> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvt_ps2pd_128: ; X86: # %bb.0: -; X86-NEXT: vcvtps2pd %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x5a,0xd0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvtps2pd %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x09,0x5a,0xc8] -; X86-NEXT: vaddpd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc2] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_ps2pd_128: ; X64: # %bb.0: -; X64-NEXT: vcvtps2pd %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x5a,0xd0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtps2pd %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x09,0x5a,0xc8] -; X64-NEXT: vaddpd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc2] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <2 x double> @llvm.x86.avx512.mask.cvtps2pd.128(<4 x float> %x0, <2 x double> %x1, i8 %x2) - %res1 = call <2 x double> @llvm.x86.avx512.mask.cvtps2pd.128(<4 x float> %x0, <2 x double> %x1, i8 -1) - %res2 = fadd <2 x double> %res, %res1 - ret <2 x double> %res2 + ret <2 x double> %res } declare <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.256(<4 x double>, <4 x i32>, i8) +define <4 x i32>@test_int_x86_avx512_cvtt_pd2dq_256(<4 x double> %x0, <4 x i32> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvtt_pd2dq_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttpd2dq %ymm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe6,0xc0] +; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.256(<4 x double> %x0, <4 x i32> %x1, i8 -1) + ret <4 x i32> %res +} + define <4 x i32>@test_int_x86_avx512_mask_cvtt_pd2dq_256(<4 x double> %x0, <4 x i32> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvtt_pd2dq_256: ; X86: # %bb.0: -; X86-NEXT: vcvttpd2dq %ymm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe6,0xd0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvttpd2dq %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0xe6,0xc8] -; X86-NEXT: vpaddd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc2] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvtt_pd2dq_256: ; X64: # %bb.0: -; X64-NEXT: vcvttpd2dq %ymm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe6,0xd0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvttpd2dq %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0xe6,0xc8] -; X64-NEXT: vpaddd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc2] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.256(<4 x double> %x0, <4 x i32> %x1, i8 %x2) - %res1 = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.256(<4 x double> %x0, <4 x i32> %x1, i8 -1) - %res2 = add <4 x i32> %res, %res1 - ret <4 x i32> %res2 + ret <4 x i32> %res } declare <4 x i32> @llvm.x86.avx512.mask.cvttps2dq.128(<4 x float>, <4 x i32>, i8) +define <4 x i32>@test_int_x86_avx512_cvtt_ps2dq_128(<4 x float> %x0, <4 x i32> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvtt_ps2dq_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttps2dq %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x5b,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.mask.cvttps2dq.128(<4 x float> %x0, <4 x i32> %x1, i8 -1) + ret <4 x i32> %res +} + define <4 x i32>@test_int_x86_avx512_mask_cvtt_ps2dq_128(<4 x float> %x0, <4 x i32> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvtt_ps2dq_128: ; X86: # %bb.0: -; X86-NEXT: vcvttps2dq %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x5b,0xd0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvttps2dq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7e,0x09,0x5b,0xc8] -; X86-NEXT: vpaddd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc2] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvtt_ps2dq_128: ; X64: # %bb.0: -; X64-NEXT: vcvttps2dq %xmm0, %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x5b,0xd0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvttps2dq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7e,0x09,0x5b,0xc8] -; X64-NEXT: vpaddd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc2] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.cvttps2dq.128(<4 x float> %x0, <4 x i32> %x1, i8 %x2) - %res1 = call <4 x i32> @llvm.x86.avx512.mask.cvttps2dq.128(<4 x float> %x0, <4 x i32> %x1, i8 -1) - %res2 = add <4 x i32> %res, %res1 - ret <4 x i32> %res2 + ret <4 x i32> %res } declare <8 x i32> @llvm.x86.avx512.mask.cvttps2dq.256(<8 x float>, <8 x i32>, i8) +define <8 x i32>@test_int_x86_avx512_cvtt_ps2dq_256(<8 x float> %x0, <8 x i32> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvtt_ps2dq_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttps2dq %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfe,0x5b,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i32> @llvm.x86.avx512.mask.cvttps2dq.256(<8 x float> %x0, <8 x i32> %x1, i8 -1) + ret <8 x i32> %res +} + define <8 x i32>@test_int_x86_avx512_mask_cvtt_ps2dq_256(<8 x float> %x0, <8 x i32> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvtt_ps2dq_256: ; X86: # %bb.0: -; X86-NEXT: vcvttps2dq %ymm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc5,0xfe,0x5b,0xd0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvttps2dq %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7e,0x29,0x5b,0xc8] -; X86-NEXT: vpaddd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc2] +; X86-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvtt_ps2dq_256: ; X64: # %bb.0: -; X64-NEXT: vcvttps2dq %ymm0, %ymm2 # EVEX TO VEX Compression encoding: [0xc5,0xfe,0x5b,0xd0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvttps2dq %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7e,0x29,0x5b,0xc8] -; X64-NEXT: vpaddd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc2] +; X64-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.cvttps2dq.256(<8 x float> %x0, <8 x i32> %x1, i8 %x2) - %res1 = call <8 x i32> @llvm.x86.avx512.mask.cvttps2dq.256(<8 x float> %x0, <8 x i32> %x1, i8 -1) - %res2 = add <8 x i32> %res, %res1 - ret <8 x i32> %res2 + ret <8 x i32> %res } declare <8 x float> @llvm.x86.avx512.mask.permvar.sf.256(<8 x float>, <8 x i32>, <8 x float>, i8) +define <8 x float>@test_int_x86_avx512_permvar_sf_256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_permvar_sf_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x16,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x float> @llvm.x86.avx512.mask.permvar.sf.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 -1) + ret <8 x float> %res +} + define <8 x float>@test_int_x86_avx512_mask_permvar_sf_256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_permvar_sf_256: ; X86: # %bb.0: -; X86-NEXT: vpermps %ymm0, %ymm1, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x16,0xd8] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermps %ymm0, %ymm1, %ymm2 {%k1} # encoding: [0x62,0xf2,0x75,0x29,0x16,0xd0] +; X86-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_mask_permvar_sf_256: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vpermps %ymm0, %ymm1, %ymm2 {%k1} # encoding: [0x62,0xf2,0x75,0x29,0x16,0xd0] +; X64-NEXT: vmovaps %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <8 x float> @llvm.x86.avx512.mask.permvar.sf.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 %x3) + ret <8 x float> %res +} + +define <8 x float>@test_int_x86_avx512_maskz_permvar_sf_256(<8 x float> %x0, <8 x i32> %x1, i8 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_permvar_sf_256: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermps %ymm0, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x16,0xc0] -; X86-NEXT: vaddps %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc0] -; X86-NEXT: vaddps %ymm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc3] ; X86-NEXT: retl # encoding: [0xc3] ; -; X64-LABEL: test_int_x86_avx512_mask_permvar_sf_256: +; X64-LABEL: test_int_x86_avx512_maskz_permvar_sf_256: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vpermps %ymm0, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x16,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <8 x float> @llvm.x86.avx512.mask.permvar.sf.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> zeroinitializer, i8 %x3) + ret <8 x float> %res +} + +declare <8 x i32> @llvm.x86.avx512.mask.permvar.si.256(<8 x i32>, <8 x i32>, <8 x i32>, i8) + +define <8 x i32>@test_int_x86_avx512_permvar_si_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) { +; CHECK-LABEL: test_int_x86_avx512_permvar_si_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermps %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x16,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i32> @llvm.x86.avx512.mask.permvar.si.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1) + ret <8 x i32> %res +} + +define <8 x i32>@test_int_x86_avx512_mask_permvar_si_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) { +; X86-LABEL: test_int_x86_avx512_mask_permvar_si_256: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpermd %ymm0, %ymm1, %ymm2 {%k1} # encoding: [0x62,0xf2,0x75,0x29,0x36,0xd0] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_mask_permvar_si_256: ; X64: # %bb.0: -; X64-NEXT: vpermps %ymm0, %ymm1, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x16,0xd8] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vpermps %ymm0, %ymm1, %ymm2 {%k1} # encoding: [0x62,0xf2,0x75,0x29,0x16,0xd0] -; X64-NEXT: vpermps %ymm0, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x16,0xc0] -; X64-NEXT: vaddps %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc0] -; X64-NEXT: vaddps %ymm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc3] +; X64-NEXT: vpermd %ymm0, %ymm1, %ymm2 {%k1} # encoding: [0x62,0xf2,0x75,0x29,0x36,0xd0] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <8 x float> @llvm.x86.avx512.mask.permvar.sf.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 %x3) - %res1 = call <8 x float> @llvm.x86.avx512.mask.permvar.sf.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> zeroinitializer, i8 %x3) - %res2 = call <8 x float> @llvm.x86.avx512.mask.permvar.sf.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 -1) - %res3 = fadd <8 x float> %res, %res1 - %res4 = fadd <8 x float> %res3, %res2 - ret <8 x float> %res4 + %res = call <8 x i32> @llvm.x86.avx512.mask.permvar.si.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) + ret <8 x i32> %res } -declare <8 x i32> @llvm.x86.avx512.mask.permvar.si.256(<8 x i32>, <8 x i32>, <8 x i32>, i8) - -define <8 x i32>@test_int_x86_avx512_mask_permvar_si_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) { -; X86-LABEL: test_int_x86_avx512_mask_permvar_si_256: +define <8 x i32>@test_int_x86_avx512_maskz_permvar_si_256(<8 x i32> %x0, <8 x i32> %x1, i8 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_permvar_si_256: ; X86: # %bb.0: -; X86-NEXT: vpermd %ymm0, %ymm1, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x36,0xd8] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpermd %ymm0, %ymm1, %ymm2 {%k1} # encoding: [0x62,0xf2,0x75,0x29,0x36,0xd0] ; X86-NEXT: vpermd %ymm0, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x36,0xc0] -; X86-NEXT: vpaddd %ymm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc3] -; X86-NEXT: vpaddd %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc0] ; X86-NEXT: retl # encoding: [0xc3] ; -; X64-LABEL: test_int_x86_avx512_mask_permvar_si_256: +; X64-LABEL: test_int_x86_avx512_maskz_permvar_si_256: ; X64: # %bb.0: -; X64-NEXT: vpermd %ymm0, %ymm1, %ymm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x36,0xd8] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vpermd %ymm0, %ymm1, %ymm2 {%k1} # encoding: [0x62,0xf2,0x75,0x29,0x36,0xd0] ; X64-NEXT: vpermd %ymm0, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x36,0xc0] -; X64-NEXT: vpaddd %ymm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc3] -; X64-NEXT: vpaddd %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc0] ; X64-NEXT: retq # encoding: [0xc3] - %res = call <8 x i32> @llvm.x86.avx512.mask.permvar.si.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) - %res1 = call <8 x i32> @llvm.x86.avx512.mask.permvar.si.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> zeroinitializer, i8 %x3) - %res2 = call <8 x i32> @llvm.x86.avx512.mask.permvar.si.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1) - %res3 = add <8 x i32> %res, %res1 - %res4 = add <8 x i32> %res3, %res2 - ret <8 x i32> %res4 + %res = call <8 x i32> @llvm.x86.avx512.mask.permvar.si.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> zeroinitializer, i8 %x3) + ret <8 x i32> %res } declare <4 x double> @llvm.x86.avx512.mask.permvar.df.256(<4 x double>, <4 x i64>, <4 x double>, i8) +define <4 x double>@test_int_x86_avx512_permvar_df_256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2) { +; CHECK-LABEL: test_int_x86_avx512_permvar_df_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermpd %ymm0, %ymm1, %ymm0 # encoding: [0x62,0xf2,0xf5,0x28,0x16,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x double> @llvm.x86.avx512.mask.permvar.df.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 -1) + ret <4 x double> %res +} + define <4 x double>@test_int_x86_avx512_mask_permvar_df_256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_permvar_df_256: ; X86: # %bb.0: -; X86-NEXT: vpermpd %ymm0, %ymm1, %ymm3 # encoding: [0x62,0xf2,0xf5,0x28,0x16,0xd8] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermpd %ymm0, %ymm1, %ymm2 {%k1} # encoding: [0x62,0xf2,0xf5,0x29,0x16,0xd0] -; X86-NEXT: vpermpd %ymm0, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0x16,0xc0] -; X86-NEXT: vaddpd %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc0] -; X86-NEXT: vaddpd %ymm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc3] +; X86-NEXT: vmovapd %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_permvar_df_256: ; X64: # %bb.0: -; X64-NEXT: vpermpd %ymm0, %ymm1, %ymm3 # encoding: [0x62,0xf2,0xf5,0x28,0x16,0xd8] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermpd %ymm0, %ymm1, %ymm2 {%k1} # encoding: [0x62,0xf2,0xf5,0x29,0x16,0xd0] -; X64-NEXT: vpermpd %ymm0, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0x16,0xc0] -; X64-NEXT: vaddpd %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc0] -; X64-NEXT: vaddpd %ymm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc3] +; X64-NEXT: vmovapd %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x double> @llvm.x86.avx512.mask.permvar.df.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 %x3) - %res1 = call <4 x double> @llvm.x86.avx512.mask.permvar.df.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> zeroinitializer, i8 %x3) - %res2 = call <4 x double> @llvm.x86.avx512.mask.permvar.df.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 -1) - %res3 = fadd <4 x double> %res, %res1 - %res4 = fadd <4 x double> %res3, %res2 - ret <4 x double> %res4 + ret <4 x double> %res +} + +define <4 x double>@test_int_x86_avx512_maskz_permvar_df_256(<4 x double> %x0, <4 x i64> %x1, i8 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_permvar_df_256: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpermpd %ymm0, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0x16,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_permvar_df_256: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vpermpd %ymm0, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0x16,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <4 x double> @llvm.x86.avx512.mask.permvar.df.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> zeroinitializer, i8 %x3) + ret <4 x double> %res } declare <4 x i64> @llvm.x86.avx512.mask.permvar.di.256(<4 x i64>, <4 x i64>, <4 x i64>, i8) +define <4 x i64>@test_int_x86_avx512_permvar_di_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2) { +; CHECK-LABEL: test_int_x86_avx512_permvar_di_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermpd %ymm0, %ymm1, %ymm0 # encoding: [0x62,0xf2,0xf5,0x28,0x16,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i64> @llvm.x86.avx512.mask.permvar.di.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1) + ret <4 x i64> %res +} + define <4 x i64>@test_int_x86_avx512_mask_permvar_di_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_permvar_di_256: ; X86: # %bb.0: -; X86-NEXT: vpermq %ymm0, %ymm1, %ymm3 # encoding: [0x62,0xf2,0xf5,0x28,0x36,0xd8] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermq %ymm0, %ymm1, %ymm2 {%k1} # encoding: [0x62,0xf2,0xf5,0x29,0x36,0xd0] -; X86-NEXT: vpermq %ymm0, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0x36,0xc0] -; X86-NEXT: vpaddq %ymm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc3] -; X86-NEXT: vpaddq %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_permvar_di_256: ; X64: # %bb.0: -; X64-NEXT: vpermq %ymm0, %ymm1, %ymm3 # encoding: [0x62,0xf2,0xf5,0x28,0x36,0xd8] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermq %ymm0, %ymm1, %ymm2 {%k1} # encoding: [0x62,0xf2,0xf5,0x29,0x36,0xd0] -; X64-NEXT: vpermq %ymm0, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0x36,0xc0] -; X64-NEXT: vpaddq %ymm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc3] -; X64-NEXT: vpaddq %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.permvar.di.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) - %res1 = call <4 x i64> @llvm.x86.avx512.mask.permvar.di.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3) - %res2 = call <4 x i64> @llvm.x86.avx512.mask.permvar.di.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1) - %res3 = add <4 x i64> %res, %res1 - %res4 = add <4 x i64> %res3, %res2 - ret <4 x i64> %res4 + ret <4 x i64> %res +} + +define <4 x i64>@test_int_x86_avx512_maskz_permvar_di_256(<4 x i64> %x0, <4 x i64> %x1, i8 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_permvar_di_256: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpermq %ymm0, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0x36,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_permvar_di_256: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vpermq %ymm0, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0x36,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <4 x i64> @llvm.x86.avx512.mask.permvar.di.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3) + ret <4 x i64> %res } declare <4 x i32> @llvm.x86.avx512.mask.pternlog.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i32, i8) @@ -10864,106 +12094,122 @@ define <4 x i64>@test_int_x86_avx512_maskz_pternlog_q_256(<4 x i64> %x0, <4 x i6 declare <4 x float> @llvm.x86.avx512.mask.cvtudq2ps.128(<4 x i32>, <4 x float>, i8) +define <4 x float>@test_int_x86_avx512_cvt_udq2ps_128(<4 x i32> %x0, <4 x float> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvt_udq2ps_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtudq2ps %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7f,0x08,0x7a,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x float> @llvm.x86.avx512.mask.cvtudq2ps.128(<4 x i32> %x0, <4 x float> %x1, i8 -1) + ret <4 x float> %res +} + define <4 x float>@test_int_x86_avx512_mask_cvt_udq2ps_128(<4 x i32> %x0, <4 x float> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvt_udq2ps_128: ; X86: # %bb.0: -; X86-NEXT: vcvtudq2ps %xmm0, %xmm2 # encoding: [0x62,0xf1,0x7f,0x08,0x7a,0xd0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvtudq2ps %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7f,0x09,0x7a,0xc8] -; X86-NEXT: vaddps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc2] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_udq2ps_128: ; X64: # %bb.0: -; X64-NEXT: vcvtudq2ps %xmm0, %xmm2 # encoding: [0x62,0xf1,0x7f,0x08,0x7a,0xd0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtudq2ps %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7f,0x09,0x7a,0xc8] -; X64-NEXT: vaddps %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc2] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.cvtudq2ps.128(<4 x i32> %x0, <4 x float> %x1, i8 %x2) - %res1 = call <4 x float> @llvm.x86.avx512.mask.cvtudq2ps.128(<4 x i32> %x0, <4 x float> %x1, i8 -1) - %res2 = fadd <4 x float> %res, %res1 - ret <4 x float> %res2 + ret <4 x float> %res } declare <8 x float> @llvm.x86.avx512.mask.cvtudq2ps.256(<8 x i32>, <8 x float>, i8) +define <8 x float>@test_int_x86_avx512_cvt_udq2ps_256(<8 x i32> %x0, <8 x float> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvt_udq2ps_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtudq2ps %ymm0, %ymm0 # encoding: [0x62,0xf1,0x7f,0x28,0x7a,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x float> @llvm.x86.avx512.mask.cvtudq2ps.256(<8 x i32> %x0, <8 x float> %x1, i8 -1) + ret <8 x float> %res +} + define <8 x float>@test_int_x86_avx512_mask_cvt_udq2ps_256(<8 x i32> %x0, <8 x float> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvt_udq2ps_256: ; X86: # %bb.0: -; X86-NEXT: vcvtudq2ps %ymm0, %ymm2 # encoding: [0x62,0xf1,0x7f,0x28,0x7a,0xd0] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvtudq2ps %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7f,0x29,0x7a,0xc8] -; X86-NEXT: vaddps %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xc2] +; X86-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_udq2ps_256: ; X64: # %bb.0: -; X64-NEXT: vcvtudq2ps %ymm0, %ymm2 # encoding: [0x62,0xf1,0x7f,0x28,0x7a,0xd0] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtudq2ps %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7f,0x29,0x7a,0xc8] -; X64-NEXT: vaddps %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xc2] +; X64-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx512.mask.cvtudq2ps.256(<8 x i32> %x0, <8 x float> %x1, i8 %x2) - %res1 = call <8 x float> @llvm.x86.avx512.mask.cvtudq2ps.256(<8 x i32> %x0, <8 x float> %x1, i8 -1) - %res2 = fadd <8 x float> %res, %res1 - ret <8 x float> %res2 + ret <8 x float> %res } declare <4 x i32> @llvm.x86.avx512.mask.vpermi2var.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8) +define <4 x i32>@test_int_x86_avx512_vpermi2var_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermi2var_d_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermt2d %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x75,0x08,0x7e,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.mask.vpermi2var.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1) + ret <4 x i32> %res +} + define <4 x i32>@test_int_x86_avx512_mask_vpermi2var_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermi2var_d_128: ; X86: # %bb.0: -; X86-NEXT: vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8] -; X86-NEXT: vpermt2d %xmm2, %xmm1, %xmm3 # encoding: [0x62,0xf2,0x75,0x08,0x7e,0xda] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermi2d %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x76,0xca] -; X86-NEXT: vpaddd %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc3] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermi2var_d_128: ; X64: # %bb.0: -; X64-NEXT: vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8] -; X64-NEXT: vpermt2d %xmm2, %xmm1, %xmm3 # encoding: [0x62,0xf2,0x75,0x08,0x7e,0xda] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermi2d %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x76,0xca] -; X64-NEXT: vpaddd %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc3] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.vpermi2var.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) - %res1 = call <4 x i32> @llvm.x86.avx512.mask.vpermi2var.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1) - %res2 = add <4 x i32> %res, %res1 - ret <4 x i32> %res2 + ret <4 x i32> %res } declare <4 x i32> @llvm.x86.avx512.mask.vpermt2var.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8) +define <4 x i32>@test_int_x86_avx512_vpermt2var_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermt2var_d_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermi2d %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x75,0x08,0x76,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.mask.vpermt2var.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1) + ret <4 x i32> %res +} + define <4 x i32>@test_int_x86_avx512_mask_vpermt2var_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermt2var_d_128: ; X86: # %bb.0: -; X86-NEXT: vmovdqa %xmm1, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9] -; X86-NEXT: vpermt2d %xmm2, %xmm0, %xmm3 # encoding: [0x62,0xf2,0x7d,0x08,0x7e,0xda] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermt2d %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x7e,0xca] -; X86-NEXT: vpaddd %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc3] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermt2var_d_128: ; X64: # %bb.0: -; X64-NEXT: vmovdqa %xmm1, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9] -; X64-NEXT: vpermt2d %xmm2, %xmm0, %xmm3 # encoding: [0x62,0xf2,0x7d,0x08,0x7e,0xda] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermt2d %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x7e,0xca] -; X64-NEXT: vpaddd %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc3] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.vpermt2var.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) - %res1 = call <4 x i32> @llvm.x86.avx512.mask.vpermt2var.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1) - %res2 = add <4 x i32> %res, %res1 - ret <4 x i32> %res2 + ret <4 x i32> %res } declare <4 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8) @@ -10971,80 +12217,78 @@ declare <4 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.128(<4 x i32>, <4 x i32>, define <4 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_maskz_vpermt2var_d_128: ; X86: # %bb.0: -; X86-NEXT: vmovdqa %xmm1, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9] -; X86-NEXT: vpermt2d %xmm2, %xmm0, %xmm3 # encoding: [0x62,0xf2,0x7d,0x08,0x7e,0xda] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpermt2d %xmm2, %xmm0, %xmm1 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x7e,0xca] -; X86-NEXT: vpaddd %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc3] +; X86-NEXT: vpermi2d %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0x76,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_maskz_vpermt2var_d_128: ; X64: # %bb.0: -; X64-NEXT: vmovdqa %xmm1, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9] -; X64-NEXT: vpermt2d %xmm2, %xmm0, %xmm3 # encoding: [0x62,0xf2,0x7d,0x08,0x7e,0xda] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vpermt2d %xmm2, %xmm0, %xmm1 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x7e,0xca] -; X64-NEXT: vpaddd %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc3] +; X64-NEXT: vpermi2d %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0x76,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) - %res1 = call <4 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1) - %res2 = add <4 x i32> %res, %res1 - ret <4 x i32> %res2 + ret <4 x i32> %res } declare <8 x i32> @llvm.x86.avx512.mask.vpermi2var.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8) +define <8 x i32>@test_int_x86_avx512_vpermi2var_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) { +; CHECK-LABEL: test_int_x86_avx512_vpermi2var_d_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermt2d %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x75,0x28,0x7e,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i32> @llvm.x86.avx512.mask.vpermi2var.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1) + ret <8 x i32> %res +} + define <8 x i32>@test_int_x86_avx512_mask_vpermi2var_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermi2var_d_256: ; X86: # %bb.0: -; X86-NEXT: vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8] -; X86-NEXT: vpermt2d %ymm2, %ymm1, %ymm3 # encoding: [0x62,0xf2,0x75,0x28,0x7e,0xda] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermi2d %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x76,0xca] -; X86-NEXT: vpaddd %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc3] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermi2var_d_256: ; X64: # %bb.0: -; X64-NEXT: vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8] -; X64-NEXT: vpermt2d %ymm2, %ymm1, %ymm3 # encoding: [0x62,0xf2,0x75,0x28,0x7e,0xda] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermi2d %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x76,0xca] -; X64-NEXT: vpaddd %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc3] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.vpermi2var.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) - %res1 = call <8 x i32> @llvm.x86.avx512.mask.vpermi2var.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1) - %res2 = add <8 x i32> %res, %res1 - ret <8 x i32> %res2 + ret <8 x i32> %res } declare <8 x i32> @llvm.x86.avx512.mask.vpermt2var.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8) +define <8 x i32>@test_int_x86_avx512_vpermt2var_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermt2var_d_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermi2d %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x75,0x28,0x76,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i32> @llvm.x86.avx512.mask.vpermt2var.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1) + ret <8 x i32> %res +} + define <8 x i32>@test_int_x86_avx512_mask_vpermt2var_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermt2var_d_256: ; X86: # %bb.0: -; X86-NEXT: vmovdqa %ymm1, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9] -; X86-NEXT: vpermt2d %ymm2, %ymm0, %ymm3 # encoding: [0x62,0xf2,0x7d,0x28,0x7e,0xda] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermt2d %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x7e,0xca] -; X86-NEXT: vpaddd %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc3] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermt2var_d_256: ; X64: # %bb.0: -; X64-NEXT: vmovdqa %ymm1, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9] -; X64-NEXT: vpermt2d %ymm2, %ymm0, %ymm3 # encoding: [0x62,0xf2,0x7d,0x28,0x7e,0xda] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermt2d %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x7e,0xca] -; X64-NEXT: vpaddd %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc3] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.vpermt2var.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) - %res1 = call <8 x i32> @llvm.x86.avx512.mask.vpermt2var.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1) - %res2 = add <8 x i32> %res, %res1 - ret <8 x i32> %res2 + ret <8 x i32> %res } declare <8 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8) @@ -11052,107 +12296,108 @@ declare <8 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.256(<8 x i32>, <8 x i32>, define <8 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_maskz_vpermt2var_d_256: ; X86: # %bb.0: -; X86-NEXT: vmovdqa %ymm1, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9] -; X86-NEXT: vpermt2d %ymm2, %ymm0, %ymm3 # encoding: [0x62,0xf2,0x7d,0x28,0x7e,0xda] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpermt2d %ymm2, %ymm0, %ymm1 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x7e,0xca] -; X86-NEXT: vpaddd %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc3] +; X86-NEXT: vpermi2d %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x76,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_maskz_vpermt2var_d_256: ; X64: # %bb.0: -; X64-NEXT: vmovdqa %ymm1, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9] -; X64-NEXT: vpermt2d %ymm2, %ymm0, %ymm3 # encoding: [0x62,0xf2,0x7d,0x28,0x7e,0xda] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vpermt2d %ymm2, %ymm0, %ymm1 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x7e,0xca] -; X64-NEXT: vpaddd %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc3] +; X64-NEXT: vpermi2d %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x76,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) - %res1 = call <8 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1) - %res2 = add <8 x i32> %res, %res1 - ret <8 x i32> %res2 + ret <8 x i32> %res } declare <2 x double> @llvm.x86.avx512.mask.vpermi2var.pd.128(<2 x double>, <2 x i64>, <2 x double>, i8) +define <2 x double>@test_int_x86_avx512_vpermi2var_pd_128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermi2var_pd_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermt2pd %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0xf5,0x08,0x7f,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <2 x double> @llvm.x86.avx512.mask.vpermi2var.pd.128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2, i8 -1) + ret <2 x double> %res +} + define <2 x double>@test_int_x86_avx512_mask_vpermi2var_pd_128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermi2var_pd_128: ; X86: # %bb.0: -; X86-NEXT: vmovapd %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xd8] -; X86-NEXT: vpermt2pd %xmm2, %xmm1, %xmm3 # encoding: [0x62,0xf2,0xf5,0x08,0x7f,0xda] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermi2pd %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x77,0xca] -; X86-NEXT: vaddpd %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc3] +; X86-NEXT: vmovapd %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermi2var_pd_128: ; X64: # %bb.0: -; X64-NEXT: vmovapd %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xd8] -; X64-NEXT: vpermt2pd %xmm2, %xmm1, %xmm3 # encoding: [0x62,0xf2,0xf5,0x08,0x7f,0xda] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermi2pd %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x77,0xca] -; X64-NEXT: vaddpd %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc3] +; X64-NEXT: vmovapd %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <2 x double> @llvm.x86.avx512.mask.vpermi2var.pd.128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2, i8 %x3) - %res1 = call <2 x double> @llvm.x86.avx512.mask.vpermi2var.pd.128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2, i8 -1) - %res2 = fadd <2 x double> %res, %res1 - ret <2 x double> %res2 + ret <2 x double> %res } declare <4 x double> @llvm.x86.avx512.mask.vpermi2var.pd.256(<4 x double>, <4 x i64>, <4 x double>, i8) +define <4 x double>@test_int_x86_avx512_vpermi2var_pd_256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermi2var_pd_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermt2pd %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0xf5,0x28,0x7f,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x double> @llvm.x86.avx512.mask.vpermi2var.pd.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 -1) + ret <4 x double> %res +} + define <4 x double>@test_int_x86_avx512_mask_vpermi2var_pd_256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermi2var_pd_256: ; X86: # %bb.0: -; X86-NEXT: vmovapd %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xd8] -; X86-NEXT: vpermt2pd %ymm2, %ymm1, %ymm3 # encoding: [0x62,0xf2,0xf5,0x28,0x7f,0xda] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermi2pd %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x77,0xca] -; X86-NEXT: vaddpd %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc3] +; X86-NEXT: vmovapd %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermi2var_pd_256: ; X64: # %bb.0: -; X64-NEXT: vmovapd %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xd8] -; X64-NEXT: vpermt2pd %ymm2, %ymm1, %ymm3 # encoding: [0x62,0xf2,0xf5,0x28,0x7f,0xda] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermi2pd %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x77,0xca] -; X64-NEXT: vaddpd %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc3] +; X64-NEXT: vmovapd %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x double> @llvm.x86.avx512.mask.vpermi2var.pd.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 %x3) - %res1 = call <4 x double> @llvm.x86.avx512.mask.vpermi2var.pd.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 -1) - %res2 = fadd <4 x double> %res, %res1 - ret <4 x double> %res2 + ret <4 x double> %res } declare <4 x float> @llvm.x86.avx512.mask.vpermi2var.ps.128(<4 x float>, <4 x i32>, <4 x float>, i8) +define <4 x float>@test_int_x86_avx512_vpermi2var_ps_128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermi2var_ps_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermt2ps %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x75,0x08,0x7f,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x float> @llvm.x86.avx512.mask.vpermi2var.ps.128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2, i8 -1) + ret <4 x float> %res +} + define <4 x float>@test_int_x86_avx512_mask_vpermi2var_ps_128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermi2var_ps_128: ; X86: # %bb.0: -; X86-NEXT: vmovaps %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xd8] -; X86-NEXT: vpermt2ps %xmm2, %xmm1, %xmm3 # encoding: [0x62,0xf2,0x75,0x08,0x7f,0xda] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermi2ps %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x77,0xca] -; X86-NEXT: vaddps %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc3] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermi2var_ps_128: ; X64: # %bb.0: -; X64-NEXT: vmovaps %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xd8] -; X64-NEXT: vpermt2ps %xmm2, %xmm1, %xmm3 # encoding: [0x62,0xf2,0x75,0x08,0x7f,0xda] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermi2ps %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x77,0xca] -; X64-NEXT: vaddps %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc3] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.vpermi2var.ps.128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2, i8 %x3) - %res1 = call <4 x float> @llvm.x86.avx512.mask.vpermi2var.ps.128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2, i8 -1) - %res2 = fadd <4 x float> %res, %res1 - ret <4 x float> %res2 + ret <4 x float> %res } define <4 x float>@test_int_x86_avx512_mask_vpermi2var_ps_128_cast(<4 x float> %x0, <2 x i64> %x1, <4 x float> %x2, i8 %x3) { @@ -11177,83 +12422,92 @@ define <4 x float>@test_int_x86_avx512_mask_vpermi2var_ps_128_cast(<4 x float> % declare <8 x float> @llvm.x86.avx512.mask.vpermi2var.ps.256(<8 x float>, <8 x i32>, <8 x float>, i8) +define <8 x float>@test_int_x86_avx512_vpermi2var_ps_256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermi2var_ps_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermt2ps %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x75,0x28,0x7f,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x float> @llvm.x86.avx512.mask.vpermi2var.ps.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 -1) + ret <8 x float> %res +} + define <8 x float>@test_int_x86_avx512_mask_vpermi2var_ps_256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermi2var_ps_256: ; X86: # %bb.0: -; X86-NEXT: vmovaps %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xd8] -; X86-NEXT: vpermt2ps %ymm2, %ymm1, %ymm3 # encoding: [0x62,0xf2,0x75,0x28,0x7f,0xda] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermi2ps %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x77,0xca] -; X86-NEXT: vaddps %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xc3] +; X86-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermi2var_ps_256: ; X64: # %bb.0: -; X64-NEXT: vmovaps %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xd8] -; X64-NEXT: vpermt2ps %ymm2, %ymm1, %ymm3 # encoding: [0x62,0xf2,0x75,0x28,0x7f,0xda] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermi2ps %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x77,0xca] -; X64-NEXT: vaddps %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xc3] +; X64-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx512.mask.vpermi2var.ps.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 %x3) - %res1 = call <8 x float> @llvm.x86.avx512.mask.vpermi2var.ps.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 -1) - %res2 = fadd <8 x float> %res, %res1 - ret <8 x float> %res2 + ret <8 x float> %res } declare <2 x i64> @llvm.x86.avx512.mask.vpermi2var.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8) +define <2 x i64>@test_int_x86_avx512_vpermi2var_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermi2var_q_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermt2q %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0xf5,0x08,0x7e,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <2 x i64> @llvm.x86.avx512.mask.vpermi2var.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1) + ret <2 x i64> %res +} + define <2 x i64>@test_int_x86_avx512_mask_vpermi2var_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermi2var_q_128: ; X86: # %bb.0: -; X86-NEXT: vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8] -; X86-NEXT: vpermt2q %xmm2, %xmm1, %xmm3 # encoding: [0x62,0xf2,0xf5,0x08,0x7e,0xda] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermi2q %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x76,0xca] -; X86-NEXT: vpaddq %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc3] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermi2var_q_128: ; X64: # %bb.0: -; X64-NEXT: vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8] -; X64-NEXT: vpermt2q %xmm2, %xmm1, %xmm3 # encoding: [0x62,0xf2,0xf5,0x08,0x7e,0xda] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermi2q %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x76,0xca] -; X64-NEXT: vpaddq %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc3] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.mask.vpermi2var.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) - %res1 = call <2 x i64> @llvm.x86.avx512.mask.vpermi2var.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1) - %res2 = add <2 x i64> %res, %res1 - ret <2 x i64> %res2 + ret <2 x i64> %res } declare <2 x i64> @llvm.x86.avx512.mask.vpermt2var.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8) +define <2 x i64>@test_int_x86_avx512_vpermt2var_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermt2var_q_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermi2q %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0xf5,0x08,0x76,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <2 x i64> @llvm.x86.avx512.mask.vpermt2var.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1) + ret <2 x i64> %res +} + define <2 x i64>@test_int_x86_avx512_mask_vpermt2var_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermt2var_q_128: ; X86: # %bb.0: -; X86-NEXT: vmovdqa %xmm1, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9] -; X86-NEXT: vpermt2q %xmm2, %xmm0, %xmm3 # encoding: [0x62,0xf2,0xfd,0x08,0x7e,0xda] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermt2q %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x7e,0xca] -; X86-NEXT: vpaddq %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc3] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermt2var_q_128: ; X64: # %bb.0: -; X64-NEXT: vmovdqa %xmm1, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9] -; X64-NEXT: vpermt2q %xmm2, %xmm0, %xmm3 # encoding: [0x62,0xf2,0xfd,0x08,0x7e,0xda] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermt2q %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x7e,0xca] -; X64-NEXT: vpaddq %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc3] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.mask.vpermt2var.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) - %res1 = call <2 x i64> @llvm.x86.avx512.mask.vpermt2var.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1) - %res2 = add <2 x i64> %res, %res1 - ret <2 x i64> %res2 + ret <2 x i64> %res } declare <2 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8) @@ -11261,80 +12515,78 @@ declare <2 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.128(<2 x i64>, <2 x i64>, define <2 x i64>@test_int_x86_avx512_maskz_vpermt2var_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_maskz_vpermt2var_q_128: ; X86: # %bb.0: -; X86-NEXT: vmovdqa %xmm1, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9] -; X86-NEXT: vpermt2q %xmm2, %xmm0, %xmm3 # encoding: [0x62,0xf2,0xfd,0x08,0x7e,0xda] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpermt2q %xmm2, %xmm0, %xmm1 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0x7e,0xca] -; X86-NEXT: vpaddq %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc3] +; X86-NEXT: vpermi2q %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0x76,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_maskz_vpermt2var_q_128: ; X64: # %bb.0: -; X64-NEXT: vmovdqa %xmm1, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9] -; X64-NEXT: vpermt2q %xmm2, %xmm0, %xmm3 # encoding: [0x62,0xf2,0xfd,0x08,0x7e,0xda] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vpermt2q %xmm2, %xmm0, %xmm1 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0x7e,0xca] -; X64-NEXT: vpaddq %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc3] +; X64-NEXT: vpermi2q %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0x76,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) - %res1 = call <2 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1) - %res2 = add <2 x i64> %res, %res1 - ret <2 x i64> %res2 + ret <2 x i64> %res } declare <4 x i64> @llvm.x86.avx512.mask.vpermi2var.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8) +define <4 x i64>@test_int_x86_avx512_vpermi2var_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermi2var_q_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermt2q %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0xf5,0x28,0x7e,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i64> @llvm.x86.avx512.mask.vpermi2var.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1) + ret <4 x i64> %res +} + define <4 x i64>@test_int_x86_avx512_mask_vpermi2var_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermi2var_q_256: ; X86: # %bb.0: -; X86-NEXT: vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8] -; X86-NEXT: vpermt2q %ymm2, %ymm1, %ymm3 # encoding: [0x62,0xf2,0xf5,0x28,0x7e,0xda] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermi2q %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x76,0xca] -; X86-NEXT: vpaddq %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc3] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermi2var_q_256: ; X64: # %bb.0: -; X64-NEXT: vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8] -; X64-NEXT: vpermt2q %ymm2, %ymm1, %ymm3 # encoding: [0x62,0xf2,0xf5,0x28,0x7e,0xda] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermi2q %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x76,0xca] -; X64-NEXT: vpaddq %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc3] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.vpermi2var.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) - %res1 = call <4 x i64> @llvm.x86.avx512.mask.vpermi2var.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1) - %res2 = add <4 x i64> %res, %res1 - ret <4 x i64> %res2 + ret <4 x i64> %res } declare <4 x i64> @llvm.x86.avx512.mask.vpermt2var.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8) +define <4 x i64>@test_int_x86_avx512_vpermt2var_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermt2var_q_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermi2q %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0xf5,0x28,0x76,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i64> @llvm.x86.avx512.mask.vpermt2var.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1) + ret <4 x i64> %res +} + define <4 x i64>@test_int_x86_avx512_mask_vpermt2var_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermt2var_q_256: ; X86: # %bb.0: -; X86-NEXT: vmovdqa %ymm1, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9] -; X86-NEXT: vpermt2q %ymm2, %ymm0, %ymm3 # encoding: [0x62,0xf2,0xfd,0x28,0x7e,0xda] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermt2q %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x7e,0xca] -; X86-NEXT: vpaddq %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc3] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermt2var_q_256: ; X64: # %bb.0: -; X64-NEXT: vmovdqa %ymm1, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9] -; X64-NEXT: vpermt2q %ymm2, %ymm0, %ymm3 # encoding: [0x62,0xf2,0xfd,0x28,0x7e,0xda] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermt2q %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x7e,0xca] -; X64-NEXT: vpaddq %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc3] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.vpermt2var.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) - %res1 = call <4 x i64> @llvm.x86.avx512.mask.vpermt2var.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1) - %res2 = add <4 x i64> %res, %res1 - ret <4 x i64> %res2 + ret <4 x i64> %res } declare <4 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8) @@ -11342,26 +12594,18 @@ declare <4 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.256(<4 x i64>, <4 x i64>, define <4 x i64>@test_int_x86_avx512_maskz_vpermt2var_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_maskz_vpermt2var_q_256: ; X86: # %bb.0: -; X86-NEXT: vmovdqa %ymm1, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9] -; X86-NEXT: vpermt2q %ymm2, %ymm0, %ymm3 # encoding: [0x62,0xf2,0xfd,0x28,0x7e,0xda] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpermt2q %ymm2, %ymm0, %ymm1 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x7e,0xca] -; X86-NEXT: vpaddq %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc3] +; X86-NEXT: vpermi2q %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0x76,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_maskz_vpermt2var_q_256: ; X64: # %bb.0: -; X64-NEXT: vmovdqa %ymm1, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9] -; X64-NEXT: vpermt2q %ymm2, %ymm0, %ymm3 # encoding: [0x62,0xf2,0xfd,0x28,0x7e,0xda] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vpermt2q %ymm2, %ymm0, %ymm1 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x7e,0xca] -; X64-NEXT: vpaddq %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc3] +; X64-NEXT: vpermi2q %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0x76,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) - %res1 = call <4 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1) - %res2 = add <4 x i64> %res, %res1 - ret <4 x i64> %res2 + ret <4 x i64> %res } define void @test_mask_compress_store_pd_128(i8* %addr, <2 x double> %data, i8 %mask) { @@ -14612,35 +15856,54 @@ define <4 x double> @test_mask_vfmadd256_pd_rmkz(<4 x double> %a0, <4 x double> declare <4 x i32> @llvm.x86.avx512.mask.pmov.qd.256(<4 x i64>, <4 x i32>, i8) +define <4 x i32>@test_int_x86_avx512_pmov_qd_256(<4 x i64> %x0, <4 x i32> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmov_qd_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovqd %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x35,0xc0] +; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.mask.pmov.qd.256(<4 x i64> %x0, <4 x i32> %x1, i8 -1) + ret <4 x i32> %res +} + define <4 x i32>@test_int_x86_avx512_mask_pmov_qd_256(<4 x i64> %x0, <4 x i32> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmov_qd_256: ; X86: # %bb.0: -; X86-NEXT: vpmovqd %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x35,0xc2] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovqd %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x35,0xc1] -; X86-NEXT: vpmovqd %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x35,0xc0] -; X86-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] -; X86-NEXT: vpaddd %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmov_qd_256: ; X64: # %bb.0: -; X64-NEXT: vpmovqd %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x35,0xc2] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovqd %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x35,0xc1] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.mask.pmov.qd.256(<4 x i64> %x0, <4 x i32> %x1, i8 %x2) + ret <4 x i32> %res +} + +define <4 x i32>@test_int_x86_avx512_maskz_pmov_qd_256(<4 x i64> %x0, i8 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmov_qd_256: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpmovqd %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x35,0xc0] +; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmov_qd_256: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovqd %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x35,0xc0] -; X64-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] -; X64-NEXT: vpaddd %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] - %res0 = call <4 x i32> @llvm.x86.avx512.mask.pmov.qd.256(<4 x i64> %x0, <4 x i32> %x1, i8 -1) - %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmov.qd.256(<4 x i64> %x0, <4 x i32> %x1, i8 %x2) - %res2 = call <4 x i32> @llvm.x86.avx512.mask.pmov.qd.256(<4 x i64> %x0, <4 x i32> zeroinitializer, i8 %x2) - %res3 = add <4 x i32> %res0, %res1 - %res4 = add <4 x i32> %res3, %res2 - ret <4 x i32> %res4 + %res = call <4 x i32> @llvm.x86.avx512.mask.pmov.qd.256(<4 x i64> %x0, <4 x i32> zeroinitializer, i8 %x2) + ret <4 x i32> %res } define <2 x double> @test_mask_compress_pd_128(<2 x double> %data, <2 x double> %passthru, i8 %mask) { diff --git a/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll index 05124592f22d8..2d2f88db3d93c 100644 --- a/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll @@ -1046,267 +1046,269 @@ declare <8 x float> @llvm.x86.avx512.mask.getexp.ps.256(<8 x float>, <8 x float> declare <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32>, <4 x i32>, <4 x i32>) +define <4 x i32>@test_int_x86_avx512_vpermi2var_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermi2var_d_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermt2d %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x75,0x08,0x7e,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) + ret <4 x i32> %1 +} + define <4 x i32>@test_int_x86_avx512_mask_vpermi2var_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermi2var_d_128: ; X86: # %bb.0: -; X86-NEXT: vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8] -; X86-NEXT: vpermt2d %xmm2, %xmm1, %xmm3 # encoding: [0x62,0xf2,0x75,0x08,0x7e,0xda] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermi2d %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x76,0xca] -; X86-NEXT: vpaddd %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc3] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermi2var_d_128: ; X64: # %bb.0: -; X64-NEXT: vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8] -; X64-NEXT: vpermt2d %xmm2, %xmm1, %xmm3 # encoding: [0x62,0xf2,0x75,0x08,0x7e,0xda] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermi2d %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x76,0xca] -; X64-NEXT: vpaddd %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc3] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %1 = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) %2 = bitcast i8 %x3 to <8 x i1> %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> %3 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> %x1 - %4 = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) - %res2 = add <4 x i32> %3, %4 - ret <4 x i32> %res2 + ret <4 x i32> %3 +} + +define <4 x i32>@test_int_x86_avx512_vpermt2var_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermt2var_d_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermi2d %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x75,0x08,0x76,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %x1, <4 x i32> %x0, <4 x i32> %x2) + ret <4 x i32> %1 } define <4 x i32>@test_int_x86_avx512_mask_vpermt2var_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermt2var_d_128: ; X86: # %bb.0: -; X86-NEXT: vmovdqa %xmm1, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9] -; X86-NEXT: vpermt2d %xmm2, %xmm0, %xmm3 # encoding: [0x62,0xf2,0x7d,0x08,0x7e,0xda] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermt2d %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x7e,0xca] -; X86-NEXT: vpaddd %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc3] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermt2var_d_128: ; X64: # %bb.0: -; X64-NEXT: vmovdqa %xmm1, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9] -; X64-NEXT: vpermt2d %xmm2, %xmm0, %xmm3 # encoding: [0x62,0xf2,0x7d,0x08,0x7e,0xda] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermt2d %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x7e,0xca] -; X64-NEXT: vpaddd %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc3] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %1 = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %x1, <4 x i32> %x0, <4 x i32> %x2) %2 = bitcast i8 %x3 to <8 x i1> %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> %3 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> %x1 - %4 = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %x1, <4 x i32> %x0, <4 x i32> %x2) - %res2 = add <4 x i32> %3, %4 - ret <4 x i32> %res2 + ret <4 x i32> %3 } define <4 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_maskz_vpermt2var_d_128: ; X86: # %bb.0: -; X86-NEXT: vmovdqa %xmm1, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9] -; X86-NEXT: vpermt2d %xmm2, %xmm0, %xmm3 # encoding: [0x62,0xf2,0x7d,0x08,0x7e,0xda] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpermt2d %xmm2, %xmm0, %xmm1 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x7e,0xca] -; X86-NEXT: vpaddd %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc3] +; X86-NEXT: vpermi2d %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0x76,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_maskz_vpermt2var_d_128: ; X64: # %bb.0: -; X64-NEXT: vmovdqa %xmm1, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9] -; X64-NEXT: vpermt2d %xmm2, %xmm0, %xmm3 # encoding: [0x62,0xf2,0x7d,0x08,0x7e,0xda] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vpermt2d %xmm2, %xmm0, %xmm1 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x89,0x7e,0xca] -; X64-NEXT: vpaddd %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc3] +; X64-NEXT: vpermi2d %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0x76,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %1 = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %x1, <4 x i32> %x0, <4 x i32> %x2) %2 = bitcast i8 %x3 to <8 x i1> %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> %3 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> zeroinitializer - %4 = call <4 x i32> @llvm.x86.avx512.vpermi2var.d.128(<4 x i32> %x1, <4 x i32> %x0, <4 x i32> %x2) - %res2 = add <4 x i32> %3, %4 - ret <4 x i32> %res2 + ret <4 x i32> %3 } declare <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32>, <8 x i32>, <8 x i32>) +define <8 x i32>@test_int_x86_avx512_vpermi2var_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermi2var_d_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermt2d %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x75,0x28,0x7e,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) + ret <8 x i32> %1 +} + define <8 x i32>@test_int_x86_avx512_mask_vpermi2var_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermi2var_d_256: ; X86: # %bb.0: -; X86-NEXT: vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8] -; X86-NEXT: vpermt2d %ymm2, %ymm1, %ymm3 # encoding: [0x62,0xf2,0x75,0x28,0x7e,0xda] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermi2d %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x76,0xca] -; X86-NEXT: vpaddd %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc3] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermi2var_d_256: ; X64: # %bb.0: -; X64-NEXT: vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8] -; X64-NEXT: vpermt2d %ymm2, %ymm1, %ymm3 # encoding: [0x62,0xf2,0x75,0x28,0x7e,0xda] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermi2d %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x76,0xca] -; X64-NEXT: vpaddd %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc3] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %1 = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) %2 = bitcast i8 %x3 to <8 x i1> %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> %x1 - %4 = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) - %res2 = add <8 x i32> %3, %4 - ret <8 x i32> %res2 + ret <8 x i32> %3 +} + +define <8 x i32>@test_int_x86_avx512_ask_vpermt2var_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) { +; CHECK-LABEL: test_int_x86_avx512_ask_vpermt2var_d_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermi2d %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x75,0x28,0x76,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %x1, <8 x i32> %x0, <8 x i32> %x2) + ret <8 x i32> %1 } define <8 x i32>@test_int_x86_avx512_mask_vpermt2var_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermt2var_d_256: ; X86: # %bb.0: -; X86-NEXT: vmovdqa %ymm1, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9] -; X86-NEXT: vpermt2d %ymm2, %ymm0, %ymm3 # encoding: [0x62,0xf2,0x7d,0x28,0x7e,0xda] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermt2d %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x7e,0xca] -; X86-NEXT: vpaddd %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc3] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermt2var_d_256: ; X64: # %bb.0: -; X64-NEXT: vmovdqa %ymm1, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9] -; X64-NEXT: vpermt2d %ymm2, %ymm0, %ymm3 # encoding: [0x62,0xf2,0x7d,0x28,0x7e,0xda] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermt2d %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x7e,0xca] -; X64-NEXT: vpaddd %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc3] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %1 = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %x1, <8 x i32> %x0, <8 x i32> %x2) %2 = bitcast i8 %x3 to <8 x i1> %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> %x1 - %4 = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %x1, <8 x i32> %x0, <8 x i32> %x2) - %res2 = add <8 x i32> %3, %4 - ret <8 x i32> %res2 + ret <8 x i32> %3 } define <8 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_maskz_vpermt2var_d_256: ; X86: # %bb.0: -; X86-NEXT: vmovdqa %ymm1, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9] -; X86-NEXT: vpermt2d %ymm2, %ymm0, %ymm3 # encoding: [0x62,0xf2,0x7d,0x28,0x7e,0xda] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpermt2d %ymm2, %ymm0, %ymm1 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x7e,0xca] -; X86-NEXT: vpaddd %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc3] +; X86-NEXT: vpermi2d %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x76,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_maskz_vpermt2var_d_256: ; X64: # %bb.0: -; X64-NEXT: vmovdqa %ymm1, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9] -; X64-NEXT: vpermt2d %ymm2, %ymm0, %ymm3 # encoding: [0x62,0xf2,0x7d,0x28,0x7e,0xda] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vpermt2d %ymm2, %ymm0, %ymm1 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xa9,0x7e,0xca] -; X64-NEXT: vpaddd %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc3] +; X64-NEXT: vpermi2d %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0xa9,0x76,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %1 = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %x1, <8 x i32> %x0, <8 x i32> %x2) %2 = bitcast i8 %x3 to <8 x i1> %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> zeroinitializer - %4 = call <8 x i32> @llvm.x86.avx512.vpermi2var.d.256(<8 x i32> %x1, <8 x i32> %x0, <8 x i32> %x2) - %res2 = add <8 x i32> %3, %4 - ret <8 x i32> %res2 + ret <8 x i32> %3 } declare <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double>, <2 x i64>, <2 x double>) +define <2 x double>@test_int_x86_avx512_vpermi2var_pd_128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermi2var_pd_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermt2pd %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0xf5,0x08,0x7f,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = call <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2) + ret <2 x double> %1 +} + define <2 x double>@test_int_x86_avx512_mask_vpermi2var_pd_128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermi2var_pd_128: ; X86: # %bb.0: -; X86-NEXT: vmovapd %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xd8] -; X86-NEXT: vpermt2pd %xmm2, %xmm1, %xmm3 # encoding: [0x62,0xf2,0xf5,0x08,0x7f,0xda] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermi2pd %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x77,0xca] -; X86-NEXT: vaddpd %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc3] +; X86-NEXT: vmovapd %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermi2var_pd_128: ; X64: # %bb.0: -; X64-NEXT: vmovapd %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xd8] -; X64-NEXT: vpermt2pd %xmm2, %xmm1, %xmm3 # encoding: [0x62,0xf2,0xf5,0x08,0x7f,0xda] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermi2pd %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x77,0xca] -; X64-NEXT: vaddpd %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc3] +; X64-NEXT: vmovapd %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %1 = call <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2) %2 = bitcast <2 x i64> %x1 to <2 x double> %3 = bitcast i8 %x3 to <8 x i1> %extract = shufflevector <8 x i1> %3, <8 x i1> %3, <2 x i32> %4 = select <2 x i1> %extract, <2 x double> %1, <2 x double> %2 - %5 = call <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2) - %6 = bitcast <2 x i64> %x1 to <2 x double> - %res2 = fadd <2 x double> %4, %5 - ret <2 x double> %res2 + ret <2 x double> %4 } declare <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double>, <4 x i64>, <4 x double>) +define <4 x double>@test_int_x86_avx512_vpermi2var_pd_256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermi2var_pd_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermt2pd %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0xf5,0x28,0x7f,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = call <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2) + ret <4 x double> %1 +} + define <4 x double>@test_int_x86_avx512_mask_vpermi2var_pd_256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermi2var_pd_256: ; X86: # %bb.0: -; X86-NEXT: vmovapd %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xd8] -; X86-NEXT: vpermt2pd %ymm2, %ymm1, %ymm3 # encoding: [0x62,0xf2,0xf5,0x28,0x7f,0xda] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermi2pd %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x77,0xca] -; X86-NEXT: vaddpd %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc3] +; X86-NEXT: vmovapd %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermi2var_pd_256: ; X64: # %bb.0: -; X64-NEXT: vmovapd %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xd8] -; X64-NEXT: vpermt2pd %ymm2, %ymm1, %ymm3 # encoding: [0x62,0xf2,0xf5,0x28,0x7f,0xda] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermi2pd %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x77,0xca] -; X64-NEXT: vaddpd %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc3] +; X64-NEXT: vmovapd %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %1 = call <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2) %2 = bitcast <4 x i64> %x1 to <4 x double> %3 = bitcast i8 %x3 to <8 x i1> %extract = shufflevector <8 x i1> %3, <8 x i1> %3, <4 x i32> %4 = select <4 x i1> %extract, <4 x double> %1, <4 x double> %2 - %5 = call <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2) - %6 = bitcast <4 x i64> %x1 to <4 x double> - %res2 = fadd <4 x double> %4, %5 - ret <4 x double> %res2 + ret <4 x double> %4 } declare <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float>, <4 x i32>, <4 x float>) +define <4 x float>@test_int_x86_avx512_vpermi2var_ps_128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermi2var_ps_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermt2ps %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0x75,0x08,0x7f,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = call <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2) + ret <4 x float> %1 +} + define <4 x float>@test_int_x86_avx512_mask_vpermi2var_ps_128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermi2var_ps_128: ; X86: # %bb.0: -; X86-NEXT: vmovaps %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xd8] -; X86-NEXT: vpermt2ps %xmm2, %xmm1, %xmm3 # encoding: [0x62,0xf2,0x75,0x08,0x7f,0xda] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermi2ps %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x77,0xca] -; X86-NEXT: vaddps %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc3] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermi2var_ps_128: ; X64: # %bb.0: -; X64-NEXT: vmovaps %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xd8] -; X64-NEXT: vpermt2ps %xmm2, %xmm1, %xmm3 # encoding: [0x62,0xf2,0x75,0x08,0x7f,0xda] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermi2ps %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0x77,0xca] -; X64-NEXT: vaddps %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc3] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %1 = call <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2) %2 = bitcast <4 x i32> %x1 to <4 x float> %3 = bitcast i8 %x3 to <8 x i1> %extract = shufflevector <8 x i1> %3, <8 x i1> %3, <4 x i32> %4 = select <4 x i1> %extract, <4 x float> %1, <4 x float> %2 - %5 = call <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2) - %6 = bitcast <4 x i32> %x1 to <4 x float> - %res2 = fadd <4 x float> %4, %5 - ret <4 x float> %res2 + ret <4 x float> %4 } define <4 x float>@test_int_x86_avx512_mask_vpermi2var_ps_128_cast(<4 x float> %x0, <2 x i64> %x1, <4 x float> %x2, i8 %x3) { @@ -1335,205 +1337,203 @@ define <4 x float>@test_int_x86_avx512_mask_vpermi2var_ps_128_cast(<4 x float> % declare <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float>, <8 x i32>, <8 x float>) +define <8 x float>@test_int_x86_avx512_vpermi2var_ps_256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermi2var_ps_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermt2ps %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0x75,0x28,0x7f,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = call <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2) + ret <8 x float> %1 +} + define <8 x float>@test_int_x86_avx512_mask_vpermi2var_ps_256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermi2var_ps_256: ; X86: # %bb.0: -; X86-NEXT: vmovaps %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xd8] -; X86-NEXT: vpermt2ps %ymm2, %ymm1, %ymm3 # encoding: [0x62,0xf2,0x75,0x28,0x7f,0xda] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermi2ps %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x77,0xca] -; X86-NEXT: vaddps %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xc3] +; X86-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermi2var_ps_256: ; X64: # %bb.0: -; X64-NEXT: vmovaps %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xd8] -; X64-NEXT: vpermt2ps %ymm2, %ymm1, %ymm3 # encoding: [0x62,0xf2,0x75,0x28,0x7f,0xda] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermi2ps %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x29,0x77,0xca] -; X64-NEXT: vaddps %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xc3] +; X64-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %1 = call <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2) %2 = bitcast <8 x i32> %x1 to <8 x float> %3 = bitcast i8 %x3 to <8 x i1> %4 = select <8 x i1> %3, <8 x float> %1, <8 x float> %2 - %5 = call <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2) - %6 = bitcast <8 x i32> %x1 to <8 x float> - %res2 = fadd <8 x float> %4, %5 - ret <8 x float> %res2 + ret <8 x float> %4 } declare <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64>, <2 x i64>, <2 x i64>) +define <2 x i64>@test_int_x86_avx512_vpermi2var_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermi2var_q_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermt2q %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0xf5,0x08,0x7e,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) + ret <2 x i64> %1 +} + define <2 x i64>@test_int_x86_avx512_mask_vpermi2var_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermi2var_q_128: ; X86: # %bb.0: -; X86-NEXT: vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8] -; X86-NEXT: vpermt2q %xmm2, %xmm1, %xmm3 # encoding: [0x62,0xf2,0xf5,0x08,0x7e,0xda] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermi2q %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x76,0xca] -; X86-NEXT: vpaddq %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc3] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermi2var_q_128: ; X64: # %bb.0: -; X64-NEXT: vmovdqa %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8] -; X64-NEXT: vpermt2q %xmm2, %xmm1, %xmm3 # encoding: [0x62,0xf2,0xf5,0x08,0x7e,0xda] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermi2q %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x76,0xca] -; X64-NEXT: vpaddq %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc3] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %1 = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) %2 = bitcast i8 %x3 to <8 x i1> %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <2 x i32> %3 = select <2 x i1> %extract, <2 x i64> %1, <2 x i64> %x1 - %4 = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) - %res2 = add <2 x i64> %3, %4 - ret <2 x i64> %res2 + ret <2 x i64> %3 +} + +define <2 x i64>@test_int_x86_avx512_vpermt2var_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermt2var_q_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermi2q %xmm2, %xmm1, %xmm0 # encoding: [0x62,0xf2,0xf5,0x08,0x76,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %x1, <2 x i64> %x0, <2 x i64> %x2) + ret <2 x i64> %1 } define <2 x i64>@test_int_x86_avx512_mask_vpermt2var_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermt2var_q_128: ; X86: # %bb.0: -; X86-NEXT: vmovdqa %xmm1, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9] -; X86-NEXT: vpermt2q %xmm2, %xmm0, %xmm3 # encoding: [0x62,0xf2,0xfd,0x08,0x7e,0xda] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermt2q %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x7e,0xca] -; X86-NEXT: vpaddq %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc3] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermt2var_q_128: ; X64: # %bb.0: -; X64-NEXT: vmovdqa %xmm1, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9] -; X64-NEXT: vpermt2q %xmm2, %xmm0, %xmm3 # encoding: [0x62,0xf2,0xfd,0x08,0x7e,0xda] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermt2q %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x7e,0xca] -; X64-NEXT: vpaddq %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc3] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %1 = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %x1, <2 x i64> %x0, <2 x i64> %x2) %2 = bitcast i8 %x3 to <8 x i1> %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <2 x i32> %3 = select <2 x i1> %extract, <2 x i64> %1, <2 x i64> %x1 - %4 = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %x1, <2 x i64> %x0, <2 x i64> %x2) - %res2 = add <2 x i64> %3, %4 - ret <2 x i64> %res2 + ret <2 x i64> %3 } define <2 x i64>@test_int_x86_avx512_maskz_vpermt2var_q_128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_maskz_vpermt2var_q_128: ; X86: # %bb.0: -; X86-NEXT: vmovdqa %xmm1, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9] -; X86-NEXT: vpermt2q %xmm2, %xmm0, %xmm3 # encoding: [0x62,0xf2,0xfd,0x08,0x7e,0xda] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpermt2q %xmm2, %xmm0, %xmm1 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0x7e,0xca] -; X86-NEXT: vpaddq %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc3] +; X86-NEXT: vpermi2q %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0x76,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_maskz_vpermt2var_q_128: ; X64: # %bb.0: -; X64-NEXT: vmovdqa %xmm1, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9] -; X64-NEXT: vpermt2q %xmm2, %xmm0, %xmm3 # encoding: [0x62,0xf2,0xfd,0x08,0x7e,0xda] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vpermt2q %xmm2, %xmm0, %xmm1 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0x7e,0xca] -; X64-NEXT: vpaddq %xmm3, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc3] +; X64-NEXT: vpermi2q %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0x76,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %1 = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %x1, <2 x i64> %x0, <2 x i64> %x2) %2 = bitcast i8 %x3 to <8 x i1> %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <2 x i32> %3 = select <2 x i1> %extract, <2 x i64> %1, <2 x i64> zeroinitializer - %4 = call <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64> %x1, <2 x i64> %x0, <2 x i64> %x2) - %res2 = add <2 x i64> %3, %4 - ret <2 x i64> %res2 + ret <2 x i64> %3 } declare <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64>, <4 x i64>, <4 x i64>) +define <4 x i64>@test_int_x86_avx512_vpermi2var_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermi2var_q_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermt2q %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0xf5,0x28,0x7e,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2) + ret <4 x i64> %1 +} + define <4 x i64>@test_int_x86_avx512_mask_vpermi2var_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermi2var_q_256: ; X86: # %bb.0: -; X86-NEXT: vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8] -; X86-NEXT: vpermt2q %ymm2, %ymm1, %ymm3 # encoding: [0x62,0xf2,0xf5,0x28,0x7e,0xda] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermi2q %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x76,0xca] -; X86-NEXT: vpaddq %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc3] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermi2var_q_256: ; X64: # %bb.0: -; X64-NEXT: vmovdqa %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8] -; X64-NEXT: vpermt2q %ymm2, %ymm1, %ymm3 # encoding: [0x62,0xf2,0xf5,0x28,0x7e,0xda] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermi2q %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x76,0xca] -; X64-NEXT: vpaddq %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc3] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %1 = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2) %2 = bitcast i8 %x3 to <8 x i1> %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> %3 = select <4 x i1> %extract, <4 x i64> %1, <4 x i64> %x1 - %4 = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2) - %res2 = add <4 x i64> %3, %4 - ret <4 x i64> %res2 + ret <4 x i64> %3 +} + +define <4 x i64>@test_int_x86_avx512_vpermt2var_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2) { +; CHECK-LABEL: test_int_x86_avx512_vpermt2var_q_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermi2q %ymm2, %ymm1, %ymm0 # encoding: [0x62,0xf2,0xf5,0x28,0x76,0xc2] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %x1, <4 x i64> %x0, <4 x i64> %x2) + ret <4 x i64> %1 } define <4 x i64>@test_int_x86_avx512_mask_vpermt2var_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermt2var_q_256: ; X86: # %bb.0: -; X86-NEXT: vmovdqa %ymm1, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9] -; X86-NEXT: vpermt2q %ymm2, %ymm0, %ymm3 # encoding: [0x62,0xf2,0xfd,0x28,0x7e,0xda] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermt2q %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x7e,0xca] -; X86-NEXT: vpaddq %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc3] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpermt2var_q_256: ; X64: # %bb.0: -; X64-NEXT: vmovdqa %ymm1, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9] -; X64-NEXT: vpermt2q %ymm2, %ymm0, %ymm3 # encoding: [0x62,0xf2,0xfd,0x28,0x7e,0xda] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermt2q %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x7e,0xca] -; X64-NEXT: vpaddq %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc3] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %1 = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %x1, <4 x i64> %x0, <4 x i64> %x2) %2 = bitcast i8 %x3 to <8 x i1> %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> %3 = select <4 x i1> %extract, <4 x i64> %1, <4 x i64> %x1 - %4 = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %x1, <4 x i64> %x0, <4 x i64> %x2) - %res2 = add <4 x i64> %3, %4 - ret <4 x i64> %res2 + ret <4 x i64> %3 } define <4 x i64>@test_int_x86_avx512_maskz_vpermt2var_q_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_maskz_vpermt2var_q_256: ; X86: # %bb.0: -; X86-NEXT: vmovdqa %ymm1, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9] -; X86-NEXT: vpermt2q %ymm2, %ymm0, %ymm3 # encoding: [0x62,0xf2,0xfd,0x28,0x7e,0xda] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpermt2q %ymm2, %ymm0, %ymm1 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x7e,0xca] -; X86-NEXT: vpaddq %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc3] +; X86-NEXT: vpermi2q %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0x76,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_maskz_vpermt2var_q_256: ; X64: # %bb.0: -; X64-NEXT: vmovdqa %ymm1, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9] -; X64-NEXT: vpermt2q %ymm2, %ymm0, %ymm3 # encoding: [0x62,0xf2,0xfd,0x28,0x7e,0xda] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vpermt2q %ymm2, %ymm0, %ymm1 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x7e,0xca] -; X64-NEXT: vpaddq %ymm3, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc3] +; X64-NEXT: vpermi2q %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0x76,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %1 = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %x1, <4 x i64> %x0, <4 x i64> %x2) %2 = bitcast i8 %x3 to <8 x i1> %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> %3 = select <4 x i1> %extract, <4 x i64> %1, <4 x i64> zeroinitializer - %4 = call <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64> %x1, <4 x i64> %x0, <4 x i64> %x2) - %res2 = add <4 x i64> %3, %4 - ret <4 x i64> %res2 + ret <4 x i64> %3 } declare <2 x double> @llvm.x86.avx512.mask.scalef.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) @@ -2470,41 +2470,60 @@ define void @test_int_x86_avx512_mask_pmovus_qd_mem_128(i8* %ptr, <2 x i64> %x1, ret void } +define <4 x i32>@test_int_x86_avx512_pmov_qd_256(<4 x i64> %x0) { +; CHECK-LABEL: test_int_x86_avx512_pmov_qd_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovqd %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x35,0xc0] +; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = trunc <4 x i64> %x0 to <4 x i32> + ret <4 x i32> %1 +} + define <4 x i32>@test_int_x86_avx512_mask_pmov_qd_256(<4 x i64> %x0, <4 x i32> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmov_qd_256: ; X86: # %bb.0: -; X86-NEXT: vpmovqd %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x35,0xc2] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovqd %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x35,0xc1] -; X86-NEXT: vpmovqd %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x35,0xc0] -; X86-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] -; X86-NEXT: vpaddd %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmov_qd_256: ; X64: # %bb.0: -; X64-NEXT: vpmovqd %ymm0, %xmm2 # encoding: [0x62,0xf2,0x7e,0x28,0x35,0xc2] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovqd %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x35,0xc1] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X64-NEXT: retq # encoding: [0xc3] + %1 = trunc <4 x i64> %x0 to <4 x i32> + %2 = bitcast i8 %x2 to <8 x i1> + %extract1 = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> + %3 = select <4 x i1> %extract1, <4 x i32> %1, <4 x i32> %x1 + ret <4 x i32> %3 +} + +define <4 x i32>@test_int_x86_avx512_maskz_pmov_qd_256(<4 x i64> %x0, i8 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmov_qd_256: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpmovqd %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x35,0xc0] +; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmov_qd_256: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpmovqd %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x35,0xc0] -; X64-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] -; X64-NEXT: vpaddd %xmm0, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] %1 = trunc <4 x i64> %x0 to <4 x i32> - %2 = trunc <4 x i64> %x0 to <4 x i32> - %3 = bitcast i8 %x2 to <8 x i1> - %extract1 = shufflevector <8 x i1> %3, <8 x i1> %3, <4 x i32> - %4 = select <4 x i1> %extract1, <4 x i32> %2, <4 x i32> %x1 - %5 = trunc <4 x i64> %x0 to <4 x i32> - %6 = bitcast i8 %x2 to <8 x i1> - %extract = shufflevector <8 x i1> %6, <8 x i1> %6, <4 x i32> - %7 = select <4 x i1> %extract, <4 x i32> %5, <4 x i32> zeroinitializer - %res3 = add <4 x i32> %1, %4 - %res4 = add <4 x i32> %res3, %7 - ret <4 x i32> %res4 + %2 = bitcast i8 %x2 to <8 x i1> + %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> + %3 = select <4 x i1> %extract, <4 x i32> %1, <4 x i32> zeroinitializer + ret <4 x i32> %3 } declare void @llvm.x86.avx512.mask.pmov.qd.mem.256(i8* %ptr, <4 x i64>, i8) @@ -2534,35 +2553,54 @@ define void @test_int_x86_avx512_mask_pmov_qd_mem_256(i8* %ptr, <4 x i64> %x1, i declare <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.256(<4 x i64>, <4 x i32>, i8) +define <4 x i32>@test_int_x86_avx512_pmovs_qd_256(<4 x i64> %x0, <4 x i32> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmovs_qd_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovsqd %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x25,0xc0] +; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.256(<4 x i64> %x0, <4 x i32> %x1, i8 -1) + ret <4 x i32> %res +} + define <4 x i32>@test_int_x86_avx512_mask_pmovs_qd_256(<4 x i64> %x0, <4 x i32> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmovs_qd_256: ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovsqd %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x25,0xc1] -; X86-NEXT: vpmovsqd %ymm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x25,0xc2] -; X86-NEXT: vpaddd %xmm2, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xca] -; X86-NEXT: vpmovsqd %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x25,0xc0] -; X86-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc1] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmovs_qd_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vpmovsqd %ymm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x25,0xc2] ; X64-NEXT: vpmovsqd %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x25,0xc1] -; X64-NEXT: vpaddd %xmm2, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xca] -; X64-NEXT: vpmovsqd %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x25,0xc0] -; X64-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc1] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] - %res0 = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.256(<4 x i64> %x0, <4 x i32> %x1, i8 -1) - %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.256(<4 x i64> %x0, <4 x i32> %x1, i8 %x2) - %res2 = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.256(<4 x i64> %x0, <4 x i32> zeroinitializer, i8 %x2) - %res3 = add <4 x i32> %res0, %res1 - %res4 = add <4 x i32> %res3, %res2 - ret <4 x i32> %res4 + %res = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.256(<4 x i64> %x0, <4 x i32> %x1, i8 %x2) + ret <4 x i32> %res +} + +define <4 x i32>@test_int_x86_avx512_maskz_pmovs_qd_256(<4 x i64> %x0, i8 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmovs_qd_256: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpmovsqd %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x25,0xc0] +; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmovs_qd_256: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vpmovsqd %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x25,0xc0] +; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.256(<4 x i64> %x0, <4 x i32> zeroinitializer, i8 %x2) + ret <4 x i32> %res } declare void @llvm.x86.avx512.mask.pmovs.qd.mem.256(i8* %ptr, <4 x i64>, i8) @@ -2592,35 +2630,54 @@ define void @test_int_x86_avx512_mask_pmovs_qd_mem_256(i8* %ptr, <4 x i64> %x1, declare <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.256(<4 x i64>, <4 x i32>, i8) +define <4 x i32>@test_int_x86_avx512_pmovus_qd_256(<4 x i64> %x0, <4 x i32> %x1) { +; CHECK-LABEL: test_int_x86_avx512_pmovus_qd_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovusqd %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x15,0xc0] +; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.256(<4 x i64> %x0, <4 x i32> %x1, i8 -1) + ret <4 x i32> %res +} + define <4 x i32>@test_int_x86_avx512_mask_pmovus_qd_256(<4 x i64> %x0, <4 x i32> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_pmovus_qd_256: ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpmovusqd %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x15,0xc1] -; X86-NEXT: vpmovusqd %ymm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x15,0xc2] -; X86-NEXT: vpaddd %xmm2, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xca] -; X86-NEXT: vpmovusqd %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x15,0xc0] -; X86-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc1] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_pmovus_qd_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; X64-NEXT: vpmovusqd %ymm0, %xmm2 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x15,0xc2] ; X64-NEXT: vpmovusqd %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0x7e,0x29,0x15,0xc1] -; X64-NEXT: vpaddd %xmm2, %xmm1, %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xca] -; X64-NEXT: vpmovusqd %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x15,0xc0] -; X64-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc1] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] - %res0 = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.256(<4 x i64> %x0, <4 x i32> %x1, i8 -1) - %res1 = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.256(<4 x i64> %x0, <4 x i32> %x1, i8 %x2) - %res2 = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.256(<4 x i64> %x0, <4 x i32> zeroinitializer, i8 %x2) - %res3 = add <4 x i32> %res0, %res1 - %res4 = add <4 x i32> %res3, %res2 - ret <4 x i32> %res4 + %res = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.256(<4 x i64> %x0, <4 x i32> %x1, i8 %x2) + ret <4 x i32> %res +} + +define <4 x i32>@test_int_x86_avx512_maskz_pmovus_qd_256(<4 x i64> %x0, i8 %x2) { +; X86-LABEL: test_int_x86_avx512_maskz_pmovus_qd_256: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpmovusqd %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x15,0xc0] +; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_pmovus_qd_256: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vpmovusqd %ymm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7e,0xa9,0x15,0xc0] +; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; X64-NEXT: retq # encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.256(<4 x i64> %x0, <4 x i32> zeroinitializer, i8 %x2) + ret <4 x i32> %res } declare void @llvm.x86.avx512.mask.pmovus.qd.mem.256(i8* %ptr, <4 x i64>, i8) @@ -3345,54 +3402,74 @@ define <4 x i32>@test_int_x86_avx512_mask_cvt_pd2dq_128(<2 x double> %x0, <4 x i ret <4 x i32> %res2 } +define <4 x i32>@test_int_x86_avx512_cvt_pd2dq_128_zext(<2 x double> %x0, <4 x i32> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvt_pd2dq_128_zext: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtpd2dq %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0xe6,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res2 = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.128(<2 x double> %x0, <4 x i32> %x1, i8 -1) + %res3 = shufflevector <4 x i32> %res2, <4 x i32> zeroinitializer, <4 x i32> + ret <4 x i32> %res3 +} + define <4 x i32>@test_int_x86_avx512_mask_cvt_pd2dq_128_zext(<2 x double> %x0, <4 x i32> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvt_pd2dq_128_zext: ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvtpd2dq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0xe6,0xc8] -; X86-NEXT: vcvtpd2dq %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0xe6,0xc0] -; X86-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_pd2dq_128_zext: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtpd2dq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0xe6,0xc8] -; X64-NEXT: vcvtpd2dq %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0xe6,0xc0] -; X64-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.128(<2 x double> %x0, <4 x i32> %x1, i8 %x2) %res1 = shufflevector <4 x i32> %res, <4 x i32> zeroinitializer, <4 x i32> - %res2 = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2dq.128(<2 x double> %x0, <4 x i32> %x1, i8 -1) - %res3 = shufflevector <4 x i32> %res2, <4 x i32> zeroinitializer, <4 x i32> - %res4 = add <4 x i32> %res1, %res3 - ret <4 x i32> %res4 + ret <4 x i32> %res1 } declare <4 x float> @llvm.x86.avx512.mask.cvtpd2ps(<2 x double>, <4 x float>, i8) +define <4 x float>@test_int_x86_avx512_cvt_pd2ps(<2 x double> %x0, <4 x float> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvt_pd2ps: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtpd2ps %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x5a,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x float> @llvm.x86.avx512.mask.cvtpd2ps(<2 x double> %x0, <4 x float> %x1, i8 -1) + ret <4 x float> %res +} + define <4 x float>@test_int_x86_avx512_mask_cvt_pd2ps(<2 x double> %x0, <4 x float> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvt_pd2ps: ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvtpd2ps %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0x5a,0xc8] -; X86-NEXT: vcvtpd2ps %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x5a,0xc0] -; X86-NEXT: vaddps %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc0] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_pd2ps: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtpd2ps %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0x5a,0xc8] -; X64-NEXT: vcvtpd2ps %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x5a,0xc0] -; X64-NEXT: vaddps %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc0] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.cvtpd2ps(<2 x double> %x0, <4 x float> %x1, i8 %x2) - %res1 = call <4 x float> @llvm.x86.avx512.mask.cvtpd2ps(<2 x double> %x0, <4 x float> %x1, i8 -1) - %res2 = fadd <4 x float> %res, %res1 - ret <4 x float> %res2 + ret <4 x float> %res +} + +define <4 x float>@test_int_x86_avx512_cvt_pd2ps_zext(<2 x double> %x0, <4 x float> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvt_pd2ps_zext: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtpd2ps %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x5a,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res2 = call <4 x float> @llvm.x86.avx512.mask.cvtpd2ps(<2 x double> %x0, <4 x float> %x1, i8 -1) + %res3 = shufflevector <4 x float> %res2, <4 x float> zeroinitializer, <4 x i32> + ret <4 x float> %res3 } define <4 x float>@test_int_x86_avx512_mask_cvt_pd2ps_zext(<2 x double> %x0, <4 x float> %x1, i8 %x2) { @@ -3401,48 +3478,58 @@ define <4 x float>@test_int_x86_avx512_mask_cvt_pd2ps_zext(<2 x double> %x0, <4 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvtpd2ps %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0x5a,0xc8] -; X86-NEXT: vcvtpd2ps %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x5a,0xc0] -; X86-NEXT: vaddps %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc0] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_pd2ps_zext: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtpd2ps %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0x5a,0xc8] -; X64-NEXT: vcvtpd2ps %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x5a,0xc0] -; X64-NEXT: vaddps %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc0] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.cvtpd2ps(<2 x double> %x0, <4 x float> %x1, i8 %x2) %res1 = shufflevector <4 x float> %res, <4 x float> zeroinitializer, <4 x i32> - %res2 = call <4 x float> @llvm.x86.avx512.mask.cvtpd2ps(<2 x double> %x0, <4 x float> %x1, i8 -1) - %res3 = shufflevector <4 x float> %res2, <4 x float> zeroinitializer, <4 x i32> - %res4 = fadd <4 x float> %res1, %res3 - ret <4 x float> %res4 + ret <4 x float> %res1 } declare <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double>, <4 x i32>, i8) +define <4 x i32>@test_int_x86_avx512_cvt_pd2udq_128(<2 x double> %x0, <4 x i32> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvt_pd2udq_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtpd2udq %xmm0, %xmm0 # encoding: [0x62,0xf1,0xfc,0x08,0x79,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double> %x0, <4 x i32> %x1, i8 -1) + ret <4 x i32> %res +} + define <4 x i32>@test_int_x86_avx512_mask_cvt_pd2udq_128(<2 x double> %x0, <4 x i32> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvt_pd2udq_128: ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvtpd2udq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfc,0x09,0x79,0xc8] -; X86-NEXT: vcvtpd2udq %xmm0, %xmm0 # encoding: [0x62,0xf1,0xfc,0x08,0x79,0xc0] -; X86-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_pd2udq_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtpd2udq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfc,0x09,0x79,0xc8] -; X64-NEXT: vcvtpd2udq %xmm0, %xmm0 # encoding: [0x62,0xf1,0xfc,0x08,0x79,0xc0] -; X64-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double> %x0, <4 x i32> %x1, i8 %x2) - %res1 = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double> %x0, <4 x i32> %x1, i8 -1) - %res2 = add <4 x i32> %res, %res1 - ret <4 x i32> %res2 + ret <4 x i32> %res +} + +define <4 x i32>@test_int_x86_avx512_cvt_pd2udq_128_zext(<2 x double> %x0, <4 x i32> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvt_pd2udq_128_zext: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtpd2udq %xmm0, %xmm0 # encoding: [0x62,0xf1,0xfc,0x08,0x79,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res2 = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double> %x0, <4 x i32> %x1, i8 -1) + %res3 = shufflevector <4 x i32> %res2, <4 x i32> zeroinitializer, <4 x i32> + ret <4 x i32> %res3 } define <4 x i32>@test_int_x86_avx512_mask_cvt_pd2udq_128_zext(<2 x double> %x0, <4 x i32> %x1, i8 %x2) { @@ -3451,35 +3538,39 @@ define <4 x i32>@test_int_x86_avx512_mask_cvt_pd2udq_128_zext(<2 x double> %x0, ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvtpd2udq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfc,0x09,0x79,0xc8] -; X86-NEXT: vcvtpd2udq %xmm0, %xmm0 # encoding: [0x62,0xf1,0xfc,0x08,0x79,0xc0] -; X86-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_pd2udq_128_zext: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtpd2udq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfc,0x09,0x79,0xc8] -; X64-NEXT: vcvtpd2udq %xmm0, %xmm0 # encoding: [0x62,0xf1,0xfc,0x08,0x79,0xc0] -; X64-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double> %x0, <4 x i32> %x1, i8 %x2) %res1 = shufflevector <4 x i32> %res, <4 x i32> zeroinitializer, <4 x i32> - %res2 = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.128(<2 x double> %x0, <4 x i32> %x1, i8 -1) - %res3 = shufflevector <4 x i32> %res2, <4 x i32> zeroinitializer, <4 x i32> - %res4 = add <4 x i32> %res1, %res3 - ret <4 x i32> %res4 + ret <4 x i32> %res1 } declare <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.256(<4 x double>, <4 x i32>, i8) +define <4 x i32>@test_int_x86_avx512_cvt_pd2udq_256(<4 x double> %x0, <4 x i32> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvt_pd2udq_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtpd2udq %ymm0, %xmm0 # encoding: [0x62,0xf1,0xfc,0x28,0x79,0xc0] +; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.256(<4 x double> %x0, <4 x i32> %x1, i8 -1) + ret <4 x i32> %res +} + define <4 x i32>@test_int_x86_avx512_mask_cvt_pd2udq_256(<4 x double> %x0, <4 x i32> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvt_pd2udq_256: ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvtpd2udq %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfc,0x29,0x79,0xc8] -; X86-NEXT: vcvtpd2udq %ymm0, %xmm0 # encoding: [0x62,0xf1,0xfc,0x28,0x79,0xc0] -; X86-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -3487,139 +3578,171 @@ define <4 x i32>@test_int_x86_avx512_mask_cvt_pd2udq_256(<4 x double> %x0, <4 x ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtpd2udq %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfc,0x29,0x79,0xc8] -; X64-NEXT: vcvtpd2udq %ymm0, %xmm0 # encoding: [0x62,0xf1,0xfc,0x28,0x79,0xc0] -; X64-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.256(<4 x double> %x0, <4 x i32> %x1, i8 %x2) - %res1 = call <4 x i32> @llvm.x86.avx512.mask.cvtpd2udq.256(<4 x double> %x0, <4 x i32> %x1, i8 -1) - %res2 = add <4 x i32> %res, %res1 - ret <4 x i32> %res2 + ret <4 x i32> %res } declare <4 x i32> @llvm.x86.avx512.mask.cvtps2dq.128(<4 x float>, <4 x i32>, i8) +define <4 x i32>@test_int_x86_avx512_cvt_ps2dq_128(<4 x float> %x0, <4 x i32> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvt_ps2dq_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtps2dq %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x5b,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.mask.cvtps2dq.128(<4 x float> %x0, <4 x i32> %x1, i8 -1) + ret <4 x i32> %res +} + define <4 x i32>@test_int_x86_avx512_mask_cvt_ps2dq_128(<4 x float> %x0, <4 x i32> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvt_ps2dq_128: ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvtps2dq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x5b,0xc8] -; X86-NEXT: vcvtps2dq %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x5b,0xc0] -; X86-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_ps2dq_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtps2dq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0x5b,0xc8] -; X64-NEXT: vcvtps2dq %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x5b,0xc0] -; X64-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.cvtps2dq.128(<4 x float> %x0, <4 x i32> %x1, i8 %x2) - %res1 = call <4 x i32> @llvm.x86.avx512.mask.cvtps2dq.128(<4 x float> %x0, <4 x i32> %x1, i8 -1) - %res2 = add <4 x i32> %res, %res1 - ret <4 x i32> %res2 + ret <4 x i32> %res } declare <8 x i32> @llvm.x86.avx512.mask.cvtps2dq.256(<8 x float>, <8 x i32>, i8) +define <8 x i32>@test_int_x86_avx512_cvt_ps2dq_256(<8 x float> %x0, <8 x i32> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvt_ps2dq_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtps2dq %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x5b,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i32> @llvm.x86.avx512.mask.cvtps2dq.256(<8 x float> %x0, <8 x i32> %x1, i8 -1) + ret <8 x i32> %res +} + define <8 x i32>@test_int_x86_avx512_mask_cvt_ps2dq_256(<8 x float> %x0, <8 x i32> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvt_ps2dq_256: ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvtps2dq %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x5b,0xc8] -; X86-NEXT: vcvtps2dq %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x5b,0xc0] -; X86-NEXT: vpaddd %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0] +; X86-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_ps2dq_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtps2dq %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0x5b,0xc8] -; X64-NEXT: vcvtps2dq %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x5b,0xc0] -; X64-NEXT: vpaddd %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0] +; X64-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.cvtps2dq.256(<8 x float> %x0, <8 x i32> %x1, i8 %x2) - %res1 = call <8 x i32> @llvm.x86.avx512.mask.cvtps2dq.256(<8 x float> %x0, <8 x i32> %x1, i8 -1) - %res2 = add <8 x i32> %res, %res1 - ret <8 x i32> %res2 + ret <8 x i32> %res } declare <4 x i32> @llvm.x86.avx512.mask.cvtps2udq.128(<4 x float>, <4 x i32>, i8) +define <4 x i32>@test_int_x86_avx512_cvt_ps2udq_128(<4 x float> %x0, <4 x i32> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvt_ps2udq_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtps2udq %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7c,0x08,0x79,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.mask.cvtps2udq.128(<4 x float> %x0, <4 x i32> %x1, i8 -1) + ret <4 x i32> %res +} + define <4 x i32>@test_int_x86_avx512_mask_cvt_ps2udq_128(<4 x float> %x0, <4 x i32> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvt_ps2udq_128: ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvtps2udq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x09,0x79,0xc8] -; X86-NEXT: vcvtps2udq %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7c,0x08,0x79,0xc0] -; X86-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_ps2udq_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtps2udq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x09,0x79,0xc8] -; X64-NEXT: vcvtps2udq %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7c,0x08,0x79,0xc0] -; X64-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.cvtps2udq.128(<4 x float> %x0, <4 x i32> %x1, i8 %x2) - %res1 = call <4 x i32> @llvm.x86.avx512.mask.cvtps2udq.128(<4 x float> %x0, <4 x i32> %x1, i8 -1) - %res2 = add <4 x i32> %res, %res1 - ret <4 x i32> %res2 + ret <4 x i32> %res } declare <8 x i32> @llvm.x86.avx512.mask.cvtps2udq.256(<8 x float>, <8 x i32>, i8) +define <8 x i32>@test_int_x86_avx512_cvt_ps2udq_256(<8 x float> %x0, <8 x i32> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvt_ps2udq_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvtps2udq %ymm0, %ymm0 # encoding: [0x62,0xf1,0x7c,0x28,0x79,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i32> @llvm.x86.avx512.mask.cvtps2udq.256(<8 x float> %x0, <8 x i32> %x1, i8 -1) + ret <8 x i32> %res +} + define <8 x i32>@test_int_x86_avx512_mask_cvt_ps2udq_256(<8 x float> %x0, <8 x i32> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvt_ps2udq_256: ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvtps2udq %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0x79,0xc8] -; X86-NEXT: vcvtps2udq %ymm0, %ymm0 # encoding: [0x62,0xf1,0x7c,0x28,0x79,0xc0] -; X86-NEXT: vpaddd %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0] +; X86-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvt_ps2udq_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvtps2udq %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0x79,0xc8] -; X64-NEXT: vcvtps2udq %ymm0, %ymm0 # encoding: [0x62,0xf1,0x7c,0x28,0x79,0xc0] -; X64-NEXT: vpaddd %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0] +; X64-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.cvtps2udq.256(<8 x float> %x0, <8 x i32> %x1, i8 %x2) - %res1 = call <8 x i32> @llvm.x86.avx512.mask.cvtps2udq.256(<8 x float> %x0, <8 x i32> %x1, i8 -1) - %res2 = add <8 x i32> %res, %res1 - ret <8 x i32> %res2 + ret <8 x i32> %res } declare <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.128(<2 x double>, <4 x i32>, i8) +define <4 x i32>@test_int_x86_avx512_ask_cvtt_pd2dq_128(<2 x double> %x0, <4 x i32> %x1) { +; CHECK-LABEL: test_int_x86_avx512_ask_cvtt_pd2dq_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttpd2dq %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe6,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.128(<2 x double> %x0, <4 x i32> %x1, i8 -1) + ret <4 x i32> %res +} + define <4 x i32>@test_int_x86_avx512_mask_cvtt_pd2dq_128(<2 x double> %x0, <4 x i32> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvtt_pd2dq_128: ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvttpd2dq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0xe6,0xc8] -; X86-NEXT: vcvttpd2dq %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe6,0xc0] -; X86-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvtt_pd2dq_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvttpd2dq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0xe6,0xc8] -; X64-NEXT: vcvttpd2dq %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe6,0xc0] -; X64-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.128(<2 x double> %x0, <4 x i32> %x1, i8 %x2) - %res1 = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.128(<2 x double> %x0, <4 x i32> %x1, i8 -1) - %res2 = add <4 x i32> %res, %res1 - ret <4 x i32> %res2 + ret <4 x i32> %res +} + +define <4 x i32>@test_int_x86_avx512_cvtt_pd2dq_128_zext(<2 x double> %x0, <4 x i32> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvtt_pd2dq_128_zext: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttpd2dq %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe6,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res2 = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.128(<2 x double> %x0, <4 x i32> %x1, i8 -1) + %res3 = shufflevector <4 x i32> %res2, <4 x i32> zeroinitializer, <4 x i32> + ret <4 x i32> %res3 } define <4 x i32>@test_int_x86_avx512_mask_cvtt_pd2dq_128_zext(<2 x double> %x0, <4 x i32> %x1, i8 %x2) { @@ -3628,48 +3751,58 @@ define <4 x i32>@test_int_x86_avx512_mask_cvtt_pd2dq_128_zext(<2 x double> %x0, ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvttpd2dq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0xe6,0xc8] -; X86-NEXT: vcvttpd2dq %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe6,0xc0] -; X86-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvtt_pd2dq_128_zext: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvttpd2dq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0xe6,0xc8] -; X64-NEXT: vcvttpd2dq %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe6,0xc0] -; X64-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.128(<2 x double> %x0, <4 x i32> %x1, i8 %x2) %res1 = shufflevector <4 x i32> %res, <4 x i32> zeroinitializer, <4 x i32> - %res2 = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2dq.128(<2 x double> %x0, <4 x i32> %x1, i8 -1) - %res3 = shufflevector <4 x i32> %res2, <4 x i32> zeroinitializer, <4 x i32> - %res4 = add <4 x i32> %res1, %res3 - ret <4 x i32> %res4 + ret <4 x i32> %res1 } declare <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double>, <4 x i32>, i8) +define <4 x i32>@test_int_x86_avx512_cvtt_pd2udq_128(<2 x double> %x0, <4 x i32> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvtt_pd2udq_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttpd2udq %xmm0, %xmm0 # encoding: [0x62,0xf1,0xfc,0x08,0x78,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double> %x0, <4 x i32> %x1, i8 -1) + ret <4 x i32> %res +} + define <4 x i32>@test_int_x86_avx512_mask_cvtt_pd2udq_128(<2 x double> %x0, <4 x i32> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvtt_pd2udq_128: ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvttpd2udq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfc,0x09,0x78,0xc8] -; X86-NEXT: vcvttpd2udq %xmm0, %xmm0 # encoding: [0x62,0xf1,0xfc,0x08,0x78,0xc0] -; X86-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvtt_pd2udq_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvttpd2udq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfc,0x09,0x78,0xc8] -; X64-NEXT: vcvttpd2udq %xmm0, %xmm0 # encoding: [0x62,0xf1,0xfc,0x08,0x78,0xc0] -; X64-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double> %x0, <4 x i32> %x1, i8 %x2) - %res1 = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double> %x0, <4 x i32> %x1, i8 -1) - %res2 = add <4 x i32> %res, %res1 - ret <4 x i32> %res2 + ret <4 x i32> %res +} + +define <4 x i32>@test_int_x86_avx512_cvtt_pd2udq_128_zext(<2 x double> %x0, <4 x i32> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvtt_pd2udq_128_zext: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttpd2udq %xmm0, %xmm0 # encoding: [0x62,0xf1,0xfc,0x08,0x78,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res2 = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double> %x0, <4 x i32> %x1, i8 -1) + %res3 = shufflevector <4 x i32> %res2, <4 x i32> zeroinitializer, <4 x i32> + ret <4 x i32> %res3 } define <4 x i32>@test_int_x86_avx512_mask_cvtt_pd2udq_128_zext(<2 x double> %x0, <4 x i32> %x1, i8 %x2) { @@ -3678,35 +3811,39 @@ define <4 x i32>@test_int_x86_avx512_mask_cvtt_pd2udq_128_zext(<2 x double> %x0, ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvttpd2udq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfc,0x09,0x78,0xc8] -; X86-NEXT: vcvttpd2udq %xmm0, %xmm0 # encoding: [0x62,0xf1,0xfc,0x08,0x78,0xc0] -; X86-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvtt_pd2udq_128_zext: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvttpd2udq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfc,0x09,0x78,0xc8] -; X64-NEXT: vcvttpd2udq %xmm0, %xmm0 # encoding: [0x62,0xf1,0xfc,0x08,0x78,0xc0] -; X64-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double> %x0, <4 x i32> %x1, i8 %x2) %res1 = shufflevector <4 x i32> %res, <4 x i32> zeroinitializer, <4 x i32> - %res2 = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.128(<2 x double> %x0, <4 x i32> %x1, i8 -1) - %res3 = shufflevector <4 x i32> %res2, <4 x i32> zeroinitializer, <4 x i32> - %res4 = add <4 x i32> %res1, %res3 - ret <4 x i32> %res4 + ret <4 x i32> %res1 } declare <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.256(<4 x double>, <4 x i32>, i8) +define <4 x i32>@test_int_x86_avx512_cvtt_pd2udq_256(<4 x double> %x0, <4 x i32> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvtt_pd2udq_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttpd2udq %ymm0, %xmm0 # encoding: [0x62,0xf1,0xfc,0x28,0x78,0xc0] +; CHECK-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.256(<4 x double> %x0, <4 x i32> %x1, i8 -1) + ret <4 x i32> %res +} + define <4 x i32>@test_int_x86_avx512_mask_cvtt_pd2udq_256(<4 x double> %x0, <4 x i32> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvtt_pd2udq_256: ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvttpd2udq %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfc,0x29,0x78,0xc8] -; X86-NEXT: vcvttpd2udq %ymm0, %xmm0 # encoding: [0x62,0xf1,0xfc,0x28,0x78,0xc0] -; X86-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -3714,64 +3851,71 @@ define <4 x i32>@test_int_x86_avx512_mask_cvtt_pd2udq_256(<4 x double> %x0, <4 x ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvttpd2udq %ymm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfc,0x29,0x78,0xc8] -; X64-NEXT: vcvttpd2udq %ymm0, %xmm0 # encoding: [0x62,0xf1,0xfc,0x28,0x78,0xc0] -; X64-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.256(<4 x double> %x0, <4 x i32> %x1, i8 %x2) - %res1 = call <4 x i32> @llvm.x86.avx512.mask.cvttpd2udq.256(<4 x double> %x0, <4 x i32> %x1, i8 -1) - %res2 = add <4 x i32> %res, %res1 - ret <4 x i32> %res2 + ret <4 x i32> %res } declare <4 x i32> @llvm.x86.avx512.mask.cvttps2udq.128(<4 x float>, <4 x i32>, i8) +define <4 x i32>@test_int_x86_avx512_cvtt_ps2udq_128(<4 x float> %x0, <4 x i32> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvtt_ps2udq_128: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttps2udq %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7c,0x08,0x78,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x i32> @llvm.x86.avx512.mask.cvttps2udq.128(<4 x float> %x0, <4 x i32> %x1, i8 -1) + ret <4 x i32> %res +} + define <4 x i32>@test_int_x86_avx512_mask_cvtt_ps2udq_128(<4 x float> %x0, <4 x i32> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvtt_ps2udq_128: ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvttps2udq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x09,0x78,0xc8] -; X86-NEXT: vcvttps2udq %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7c,0x08,0x78,0xc0] -; X86-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] +; X86-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvtt_ps2udq_128: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvttps2udq %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x09,0x78,0xc8] -; X64-NEXT: vcvttps2udq %xmm0, %xmm0 # encoding: [0x62,0xf1,0x7c,0x08,0x78,0xc0] -; X64-NEXT: vpaddd %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] +; X64-NEXT: vmovaps %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.cvttps2udq.128(<4 x float> %x0, <4 x i32> %x1, i8 %x2) - %res1 = call <4 x i32> @llvm.x86.avx512.mask.cvttps2udq.128(<4 x float> %x0, <4 x i32> %x1, i8 -1) - %res2 = add <4 x i32> %res, %res1 - ret <4 x i32> %res2 + ret <4 x i32> %res } declare <8 x i32> @llvm.x86.avx512.mask.cvttps2udq.256(<8 x float>, <8 x i32>, i8) +define <8 x i32>@test_int_x86_avx512_cvtt_ps2udq_256(<8 x float> %x0, <8 x i32> %x1) { +; CHECK-LABEL: test_int_x86_avx512_cvtt_ps2udq_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vcvttps2udq %ymm0, %ymm0 # encoding: [0x62,0xf1,0x7c,0x28,0x78,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x i32> @llvm.x86.avx512.mask.cvttps2udq.256(<8 x float> %x0, <8 x i32> %x1, i8 -1) + ret <8 x i32> %res +} + define <8 x i32>@test_int_x86_avx512_mask_cvtt_ps2udq_256(<8 x float> %x0, <8 x i32> %x1, i8 %x2) { ; X86-LABEL: test_int_x86_avx512_mask_cvtt_ps2udq_256: ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vcvttps2udq %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0x78,0xc8] -; X86-NEXT: vcvttps2udq %ymm0, %ymm0 # encoding: [0x62,0xf1,0x7c,0x28,0x78,0xc0] -; X86-NEXT: vpaddd %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0] +; X86-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_cvtt_ps2udq_256: ; X64: # %bb.0: ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vcvttps2udq %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7c,0x29,0x78,0xc8] -; X64-NEXT: vcvttps2udq %ymm0, %ymm0 # encoding: [0x62,0xf1,0x7c,0x28,0x78,0xc0] -; X64-NEXT: vpaddd %ymm0, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0] +; X64-NEXT: vmovaps %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1] ; X64-NEXT: retq # encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.cvttps2udq.256(<8 x float> %x0, <8 x i32> %x1, i8 %x2) - %res1 = call <8 x i32> @llvm.x86.avx512.mask.cvttps2udq.256(<8 x float> %x0, <8 x i32> %x1, i8 -1) - %res2 = add <8 x i32> %res, %res1 - ret <8 x i32> %res2 + ret <8 x i32> %res } declare <2 x double> @llvm.x86.avx512.mask.rndscale.pd.128(<2 x double>, i32, <2 x double>, i8) @@ -4652,76 +4796,108 @@ declare <2 x double> @llvm.x86.avx512.rcp14.pd.128(<2 x double>, <2 x double>, i declare <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double>, <4 x i64>) +define <4 x double>@test_int_x86_avx512_permvar_df_256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2) { +; CHECK-LABEL: test_int_x86_avx512_permvar_df_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermpd %ymm0, %ymm1, %ymm0 # encoding: [0x62,0xf2,0xf5,0x28,0x16,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %x0, <4 x i64> %x1) + ret <4 x double> %1 +} + define <4 x double>@test_int_x86_avx512_mask_permvar_df_256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_permvar_df_256: ; X86: # %bb.0: -; X86-NEXT: vpermpd %ymm0, %ymm1, %ymm3 # encoding: [0x62,0xf2,0xf5,0x28,0x16,0xd8] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermpd %ymm0, %ymm1, %ymm2 {%k1} # encoding: [0x62,0xf2,0xf5,0x29,0x16,0xd0] -; X86-NEXT: vpermpd %ymm0, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0x16,0xc0] -; X86-NEXT: vaddpd %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc0] -; X86-NEXT: vaddpd %ymm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc3] +; X86-NEXT: vmovapd %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_permvar_df_256: ; X64: # %bb.0: -; X64-NEXT: vpermpd %ymm0, %ymm1, %ymm3 # encoding: [0x62,0xf2,0xf5,0x28,0x16,0xd8] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermpd %ymm0, %ymm1, %ymm2 {%k1} # encoding: [0x62,0xf2,0xf5,0x29,0x16,0xd0] -; X64-NEXT: vpermpd %ymm0, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0x16,0xc0] -; X64-NEXT: vaddpd %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc0] -; X64-NEXT: vaddpd %ymm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc3] +; X64-NEXT: vmovapd %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %1 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %x0, <4 x i64> %x1) %2 = bitcast i8 %x3 to <8 x i1> %extract1 = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> %3 = select <4 x i1> %extract1, <4 x double> %1, <4 x double> %x2 - %4 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %x0, <4 x i64> %x1) - %5 = bitcast i8 %x3 to <8 x i1> - %extract = shufflevector <8 x i1> %5, <8 x i1> %5, <4 x i32> - %6 = select <4 x i1> %extract, <4 x double> %4, <4 x double> zeroinitializer - %7 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %x0, <4 x i64> %x1) - %res3 = fadd <4 x double> %3, %6 - %res4 = fadd <4 x double> %res3, %7 - ret <4 x double> %res4 + ret <4 x double> %3 +} + +define <4 x double>@test_int_x86_avx512_maskz_permvar_df_256(<4 x double> %x0, <4 x i64> %x1, i8 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_permvar_df_256: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpermpd %ymm0, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0x16,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_permvar_df_256: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vpermpd %ymm0, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0x16,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %1 = call <4 x double> @llvm.x86.avx512.permvar.df.256(<4 x double> %x0, <4 x i64> %x1) + %2 = bitcast i8 %x3 to <8 x i1> + %extract1 = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> + %3 = select <4 x i1> %extract1, <4 x double> %1, <4 x double> zeroinitializer + ret <4 x double> %3 } declare <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64>, <4 x i64>) +define <4 x i64>@test_int_x86_avx512_permvar_di_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2) { +; CHECK-LABEL: test_int_x86_avx512_permvar_di_256: +; CHECK: # %bb.0: +; CHECK-NEXT: vpermpd %ymm0, %ymm1, %ymm0 # encoding: [0x62,0xf2,0xf5,0x28,0x16,0xc0] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %x0, <4 x i64> %x1) + ret <4 x i64> %1 +} + define <4 x i64>@test_int_x86_avx512_mask_permvar_di_256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_permvar_di_256: ; X86: # %bb.0: -; X86-NEXT: vpermq %ymm0, %ymm1, %ymm3 # encoding: [0x62,0xf2,0xf5,0x28,0x36,0xd8] ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] ; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] ; X86-NEXT: vpermq %ymm0, %ymm1, %ymm2 {%k1} # encoding: [0x62,0xf2,0xf5,0x29,0x36,0xd0] -; X86-NEXT: vpermq %ymm0, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0x36,0xc0] -; X86-NEXT: vpaddq %ymm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc3] -; X86-NEXT: vpaddq %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_permvar_di_256: ; X64: # %bb.0: -; X64-NEXT: vpermq %ymm0, %ymm1, %ymm3 # encoding: [0x62,0xf2,0xf5,0x28,0x36,0xd8] ; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; X64-NEXT: vpermq %ymm0, %ymm1, %ymm2 {%k1} # encoding: [0x62,0xf2,0xf5,0x29,0x36,0xd0] -; X64-NEXT: vpermq %ymm0, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0x36,0xc0] -; X64-NEXT: vpaddq %ymm3, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc3] -; X64-NEXT: vpaddq %ymm0, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] ; X64-NEXT: retq # encoding: [0xc3] %1 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %x0, <4 x i64> %x1) %2 = bitcast i8 %x3 to <8 x i1> %extract1 = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> %3 = select <4 x i1> %extract1, <4 x i64> %1, <4 x i64> %x2 - %4 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %x0, <4 x i64> %x1) - %5 = bitcast i8 %x3 to <8 x i1> - %extract = shufflevector <8 x i1> %5, <8 x i1> %5, <4 x i32> - %6 = select <4 x i1> %extract, <4 x i64> %4, <4 x i64> zeroinitializer - %7 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %x0, <4 x i64> %x1) - %res3 = add <4 x i64> %3, %6 - %res4 = add <4 x i64> %res3, %7 - ret <4 x i64> %res4 + ret <4 x i64> %3 +} + +define <4 x i64>@test_int_x86_avx512_maskz_permvar_di_256(<4 x i64> %x0, <4 x i64> %x1, i8 %x3) { +; X86-LABEL: test_int_x86_avx512_maskz_permvar_di_256: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] +; X86-NEXT: vpermq %ymm0, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0x36,0xc0] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_maskz_permvar_di_256: +; X64: # %bb.0: +; X64-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] +; X64-NEXT: vpermq %ymm0, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xa9,0x36,0xc0] +; X64-NEXT: retq # encoding: [0xc3] + %1 = call <4 x i64> @llvm.x86.avx512.permvar.di.256(<4 x i64> %x0, <4 x i64> %x1) + %2 = bitcast i8 %x3 to <8 x i1> + %extract1 = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> + %3 = select <4 x i1> %extract1, <4 x i64> %1, <4 x i64> zeroinitializer + ret <4 x i64> %3 } declare <2 x double> @llvm.x86.avx512.mask.fixupimm.pd.128(<2 x double>, <2 x double>, <2 x i64>, i32, i8) diff --git a/llvm/test/CodeGen/X86/block-placement-2.ll b/llvm/test/CodeGen/X86/block-placement-2.ll deleted file mode 100644 index 2006dec266d52..0000000000000 --- a/llvm/test/CodeGen/X86/block-placement-2.ll +++ /dev/null @@ -1,162 +0,0 @@ - -; RUN: llc -mtriple=i686-linux -pre-RA-sched=source < %s | FileCheck %s -; RUN: opt -disable-output -debugify < %s - -; This was derived from the Linux kernel. The __builtin_expect was ignored -; which pushed the hot block "if.else" out of the critical path choosing -; instead the cold block "if.then23". The cold block should be moved towards -; the bottom. - -; CHECK-LABEL: test1: -; CHECK: %for.inc -; CHECK: %if.end18 -; CHECK: %if.else -; CHECK: %if.end.i.i -; CHECK: %if.end8.i.i -; CHECK: %if.then23 -; CHECK: ret - -%struct.hlist_bl_node = type { %struct.hlist_bl_node*, %struct.hlist_bl_node** } -%struct.dentry = type { i32, %struct.inode, %struct.hlist_bl_node, %struct.dentry*, %struct.inode, %struct.inode*, [32 x i8], %struct.inode, %struct.dentry_operations* } -%struct.inode = type { i32 } -%struct.dentry_operations = type { i32 (%struct.dentry*, i32)*, i32 (%struct.dentry*, i32)*, i32 (%struct.dentry*, %struct.inode*)*, i32 (%struct.dentry*, i32, i8*)* } -%struct.anon.2 = type { i32, i32 } - -define %struct.dentry* @test1(%struct.dentry* readonly %parent, i8* %name, i32* nocapture %seqp, i64 %param1) { -entry: - %tobool135 = icmp eq i64 %param1, 0 - br i1 %tobool135, label %cleanup63, label %do.body4.lr.ph - -do.body4.lr.ph: ; preds = %entry - %d_op = getelementptr inbounds %struct.dentry, %struct.dentry* %parent, i64 0, i32 8 - %shr = lshr i64 %param1, 32 - %conv49 = trunc i64 %shr to i32 - br label %do.body4 - -do.body4: ; preds = %for.inc, %do.body4.lr.ph - %node.0.in136 = phi i64 [ %param1, %do.body4.lr.ph ], [ %tmp35, %for.inc ] - %node.0 = inttoptr i64 %node.0.in136 to %struct.hlist_bl_node* - %add.ptr = getelementptr %struct.hlist_bl_node, %struct.hlist_bl_node* %node.0, i64 -1, i32 1 - %tmp6 = bitcast %struct.hlist_bl_node*** %add.ptr to %struct.dentry* - %tmp7 = getelementptr inbounds %struct.dentry, %struct.dentry* %tmp6, i64 0, i32 1, i32 0 - %tmp8 = load volatile i32, i32* %tmp7, align 4 - call void asm sideeffect "", "~{memory},~{dirflag},~{fpsr},~{flags}"() - %d_parent = getelementptr inbounds %struct.hlist_bl_node**, %struct.hlist_bl_node*** %add.ptr, i64 3 - %tmp9 = bitcast %struct.hlist_bl_node*** %d_parent to %struct.dentry** - %tmp10 = load %struct.dentry*, %struct.dentry** %tmp9, align 8 - %cmp133 = icmp eq %struct.dentry* %tmp10, %parent - br i1 %cmp133, label %if.end14.lr.ph, label %for.inc - -if.end14.lr.ph: ; preds = %do.body4 - %tmp11 = getelementptr inbounds %struct.hlist_bl_node**, %struct.hlist_bl_node*** %add.ptr, i64 2 - %d_name43 = getelementptr inbounds %struct.hlist_bl_node**, %struct.hlist_bl_node*** %add.ptr, i64 4 - %hash = bitcast %struct.hlist_bl_node*** %d_name43 to i32* - %tmp12 = bitcast %struct.hlist_bl_node*** %d_name43 to %struct.anon.2* - %len = getelementptr inbounds %struct.anon.2, %struct.anon.2* %tmp12, i64 0, i32 1 - %name31 = getelementptr inbounds %struct.hlist_bl_node**, %struct.hlist_bl_node*** %add.ptr, i64 5 - %tmp13 = bitcast %struct.hlist_bl_node*** %name31 to i8** - br label %if.end14 - -if.end14: ; preds = %cleanup, %if.end14.lr.ph - %and.i100134.in = phi i32 [ %tmp8, %if.end14.lr.ph ], [ undef, %cleanup ] - %and.i100134 = and i32 %and.i100134.in, -2 - %tmp14 = load %struct.hlist_bl_node**, %struct.hlist_bl_node*** %tmp11, align 8 - %tobool.i.i = icmp eq %struct.hlist_bl_node** %tmp14, null - br i1 %tobool.i.i, label %for.inc, label %if.end18 - -if.end18: ; preds = %if.end14 - %tmp15 = load i32, i32* %seqp, align 8 - %tmp16 = and i32 %tmp15, 2 - %tobool22 = icmp eq i32 %tmp16, 0 - br i1 %tobool22, label %if.else, label %if.then23, !prof !0, !misexpect !1 - -if.then23: ; preds = %if.end18 - %tmp17 = load i32, i32* %hash, align 8 - %cmp25 = icmp eq i32 %tmp17, 42 - br i1 %cmp25, label %if.end28, label %for.inc - -if.end28: ; preds = %if.then23 - %tmp18 = load i32, i32* %len, align 4 - %tmp19 = load i8*, i8** %tmp13, align 8 - call void asm sideeffect "", "~{memory},~{dirflag},~{fpsr},~{flags}"() - %tmp20 = load i32, i32* %tmp7, align 4 - %cmp.i.i101 = icmp eq i32 %tmp20, %and.i100134 - br i1 %cmp.i.i101, label %if.end36, label %cleanup - -if.end36: ; preds = %if.end28 - %tmp21 = load %struct.dentry_operations*, %struct.dentry_operations** %d_op, align 8 - %d_compare = getelementptr inbounds %struct.dentry_operations, %struct.dentry_operations* %tmp21, i64 0, i32 3 - %tmp22 = load i32 (%struct.dentry*, i32, i8*)*, i32 (%struct.dentry*, i32, i8*)** %d_compare, align 8 - %call37 = call i32 %tmp22(%struct.dentry* %tmp6, i32 %tmp18, i8* %name) - %cmp38 = icmp eq i32 %call37, 0 - br i1 %cmp38, label %cleanup56, label %for.inc - -cleanup: ; preds = %if.end28 - %tmp24 = load %struct.dentry*, %struct.dentry** %tmp9, align 8 - %cmp = icmp eq %struct.dentry* null, %parent - br i1 %cmp, label %if.end14, label %for.inc - -if.else: ; preds = %if.end18 - %hash_len44 = bitcast %struct.hlist_bl_node*** %d_name43 to i64* - %tmp25 = load i64, i64* %hash_len44, align 8 - %cmp45 = icmp eq i64 %tmp25, %param1 - br i1 %cmp45, label %if.end48, label %for.inc - -if.end48: ; preds = %if.else - %tmp26 = bitcast %struct.hlist_bl_node*** %name31 to i64* - %tmp27 = load volatile i64, i64* %tmp26, align 8 - %tmp28 = inttoptr i64 %tmp27 to i8* - br label %for.cond.i.i - -for.cond.i.i: ; preds = %if.end8.i.i, %if.end48 - %tcount.addr.0.i.i = phi i32 [ %conv49, %if.end48 ], [ %sub.i.i, %if.end8.i.i ] - %ct.addr.0.i.i = phi i8* [ %name, %if.end48 ], [ %add.ptr9.i.i, %if.end8.i.i ] - %cs.addr.0.i.i = phi i8* [ %tmp28, %if.end48 ], [ %add.ptr.i.i, %if.end8.i.i ] - %tmp29 = bitcast i8* %cs.addr.0.i.i to i64* - %tmp30 = load i64, i64* %tmp29, align 8 - %tmp31 = bitcast i8* %ct.addr.0.i.i to i64* - %tmp32 = call { i64, i64 } asm "1:\09mov $2,$0\0A2:\0A.section .fixup,\22ax\22\0A3:\09lea $2,$1\0A\09and $3,$1\0A\09mov ($1),$0\0A\09leal $2,%ecx\0A\09andl $4,%ecx\0A\09shll $$3,%ecx\0A\09shr %cl,$0\0A\09jmp 2b\0A.previous\0A .pushsection \22__ex_table\22,\22a\22\0A .balign 4\0A .long (1b) - .\0A .long (3b) - .\0A .long (ex_handler_default) - .\0A .popsection\0A", "=&r,=&{cx},*m,i,i,~{dirflag},~{fpsr},~{flags}"(i64* %tmp31, i64 -8, i64 7) - %cmp.i.i = icmp ult i32 %tcount.addr.0.i.i, 8 - %asmresult.i.le.i.le.i.le = extractvalue { i64, i64 } %tmp32, 0 - br i1 %cmp.i.i, label %dentry_cmp.exit, label %if.end.i.i - -if.end.i.i: ; preds = %for.cond.i.i - %cmp3.i.i = icmp eq i64 %tmp30, %asmresult.i.le.i.le.i.le - br i1 %cmp3.i.i, label %if.end8.i.i, label %for.inc, !prof !0, !misexpect !1 - -if.end8.i.i: ; preds = %if.end.i.i - %add.ptr.i.i = getelementptr i8, i8* %cs.addr.0.i.i, i64 8 - %add.ptr9.i.i = getelementptr i8, i8* %ct.addr.0.i.i, i64 8 - %sub.i.i = add i32 %tcount.addr.0.i.i, -8 - %tobool12.i.i = icmp eq i32 %sub.i.i, 0 - br i1 %tobool12.i.i, label %cleanup56, label %for.cond.i.i - -dentry_cmp.exit: ; preds = %for.cond.i.i - %asmresult.i.le.i.le.i.le.le = extractvalue { i64, i64 } %tmp32, 0 - %mul.i.i = shl nuw nsw i32 %tcount.addr.0.i.i, 3 - %sh_prom.i.i = zext i32 %mul.i.i to i64 - %shl.i.i = shl nsw i64 -1, %sh_prom.i.i - %neg.i.i = xor i64 %shl.i.i, -1 - %xor.i.i = xor i64 %asmresult.i.le.i.le.i.le.le, %tmp30 - %and.i.i = and i64 %xor.i.i, %neg.i.i - %tobool15.i.i = icmp eq i64 %and.i.i, 0 - br i1 %tobool15.i.i, label %cleanup56, label %for.inc - -cleanup56: ; preds = %dentry_cmp.exit, %if.end8.i.i, %if.end36 - %tmp33 = bitcast %struct.hlist_bl_node*** %add.ptr to %struct.dentry* - store i32 %and.i100134, i32* %seqp, align 4 - br label %cleanup63 - -for.inc: ; preds = %dentry_cmp.exit, %if.end.i.i, %if.else, %cleanup, %if.end36, %if.then23, %if.end14, %do.body4 - %tmp34 = inttoptr i64 %node.0.in136 to i64* - %tmp35 = load volatile i64, i64* %tmp34, align 8 - %tobool = icmp eq i64 %tmp35, 0 - br i1 %tobool, label %cleanup63, label %do.body4 - -cleanup63: ; preds = %for.inc, %cleanup56, %entry - %retval.2 = phi %struct.dentry* [ %tmp33, %cleanup56 ], [ null, %entry ], [ null, %for.inc ] - ret %struct.dentry* %retval.2 -} - -!0 = !{!"branch_weights", i32 2000, i32 1} -!1 = !{!"misexpect", i64 1, i64 2000, i64 1} diff --git a/llvm/test/CodeGen/X86/block-placement.ll b/llvm/test/CodeGen/X86/block-placement.ll index 6c93bcbdff2a9..258cc2031ae8b 100644 --- a/llvm/test/CodeGen/X86/block-placement.ll +++ b/llvm/test/CodeGen/X86/block-placement.ll @@ -1502,9 +1502,9 @@ define i32 @not_rotate_if_extra_branch(i32 %count) { ; CHECK: %.header ; CHECK: %.middle ; CHECK: %.backedge +; CHECK: %.slow ; CHECK: %.bailout ; CHECK: %.stop -; CHECK: %.slow .entry: %sum.0 = shl nsw i32 %count, 1 br label %.header diff --git a/llvm/test/CodeGen/X86/call-site-info-output.ll b/llvm/test/CodeGen/X86/call-site-info-output.ll index a0438f0c2b985..0686f184b5262 100644 --- a/llvm/test/CodeGen/X86/call-site-info-output.ll +++ b/llvm/test/CodeGen/X86/call-site-info-output.ll @@ -1,6 +1,6 @@ ; Test call site info MIR printer and parser.Parser assertions and machine ; verifier will check the rest; -; RUN: llc -emit-call-site-info -debug-entry-values %s -stop-before=finalize-isel -o %t.mir +; RUN: llc -emit-call-site-info %s -stop-before=finalize-isel -o %t.mir ; RUN: cat %t.mir | FileCheck %s ; CHECK: name: fn2 ; CHECK: callSites: @@ -10,7 +10,7 @@ ; CHECK-NEXT: arg: 0, reg: '$edi' ; CHECK-NEXT: arg: 1, reg: '$esi' ; CHECK-NEXT: arg: 2, reg: '$edx' -; RUN: llc -emit-call-site-info -debug-entry-values %t.mir -run-pass=finalize-isel -o -| FileCheck %s --check-prefix=PARSER +; RUN: llc -emit-call-site-info %t.mir -run-pass=finalize-isel -o -| FileCheck %s --check-prefix=PARSER ; Verify that we are able to parse output mir and that we are getting the same result. ; PARSER: name: fn2 ; PARSER: callSites: diff --git a/llvm/test/CodeGen/X86/cfi-epilogue-with-return.mir b/llvm/test/CodeGen/X86/cfi-epilogue-with-return.mir deleted file mode 100644 index 583e54b097faf..0000000000000 --- a/llvm/test/CodeGen/X86/cfi-epilogue-with-return.mir +++ /dev/null @@ -1,48 +0,0 @@ -# RUN: llc -o - %s -mtriple=x86_64-- -run-pass=prologepilog 2>&1 | FileCheck %s ---- | - define i64 @_Z3foob(i1 zeroext %cond) #0 { - ret i64 0 - } - attributes #0 = {"frame-pointer"="all"} -... ---- -# If the epilogue bb.1 is a return block, no .cfi_restore is -# needed in it. -# CHECK: bb.1: -# CHECK-NOT: CFI_INSTRUCTION restore -# CHECK: RET 0 -# CHECK: bb.2: -# CHECK: RET 0 -name: _Z3foob -alignment: 16 -tracksRegLiveness: true -liveins: - - { reg: '$edi' } -frameInfo: - maxAlignment: 1 - hasCalls: true - savePoint: '%bb.1' - restorePoint: '%bb.1' -machineFunctionInfo: {} -body: | - bb.0: - liveins: $edi - - TEST8rr renamable $dil, renamable $dil, implicit-def $eflags, implicit killed $edi - JCC_1 %bb.2, 4, implicit killed $eflags - JMP_1 %bb.1 - - bb.1: - renamable $rbx = IMPLICIT_DEF - renamable $r14 = IMPLICIT_DEF - renamable $r15 = IMPLICIT_DEF - renamable $r12 = IMPLICIT_DEF - renamable $r13 = IMPLICIT_DEF - dead $eax = MOV32r0 implicit-def dead $eflags, implicit-def $rax - RET 0, killed $rax - - bb.2: - dead $eax = MOV32r0 implicit-def dead $eflags, implicit-def $rax - RET 0, killed $rax - -... diff --git a/llvm/test/CodeGen/X86/cfi-epilogue-without-return.mir b/llvm/test/CodeGen/X86/cfi-epilogue-without-return.mir deleted file mode 100644 index 8f04721489608..0000000000000 --- a/llvm/test/CodeGen/X86/cfi-epilogue-without-return.mir +++ /dev/null @@ -1,53 +0,0 @@ -# RUN: llc -o - %s -mtriple=x86_64-- -run-pass=prologepilog 2>&1 | FileCheck %s ---- | - declare dso_local void @_Z3goov() - define i64 @_Z3foob(i1 zeroext %cond) #0 { - ret i64 0 - } - attributes #0 = {"frame-pointer"="all"} -... ---- -# If the epilogue bb.1.if.then is not a return block, .cfi_restore is -# needed in it, otherwise bb.2.return will see different outgoing CFI -# information from its predecessors. -# CHECK: bb.1: -# CHECK: CFI_INSTRUCTION restore $rbx -# CHECK-NEXT: CFI_INSTRUCTION restore $r12 -# CHECK-NEXT: CFI_INSTRUCTION restore $r13 -# CHECK-NEXT: CFI_INSTRUCTION restore $r14 -# CHECK-NEXT: CFI_INSTRUCTION restore $r15 -# CHECK-NEXT: CFI_INSTRUCTION restore $rbp -# CHECK-NOT: RET 0 -# CHECK: bb.2: -# CHECK: RET 0 -name: _Z3foob -alignment: 16 -tracksRegLiveness: true -liveins: - - { reg: '$edi' } -frameInfo: - maxAlignment: 1 - hasCalls: true - savePoint: '%bb.1' - restorePoint: '%bb.1' -machineFunctionInfo: {} -body: | - bb.0: - liveins: $edi - - TEST8rr renamable $dil, renamable $dil, implicit-def $eflags, implicit killed $edi - JCC_1 %bb.2, 4, implicit killed $eflags - JMP_1 %bb.1 - - bb.1: - renamable $rbx = IMPLICIT_DEF - renamable $r14 = IMPLICIT_DEF - renamable $r15 = IMPLICIT_DEF - renamable $r12 = IMPLICIT_DEF - renamable $r13 = IMPLICIT_DEF - - bb.2: - dead $eax = MOV32r0 implicit-def dead $eflags, implicit-def $rax - RET 0, killed $rax - -... diff --git a/llvm/test/CodeGen/X86/cfi-inserter-callee-save-register.mir b/llvm/test/CodeGen/X86/cfi-inserter-callee-save-register.mir deleted file mode 100644 index b17c9a67abb18..0000000000000 --- a/llvm/test/CodeGen/X86/cfi-inserter-callee-save-register.mir +++ /dev/null @@ -1,34 +0,0 @@ -# RUN: llc -o - %s -mtriple=x86_64-- -verify-cfiinstrs \ -# RUN: -run-pass=cfi-instr-inserter 2>&1 | FileCheck %s -# Test that CFI inserter inserts .cfi_restore properly for -# callee saved registers. ---- | - define void @foo() { - ret void - } -... ---- -# CHECK: bb.3: -# CHECK: CFI_INSTRUCTION restore $rbx -# CHECK-NEXT: CFI_INSTRUCTION restore $rbp -name: foo -body: | - bb.0: - TEST8rr renamable $dil, renamable $dil, implicit-def $eflags, implicit killed $edi - JCC_1 %bb.2, 5, implicit killed $eflags - - bb.1: - JMP_1 %bb.3 - - bb.2: - CFI_INSTRUCTION def_cfa_offset 16 - CFI_INSTRUCTION offset $rbp, -16 - CFI_INSTRUCTION def_cfa_register $rbp - CFI_INSTRUCTION offset $rbx, -24 - CFI_INSTRUCTION def_cfa $rsp, 8 - RET 0, $rax - - bb.3: - RET 0, $rax - -... diff --git a/llvm/test/CodeGen/X86/cfi-inserter-verify-inconsistent-csr.mir b/llvm/test/CodeGen/X86/cfi-inserter-verify-inconsistent-csr.mir deleted file mode 100644 index 63957ae5229fa..0000000000000 --- a/llvm/test/CodeGen/X86/cfi-inserter-verify-inconsistent-csr.mir +++ /dev/null @@ -1,28 +0,0 @@ -# RUN: not --crash llc -o - %s -mtriple=x86_64-- -verify-cfiinstrs \ -# RUN: -run-pass=cfi-instr-inserter 2>&1 | FileCheck %s -# Test that CFI verifier finds inconsistent csr saved set between bb.end and -# one of its precedessors. ---- | - define void @inconsistentCSR() { - entry: - br label %then - then: - br label %end - end: - ret void - } -... ---- -# CHECK: *** Inconsistent CSR Saved between pred and succ in function inconsistentCSR *** -# CHECK: LLVM ERROR: Found 1 in/out CFI information errors. -name: inconsistentCSR -body: | - bb.0.entry: - JCC_1 %bb.2, 5, implicit undef $eflags - - bb.1.then: - CFI_INSTRUCTION offset $rbp, -16 - - bb.2.end: - RET 0 -... diff --git a/llvm/test/CodeGen/X86/constructor.ll b/llvm/test/CodeGen/X86/constructor.ll index d4518f19b7e60..534c452d65483 100644 --- a/llvm/test/CodeGen/X86/constructor.ll +++ b/llvm/test/CodeGen/X86/constructor.ll @@ -1,3 +1,4 @@ +; RUN: llc -mtriple x86_64 < %s | FileCheck --check-prefix=INIT-ARRAY %s ; RUN: llc -mtriple x86_64-pc-linux -use-ctors < %s | FileCheck --check-prefix=CTOR %s ; RUN: llc -mtriple x86_64-unknown-freebsd -use-ctors < %s | FileCheck --check-prefix=CTOR %s ; RUN: llc -mtriple x86_64-pc-solaris2.11 -use-ctors < %s | FileCheck --check-prefix=CTOR %s diff --git a/llvm/test/CodeGen/X86/fast-isel-freeze.ll b/llvm/test/CodeGen/X86/fast-isel-freeze.ll new file mode 100644 index 0000000000000..fee53ca93f542 --- /dev/null +++ b/llvm/test/CodeGen/X86/fast-isel-freeze.ll @@ -0,0 +1,21 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-linux | FileCheck %s --check-prefix=SDAG +; RUN: llc < %s -fast-isel -fast-isel-abort=1 -mtriple=x86_64-unknown-linux | FileCheck %s --check-prefix=FAST + +define i32 @freeze(i32 %t) { +; SDAG-LABEL: freeze: +; SDAG: # %bb.0: +; SDAG-NEXT: movl $10, %eax +; SDAG-NEXT: xorl %edi, %eax +; SDAG-NEXT: retq +; +; FAST-LABEL: freeze: +; FAST: # %bb.0: +; FAST-NEXT: movl $10, %eax +; FAST-NEXT: xorl %edi, %eax +; FAST-NEXT: retq + %1 = freeze i32 %t + %2 = freeze i32 10 + %3 = xor i32 %1, %2 + ret i32 %3 +} diff --git a/llvm/test/CodeGen/X86/fast-isel.ll b/llvm/test/CodeGen/X86/fast-isel.ll index dbc13ba7ed780..e9a8a6b539500 100644 --- a/llvm/test/CodeGen/X86/fast-isel.ll +++ b/llvm/test/CodeGen/X86/fast-isel.ll @@ -99,6 +99,11 @@ define void @load_store_i1(i1* %p, i1* %q) nounwind { ret void } +define void @freeze_i32(i32 %x) { + %t = freeze i32 %x + ret void +} + @crash_test1x = external global <2 x i32>, align 8 define void @crash_test1() nounwind ssp { diff --git a/llvm/test/CodeGen/X86/fdiv.ll b/llvm/test/CodeGen/X86/fdiv.ll index 259cd91cca528..c361ab0f5aed7 100644 --- a/llvm/test/CodeGen/X86/fdiv.ll +++ b/llvm/test/CodeGen/X86/fdiv.ll @@ -76,5 +76,29 @@ define <4 x float> @double_negative_vector(<4 x float> %x, <4 x float> %y) #0 { ret <4 x float> %div } +; This test used to fail, depending on how llc was built (e.g. using +; clang/gcc), due to order of argument evaluation not being well defined. We +; ended up hitting llvm_unreachable in getNegatedExpression when building with +; gcc. Just make sure that we get a deterministic result. +define float @fdiv_fneg_combine(float %a0, float %a1, float %a2) #0 { +; CHECK-LABEL: fdiv_fneg_combine: +; CHECK: # %bb.0: +; CHECK-NEXT: movaps %xmm0, %xmm3 +; CHECK-NEXT: subss %xmm1, %xmm3 +; CHECK-NEXT: subss %xmm0, %xmm1 +; CHECK-NEXT: mulss %xmm2, %xmm1 +; CHECK-NEXT: subss %xmm2, %xmm3 +; CHECK-NEXT: divss %xmm3, %xmm1 +; CHECK-NEXT: movaps %xmm1, %xmm0 +; CHECK-NEXT: retq + %sub1 = fsub fast float %a0, %a1 + %mul2 = fmul fast float %sub1, %a2 + %neg = fneg fast float %a0 + %add3 = fadd fast float %a1, %neg + %sub4 = fadd fast float %add3, %a2 + %div5 = fdiv fast float %mul2, %sub4 + ret float %div5 +} + attributes #0 = { "unsafe-fp-math"="false" } diff --git a/llvm/test/CodeGen/X86/freeze-legalize.ll b/llvm/test/CodeGen/X86/freeze-legalize.ll new file mode 100644 index 0000000000000..6bbd0b8e59493 --- /dev/null +++ b/llvm/test/CodeGen/X86/freeze-legalize.ll @@ -0,0 +1,87 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; Make sure that seldag legalization works correctly for freeze instruction. +; RUN: llc -mtriple=i386-apple-darwin < %s 2>&1 | FileCheck %s + +define i64 @expand(i32 %x) { +; CHECK-LABEL: expand: +; CHECK: ## %bb.0: +; CHECK-NEXT: movl $303174162, %eax ## imm = 0x12121212 +; CHECK-NEXT: movl $875836468, %ecx ## imm = 0x34343434 +; CHECK-NEXT: movl $1448498774, %edx ## imm = 0x56565656 +; CHECK-NEXT: xorl %eax, %edx +; CHECK-NEXT: movl $2021161080, %eax ## imm = 0x78787878 +; CHECK-NEXT: xorl %ecx, %eax +; CHECK-NEXT: retl + %y1 = freeze i64 1302123111658042420 ; 0x1212121234343434 + %y2 = freeze i64 6221254864647256184 ; 0x5656565678787878 + %t2 = xor i64 %y1, %y2 + ret i64 %t2 +} + + +define <2 x i64> @expand_vec(i32 %x) nounwind { +; CHECK-LABEL: expand_vec: +; CHECK: ## %bb.0: +; CHECK-NEXT: pushl %ebx +; CHECK-NEXT: pushl %edi +; CHECK-NEXT: pushl %esi +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl $16843009, %ecx ## imm = 0x1010101 +; CHECK-NEXT: movl $589505315, %edx ## imm = 0x23232323 +; CHECK-NEXT: movl $303174162, %esi ## imm = 0x12121212 +; CHECK-NEXT: movl $875836468, %edi ## imm = 0x34343434 +; CHECK-NEXT: movl $1162167621, %ebx ## imm = 0x45454545 +; CHECK-NEXT: xorl %ecx, %ebx +; CHECK-NEXT: movl $1734829927, %ecx ## imm = 0x67676767 +; CHECK-NEXT: xorl %edx, %ecx +; CHECK-NEXT: movl $1448498774, %edx ## imm = 0x56565656 +; CHECK-NEXT: xorl %esi, %edx +; CHECK-NEXT: movl $2021161080, %esi ## imm = 0x78787878 +; CHECK-NEXT: xorl %edi, %esi +; CHECK-NEXT: movl %ebx, 12(%eax) +; CHECK-NEXT: movl %ecx, 8(%eax) +; CHECK-NEXT: movl %edx, 4(%eax) +; CHECK-NEXT: movl %esi, (%eax) +; CHECK-NEXT: popl %esi +; CHECK-NEXT: popl %edi +; CHECK-NEXT: popl %ebx +; CHECK-NEXT: retl $4 + ; <0x1212121234343434, 0x101010123232323> + %y1 = freeze <2 x i64> + ; <0x5656565678787878, 0x4545454567676767> + %y2 = freeze <2 x i64> + %t2 = xor <2 x i64> %y1, %y2 + ret <2 x i64> %t2 +} + +define i10 @promote() { +; CHECK-LABEL: promote: +; CHECK: ## %bb.0: +; CHECK-NEXT: movw $682, %cx ## imm = 0x2AA +; CHECK-NEXT: movw $992, %ax ## imm = 0x3E0 +; CHECK-NEXT: addl %ecx, %eax +; CHECK-NEXT: ## kill: def $ax killed $ax killed $eax +; CHECK-NEXT: retl + %a = freeze i10 682 + %b = freeze i10 992 + %res = add i10 %a, %b + ret i10 %res +} + +define <2 x i10> @promote_vec() { +; CHECK-LABEL: promote_vec: +; CHECK: ## %bb.0: +; CHECK-NEXT: movw $125, %ax +; CHECK-NEXT: movw $682, %cx ## imm = 0x2AA +; CHECK-NEXT: movw $393, %dx ## imm = 0x189 +; CHECK-NEXT: addl %eax, %edx +; CHECK-NEXT: movw $992, %ax ## imm = 0x3E0 +; CHECK-NEXT: addl %ecx, %eax +; CHECK-NEXT: ## kill: def $ax killed $ax killed $eax +; CHECK-NEXT: ## kill: def $dx killed $dx killed $edx +; CHECK-NEXT: retl + %a = freeze <2 x i10> + %b = freeze <2 x i10> + %res = add <2 x i10> %a, %b + ret <2 x i10> %res +} diff --git a/llvm/test/CodeGen/X86/freeze.ll b/llvm/test/CodeGen/X86/freeze.ll new file mode 100644 index 0000000000000..07f9faabf68c0 --- /dev/null +++ b/llvm/test/CodeGen/X86/freeze.ll @@ -0,0 +1,110 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=x86_64-unknown-linux-gnu < %s 2>&1 | FileCheck %s --check-prefix=X86ASM + +%struct.T = type { i32, i32 } + +define i32 @freeze_int() { +; X86ASM-LABEL: freeze_int: +; X86ASM: # %bb.0: +; X86ASM-NEXT: imull %eax, %eax +; X86ASM-NEXT: retq + %y1 = freeze i32 undef + %t1 = mul i32 %y1, %y1 + ret i32 %t1 +} + +define i5 @freeze_int2() { +; X86ASM-LABEL: freeze_int2: +; X86ASM: # %bb.0: +; X86ASM-NEXT: mulb %al +; X86ASM-NEXT: retq + %y1 = freeze i5 undef + %t1 = mul i5 %y1, %y1 + ret i5 %t1 +} + +define float @freeze_float() { +; X86ASM-LABEL: freeze_float: +; X86ASM: # %bb.0: +; X86ASM-NEXT: addss %xmm0, %xmm0 +; X86ASM-NEXT: retq + %y1 = freeze float undef + %t1 = fadd float %y1, %y1 + ret float %t1 +} + +define half @freeze_half() { +; X86ASM-LABEL: freeze_half: +; X86ASM: # %bb.0: +; X86ASM-NEXT: pushq %rax +; X86ASM-NEXT: .cfi_def_cfa_offset 16 +; X86ASM-NEXT: xorl %edi, %edi +; X86ASM-NEXT: callq __gnu_h2f_ieee +; X86ASM-NEXT: callq __gnu_f2h_ieee +; X86ASM-NEXT: movzwl %ax, %edi +; X86ASM-NEXT: callq __gnu_h2f_ieee +; X86ASM-NEXT: addss %xmm0, %xmm0 +; X86ASM-NEXT: callq __gnu_f2h_ieee +; X86ASM-NEXT: popq %rcx +; X86ASM-NEXT: .cfi_def_cfa_offset 8 +; X86ASM-NEXT: retq + %y1 = freeze half undef + %t1 = fadd half %y1, %y1 + ret half %t1 +} + +define <2 x i32> @freeze_ivec() { +; X86ASM-LABEL: freeze_ivec: +; X86ASM: # %bb.0: +; X86ASM-NEXT: paddd %xmm0, %xmm0 +; X86ASM-NEXT: retq + %y1 = freeze <2 x i32> undef + %t1 = add <2 x i32> %y1, %y1 + ret <2 x i32> %t1 +} + +define i8* @freeze_ptr() { +; X86ASM-LABEL: freeze_ptr: +; X86ASM: # %bb.0: +; X86ASM-NEXT: addq $4, %rax +; X86ASM-NEXT: retq + %y1 = freeze i8* undef + %t1 = getelementptr i8, i8* %y1, i64 4 + ret i8* %t1 +} + +define i32 @freeze_struct() { +; X86ASM-LABEL: freeze_struct: +; X86ASM: # %bb.0: +; X86ASM-NEXT: addl %eax, %eax +; X86ASM-NEXT: retq + %y1 = freeze %struct.T undef + %v1 = extractvalue %struct.T %y1, 0 + %v2 = extractvalue %struct.T %y1, 1 + %t1 = add i32 %v1, %v2 + ret i32 %t1 +} + +define i32 @freeze_anonstruct() { +; X86ASM-LABEL: freeze_anonstruct: +; X86ASM: # %bb.0: +; X86ASM-NEXT: addl %eax, %eax +; X86ASM-NEXT: retq + %y1 = freeze {i32, i32} undef + %v1 = extractvalue {i32, i32} %y1, 0 + %v2 = extractvalue {i32, i32} %y1, 1 + %t1 = add i32 %v1, %v2 + ret i32 %t1 +} + +define i64 @freeze_array() { +; X86ASM-LABEL: freeze_array: +; X86ASM: # %bb.0: +; X86ASM-NEXT: addq %rax, %rax +; X86ASM-NEXT: retq + %y1 = freeze [2 x i64] undef + %v1 = extractvalue [2 x i64] %y1, 0 + %v2 = extractvalue [2 x i64] %y1, 1 + %t1 = add i64 %v1, %v2 + ret i64 %t1 +} diff --git a/llvm/test/CodeGen/X86/funnel-shift.ll b/llvm/test/CodeGen/X86/funnel-shift.ll index 517880fb88e57..f78fe2c00eb3a 100644 --- a/llvm/test/CodeGen/X86/funnel-shift.ll +++ b/llvm/test/CodeGen/X86/funnel-shift.ll @@ -918,3 +918,67 @@ define <4 x i32> @fshr_v4i32_shift_by_bitwidth(<4 x i32> %x, <4 x i32> %y) nounw ret <4 x i32> %f } +%struct.S = type { [11 x i8], i8 } +define void @PR45265(i32 %0, %struct.S* nocapture readonly %1) nounwind { +; X32-SSE2-LABEL: PR45265: +; X32-SSE2: # %bb.0: +; X32-SSE2-NEXT: pushl %edi +; X32-SSE2-NEXT: pushl %esi +; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax +; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X32-SSE2-NEXT: leal (%eax,%eax,2), %edx +; X32-SSE2-NEXT: movzwl 8(%ecx,%edx,4), %esi +; X32-SSE2-NEXT: movsbl 10(%ecx,%edx,4), %edi +; X32-SSE2-NEXT: shll $16, %edi +; X32-SSE2-NEXT: orl %edi, %esi +; X32-SSE2-NEXT: movl 4(%ecx,%edx,4), %ecx +; X32-SSE2-NEXT: shrdl $8, %esi, %ecx +; X32-SSE2-NEXT: xorl %eax, %ecx +; X32-SSE2-NEXT: sarl $31, %eax +; X32-SSE2-NEXT: sarl $31, %edi +; X32-SSE2-NEXT: shldl $24, %esi, %edi +; X32-SSE2-NEXT: xorl %eax, %edi +; X32-SSE2-NEXT: orl %edi, %ecx +; X32-SSE2-NEXT: jne .LBB44_1 +; X32-SSE2-NEXT: # %bb.2: +; X32-SSE2-NEXT: popl %esi +; X32-SSE2-NEXT: popl %edi +; X32-SSE2-NEXT: jmp _Z3foov # TAILCALL +; X32-SSE2-NEXT: .LBB44_1: +; X32-SSE2-NEXT: popl %esi +; X32-SSE2-NEXT: popl %edi +; X32-SSE2-NEXT: retl +; +; X64-AVX2-LABEL: PR45265: +; X64-AVX2: # %bb.0: +; X64-AVX2-NEXT: movslq %edi, %rax +; X64-AVX2-NEXT: leaq (%rax,%rax,2), %rcx +; X64-AVX2-NEXT: movsbq 10(%rsi,%rcx,4), %rdx +; X64-AVX2-NEXT: shlq $16, %rdx +; X64-AVX2-NEXT: movzwl 8(%rsi,%rcx,4), %edi +; X64-AVX2-NEXT: orq %rdx, %rdi +; X64-AVX2-NEXT: movq (%rsi,%rcx,4), %rcx +; X64-AVX2-NEXT: shrdq $40, %rdi, %rcx +; X64-AVX2-NEXT: cmpq %rax, %rcx +; X64-AVX2-NEXT: jne .LBB44_1 +; X64-AVX2-NEXT: # %bb.2: +; X64-AVX2-NEXT: jmp _Z3foov # TAILCALL +; X64-AVX2-NEXT: .LBB44_1: +; X64-AVX2-NEXT: retq + %3 = sext i32 %0 to i64 + %4 = getelementptr inbounds %struct.S, %struct.S* %1, i64 %3 + %5 = bitcast %struct.S* %4 to i88* + %6 = load i88, i88* %5, align 1 + %7 = ashr i88 %6, 40 + %8 = trunc i88 %7 to i64 + %9 = icmp eq i64 %8, %3 + br i1 %9, label %10, label %11 + +10: + tail call void @_Z3foov() + br label %11 + +11: + ret void +} +declare dso_local void @_Z3foov() diff --git a/llvm/test/CodeGen/X86/haddsub-undef.ll b/llvm/test/CodeGen/X86/haddsub-undef.ll index 261cce7f59512..83e7eb4c27b72 100644 --- a/llvm/test/CodeGen/X86/haddsub-undef.ll +++ b/llvm/test/CodeGen/X86/haddsub-undef.ll @@ -802,3 +802,31 @@ define <8 x float> @PR40243(<8 x float> %a, <8 x float> %b) { ret <8 x float> %r } +define <4 x double> @PR44694(<4 x double> %0, <4 x double> %1) { +; SSE-SLOW-LABEL: PR44694: +; SSE-SLOW: # %bb.0: +; SSE-SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm1[0,0] +; SSE-SLOW-NEXT: haddpd %xmm3, %xmm2 +; SSE-SLOW-NEXT: addpd %xmm1, %xmm0 +; SSE-SLOW-NEXT: movapd %xmm2, %xmm1 +; SSE-SLOW-NEXT: retq +; +; SSE-FAST-LABEL: PR44694: +; SSE-FAST: # %bb.0: +; SSE-FAST-NEXT: movapd %xmm1, %xmm0 +; SSE-FAST-NEXT: haddpd %xmm3, %xmm2 +; SSE-FAST-NEXT: haddpd %xmm1, %xmm0 +; SSE-FAST-NEXT: movapd %xmm2, %xmm1 +; SSE-FAST-NEXT: retq +; +; AVX-LABEL: PR44694: +; AVX: # %bb.0: +; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 +; AVX-NEXT: vhaddpd %ymm0, %ymm1, %ymm0 +; AVX-NEXT: retq + %3 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> + %4 = shufflevector <4 x double> %0, <4 x double> %1, <4 x i32> + %5 = fadd <4 x double> %3, %4 + ret <4 x double> %5 +} diff --git a/llvm/test/CodeGen/X86/init-priority.ll b/llvm/test/CodeGen/X86/init-priority.ll index 30e94841f7939..47f548cf02389 100644 --- a/llvm/test/CodeGen/X86/init-priority.ll +++ b/llvm/test/CodeGen/X86/init-priority.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -mtriple=x86_64-netbsd | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-netbsd -use-ctors | FileCheck %s ; Check that our compiler never emits global constructors ; inside the .init_array section when building for a non supported target. diff --git a/llvm/test/CodeGen/X86/masked_store_trunc.ll b/llvm/test/CodeGen/X86/masked_store_trunc.ll index 5c8c40ed4bad5..2feb0382d8c9e 100644 --- a/llvm/test/CodeGen/X86/masked_store_trunc.ll +++ b/llvm/test/CodeGen/X86/masked_store_trunc.ll @@ -163,11 +163,9 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, <8 x i32>* %p, <8 x i32> %mask ; AVX1-NEXT: vpcmpeqd %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpxor %xmm5, %xmm2, %xmm2 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3],ymm1[2,3] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm3[0,2],ymm0[4,6],ymm3[4,6] ; AVX1-NEXT: vmaskmovps %ymm0, %ymm2, (%rdi) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -178,11 +176,9 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, <8 x i32>* %p, <8 x i32> %mask ; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 ; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm2 -; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2] -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3],ymm1[2,3] ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm3[0,2],ymm0[4,6],ymm3[4,6] ; AVX2-NEXT: vpmaskmovd %ymm0, %ymm2, (%rdi) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -457,11 +453,9 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, <8 x i16>* %p, <8 x i32> %mask ; AVX2-LABEL: truncstore_v8i64_v8i16: ; AVX2: # %bb.0: ; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm4[0,2] -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3],ymm1[2,3] ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm4[0,2],ymm0[4,6],ymm4[4,6] ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm1 diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll index b79fb5c35f3d5..bde5ea6e9bde5 100644 --- a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll +++ b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll @@ -300,26 +300,26 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, <8 x i32>* %p, <8 x i32> %mask ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [2147483647,2147483647] ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm9 -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm6 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7 -; AVX1-NEXT: vpcmpgtq %xmm7, %xmm4, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 +; AVX1-NEXT: vpcmpgtq %xmm6, %xmm4, %xmm7 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm4, %xmm5 ; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [18446744071562067968,18446744071562067968] ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm0, %xmm10 -; AVX1-NEXT: vblendvpd %xmm2, %xmm7, %xmm4, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm5, %xmm2, %xmm7 -; AVX1-NEXT: vblendvpd %xmm6, %xmm1, %xmm4, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm5, %xmm1, %xmm6 +; AVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm5, %xmm1, %xmm2 +; AVX1-NEXT: vblendvpd %xmm7, %xmm6, %xmm4, %xmm6 +; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm7 ; AVX1-NEXT: vblendvpd %xmm9, %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm3, %xmm4 ; AVX1-NEXT: vblendvpd %xmm4, %xmm3, %xmm5, %xmm3 -; AVX1-NEXT: vblendvpd %xmm6, %xmm1, %xmm5, %xmm1 -; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2] -; AVX1-NEXT: vblendvpd %xmm7, %xmm2, %xmm5, %xmm2 +; AVX1-NEXT: vblendvpd %xmm7, %xmm6, %xmm5, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3 +; AVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm5, %xmm1 ; AVX1-NEXT: vblendvpd %xmm10, %xmm0, %xmm5, %xmm0 -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm3[0,2],ymm0[4,6],ymm3[4,6] ; AVX1-NEXT: vmaskmovps %ymm0, %ymm8, (%rdi) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -331,20 +331,18 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, <8 x i32>* %p, <8 x i32> %mask ; AVX2-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 ; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [2147483647,2147483647,2147483647,2147483647] -; AVX2-NEXT: vpcmpgtq %ymm1, %ymm3, %ymm4 -; AVX2-NEXT: vblendvpd %ymm4, %ymm1, %ymm3, %ymm1 ; AVX2-NEXT: vpcmpgtq %ymm0, %ymm3, %ymm4 ; AVX2-NEXT: vblendvpd %ymm4, %ymm0, %ymm3, %ymm0 +; AVX2-NEXT: vpcmpgtq %ymm1, %ymm3, %ymm4 +; AVX2-NEXT: vblendvpd %ymm4, %ymm1, %ymm3, %ymm1 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968] -; AVX2-NEXT: vpcmpgtq %ymm3, %ymm0, %ymm4 -; AVX2-NEXT: vblendvpd %ymm4, %ymm0, %ymm3, %ymm0 ; AVX2-NEXT: vpcmpgtq %ymm3, %ymm1, %ymm4 ; AVX2-NEXT: vblendvpd %ymm4, %ymm1, %ymm3, %ymm1 -; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2] -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] +; AVX2-NEXT: vpcmpgtq %ymm3, %ymm0, %ymm4 +; AVX2-NEXT: vblendvpd %ymm4, %ymm0, %ymm3, %ymm0 +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3],ymm1[2,3] ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm3[0,2],ymm0[4,6],ymm3[4,6] ; AVX2-NEXT: vpmaskmovd %ymm0, %ymm2, (%rdi) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll index ab22ba3b8f927..84958fef6a325 100644 --- a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll +++ b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll @@ -240,22 +240,22 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, <8 x i32>* %p, <8 x i32> %mask ; AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm4 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [9223372041149743103,9223372041149743103] ; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm9 -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 -; AVX1-NEXT: vpxor %xmm3, %xmm6, %xmm7 -; AVX1-NEXT: vpcmpgtq %xmm7, %xmm5, %xmm7 -; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm2 +; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm6 +; AVX1-NEXT: vpcmpgtq %xmm6, %xmm5, %xmm6 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7 +; AVX1-NEXT: vpxor %xmm3, %xmm7, %xmm2 ; AVX1-NEXT: vpcmpgtq %xmm2, %xmm5, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 ; AVX1-NEXT: vpxor %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm5, %xmm3 ; AVX1-NEXT: vmovapd {{.*#+}} xmm5 = [4294967295,4294967295] ; AVX1-NEXT: vblendvpd %xmm3, %xmm4, %xmm5, %xmm3 -; AVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm5, %xmm1 -; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2] -; AVX1-NEXT: vblendvpd %xmm7, %xmm6, %xmm5, %xmm2 +; AVX1-NEXT: vblendvpd %xmm2, %xmm7, %xmm5, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-NEXT: vblendvpd %xmm6, %xmm1, %xmm5, %xmm1 ; AVX1-NEXT: vblendvpd %xmm9, %xmm0, %xmm5, %xmm0 -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] ; AVX1-NEXT: vmaskmovps %ymm0, %ymm8, (%rdi) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -268,18 +268,16 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, <8 x i32>* %p, <8 x i32> %mask ; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm3 = [4294967295,4294967295,4294967295,4294967295] ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm5 +; AVX2-NEXT: vpxor %ymm4, %ymm1, %ymm5 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm6 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103] ; AVX2-NEXT: vpcmpgtq %ymm5, %ymm6, %ymm5 -; AVX2-NEXT: vblendvpd %ymm5, %ymm0, %ymm3, %ymm0 -; AVX2-NEXT: vpxor %ymm4, %ymm1, %ymm4 +; AVX2-NEXT: vblendvpd %ymm5, %ymm1, %ymm3, %ymm1 +; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm4 ; AVX2-NEXT: vpcmpgtq %ymm4, %ymm6, %ymm4 -; AVX2-NEXT: vblendvpd %ymm4, %ymm1, %ymm3, %ymm1 -; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2] -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2] +; AVX2-NEXT: vblendvpd %ymm4, %ymm0, %ymm3, %ymm0 +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3],ymm1[2,3] ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm3[0,2],ymm0[4,6],ymm3[4,6] ; AVX2-NEXT: vpmaskmovd %ymm0, %ymm2, (%rdi) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/move_latch_to_loop_top.ll b/llvm/test/CodeGen/X86/move_latch_to_loop_top.ll index 0141593ce0f1f..d86ec9c8129d7 100644 --- a/llvm/test/CodeGen/X86/move_latch_to_loop_top.ll +++ b/llvm/test/CodeGen/X86/move_latch_to_loop_top.ll @@ -173,8 +173,8 @@ exit: ;CHECK: %header ;CHECK: %true ;CHECK: %latch -;CHECK: %exit ;CHECK: %false +;CHECK: %exit define i32 @test4(i32 %t, i32* %p) { entry: br label %header diff --git a/llvm/test/CodeGen/X86/noreturn-call-win64.ll b/llvm/test/CodeGen/X86/noreturn-call-win64.ll index 6289eef6bb48f..ee9b587978af7 100644 --- a/llvm/test/CodeGen/X86/noreturn-call-win64.ll +++ b/llvm/test/CodeGen/X86/noreturn-call-win64.ll @@ -1,5 +1,8 @@ ; RUN: llc < %s -mtriple=x86_64-windows-msvc | FileCheck %s +%struct.MakeCleanup = type { i8 } +%eh.ThrowInfo = type { i32, i32, i32, i32 } + ; Function Attrs: noinline nounwind optnone uwtable define dso_local i32 @foo() { entry: @@ -51,3 +54,60 @@ declare dso_local i32 @cond() declare dso_local void @abort1() noreturn declare dso_local void @abort2() noreturn declare dso_local void @abort3() noreturn + +define dso_local void @throw_exception() uwtable personality i32 (...)* @__CxxFrameHandler3 { +entry: + %o = alloca %struct.MakeCleanup, align 1 + %call = invoke i32 @cond() + to label %invoke.cont unwind label %ehcleanup + +invoke.cont: ; preds = %entry + %cmp1 = icmp eq i32 0, %call + br i1 %cmp1, label %if.then, label %if.end + +if.then: ; preds = %invoke.cont + invoke void @_CxxThrowException(i8* null, %eh.ThrowInfo* null) + to label %unreachable unwind label %ehcleanup + +if.end: ; preds = %invoke.cont + %call2 = invoke i32 @cond() + to label %invoke.cont1 unwind label %ehcleanup + +invoke.cont1: ; preds = %if.end + %cmp2 = icmp eq i32 0, %call2 + br i1 %cmp2, label %if.then3, label %if.end4 + +if.then3: ; preds = %invoke.cont1 + invoke void @_CxxThrowException(i8* null, %eh.ThrowInfo* null) + to label %unreachable unwind label %ehcleanup + +if.end4: ; preds = %invoke.cont1 + call void @"??1MakeCleanup@@QEAA@XZ"(%struct.MakeCleanup* nonnull %o) + ret void + +ehcleanup: ; preds = %if.then3, %if.end, %if.then, %entry + %cp = cleanuppad within none [] + call void @"??1MakeCleanup@@QEAA@XZ"(%struct.MakeCleanup* nonnull %o) [ "funclet"(token %cp) ] + cleanupret from %cp unwind to caller + +unreachable: ; preds = %if.then3, %if.then + unreachable +} + +declare dso_local i32 @__CxxFrameHandler3(...) +declare dso_local void @_CxxThrowException(i8*, %eh.ThrowInfo*) +declare dso_local void @"??1MakeCleanup@@QEAA@XZ"(%struct.MakeCleanup*) + +; CHECK-LABEL: throw_exception: +; CHECK: callq cond +; CHECK: je +; CHECK: callq cond +; CHECK: je +; CHECK: retq +; CHECK: callq _CxxThrowException +; CHECK-NOT: {{(addq|subq) .*, %rsp}} +; CHECK: callq _CxxThrowException +; CHECK-NOT: {{(addq|subq) .*, %rsp}} +; CHECK: # %unreachable +; CHECK: int3 +; CHECK: .seh_handlerdata diff --git a/llvm/test/CodeGen/X86/pr40891.ll b/llvm/test/CodeGen/X86/pr40891.ll index 817e5e8fd29aa..d67739767b218 100644 --- a/llvm/test/CodeGen/X86/pr40891.ll +++ b/llvm/test/CodeGen/X86/pr40891.ll @@ -8,11 +8,9 @@ define <8 x i32> @foo(<8 x i64> %x, <4 x i64> %y) { ; CHECK: # %bb.0: ; CHECK-NEXT: vandps %ymm2, %ymm0, %ymm0 ; CHECK-NEXT: vandps {{\.LCPI.*}}, %ymm1, %ymm1 -; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm2 -; CHECK-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] -; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2 -; CHECK-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; CHECK-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] ; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; CHECK-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] ; CHECK-NEXT: retl %a = shufflevector <4 x i64> %y, <4 x i64> , <8 x i32> %b = and <8 x i64> %x, %a diff --git a/llvm/test/CodeGen/X86/pr42870.ll b/llvm/test/CodeGen/X86/pr42870.ll index 575a2653a33f8..c42cb7cb8b286 100644 --- a/llvm/test/CodeGen/X86/pr42870.ll +++ b/llvm/test/CodeGen/X86/pr42870.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=sse | FileCheck %s -define i32 @foo(<4 x float>* %a) { -; CHECK-LABEL: foo: +define i32 @test_load(<4 x float>* %a) { +; CHECK-LABEL: test_load: ; CHECK: ## %bb.0: ## %start ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movaps (%eax), %xmm0 @@ -17,8 +17,8 @@ start: ret i32 %4 } -define i32 @bar(<4 x float> %a) { -; CHECK-LABEL: bar: +define i32 @test_bitcast(<4 x float> %a) { +; CHECK-LABEL: test_bitcast: ; CHECK: ## %bb.0: ## %start ; CHECK-NEXT: movmskps %xmm0, %eax ; CHECK-NEXT: retl @@ -29,3 +29,54 @@ start: %3 = zext i4 %2 to i32 ret i32 %3 } + +define i32 @test_and(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: test_and: +; CHECK: ## %bb.0: ## %start +; CHECK-NEXT: andps %xmm1, %xmm0 +; CHECK-NEXT: movmskps %xmm0, %eax +; CHECK-NEXT: retl +start: + %0 = bitcast <4 x float> %a to <4 x i32> + %1 = bitcast <4 x float> %b to <4 x i32> + %2 = icmp slt <4 x i32> %0, zeroinitializer + %3 = icmp slt <4 x i32> %1, zeroinitializer + %4 = and <4 x i1> %2, %3 + %5 = bitcast <4 x i1> %4 to i4 + %6 = zext i4 %5 to i32 + ret i32 %6 +} + +define i32 @test_or(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: test_or: +; CHECK: ## %bb.0: ## %start +; CHECK-NEXT: orps %xmm1, %xmm0 +; CHECK-NEXT: movmskps %xmm0, %eax +; CHECK-NEXT: retl +start: + %0 = bitcast <4 x float> %a to <4 x i32> + %1 = bitcast <4 x float> %b to <4 x i32> + %2 = icmp slt <4 x i32> %0, zeroinitializer + %3 = icmp slt <4 x i32> %1, zeroinitializer + %4 = or <4 x i1> %2, %3 + %5 = bitcast <4 x i1> %4 to i4 + %6 = zext i4 %5 to i32 + ret i32 %6 +} + +define i32 @test_xor(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: test_xor: +; CHECK: ## %bb.0: ## %start +; CHECK-NEXT: xorps %xmm1, %xmm0 +; CHECK-NEXT: movmskps %xmm0, %eax +; CHECK-NEXT: retl +start: + %0 = bitcast <4 x float> %a to <4 x i32> + %1 = bitcast <4 x float> %b to <4 x i32> + %2 = icmp slt <4 x i32> %0, zeroinitializer + %3 = icmp slt <4 x i32> %1, zeroinitializer + %4 = xor <4 x i1> %2, %3 + %5 = bitcast <4 x i1> %4 to i4 + %6 = zext i4 %5 to i32 + ret i32 %6 +} diff --git a/llvm/test/CodeGen/X86/ragreedy-bug.ll b/llvm/test/CodeGen/X86/ragreedy-bug.ll index 7a82459db00f6..7a7c98fba4eb0 100644 --- a/llvm/test/CodeGen/X86/ragreedy-bug.ll +++ b/llvm/test/CodeGen/X86/ragreedy-bug.ll @@ -10,8 +10,6 @@ ; Mem-move ; CHECK-NEXT: movl ; CHECK-NEXT: andl -; CHECK-NEXT: LBB0 -; CHECK-NEXT: in Loop ; CHECK-NEXT: testl ; CHECK-NEXT: jne ; CHECK: cond.true.i.i217 @@ -19,20 +17,20 @@ ; Mem-move ; CHECK-NEXT: movl ; CHECK-NEXT: andl -; CHECK-NEXT: LBB0 -; CHECK-NEXT: in Loop ; CHECK-NEXT: testl ; CHECK-NEXT: je ; CHECK: cond.false.i.i ; CHECK: maskrune ; CHECK-NEXT: movzbl ; CHECK-NEXT: movzbl -; CHECK-NEXT: jmp +; CHECK-NEXT: testl +; CHECK-NEXT: je ; CHECK: cond.false.i.i219 ; CHECK: maskrune ; CHECK-NEXT: movzbl ; CHECK-NEXT: movzbl -; CHECK-NEXT: jmp +; CHECK-NEXT: testl +; CHECK-NEXT: jne %struct.List_o_links_struct = type { i32, i32, i32, %struct.List_o_links_struct* } %struct.Connector_struct = type { i16, i16, i8, i8, %struct.Connector_struct*, i8* } diff --git a/llvm/test/CodeGen/X86/sad.ll b/llvm/test/CodeGen/X86/sad.ll index 72b24929cf7de..64845c847dff9 100644 --- a/llvm/test/CodeGen/X86/sad.ll +++ b/llvm/test/CodeGen/X86/sad.ll @@ -151,135 +151,30 @@ middle.block: define i32 @sad_32i8() nounwind { ; SSE2-LABEL: sad_32i8: ; SSE2: # %bb.0: # %entry -; SSE2-NEXT: pxor %xmm12, %xmm12 -; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: pxor %xmm6, %xmm6 -; SSE2-NEXT: pxor %xmm13, %xmm13 ; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: pxor %xmm15, %xmm15 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: pxor %xmm14, %xmm14 +; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: .p2align 4, 0x90 ; SSE2-NEXT: .LBB1_1: # %vector.body ; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 -; SSE2-NEXT: movdqa a+1040(%rax), %xmm8 +; SSE2-NEXT: movdqa a+1040(%rax), %xmm3 +; SSE2-NEXT: psadbw b+1040(%rax), %xmm3 +; SSE2-NEXT: paddd %xmm3, %xmm1 ; SSE2-NEXT: movdqa a+1024(%rax), %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3],xmm4[4],xmm12[4],xmm4[5],xmm12[5],xmm4[6],xmm12[6],xmm4[7],xmm12[7] -; SSE2-NEXT: movdqa %xmm4, %xmm7 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm12[0],xmm7[1],xmm12[1],xmm7[2],xmm12[2],xmm7[3],xmm12[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm12[4],xmm4[5],xmm12[5],xmm4[6],xmm12[6],xmm4[7],xmm12[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm12[8],xmm3[9],xmm12[9],xmm3[10],xmm12[10],xmm3[11],xmm12[11],xmm3[12],xmm12[12],xmm3[13],xmm12[13],xmm3[14],xmm12[14],xmm3[15],xmm12[15] -; SSE2-NEXT: movdqa %xmm3, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm12[4],xmm3[5],xmm12[5],xmm3[6],xmm12[6],xmm3[7],xmm12[7] -; SSE2-NEXT: movdqa %xmm8, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3],xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] -; SSE2-NEXT: movdqa %xmm0, %xmm5 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm12[8],xmm8[9],xmm12[9],xmm8[10],xmm12[10],xmm8[11],xmm12[11],xmm8[12],xmm12[12],xmm8[13],xmm12[13],xmm8[14],xmm12[14],xmm8[15],xmm12[15] -; SSE2-NEXT: movdqa b+1024(%rax), %xmm11 -; SSE2-NEXT: movdqa %xmm11, %xmm10 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3],xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7] -; SSE2-NEXT: movdqa %xmm10, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3] -; SSE2-NEXT: psubd %xmm2, %xmm7 -; SSE2-NEXT: movdqa b+1040(%rax), %xmm9 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7] -; SSE2-NEXT: psubd %xmm10, %xmm4 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm12[8],xmm11[9],xmm12[9],xmm11[10],xmm12[10],xmm11[11],xmm12[11],xmm11[12],xmm12[12],xmm11[13],xmm12[13],xmm11[14],xmm12[14],xmm11[15],xmm12[15] -; SSE2-NEXT: movdqa %xmm11, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3] -; SSE2-NEXT: psubd %xmm2, %xmm1 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] -; SSE2-NEXT: psubd %xmm11, %xmm3 -; SSE2-NEXT: movdqa %xmm6, %xmm10 -; SSE2-NEXT: movdqa %xmm9, %xmm6 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3],xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7] -; SSE2-NEXT: movdqa %xmm6, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3] -; SSE2-NEXT: psubd %xmm2, %xmm5 -; SSE2-NEXT: movdqa %xmm8, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7] -; SSE2-NEXT: psubd %xmm6, %xmm0 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm12[8],xmm9[9],xmm12[9],xmm9[10],xmm12[10],xmm9[11],xmm12[11],xmm9[12],xmm12[12],xmm9[13],xmm12[13],xmm9[14],xmm12[14],xmm9[15],xmm12[15] -; SSE2-NEXT: movdqa %xmm9, %xmm6 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3] -; SSE2-NEXT: psubd %xmm6, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm12[4],xmm9[5],xmm12[5],xmm9[6],xmm12[6],xmm9[7],xmm12[7] -; SSE2-NEXT: psubd %xmm9, %xmm8 -; SSE2-NEXT: movdqa %xmm7, %xmm6 -; SSE2-NEXT: psrad $31, %xmm6 -; SSE2-NEXT: paddd %xmm6, %xmm7 -; SSE2-NEXT: pxor %xmm6, %xmm7 -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload -; SSE2-NEXT: paddd %xmm7, %xmm6 -; SSE2-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm4, %xmm6 -; SSE2-NEXT: psrad $31, %xmm6 -; SSE2-NEXT: paddd %xmm6, %xmm4 -; SSE2-NEXT: pxor %xmm6, %xmm4 -; SSE2-NEXT: movdqa %xmm10, %xmm6 -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload -; SSE2-NEXT: paddd %xmm4, %xmm7 -; SSE2-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: psrad $31, %xmm4 -; SSE2-NEXT: paddd %xmm4, %xmm1 -; SSE2-NEXT: pxor %xmm4, %xmm1 -; SSE2-NEXT: paddd %xmm1, %xmm6 -; SSE2-NEXT: movdqa %xmm3, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: paddd %xmm1, %xmm3 -; SSE2-NEXT: pxor %xmm1, %xmm3 -; SSE2-NEXT: paddd %xmm3, %xmm13 -; SSE2-NEXT: movdqa %xmm5, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: paddd %xmm1, %xmm5 -; SSE2-NEXT: pxor %xmm1, %xmm5 -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE2-NEXT: paddd %xmm5, %xmm1 -; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: paddd %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm15 -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm2 -; SSE2-NEXT: pxor %xmm0, %xmm2 -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: paddd %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm8, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm8 -; SSE2-NEXT: pxor %xmm0, %xmm8 -; SSE2-NEXT: paddd %xmm8, %xmm14 +; SSE2-NEXT: psadbw b+1024(%rax), %xmm3 +; SSE2-NEXT: paddd %xmm3, %xmm2 ; SSE2-NEXT: addq $4, %rax ; SSE2-NEXT: jne .LBB1_1 ; SSE2-NEXT: # %bb.2: # %middle.block -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: paddd %xmm15, %xmm0 -; SSE2-NEXT: paddd %xmm14, %xmm13 -; SSE2-NEXT: paddd %xmm0, %xmm13 -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE2-NEXT: paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload -; SSE2-NEXT: paddd %xmm13, %xmm6 -; SSE2-NEXT: paddd %xmm0, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,3,0,1] -; SSE2-NEXT: paddd %xmm6, %xmm0 +; SSE2-NEXT: paddd %xmm0, %xmm1 +; SSE2-NEXT: paddd %xmm0, %xmm2 +; SSE2-NEXT: paddd %xmm0, %xmm0 +; SSE2-NEXT: paddd %xmm0, %xmm1 +; SSE2-NEXT: paddd %xmm0, %xmm1 +; SSE2-NEXT: paddd %xmm2, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE2-NEXT: paddd %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax @@ -416,452 +311,99 @@ middle.block: define i32 @sad_avx64i8() nounwind { ; SSE2-LABEL: sad_avx64i8: ; SSE2: # %bb.0: # %entry -; SSE2-NEXT: subq $200, %rsp -; SSE2-NEXT: pxor %xmm14, %xmm14 +; SSE2-NEXT: pxor %xmm4, %xmm4 ; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00 ; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: .p2align 4, 0x90 ; SSE2-NEXT: .LBB2_1: # %vector.body ; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 -; SSE2-NEXT: movaps a+1040(%rax), %xmm0 -; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movdqa a+1024(%rax), %xmm12 -; SSE2-NEXT: movdqa a+1056(%rax), %xmm15 -; SSE2-NEXT: movdqa a+1072(%rax), %xmm4 -; SSE2-NEXT: movdqa %xmm4, %xmm6 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm14[8],xmm6[9],xmm14[9],xmm6[10],xmm14[10],xmm6[11],xmm14[11],xmm6[12],xmm14[12],xmm6[13],xmm14[13],xmm6[14],xmm14[14],xmm6[15],xmm14[15] -; SSE2-NEXT: movdqa %xmm6, %xmm1 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm14[0],xmm6[1],xmm14[1],xmm6[2],xmm14[2],xmm6[3],xmm14[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3],xmm4[4],xmm14[4],xmm4[5],xmm14[5],xmm4[6],xmm14[6],xmm4[7],xmm14[7] -; SSE2-NEXT: movdqa %xmm4, %xmm5 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm14[4],xmm5[5],xmm14[5],xmm5[6],xmm14[6],xmm5[7],xmm14[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3] -; SSE2-NEXT: movdqa %xmm15, %xmm11 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm14[8],xmm11[9],xmm14[9],xmm11[10],xmm14[10],xmm11[11],xmm14[11],xmm11[12],xmm14[12],xmm11[13],xmm14[13],xmm11[14],xmm14[14],xmm11[15],xmm14[15] -; SSE2-NEXT: movdqa %xmm11, %xmm8 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm14[4],xmm8[5],xmm14[5],xmm8[6],xmm14[6],xmm8[7],xmm14[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm14[0],xmm11[1],xmm14[1],xmm11[2],xmm14[2],xmm11[3],xmm14[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] -; SSE2-NEXT: movdqa %xmm15, %xmm0 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] -; SSE2-NEXT: movdqa %xmm12, %xmm10 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3],xmm10[4],xmm14[4],xmm10[5],xmm14[5],xmm10[6],xmm14[6],xmm10[7],xmm14[7] -; SSE2-NEXT: movdqa %xmm10, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; SSE2-NEXT: movdqa %xmm0, %xmm9 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm14[4],xmm10[5],xmm14[5],xmm10[6],xmm14[6],xmm10[7],xmm14[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm14[8],xmm12[9],xmm14[9],xmm12[10],xmm14[10],xmm12[11],xmm14[11],xmm12[12],xmm14[12],xmm12[13],xmm14[13],xmm12[14],xmm14[14],xmm12[15],xmm14[15] -; SSE2-NEXT: movdqa %xmm12, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; SSE2-NEXT: movdqa %xmm0, %xmm13 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7] -; SSE2-NEXT: movdqa b+1072(%rax), %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm7 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm14[8],xmm7[9],xmm14[9],xmm7[10],xmm14[10],xmm7[11],xmm14[11],xmm7[12],xmm14[12],xmm7[13],xmm14[13],xmm7[14],xmm14[14],xmm7[15],xmm14[15] -; SSE2-NEXT: movdqa %xmm7, %xmm0 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] -; SSE2-NEXT: psubd %xmm0, %xmm1 -; SSE2-NEXT: movdqa b+1056(%rax), %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm14[0],xmm7[1],xmm14[1],xmm7[2],xmm14[2],xmm7[3],xmm14[3] -; SSE2-NEXT: psubd %xmm7, %xmm6 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7] -; SSE2-NEXT: movdqa %xmm3, %xmm7 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm14[4],xmm7[5],xmm14[5],xmm7[6],xmm14[6],xmm7[7],xmm14[7] -; SSE2-NEXT: psubd %xmm7, %xmm5 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3] -; SSE2-NEXT: psubd %xmm3, %xmm4 -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm14[8],xmm3[9],xmm14[9],xmm3[10],xmm14[10],xmm3[11],xmm14[11],xmm3[12],xmm14[12],xmm3[13],xmm14[13],xmm3[14],xmm14[14],xmm3[15],xmm14[15] -; SSE2-NEXT: movdqa %xmm3, %xmm7 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm14[4],xmm7[5],xmm14[5],xmm7[6],xmm14[6],xmm7[7],xmm14[7] -; SSE2-NEXT: psubd %xmm7, %xmm8 -; SSE2-NEXT: movdqa b+1024(%rax), %xmm7 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3] -; SSE2-NEXT: psubd %xmm3, %xmm11 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3],xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7] -; SSE2-NEXT: psubd %xmm3, %xmm2 -; SSE2-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; SSE2-NEXT: psubd %xmm0, %xmm15 -; SSE2-NEXT: movdqa %xmm7, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3],xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3] -; SSE2-NEXT: psubd %xmm3, %xmm9 -; SSE2-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE2-NEXT: movdqa %xmm2, %xmm9 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm14[0],xmm9[1],xmm14[1],xmm9[2],xmm14[2],xmm9[3],xmm14[3],xmm9[4],xmm14[4],xmm9[5],xmm14[5],xmm9[6],xmm14[6],xmm9[7],xmm14[7] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] -; SSE2-NEXT: psubd %xmm0, %xmm10 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm14[8],xmm7[9],xmm14[9],xmm7[10],xmm14[10],xmm7[11],xmm14[11],xmm7[12],xmm14[12],xmm7[13],xmm14[13],xmm7[14],xmm14[14],xmm7[15],xmm14[15] -; SSE2-NEXT: movdqa %xmm7, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; SSE2-NEXT: psubd %xmm0, %xmm13 -; SSE2-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm9, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm14[4],xmm7[5],xmm14[5],xmm7[6],xmm14[6],xmm7[7],xmm14[7] -; SSE2-NEXT: psubd %xmm7, %xmm12 -; SSE2-NEXT: movdqa b+1040(%rax), %xmm13 -; SSE2-NEXT: movdqa %xmm13, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7] -; SSE2-NEXT: movdqa %xmm3, %xmm7 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm14[0],xmm7[1],xmm14[1],xmm7[2],xmm14[2],xmm7[3],xmm14[3] -; SSE2-NEXT: psubd %xmm7, %xmm0 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm14[4],xmm9[5],xmm14[5],xmm9[6],xmm14[6],xmm9[7],xmm14[7] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7] -; SSE2-NEXT: psubd %xmm3, %xmm9 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm14[8],xmm2[9],xmm14[9],xmm2[10],xmm14[10],xmm2[11],xmm14[11],xmm2[12],xmm14[12],xmm2[13],xmm14[13],xmm2[14],xmm14[14],xmm2[15],xmm14[15] -; SSE2-NEXT: movdqa %xmm2, %xmm7 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm14[0],xmm7[1],xmm14[1],xmm7[2],xmm14[2],xmm7[3],xmm14[3] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm14[8],xmm13[9],xmm14[9],xmm13[10],xmm14[10],xmm13[11],xmm14[11],xmm13[12],xmm14[12],xmm13[13],xmm14[13],xmm13[14],xmm14[14],xmm13[15],xmm14[15] -; SSE2-NEXT: movdqa %xmm13, %xmm3 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3] -; SSE2-NEXT: psubd %xmm3, %xmm7 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm14[4],xmm13[5],xmm14[5],xmm13[6],xmm14[6],xmm13[7],xmm14[7] -; SSE2-NEXT: psubd %xmm13, %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm13 -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: psrad $31, %xmm3 -; SSE2-NEXT: paddd %xmm3, %xmm1 -; SSE2-NEXT: pxor %xmm3, %xmm1 -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE2-NEXT: paddd %xmm1, %xmm3 -; SSE2-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm6, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: paddd %xmm1, %xmm6 -; SSE2-NEXT: pxor %xmm1, %xmm6 -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE2-NEXT: paddd %xmm6, %xmm1 -; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm5, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: paddd %xmm1, %xmm5 -; SSE2-NEXT: pxor %xmm1, %xmm5 -; SSE2-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: movdqa a+1056(%rax), %xmm5 +; SSE2-NEXT: psadbw b+1056(%rax), %xmm5 +; SSE2-NEXT: paddd %xmm5, %xmm2 +; SSE2-NEXT: movdqa a+1040(%rax), %xmm5 +; SSE2-NEXT: psadbw b+1040(%rax), %xmm5 +; SSE2-NEXT: paddd %xmm5, %xmm3 +; SSE2-NEXT: movdqa a+1024(%rax), %xmm5 +; SSE2-NEXT: psadbw b+1024(%rax), %xmm5 +; SSE2-NEXT: paddd %xmm5, %xmm0 +; SSE2-NEXT: movdqa a+1072(%rax), %xmm5 +; SSE2-NEXT: psadbw b+1072(%rax), %xmm5 ; SSE2-NEXT: paddd %xmm5, %xmm1 -; SSE2-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm4, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: paddd %xmm1, %xmm4 -; SSE2-NEXT: pxor %xmm1, %xmm4 -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE2-NEXT: paddd %xmm4, %xmm1 -; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm8, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: paddd %xmm1, %xmm8 -; SSE2-NEXT: pxor %xmm1, %xmm8 -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE2-NEXT: paddd %xmm8, %xmm1 -; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm11, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: paddd %xmm1, %xmm11 -; SSE2-NEXT: pxor %xmm1, %xmm11 -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE2-NEXT: paddd %xmm11, %xmm1 -; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: paddd %xmm1, %xmm2 -; SSE2-NEXT: pxor %xmm1, %xmm2 -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE2-NEXT: paddd %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm15, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: paddd %xmm1, %xmm15 -; SSE2-NEXT: pxor %xmm1, %xmm15 -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE2-NEXT: paddd %xmm15, %xmm1 -; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: paddd %xmm1, %xmm2 -; SSE2-NEXT: pxor %xmm1, %xmm2 -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE2-NEXT: paddd %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm10, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: paddd %xmm1, %xmm10 -; SSE2-NEXT: pxor %xmm1, %xmm10 -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE2-NEXT: paddd %xmm10, %xmm1 -; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: paddd %xmm1, %xmm2 -; SSE2-NEXT: pxor %xmm1, %xmm2 -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE2-NEXT: paddd %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm12, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: paddd %xmm1, %xmm12 -; SSE2-NEXT: pxor %xmm1, %xmm12 -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE2-NEXT: paddd %xmm12, %xmm1 -; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: paddd %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm0 -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm9, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm9 -; SSE2-NEXT: pxor %xmm0, %xmm9 -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: paddd %xmm9, %xmm0 -; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm7, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm7 -; SSE2-NEXT: pxor %xmm0, %xmm7 -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: paddd %xmm7, %xmm0 -; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm13, %xmm1 -; SSE2-NEXT: movdqa %xmm13, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: paddd %xmm1, %xmm0 -; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE2-NEXT: addq $4, %rax ; SSE2-NEXT: jne .LBB2_1 ; SSE2-NEXT: # %bb.2: # %middle.block -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE2-NEXT: paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; SSE2-NEXT: paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload -; SSE2-NEXT: paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload -; SSE2-NEXT: paddd %xmm1, %xmm3 -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE2-NEXT: paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload -; SSE2-NEXT: paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload -; SSE2-NEXT: paddd %xmm1, %xmm4 -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload -; SSE2-NEXT: paddd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; SSE2-NEXT: paddd (%rsp), %xmm1 # 16-byte Folded Reload +; SSE2-NEXT: paddd %xmm4, %xmm2 +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: paddd %xmm5, %xmm5 +; SSE2-NEXT: paddd %xmm4, %xmm0 ; SSE2-NEXT: paddd %xmm4, %xmm1 -; SSE2-NEXT: paddd %xmm2, %xmm1 +; SSE2-NEXT: paddd %xmm4, %xmm3 +; SSE2-NEXT: paddd %xmm5, %xmm1 +; SSE2-NEXT: paddd %xmm5, %xmm2 +; SSE2-NEXT: paddd %xmm5, %xmm2 +; SSE2-NEXT: paddd %xmm5, %xmm1 ; SSE2-NEXT: paddd %xmm3, %xmm1 +; SSE2-NEXT: paddd %xmm2, %xmm1 ; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] ; SSE2-NEXT: paddd %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: addq $200, %rsp ; SSE2-NEXT: retq ; ; AVX1-LABEL: sad_avx64i8: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: subq $24, %rsp -; AVX1-NEXT: vpxor %xmm14, %xmm14, %xmm14 -; AVX1-NEXT: movq $-1024, %rax # imm = 0xFC00 -; AVX1-NEXT: vpxor %xmm15, %xmm15, %xmm15 -; AVX1-NEXT: vpxor %xmm7, %xmm7, %xmm7 -; AVX1-NEXT: vpxor %xmm13, %xmm13, %xmm13 ; AVX1-NEXT: vpxor %xmm8, %xmm8, %xmm8 -; AVX1-NEXT: vpxor %xmm9, %xmm9, %xmm9 -; AVX1-NEXT: vpxor %xmm10, %xmm10, %xmm10 -; AVX1-NEXT: vpxor %xmm12, %xmm12, %xmm12 +; AVX1-NEXT: movq $-1024, %rax # imm = 0xFC00 +; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: .p2align 4, 0x90 ; AVX1-NEXT: .LBB2_1: # %vector.body ; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vmovdqa %ymm7, %ymm11 -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpsubd %xmm7, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpsubd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpsubd %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpsubd %xmm0, %xmm3, %xmm0 -; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpsubd %xmm0, %xmm4, %xmm0 -; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpsubd %xmm0, %xmm5, %xmm0 -; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpsubd %xmm0, %xmm6, %xmm0 -; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpsubd %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpsubd %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpsubd %xmm6, %xmm5, %xmm4 -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpsubd %xmm6, %xmm5, %xmm3 -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpsubd %xmm6, %xmm5, %xmm0 -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpsubd %xmm6, %xmm5, %xmm5 -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpsubd %xmm7, %xmm6, %xmm6 -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpsubd %xmm1, %xmm7, %xmm1 -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpmovzxbd {{.*#+}} xmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero -; AVX1-NEXT: vpsubd %xmm2, %xmm7, %xmm2 -; AVX1-NEXT: vpabsd %xmm2, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm11, %xmm7 -; AVX1-NEXT: vpaddd %xmm7, %xmm2, %xmm2 -; AVX1-NEXT: vpabsd %xmm1, %xmm1 -; AVX1-NEXT: vpaddd %xmm1, %xmm11, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm7 -; AVX1-NEXT: vpabsd %xmm6, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm15, %xmm2 -; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpabsd %xmm5, %xmm2 -; AVX1-NEXT: vpaddd %xmm2, %xmm15, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm15 -; AVX1-NEXT: vpabsd %xmm0, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm14, %xmm2 -; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpabsd %xmm3, %xmm2 -; AVX1-NEXT: vpaddd %xmm2, %xmm14, %xmm2 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm14 -; AVX1-NEXT: vpabsd %xmm4, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm13, %xmm2 -; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX1-NEXT: vpaddd %xmm0, %xmm13, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm13 -; AVX1-NEXT: vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; AVX1-NEXT: vextractf128 $1, %ymm8, %xmm1 -; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX1-NEXT: vpaddd %xmm1, %xmm8, %xmm1 -; AVX1-NEXT: vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm8 -; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm0 -; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX1-NEXT: vpaddd %xmm1, %xmm9, %xmm1 -; AVX1-NEXT: vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm9 -; AVX1-NEXT: vextractf128 $1, %ymm10, %xmm0 -; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload -; AVX1-NEXT: vpaddd %xmm1, %xmm10, %xmm1 -; AVX1-NEXT: vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm10 -; AVX1-NEXT: vextractf128 $1, %ymm12, %xmm0 -; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpabsd (%rsp), %xmm1 # 16-byte Folded Reload -; AVX1-NEXT: vpaddd %xmm1, %xmm12, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm12 +; AVX1-NEXT: vmovdqa a+1072(%rax), %xmm3 +; AVX1-NEXT: vpsadbw b+1072(%rax), %xmm3, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4 +; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vmovdqa a+1056(%rax), %xmm4 +; AVX1-NEXT: vpsadbw b+1056(%rax), %xmm4, %xmm4 +; AVX1-NEXT: vpaddd %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX1-NEXT: vmovdqa a+1040(%rax), %xmm3 +; AVX1-NEXT: vpsadbw b+1040(%rax), %xmm3, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vmovdqa a+1024(%rax), %xmm4 +; AVX1-NEXT: vpsadbw b+1024(%rax), %xmm4, %xmm4 +; AVX1-NEXT: vpaddd %xmm0, %xmm4, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX1-NEXT: addq $4, %rax ; AVX1-NEXT: jne .LBB2_1 ; AVX1-NEXT: # %bb.2: # %middle.block -; AVX1-NEXT: vextractf128 $1, %ymm15, %xmm0 -; AVX1-NEXT: vextractf128 $1, %ymm9, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm13, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm12, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-NEXT: vextractf128 $1, %ymm8, %xmm4 +; AVX1-NEXT: vpaddd %xmm4, %xmm4, %xmm5 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 +; AVX1-NEXT: vpaddd %xmm8, %xmm8, %xmm7 +; AVX1-NEXT: vpaddd %xmm8, %xmm8, %xmm1 +; AVX1-NEXT: vpaddd %xmm1, %xmm8, %xmm1 +; AVX1-NEXT: vpaddd %xmm7, %xmm8, %xmm7 +; AVX1-NEXT: vpaddd %xmm7, %xmm2, %xmm2 +; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpaddd %xmm5, %xmm4, %xmm2 +; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm3 ; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpaddd %xmm2, %xmm6, %xmm2 ; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vextractf128 $1, %ymm14, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm8, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm3 -; AVX1-NEXT: vextractf128 $1, %ymm10, %xmm4 -; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpaddd %xmm12, %xmm13, %xmm1 -; AVX1-NEXT: vpaddd %xmm7, %xmm10, %xmm2 -; AVX1-NEXT: vpaddd %xmm2, %xmm8, %xmm2 -; AVX1-NEXT: vpaddd %xmm1, %xmm9, %xmm1 -; AVX1-NEXT: vpaddd %xmm1, %xmm15, %xmm1 -; AVX1-NEXT: vpaddd %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpaddd %xmm0, %xmm14, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: addq $24, %rsp ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -869,68 +411,25 @@ define i32 @sad_avx64i8() nounwind { ; AVX2: # %bb.0: # %entry ; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: movq $-1024, %rax # imm = 0xFC00 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 -; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX2-NEXT: vpxor %xmm6, %xmm6, %xmm6 -; AVX2-NEXT: vpxor %xmm5, %xmm5, %xmm5 -; AVX2-NEXT: vpxor %xmm7, %xmm7, %xmm7 +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: .p2align 4, 0x90 ; AVX2-NEXT: .LBB2_1: # %vector.body ; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm9 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm10 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm11 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm12 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm13 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm14 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpsubd %ymm15, %ymm8, %ymm8 -; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpsubd %ymm15, %ymm9, %ymm9 -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpsubd %ymm15, %ymm10, %ymm10 -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpsubd %ymm15, %ymm11, %ymm11 -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpsubd %ymm15, %ymm12, %ymm12 -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpsubd %ymm15, %ymm13, %ymm13 -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpsubd %ymm15, %ymm14, %ymm14 -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload -; AVX2-NEXT: vpsubd %ymm15, %ymm8, %ymm15 -; AVX2-NEXT: vpabsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload -; AVX2-NEXT: vpaddd %ymm7, %ymm8, %ymm7 -; AVX2-NEXT: vpabsd %ymm9, %ymm8 -; AVX2-NEXT: vpaddd %ymm5, %ymm8, %ymm5 -; AVX2-NEXT: vpabsd %ymm10, %ymm8 -; AVX2-NEXT: vpaddd %ymm6, %ymm8, %ymm6 -; AVX2-NEXT: vpabsd %ymm11, %ymm8 -; AVX2-NEXT: vpaddd %ymm3, %ymm8, %ymm3 -; AVX2-NEXT: vpabsd %ymm12, %ymm8 -; AVX2-NEXT: vpaddd %ymm0, %ymm8, %ymm0 -; AVX2-NEXT: vpabsd %ymm13, %ymm8 -; AVX2-NEXT: vpaddd %ymm2, %ymm8, %ymm2 -; AVX2-NEXT: vpabsd %ymm14, %ymm8 -; AVX2-NEXT: vpaddd %ymm1, %ymm8, %ymm1 -; AVX2-NEXT: vpabsd %ymm15, %ymm8 -; AVX2-NEXT: vpaddd %ymm4, %ymm8, %ymm4 +; AVX2-NEXT: vmovdqa a+1056(%rax), %ymm3 +; AVX2-NEXT: vpsadbw b+1056(%rax), %ymm3, %ymm3 +; AVX2-NEXT: vpaddd %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vmovdqa a+1024(%rax), %ymm3 +; AVX2-NEXT: vpsadbw b+1024(%rax), %ymm3, %ymm3 +; AVX2-NEXT: vpaddd %ymm1, %ymm3, %ymm1 ; AVX2-NEXT: addq $4, %rax ; AVX2-NEXT: jne .LBB2_1 ; AVX2-NEXT: # %bb.2: # %middle.block -; AVX2-NEXT: vpaddd %ymm6, %ymm2, %ymm2 -; AVX2-NEXT: vpaddd %ymm7, %ymm4, %ymm4 -; AVX2-NEXT: vpaddd %ymm4, %ymm2, %ymm2 -; AVX2-NEXT: vpaddd %ymm3, %ymm0, %ymm0 -; AVX2-NEXT: vpaddd %ymm5, %ymm1, %ymm1 -; AVX2-NEXT: vpaddd %ymm2, %ymm1, %ymm1 +; AVX2-NEXT: vpaddd %ymm0, %ymm2, %ymm2 +; AVX2-NEXT: vpaddd %ymm0, %ymm0, %ymm3 +; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpaddd %ymm3, %ymm2, %ymm1 +; AVX2-NEXT: vpaddd %ymm1, %ymm3, %ymm1 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 @@ -947,37 +446,21 @@ define i32 @sad_avx64i8() nounwind { ; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX512F-NEXT: movq $-1024, %rax # imm = 0xFC00 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX512F-NEXT: .p2align 4, 0x90 ; AVX512F-NEXT: .LBB2_1: # %vector.body ; AVX512F-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT: vpsubd %zmm8, %zmm4, %zmm4 -; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT: vpsubd %zmm8, %zmm5, %zmm5 -; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT: vpsubd %zmm8, %zmm6, %zmm6 -; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT: vpsubd %zmm8, %zmm7, %zmm7 -; AVX512F-NEXT: vpabsd %zmm4, %zmm4 -; AVX512F-NEXT: vpaddd %zmm0, %zmm4, %zmm0 -; AVX512F-NEXT: vpabsd %zmm5, %zmm4 -; AVX512F-NEXT: vpaddd %zmm1, %zmm4, %zmm1 -; AVX512F-NEXT: vpabsd %zmm6, %zmm4 -; AVX512F-NEXT: vpaddd %zmm2, %zmm4, %zmm2 -; AVX512F-NEXT: vpabsd %zmm7, %zmm4 -; AVX512F-NEXT: vpaddd %zmm3, %zmm4, %zmm3 +; AVX512F-NEXT: vmovdqa a+1056(%rax), %ymm2 +; AVX512F-NEXT: vpsadbw b+1056(%rax), %ymm2, %ymm2 +; AVX512F-NEXT: vmovdqa a+1024(%rax), %ymm3 +; AVX512F-NEXT: vpsadbw b+1024(%rax), %ymm3, %ymm3 +; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 +; AVX512F-NEXT: vpaddd %zmm1, %zmm2, %zmm1 ; AVX512F-NEXT: addq $4, %rax ; AVX512F-NEXT: jne .LBB2_1 ; AVX512F-NEXT: # %bb.2: # %middle.block -; AVX512F-NEXT: vpaddd %zmm2, %zmm0, %zmm0 -; AVX512F-NEXT: vpaddd %zmm3, %zmm1, %zmm1 -; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm0, %zmm1, %zmm1 +; AVX512F-NEXT: vpaddd %zmm0, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 @@ -1135,6 +618,81 @@ middle.block: ret i32 %12 } +define i32 @sad_4i8() nounwind { +; SSE2-LABEL: sad_4i8: +; SSE2: # %bb.0: # %entry +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00 +; SSE2-NEXT: .p2align 4, 0x90 +; SSE2-NEXT: .LBB4_1: # %vector.body +; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 +; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; SSE2-NEXT: psadbw %xmm1, %xmm2 +; SSE2-NEXT: paddd %xmm2, %xmm0 +; SSE2-NEXT: addq $4, %rax +; SSE2-NEXT: jne .LBB4_1 +; SSE2-NEXT: # %bb.2: # %middle.block +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; SSE2-NEXT: paddd %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] +; SSE2-NEXT: paddd %xmm1, %xmm0 +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: retq +; +; AVX-LABEL: sad_4i8: +; AVX: # %bb.0: # %entry +; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX-NEXT: movq $-1024, %rax # imm = 0xFC00 +; AVX-NEXT: .p2align 4, 0x90 +; AVX-NEXT: .LBB4_1: # %vector.body +; AVX-NEXT: # =>This Inner Loop Header: Depth=1 +; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; AVX-NEXT: vpsadbw %xmm1, %xmm2, %xmm1 +; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: addq $4, %rax +; AVX-NEXT: jne .LBB4_1 +; AVX-NEXT: # %bb.2: # %middle.block +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] +; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: retq +entry: + br label %vector.body + +vector.body: + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %vec.phi = phi <4 x i32> [ zeroinitializer, %entry ], [ %10, %vector.body ] + %0 = getelementptr inbounds [1024 x i8], [1024 x i8]* @a, i64 0, i64 %index + %1 = bitcast i8* %0 to <4 x i8>* + %wide.load = load <4 x i8>, <4 x i8>* %1, align 4 + %2 = zext <4 x i8> %wide.load to <4 x i32> + %3 = getelementptr inbounds [1024 x i8], [1024 x i8]* @b, i64 0, i64 %index + %4 = bitcast i8* %3 to <4 x i8>* + %wide.load1 = load <4 x i8>, <4 x i8>* %4, align 4 + %5 = zext <4 x i8> %wide.load1 to <4 x i32> + %6 = sub nsw <4 x i32> %2, %5 + %7 = icmp sgt <4 x i32> %6, + %8 = sub nsw <4 x i32> zeroinitializer, %6 + %9 = select <4 x i1> %7, <4 x i32> %6, <4 x i32> %8 + %10 = add nsw <4 x i32> %9, %vec.phi + %index.next = add i64 %index, 4 + %11 = icmp eq i64 %index.next, 1024 + br i1 %11, label %middle.block, label %vector.body + +middle.block: + %h2 = shufflevector <4 x i32> %10, <4 x i32> undef, <4 x i32> + %sum2 = add <4 x i32> %10, %h2 + %h3 = shufflevector <4 x i32> %sum2, <4 x i32> undef, <4 x i32> + %sum3 = add <4 x i32> %sum2, %h3 + %sum = extractelement <4 x i32> %sum3, i32 0 + ret i32 %sum +} + + define i32 @sad_nonloop_4i8(<4 x i8>* nocapture readonly %p, i64, <4 x i8>* nocapture readonly %q) local_unnamed_addr #0 { ; SSE2-LABEL: sad_nonloop_4i8: ; SSE2: # %bb.0: @@ -1243,99 +801,16 @@ define i32 @sad_nonloop_16i8(<16 x i8>* nocapture readonly %p, i64, <16 x i8>* n define i32 @sad_nonloop_32i8(<32 x i8>* nocapture readonly %p, i64, <32 x i8>* nocapture readonly %q) local_unnamed_addr #0 { ; SSE2-LABEL: sad_nonloop_32i8: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqu (%rdi), %xmm0 -; SSE2-NEXT: movdqu 16(%rdi), %xmm12 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: movdqa %xmm12, %xmm8 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3],xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7] -; SSE2-NEXT: movdqa %xmm8, %xmm10 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7] -; SSE2-NEXT: movdqa %xmm0, %xmm9 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3],xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7] -; SSE2-NEXT: movdqa %xmm9, %xmm11 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm1[4],xmm11[5],xmm1[5],xmm11[6],xmm1[6],xmm11[7],xmm1[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm1[8],xmm12[9],xmm1[9],xmm12[10],xmm1[10],xmm12[11],xmm1[11],xmm12[12],xmm1[12],xmm12[13],xmm1[13],xmm12[14],xmm1[14],xmm12[15],xmm1[15] -; SSE2-NEXT: movdqa %xmm12, %xmm13 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm1[4],xmm13[5],xmm1[5],xmm13[6],xmm1[6],xmm13[7],xmm1[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; SSE2-NEXT: movdqa %xmm0, %xmm4 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1],xmm12[2],xmm1[2],xmm12[3],xmm1[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: movdqu (%rdx), %xmm7 -; SSE2-NEXT: movdqu 16(%rdx), %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm6 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3],xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7] -; SSE2-NEXT: movdqa %xmm6, %xmm5 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] -; SSE2-NEXT: psubd %xmm5, %xmm10 -; SSE2-NEXT: movdqa %xmm7, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] -; SSE2-NEXT: psubd %xmm5, %xmm11 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] -; SSE2-NEXT: movdqa %xmm3, %xmm5 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] -; SSE2-NEXT: psubd %xmm5, %xmm13 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm1[8],xmm7[9],xmm1[9],xmm7[10],xmm1[10],xmm7[11],xmm1[11],xmm7[12],xmm1[12],xmm7[13],xmm1[13],xmm7[14],xmm1[14],xmm7[15],xmm1[15] -; SSE2-NEXT: movdqa %xmm7, %xmm5 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] -; SSE2-NEXT: psubd %xmm5, %xmm4 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] -; SSE2-NEXT: psubd %xmm6, %xmm8 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE2-NEXT: psubd %xmm2, %xmm9 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; SSE2-NEXT: psubd %xmm3, %xmm12 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3] -; SSE2-NEXT: psubd %xmm7, %xmm0 -; SSE2-NEXT: movdqa %xmm10, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: paddd %xmm1, %xmm10 -; SSE2-NEXT: pxor %xmm1, %xmm10 -; SSE2-NEXT: movdqa %xmm11, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: paddd %xmm1, %xmm11 -; SSE2-NEXT: pxor %xmm1, %xmm11 -; SSE2-NEXT: movdqa %xmm13, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: paddd %xmm1, %xmm13 -; SSE2-NEXT: pxor %xmm1, %xmm13 -; SSE2-NEXT: movdqa %xmm4, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: paddd %xmm1, %xmm4 -; SSE2-NEXT: pxor %xmm1, %xmm4 -; SSE2-NEXT: paddd %xmm13, %xmm4 -; SSE2-NEXT: paddd %xmm10, %xmm4 -; SSE2-NEXT: paddd %xmm11, %xmm4 -; SSE2-NEXT: movdqa %xmm8, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: paddd %xmm1, %xmm8 -; SSE2-NEXT: pxor %xmm1, %xmm8 -; SSE2-NEXT: movdqa %xmm9, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: paddd %xmm1, %xmm9 -; SSE2-NEXT: pxor %xmm1, %xmm9 -; SSE2-NEXT: movdqa %xmm12, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: paddd %xmm1, %xmm12 -; SSE2-NEXT: pxor %xmm1, %xmm12 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: paddd %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm0 -; SSE2-NEXT: paddd %xmm12, %xmm0 -; SSE2-NEXT: paddd %xmm8, %xmm0 -; SSE2-NEXT: paddd %xmm4, %xmm0 -; SSE2-NEXT: paddd %xmm9, %xmm0 +; SSE2-NEXT: movdqu (%rdx), %xmm0 +; SSE2-NEXT: movdqu 16(%rdx), %xmm1 +; SSE2-NEXT: movdqu (%rdi), %xmm2 +; SSE2-NEXT: psadbw %xmm0, %xmm2 +; SSE2-NEXT: movdqu 16(%rdi), %xmm0 +; SSE2-NEXT: psadbw %xmm1, %xmm0 +; SSE2-NEXT: paddq %xmm2, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] -; SSE2-NEXT: paddd %xmm1, %xmm0 -; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: paddq %xmm0, %xmm1 +; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: retq ; ; AVX1-LABEL: sad_nonloop_32i8: @@ -1395,6 +870,115 @@ define i32 @sad_nonloop_32i8(<32 x i8>* nocapture readonly %p, i64, <32 x i8>* n ret i32 %sum } +define i32 @sad_nonloop_64i8(<64 x i8>* nocapture readonly %p, i64, <64 x i8>* nocapture readonly %q) local_unnamed_addr #0 { +; SSE2-LABEL: sad_nonloop_64i8: +; SSE2: # %bb.0: +; SSE2-NEXT: movdqu (%rdx), %xmm0 +; SSE2-NEXT: movdqu 16(%rdx), %xmm1 +; SSE2-NEXT: movdqu 32(%rdx), %xmm2 +; SSE2-NEXT: movdqu 48(%rdx), %xmm3 +; SSE2-NEXT: movdqu (%rdi), %xmm4 +; SSE2-NEXT: psadbw %xmm0, %xmm4 +; SSE2-NEXT: movdqu 16(%rdi), %xmm0 +; SSE2-NEXT: psadbw %xmm1, %xmm0 +; SSE2-NEXT: movdqu 32(%rdi), %xmm1 +; SSE2-NEXT: psadbw %xmm2, %xmm1 +; SSE2-NEXT: movdqu 48(%rdi), %xmm2 +; SSE2-NEXT: psadbw %xmm3, %xmm2 +; SSE2-NEXT: paddq %xmm0, %xmm2 +; SSE2-NEXT: paddq %xmm1, %xmm2 +; SSE2-NEXT: paddq %xmm4, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] +; SSE2-NEXT: paddq %xmm2, %xmm0 +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: retq +; +; AVX1-LABEL: sad_nonloop_64i8: +; AVX1: # %bb.0: +; AVX1-NEXT: vmovdqu (%rdi), %xmm0 +; AVX1-NEXT: vmovdqu 16(%rdi), %xmm1 +; AVX1-NEXT: vmovdqu 32(%rdi), %xmm2 +; AVX1-NEXT: vmovdqu 48(%rdi), %xmm3 +; AVX1-NEXT: vpsadbw 48(%rdx), %xmm3, %xmm3 +; AVX1-NEXT: vpsadbw 16(%rdx), %xmm1, %xmm1 +; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpsadbw 32(%rdx), %xmm2, %xmm2 +; AVX1-NEXT: vpaddq %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpsadbw (%rdx), %xmm0, %xmm0 +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovd %xmm0, %eax +; AVX1-NEXT: retq +; +; AVX2-LABEL: sad_nonloop_64i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqu (%rdi), %ymm0 +; AVX2-NEXT: vmovdqu 32(%rdi), %ymm1 +; AVX2-NEXT: vpsadbw 32(%rdx), %ymm1, %ymm1 +; AVX2-NEXT: vpsadbw (%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512F-LABEL: sad_nonloop_64i8: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqu (%rdi), %ymm0 +; AVX512F-NEXT: vmovdqu 32(%rdi), %ymm1 +; AVX512F-NEXT: vpsadbw 32(%rdx), %ymm1, %ymm1 +; AVX512F-NEXT: vpsadbw (%rdx), %ymm0, %ymm0 +; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512F-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512F-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vmovd %xmm0, %eax +; AVX512F-NEXT: vzeroupper +; AVX512F-NEXT: retq +; +; AVX512BW-LABEL: sad_nonloop_64i8: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqu64 (%rdi), %zmm0 +; AVX512BW-NEXT: vpsadbw (%rdx), %zmm0, %zmm0 +; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 +; AVX512BW-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512BW-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] +; AVX512BW-NEXT: vpaddq %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vmovd %xmm0, %eax +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq + %v1 = load <64 x i8>, <64 x i8>* %p, align 1 + %z1 = zext <64 x i8> %v1 to <64 x i32> + %v2 = load <64 x i8>, <64 x i8>* %q, align 1 + %z2 = zext <64 x i8> %v2 to <64 x i32> + %sub = sub nsw <64 x i32> %z1, %z2 + %isneg = icmp sgt <64 x i32> %sub, + %neg = sub nsw <64 x i32> zeroinitializer, %sub + %abs = select <64 x i1> %isneg, <64 x i32> %sub, <64 x i32> %neg + %h64 = shufflevector <64 x i32> %abs, <64 x i32> undef, <64 x i32> + %sum64 = add <64 x i32> %abs, %h64 + %h32 = shufflevector <64 x i32> %sum64, <64 x i32> undef, <64 x i32> + %sum32 = add <64 x i32> %sum64, %h32 + %h0 = shufflevector <64 x i32> %sum32, <64 x i32> undef, <64 x i32> + %sum0 = add <64 x i32> %sum32, %h0 + %h1 = shufflevector <64 x i32> %sum0, <64 x i32> undef, <64 x i32> + %sum1 = add <64 x i32> %sum0, %h1 + %h2 = shufflevector <64 x i32> %sum1, <64 x i32> undef, <64 x i32> + %sum2 = add <64 x i32> %sum1, %h2 + %h3 = shufflevector <64 x i32> %sum2, <64 x i32> undef, <64 x i32> + %sum3 = add <64 x i32> %sum2, %h3 + %sum = extractelement <64 x i32> %sum3, i32 0 + ret i32 %sum +} + ; This contains an unrolled sad loop with a non-zero initial value. ; DAGCombiner reassociation previously rewrote the adds to move the constant vector further down the tree. This resulted in the vector-reduction flag being lost. define i32 @sad_unroll_nonzero_initial(<16 x i8>* %arg, <16 x i8>* %arg1, <16 x i8>* %arg2, <16 x i8>* %arg3) { diff --git a/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll index 5f9dbdf888a34..85400656e2e54 100644 --- a/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll @@ -2791,7 +2791,7 @@ define void @test_mm_storeh_pi(x86_mmx *%a0, <4 x float> %a1) nounwind { ; ; X64-SSE2-LABEL: test_mm_storeh_pi: ; X64-SSE2: # %bb.0: -; X64-SSE2-NEXT: shufps $78, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0x4e] +; X64-SSE2-NEXT: pshufd $78, %xmm0, %xmm0 # encoding: [0x66,0x0f,0x70,0xc0,0x4e] ; X64-SSE2-NEXT: # xmm0 = xmm0[2,3,0,1] ; X64-SSE2-NEXT: movq %xmm0, %rax # encoding: [0x66,0x48,0x0f,0x7e,0xc0] ; X64-SSE2-NEXT: movq %rax, (%rdi) # encoding: [0x48,0x89,0x07] diff --git a/llvm/test/CodeGen/X86/var-permute-256.ll b/llvm/test/CodeGen/X86/var-permute-256.ll index d1594eddf4444..4c04219b3883c 100644 --- a/llvm/test/CodeGen/X86/var-permute-256.ll +++ b/llvm/test/CodeGen/X86/var-permute-256.ll @@ -558,29 +558,27 @@ define <4 x i64> @var_shuffle_v4i64_from_v2i64(<2 x i64> %v, <4 x i64> %indices) ; XOP-LABEL: var_shuffle_v4i64_from_v2i64: ; XOP: # %bb.0: ; XOP-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; XOP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3] ; XOP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; XOP-NEXT: vpaddq %xmm1, %xmm1, %xmm3 +; XOP-NEXT: vpaddq %xmm1, %xmm1, %xmm2 ; XOP-NEXT: vextractf128 $1, %ymm1, %xmm1 ; XOP-NEXT: vpaddq %xmm1, %xmm1, %xmm1 -; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; XOP-NEXT: vpermil2pd $0, %ymm1, %ymm2, %ymm0, %ymm0 +; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; XOP-NEXT: vpermil2pd $0, %ymm1, %ymm0, %ymm0, %ymm0 ; XOP-NEXT: retq ; ; AVX1-LABEL: var_shuffle_v4i64_from_v2i64: ; AVX1: # %bb.0: ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3] -; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm4 -; AVX1-NEXT: vpermilpd %ymm4, %ymm2, %ymm2 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: vpermilpd %ymm4, %ymm0, %ymm0 -; AVX1-NEXT: vpcmpgtq {{.*}}(%rip), %xmm3, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm3 +; AVX1-NEXT: vpermilpd %ymm3, %ymm0, %ymm0 +; AVX1-NEXT: vpcmpgtq {{.*}}(%rip), %xmm2, %xmm2 ; AVX1-NEXT: vpcmpgtq {{\.LCPI.*}}+{{.*}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vpermilpd %ymm3, %ymm0, %ymm2 ; AVX1-NEXT: vblendvpd %ymm1, %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -590,8 +588,7 @@ define <4 x i64> @var_shuffle_v4i64_from_v2i64(<2 x i64> %v, <4 x i64> %indices) ; AVX2-NEXT: vpaddq %ymm1, %ymm1, %ymm1 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2,2,2,2] ; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2 -; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[2,3,2,3] -; AVX2-NEXT: vpermilpd %ymm1, %ymm3, %ymm3 +; AVX2-NEXT: vpermilpd %ymm1, %ymm0, %ymm3 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vblendvpd %ymm2, %ymm3, %ymm0, %ymm0 @@ -984,29 +981,27 @@ define <4 x double> @var_shuffle_v4f64_from_v2f64(<2 x double> %v, <4 x i64> %in ; XOP-LABEL: var_shuffle_v4f64_from_v2f64: ; XOP: # %bb.0: ; XOP-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; XOP-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3] ; XOP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; XOP-NEXT: vpaddq %xmm1, %xmm1, %xmm3 +; XOP-NEXT: vpaddq %xmm1, %xmm1, %xmm2 ; XOP-NEXT: vextractf128 $1, %ymm1, %xmm1 ; XOP-NEXT: vpaddq %xmm1, %xmm1, %xmm1 -; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 -; XOP-NEXT: vpermil2pd $0, %ymm1, %ymm2, %ymm0, %ymm0 +; XOP-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; XOP-NEXT: vpermil2pd $0, %ymm1, %ymm0, %ymm0, %ymm0 ; XOP-NEXT: retq ; ; AVX1-LABEL: var_shuffle_v4f64_from_v2f64: ; AVX1: # %bb.0: ; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3,2,3] -; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vpaddq %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm4 -; AVX1-NEXT: vpermilpd %ymm4, %ymm2, %ymm2 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: vpermilpd %ymm4, %ymm0, %ymm0 -; AVX1-NEXT: vpcmpgtq {{.*}}(%rip), %xmm3, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm3 +; AVX1-NEXT: vpermilpd %ymm3, %ymm0, %ymm0 +; AVX1-NEXT: vpcmpgtq {{.*}}(%rip), %xmm2, %xmm2 ; AVX1-NEXT: vpcmpgtq {{\.LCPI.*}}+{{.*}}(%rip), %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 +; AVX1-NEXT: vpermilpd %ymm3, %ymm0, %ymm2 ; AVX1-NEXT: vblendvpd %ymm1, %ymm2, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -1016,8 +1011,7 @@ define <4 x double> @var_shuffle_v4f64_from_v2f64(<2 x double> %v, <4 x i64> %in ; AVX2-NEXT: vpaddq %ymm1, %ymm1, %ymm1 ; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2,2,2,2] ; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2 -; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[2,3,2,3] -; AVX2-NEXT: vpermilpd %ymm1, %ymm3, %ymm3 +; AVX2-NEXT: vpermilpd %ymm1, %ymm0, %ymm3 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,1] ; AVX2-NEXT: vpermilpd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vblendvpd %ymm2, %ymm3, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll index a18e47cd99845..3457450a3ee02 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-and-bool.ll @@ -566,11 +566,9 @@ define i1 @trunc_v8i64_v8i1(<8 x i64>) { ; ; AVX2-LABEL: trunc_v8i64_v8i1: ; AVX2: # %bb.0: -; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: vpsllw $15, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-reduce-mul.ll b/llvm/test/CodeGen/X86/vector-reduce-mul.ll index 39b189c157ce0..9bdd722f64021 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-mul.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-mul.ll @@ -2011,9 +2011,9 @@ define i8 @test_v32i8(<32 x i8> %a0) { ; AVX2-NEXT: vpmullw %xmm2, %xmm3, %xmm2 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; AVX2-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX2-NEXT: vpmullw %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpand %xmm3, %xmm0, %xmm1 ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 @@ -2099,9 +2099,9 @@ define i8 @test_v32i8(<32 x i8> %a0) { ; AVX512DQ-NEXT: vpmullw %xmm2, %xmm3, %xmm2 ; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255] ; AVX512DQ-NEXT: vpand %xmm3, %xmm2, %xmm2 +; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero ; AVX512DQ-NEXT: vpmullw %xmm2, %xmm1, %xmm1 -; AVX512DQ-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512DQ-NEXT: vpmullw %xmm1, %xmm0, %xmm0 ; AVX512DQ-NEXT: vpand %xmm3, %xmm0, %xmm1 ; AVX512DQ-NEXT: vpxor %xmm2, %xmm2, %xmm2 diff --git a/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll index a9d4f50502a7e..5a0deab79ae98 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll @@ -560,11 +560,9 @@ define i1 @trunc_v8i64_v8i1(<8 x i64>) { ; ; AVX2-LABEL: trunc_v8i64_v8i1: ; AVX2: # %bb.0: -; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: vpsllw $15, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll index fb2b99543e36e..8014f1f415162 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll @@ -611,11 +611,9 @@ define i1 @trunc_v8i64_v8i1(<8 x i64>) { ; ; AVX2-LABEL: trunc_v8i64_v8i1: ; AVX2: # %bb.0: -; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX2-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: vpsllw $15, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll index 1d7711cc53efd..343fd75044e76 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -4749,6 +4749,127 @@ define <32 x i8> @shuffle_v32i8_02_03_04_05_06_07_00_01_10_11_12_13_14_15_08_09_ ret <32 x i8> %shuffle } +define <32 x i8> @shuffle_v32i8_00_02_04_06_08_10_12_14_32_34_36_38_40_42_44_46_16_18_20_22_24_26_28_30_48_50_52_54_56_58_60_62(<32 x i8> %a0, <32 x i8> %a1) { +; AVX1-LABEL: shuffle_v32i8_00_02_04_06_08_10_12_14_32_34_36_38_40_42_44_46_16_18_20_22_24_26_28_30_48_50_52_54_56_58_60_62: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm4 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm4[0],xmm2[0] +; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_02_04_06_08_10_12_14_32_34_36_38_40_42_44_46_16_18_20_22_24_26_28_30_48_50_52_54_56_58_60_62: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX2-NEXT: retq +; +; AVX512VLBW-LABEL: shuffle_v32i8_00_02_04_06_08_10_12_14_32_34_36_38_40_42_44_46_16_18_20_22_24_26_28_30_48_50_52_54_56_58_60_62: +; AVX512VLBW: # %bb.0: +; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30] +; AVX512VLBW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u] +; AVX512VLBW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX512VLBW-NEXT: retq +; +; AVX512VLVBMI-LABEL: shuffle_v32i8_00_02_04_06_08_10_12_14_32_34_36_38_40_42_44_46_16_18_20_22_24_26_28_30_48_50_52_54_56_58_60_62: +; AVX512VLVBMI: # %bb.0: +; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14,32,34,36,38,40,42,44,46,16,18,20,22,24,26,28,30,48,50,52,54,56,58,60,62] +; AVX512VLVBMI-NEXT: vpermt2b %ymm1, %ymm2, %ymm0 +; AVX512VLVBMI-NEXT: retq +; +; XOPAVX1-LABEL: shuffle_v32i8_00_02_04_06_08_10_12_14_32_34_36_38_40_42_44_46_16_18_20_22_24_26_28_30_48_50_52_54_56_58_60_62: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] +; XOPAVX1-NEXT: vpperm %xmm4, %xmm2, %xmm3, %xmm2 +; XOPAVX1-NEXT: vpperm %xmm4, %xmm1, %xmm0, %xmm0 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: shuffle_v32i8_00_02_04_06_08_10_12_14_32_34_36_38_40_42_44_46_16_18_20_22_24_26_28_30_48_50_52_54_56_58_60_62: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30] +; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u] +; XOPAVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; XOPAVX2-NEXT: retq + %1 = shufflevector <32 x i8> %a0, <32 x i8> %a1, <32 x i32> + ret <32 x i8> %1 +} + +define <32 x i8> @shuffle_v32i8_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62(<32 x i8> %a0, <32 x i8> %a1) { +; AVX1-LABEL: shuffle_v32i8_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62: +; AVX1: # %bb.0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> +; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v32i8_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX2-NEXT: retq +; +; AVX512VLBW-SLOW-LABEL: shuffle_v32i8_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62: +; AVX512VLBW-SLOW: # %bb.0: +; AVX512VLBW-SLOW-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30] +; AVX512VLBW-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u] +; AVX512VLBW-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX512VLBW-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512VLBW-SLOW-NEXT: retq +; +; AVX512VLBW-FAST-LABEL: shuffle_v32i8_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62: +; AVX512VLBW-FAST: # %bb.0: +; AVX512VLBW-FAST-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30] +; AVX512VLBW-FAST-NEXT: vpshufb {{.*#+}} ymm2 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u] +; AVX512VLBW-FAST-NEXT: vmovdqa {{.*#+}} ymm0 = [0,2,5,7] +; AVX512VLBW-FAST-NEXT: vpermi2q %ymm1, %ymm2, %ymm0 +; AVX512VLBW-FAST-NEXT: retq +; +; AVX512VLVBMI-LABEL: shuffle_v32i8_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62: +; AVX512VLVBMI: # %bb.0: +; AVX512VLVBMI-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62] +; AVX512VLVBMI-NEXT: vpermt2b %ymm1, %ymm2, %ymm0 +; AVX512VLVBMI-NEXT: retq +; +; XOPAVX1-LABEL: shuffle_v32i8_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62: +; XOPAVX1: # %bb.0: +; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; XOPAVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30] +; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm1, %xmm1 +; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; XOPAVX1-NEXT: vpperm %xmm3, %xmm2, %xmm0, %xmm0 +; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; XOPAVX1-NEXT: retq +; +; XOPAVX2-LABEL: shuffle_v32i8_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62: +; XOPAVX2: # %bb.0: +; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30] +; XOPAVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u] +; XOPAVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; XOPAVX2-NEXT: retq + %1 = shufflevector <32 x i8> %a0, <32 x i8> %a1, <32 x i32> + ret <32 x i8> %1 +} + define <32 x i8> @shuffle_v32i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62(<16 x i16> %a0, <16 x i16> %a1) { ; AVX1-LABEL: shuffle_v32i8_shift_00_02_04_06_08_10_12_14_16_18_20_22_24_26_28_30_32_34_36_38_40_42_44_46_48_50_52_54_56_58_60_62: ; AVX1: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vector-trunc-math.ll b/llvm/test/CodeGen/X86/vector-trunc-math.ll index 38cd2a3ae968d..24418ece07a51 100644 --- a/llvm/test/CodeGen/X86/vector-trunc-math.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-math.ll @@ -101,13 +101,11 @@ define <8 x i16> @trunc_add_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { ; ; AVX2-SLOW-LABEL: trunc_add_v8i64_v8i16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpaddq %ymm3, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpaddq %ymm2, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 @@ -253,25 +251,21 @@ define <16 x i8> @trunc_add_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin ; ; AVX2-SLOW-LABEL: trunc_add_v16i64_v16i8: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpaddq %ymm4, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpaddq %ymm5, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpaddq %ymm6, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpaddq %ymm4, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpaddq %ymm7, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpaddq %ymm6, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm2[2,3],ymm3[2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2],ymm4[0,2],ymm2[4,6],ymm4[4,6] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm0[2,3],ymm1[2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm5[0,2],ymm0[4,6],ymm5[4,6] ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0 @@ -574,11 +568,9 @@ define <8 x i16> @trunc_add_const_v8i64_v8i16(<8 x i64> %a0) nounwind { ; ; AVX2-SLOW-LABEL: trunc_add_const_v8i64_v8i16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpaddw {{.*}}(%rip), %xmm0, %xmm0 @@ -696,21 +688,17 @@ define <16 x i8> @trunc_add_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; ; AVX2-SLOW-LABEL: trunc_add_const_v16i64_v16i8: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2] +; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm2[2,3],ymm3[2,3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2],ymm4[0,2],ymm2[4,6],ymm4[4,6] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2] +; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[2,3],ymm1[2,3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm5[0,2],ymm0[4,6],ymm5[4,6] ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0 @@ -959,13 +947,11 @@ define <8 x i16> @trunc_sub_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { ; ; AVX2-SLOW-LABEL: trunc_sub_v8i64_v8i16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpsubq %ymm2, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpsubq %ymm3, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpsubq %ymm2, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 @@ -1111,25 +1097,21 @@ define <16 x i8> @trunc_sub_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin ; ; AVX2-SLOW-LABEL: trunc_sub_v16i64_v16i8: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpsubq %ymm4, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpsubq %ymm5, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpsubq %ymm6, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpsubq %ymm4, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpsubq %ymm7, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm3, %xmm4 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm4 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vpsubq %ymm6, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm2[2,3],ymm3[2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2],ymm4[0,2],ymm2[4,6],ymm4[4,6] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm0[2,3],ymm1[2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm5[0,2],ymm0[4,6],ymm5[4,6] ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0 @@ -1400,11 +1382,9 @@ define <8 x i16> @trunc_sub_const_v8i64_v8i16(<8 x i64> %a0) nounwind { ; ; AVX2-SLOW-LABEL: trunc_sub_const_v8i64_v8i16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpsubw {{.*}}(%rip), %xmm0, %xmm0 @@ -1522,21 +1502,17 @@ define <16 x i8> @trunc_sub_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; ; AVX2-SLOW-LABEL: trunc_sub_const_v16i64_v16i8: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2] +; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm2[2,3],ymm3[2,3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2],ymm4[0,2],ymm2[4,6],ymm4[4,6] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2] +; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[2,3],ymm1[2,3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm5[0,2],ymm0[4,6],ymm5[4,6] ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0 @@ -1852,19 +1828,15 @@ define <8 x i16> @trunc_mul_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { ; ; AVX2-SLOW-LABEL: trunc_mul_v8i64_v8i16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2] +; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm2[2,3],ymm3[2,3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2],ymm4[0,2],ymm2[4,6],ymm4[4,6] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm4[0,2] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2] +; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3],ymm1[2,3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm4[0,2],ymm0[4,6],ymm4[4,6] ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpmullw %xmm2, %xmm0, %xmm0 @@ -2426,11 +2398,9 @@ define <8 x i16> @trunc_mul_const_v8i64_v8i16(<8 x i64> %a0) nounwind { ; ; AVX2-SLOW-LABEL: trunc_mul_const_v8i64_v8i16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpmullw {{.*}}(%rip), %xmm0, %xmm0 @@ -2869,13 +2839,11 @@ define <8 x i16> @trunc_and_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { ; ; AVX2-SLOW-LABEL: trunc_and_v8i64_v8i16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vandps %ymm2, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vandps %ymm3, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 @@ -3007,25 +2975,21 @@ define <16 x i8> @trunc_and_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin ; ; AVX2-SLOW-LABEL: trunc_and_v16i64_v16i8: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vandps %ymm4, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vandps %ymm5, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vandps %ymm6, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vandps %ymm4, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vandps %ymm7, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2] +; AVX2-SLOW-NEXT: vandps %ymm6, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm2[2,3],ymm3[2,3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2],ymm4[0,2],ymm2[4,6],ymm4[4,6] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2] +; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[2,3],ymm1[2,3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm5[0,2],ymm0[4,6],ymm5[4,6] ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0 @@ -3269,11 +3233,9 @@ define <8 x i16> @trunc_and_const_v8i64_v8i16(<8 x i64> %a0) nounwind { ; ; AVX2-SLOW-LABEL: trunc_and_const_v8i64_v8i16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0 @@ -3391,21 +3353,17 @@ define <16 x i8> @trunc_and_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; ; AVX2-SLOW-LABEL: trunc_and_const_v16i64_v16i8: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2] +; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm2[2,3],ymm3[2,3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2],ymm4[0,2],ymm2[4,6],ymm4[4,6] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2] +; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[2,3],ymm1[2,3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm5[0,2],ymm0[4,6],ymm5[4,6] ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0 @@ -3646,13 +3604,11 @@ define <8 x i16> @trunc_xor_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { ; ; AVX2-SLOW-LABEL: trunc_xor_v8i64_v8i16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vxorps %ymm2, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vxorps %ymm3, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vxorps %ymm2, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 @@ -3784,25 +3740,21 @@ define <16 x i8> @trunc_xor_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin ; ; AVX2-SLOW-LABEL: trunc_xor_v16i64_v16i8: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vxorps %ymm4, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vxorps %ymm5, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vxorps %ymm6, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vxorps %ymm4, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vxorps %ymm7, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2] +; AVX2-SLOW-NEXT: vxorps %ymm6, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm2[2,3],ymm3[2,3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2],ymm4[0,2],ymm2[4,6],ymm4[4,6] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2] +; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[2,3],ymm1[2,3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm5[0,2],ymm0[4,6],ymm5[4,6] ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0 @@ -4046,11 +3998,9 @@ define <8 x i16> @trunc_xor_const_v8i64_v8i16(<8 x i64> %a0) nounwind { ; ; AVX2-SLOW-LABEL: trunc_xor_const_v8i64_v8i16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0 @@ -4168,21 +4118,17 @@ define <16 x i8> @trunc_xor_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; ; AVX2-SLOW-LABEL: trunc_xor_const_v16i64_v16i8: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2] +; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm2[2,3],ymm3[2,3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2],ymm4[0,2],ymm2[4,6],ymm4[4,6] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2] +; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[2,3],ymm1[2,3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm5[0,2],ymm0[4,6],ymm5[4,6] ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0 @@ -4423,13 +4369,11 @@ define <8 x i16> @trunc_or_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind { ; ; AVX2-SLOW-LABEL: trunc_or_v8i64_v8i16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vorps %ymm2, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vorps %ymm3, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vorps %ymm2, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 @@ -4561,25 +4505,21 @@ define <16 x i8> @trunc_or_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind ; ; AVX2-SLOW-LABEL: trunc_or_v16i64_v16i8: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vorps %ymm4, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vorps %ymm5, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vorps %ymm6, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vorps %ymm4, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vorps %ymm7, %ymm3, %ymm3 -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2] +; AVX2-SLOW-NEXT: vorps %ymm6, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm2[2,3],ymm3[2,3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2],ymm4[0,2],ymm2[4,6],ymm4[4,6] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2] +; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[2,3],ymm1[2,3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm5[0,2],ymm0[4,6],ymm5[4,6] ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0 @@ -4823,11 +4763,9 @@ define <8 x i16> @trunc_or_const_v8i64_v8i16(<8 x i64> %a0) nounwind { ; ; AVX2-SLOW-LABEL: trunc_or_const_v8i64_v8i16: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 @@ -4945,21 +4883,17 @@ define <16 x i8> @trunc_or_const_v16i64_v16i8(<16 x i64> %a0) nounwind { ; ; AVX2-SLOW-LABEL: trunc_or_const_v16i64_v16i8: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm3, %xmm4 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm2, %xmm4 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm4[0,2] +; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm2[2,3],ymm3[2,3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2],ymm4[0,2],ymm2[4,6],ymm4[4,6] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm2, %ymm2 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] ; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255] ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm5 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2] +; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[2,3],ymm1[2,3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm5[0,2],ymm0[4,6],ymm5[4,6] ; AVX2-SLOW-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpand %xmm4, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-trunc-packus.ll b/llvm/test/CodeGen/X86/vector-trunc-packus.ll index 80ce22ca93a7d..c74b87921c442 100644 --- a/llvm/test/CodeGen/X86/vector-trunc-packus.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-packus.ll @@ -899,25 +899,25 @@ define <8 x i32> @trunc_packus_v8i64_v8i32(<8 x i64>* %p0) "min-legal-vector-wid ; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [4294967295,4294967295] ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm8 -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm6 -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm7 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm6 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm7 ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm4, %xmm5 ; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm0, %xmm9 -; AVX1-NEXT: vblendvpd %xmm7, %xmm1, %xmm4, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm5, %xmm1, %xmm7 -; AVX1-NEXT: vblendvpd %xmm6, %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm5, %xmm2, %xmm6 +; AVX1-NEXT: vblendvpd %xmm7, %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm5, %xmm2, %xmm7 +; AVX1-NEXT: vblendvpd %xmm6, %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm5, %xmm1, %xmm6 ; AVX1-NEXT: vblendvpd %xmm8, %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm3, %xmm4 ; AVX1-NEXT: vpand %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpand %xmm2, %xmm6, %xmm2 -; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] -; AVX1-NEXT: vpand %xmm1, %xmm7, %xmm1 +; AVX1-NEXT: vpand %xmm1, %xmm6, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-NEXT: vpand %xmm2, %xmm7, %xmm2 ; AVX1-NEXT: vpand %xmm0, %xmm9, %xmm0 -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6] ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: trunc_packus_v8i64_v8i32: @@ -925,20 +925,18 @@ define <8 x i32> @trunc_packus_v8i64_v8i32(<8 x i64>* %p0) "min-legal-vector-wid ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,4294967295] -; AVX2-SLOW-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3 -; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-SLOW-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3 ; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3 +; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vpand %ymm0, %ymm3, %ymm0 -; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2 -; AVX2-SLOW-NEXT: vpand %ymm1, %ymm2, %ymm1 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3 +; AVX2-SLOW-NEXT: vpand %ymm1, %ymm3, %ymm1 +; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm2 +; AVX2-SLOW-NEXT: vpand %ymm0, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: trunc_packus_v8i64_v8i32: diff --git a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll index 543708caaea82..35a50b59fbf35 100644 --- a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll @@ -905,25 +905,25 @@ define <8 x i32> @trunc_ssat_v8i64_v8i32(<8 x i64>* %p0) "min-legal-vector-width ; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [2147483647,2147483647] ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm8 -; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm6 -; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm7 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm6 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm7 ; AVX1-NEXT: vpcmpgtq %xmm0, %xmm4, %xmm5 ; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [18446744071562067968,18446744071562067968] ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm0, %xmm9 -; AVX1-NEXT: vblendvpd %xmm7, %xmm1, %xmm4, %xmm1 -; AVX1-NEXT: vpcmpgtq %xmm5, %xmm1, %xmm7 -; AVX1-NEXT: vblendvpd %xmm6, %xmm2, %xmm4, %xmm2 -; AVX1-NEXT: vpcmpgtq %xmm5, %xmm2, %xmm6 +; AVX1-NEXT: vblendvpd %xmm7, %xmm2, %xmm4, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm5, %xmm2, %xmm7 +; AVX1-NEXT: vblendvpd %xmm6, %xmm1, %xmm4, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm5, %xmm1, %xmm6 ; AVX1-NEXT: vblendvpd %xmm8, %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm3, %xmm4 ; AVX1-NEXT: vblendvpd %xmm4, %xmm3, %xmm5, %xmm3 -; AVX1-NEXT: vblendvpd %xmm6, %xmm2, %xmm5, %xmm2 -; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] -; AVX1-NEXT: vblendvpd %xmm7, %xmm1, %xmm5, %xmm1 +; AVX1-NEXT: vblendvpd %xmm6, %xmm1, %xmm5, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-NEXT: vblendvpd %xmm7, %xmm2, %xmm5, %xmm2 ; AVX1-NEXT: vblendvpd %xmm9, %xmm0, %xmm5, %xmm0 -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6] ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: trunc_ssat_v8i64_v8i32: @@ -931,20 +931,18 @@ define <8 x i32> @trunc_ssat_v8i64_v8i32(<8 x i64>* %p0) "min-legal-vector-width ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2147483647,2147483647,2147483647,2147483647] -; AVX2-SLOW-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3 -; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-SLOW-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3 ; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3 +; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 ; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968] -; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3 -; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 ; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3 ; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3 +; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: trunc_ssat_v8i64_v8i32: diff --git a/llvm/test/CodeGen/X86/vector-trunc-usat.ll b/llvm/test/CodeGen/X86/vector-trunc-usat.ll index 7afdb04f6b8fe..8434bac380df7 100644 --- a/llvm/test/CodeGen/X86/vector-trunc-usat.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-usat.ll @@ -611,20 +611,20 @@ define <8 x i32> @trunc_usat_v8i64_v8i32(<8 x i64>* %p0) { ; AVX1-NEXT: vpxor %xmm4, %xmm0, %xmm5 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [9223372041149743103,9223372041149743103] ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm8 -; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm7 +; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm7 ; AVX1-NEXT: vpcmpgtq %xmm7, %xmm6, %xmm7 -; AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm5 +; AVX1-NEXT: vpxor %xmm4, %xmm1, %xmm5 ; AVX1-NEXT: vpcmpgtq %xmm5, %xmm6, %xmm5 ; AVX1-NEXT: vpxor %xmm4, %xmm3, %xmm4 ; AVX1-NEXT: vpcmpgtq %xmm4, %xmm6, %xmm4 ; AVX1-NEXT: vmovapd {{.*#+}} xmm6 = [4294967295,4294967295] ; AVX1-NEXT: vblendvpd %xmm4, %xmm3, %xmm6, %xmm3 -; AVX1-NEXT: vblendvpd %xmm5, %xmm2, %xmm6, %xmm2 -; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] -; AVX1-NEXT: vblendvpd %xmm7, %xmm1, %xmm6, %xmm1 +; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm6, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-NEXT: vblendvpd %xmm7, %xmm2, %xmm6, %xmm2 ; AVX1-NEXT: vblendvpd %xmm8, %xmm0, %xmm6, %xmm0 -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6] ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: trunc_usat_v8i64_v8i32: @@ -633,18 +633,16 @@ define <8 x i32> @trunc_usat_v8i64_v8i32(<8 x i64>* %p0) { ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,4294967295] ; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX2-SLOW-NEXT: vpxor %ymm3, %ymm0, %ymm4 +; AVX2-SLOW-NEXT: vpxor %ymm3, %ymm1, %ymm4 ; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372041149743103,9223372041149743103,9223372041149743103,9223372041149743103] ; AVX2-SLOW-NEXT: vpcmpgtq %ymm4, %ymm5, %ymm4 -; AVX2-SLOW-NEXT: vblendvpd %ymm4, %ymm0, %ymm2, %ymm0 -; AVX2-SLOW-NEXT: vpxor %ymm3, %ymm1, %ymm3 +; AVX2-SLOW-NEXT: vblendvpd %ymm4, %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vpxor %ymm3, %ymm0, %ymm3 ; AVX2-SLOW-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3 -; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1 -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: trunc_usat_v8i64_v8i32: diff --git a/llvm/test/CodeGen/X86/vector-trunc.ll b/llvm/test/CodeGen/X86/vector-trunc.ll index 36f7d46cd3329..94b08c9abb820 100644 --- a/llvm/test/CodeGen/X86/vector-trunc.ll +++ b/llvm/test/CodeGen/X86/vector-trunc.ll @@ -20,20 +20,16 @@ define <8 x i32> @trunc8i64_8i32(<8 x i64> %a) { ; ; AVX1-LABEL: trunc8i64_8i32: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: trunc8i64_8i32: ; AVX2-SLOW: # %bb.0: # %entry -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: trunc8i64_8i32: @@ -63,20 +59,16 @@ define <8 x i32> @trunc8i64_8i32_ashr(<8 x i64> %a) { ; ; AVX1-LABEL: trunc8i64_8i32_ashr: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3],xmm3[1,3] -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3] +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7] ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: trunc8i64_8i32_ashr: ; AVX2-SLOW: # %bb.0: # %entry -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,3] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3] +; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7] ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: trunc8i64_8i32_ashr: @@ -108,22 +100,18 @@ define <8 x i32> @trunc8i64_8i32_lshr(<8 x i64> %a) { ; ; AVX1-LABEL: trunc8i64_8i32_lshr: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 -; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3],xmm3[1,3] -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3] +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7] ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: trunc8i64_8i32_lshr: ; AVX2-SLOW: # %bb.0: # %entry -; AVX2-SLOW-NEXT: vpsrlq $32, %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpsrlq $32, %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] -; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpsrlq $32, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: trunc8i64_8i32_lshr: @@ -205,11 +193,9 @@ define <8 x i16> @trunc8i64_8i16(<8 x i64> %a) { ; ; AVX2-SLOW-LABEL: trunc8i64_8i16: ; AVX2-SLOW: # %bb.0: # %entry -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] ; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 @@ -1361,20 +1347,16 @@ define <8 x i32> @trunc2x4i64_8i32(<4 x i64> %a, <4 x i64> %b) { ; ; AVX1-LABEL: trunc2x4i64_8i32: ; AVX1: # %bb.0: # %entry -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: trunc2x4i64_8i32: ; AVX2-SLOW: # %bb.0: # %entry -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] -; AVX2-SLOW-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2] +; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3] ; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6] ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: trunc2x4i64_8i32: diff --git a/llvm/test/CodeGen/X86/win64-eh-empty-block-2.mir b/llvm/test/CodeGen/X86/win64-eh-empty-block-2.mir new file mode 100644 index 0000000000000..df63ba4308c8e --- /dev/null +++ b/llvm/test/CodeGen/X86/win64-eh-empty-block-2.mir @@ -0,0 +1,223 @@ +# RUN: llc -start-before=x86-avoid-trailing-call %s -o - | FileCheck %s + +# If there is a trailing unreachable block, make sure it is non-empty. + +# Manually modified the IR of the following C++ to share one unreachable block, +# as clang does for the real C++ throw: +# void __declspec(noreturn) mythrow(); +# int multi_throw(bool c1, bool c2, bool c3) { +# try { +# if (c1) +# mythrow(); +# if (c2) +# mythrow(); +# if (c3) +# mythrow(); +# } catch (...) { +# return 1; +# } +# return 0; +# } + +# CHECK-LABEL: "?multi_throw@@YAH_N00@Z": # @"?multi_throw@@YAH_N00@Z" +# CHECK: retq +# CHECK: .LBB{{.*}} # %if.then +# CHECK: callq mythrow +# CHECK: .LBB{{.*}} # %if.then4 +# CHECK: callq mythrow +# CHECK: .LBB{{.*}} # %if.then8 +# CHECK: callq mythrow +# CHECK: .LBB{{.*}} # %unreachable +# CHECK-NEXT: int3 +# CHECK: .seh_endproc +# CHECK: # %catch + +--- | + ; ModuleID = '../llvm/test/CodeGen/X86/win64-eh-empty-block-2.ll' + source_filename = "t.cpp" + target datalayout = "e-m:w-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" + target triple = "x86_64-unknown-windows-msvc19.11.0" + + ; Function Attrs: uwtable + define dso_local i32 @"?multi_throw@@YAH_N00@Z"(i1 zeroext %c1, i1 zeroext %c2, i1 zeroext %c3) local_unnamed_addr #0 personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) { + entry: + br i1 %c1, label %if.then, label %if.end + + if.then: ; preds = %entry + invoke void @mythrow() + to label %unreachable unwind label %catch.dispatch + + unreachable: ; preds = %if.then8, %if.then4, %if.then + unreachable + + if.end: ; preds = %entry + br i1 %c2, label %if.then4, label %if.end6 + + if.then4: ; preds = %if.end + invoke void @mythrow() + to label %unreachable unwind label %catch.dispatch + + if.end6: ; preds = %if.end + br i1 %c3, label %if.then8, label %return + + if.then8: ; preds = %if.end6 + invoke void @mythrow() + to label %unreachable unwind label %catch.dispatch + + catch.dispatch: ; preds = %if.then8, %if.then4, %if.then + %0 = catchswitch within none [label %catch] unwind to caller + + catch: ; preds = %catch.dispatch + %1 = catchpad within %0 [i8* null, i32 64, i8* null] + catchret from %1 to label %return + + return: ; preds = %catch, %if.end6 + %retval.0 = phi i32 [ 1, %catch ], [ 0, %if.end6 ] + ret i32 %retval.0 + } + + declare dso_local void @mythrow() + + declare dso_local i32 @__CxxFrameHandler3(...) + + attributes #0 = { uwtable } + + !llvm.module.flags = !{!0, !1} + + !0 = !{i32 1, !"wchar_size", i32 2} + !1 = !{i32 7, !"PIC Level", i32 2} + +... +--- +name: '?multi_throw@@YAH_N00@Z' +alignment: 16 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +hasWinCFI: true +registers: [] +liveins: + - { reg: '$cl', virtual-reg: '' } + - { reg: '$dl', virtual-reg: '' } + - { reg: '$r8b', virtual-reg: '' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 56 + offsetAdjustment: -56 + maxAlignment: 8 + adjustsStack: true + hasCalls: true + stackProtector: '' + maxCallFrameSize: 32 + cvBytesOfCalleeSavedRegisters: 0 + hasOpaqueSPAdjustment: true + hasVAStart: false + hasMustTailInVarArgFunc: false + localFrameSize: 0 + savePoint: '' + restorePoint: '' +fixedStack: + - { id: 0, type: default, offset: -24, size: 8, alignment: 8, stack-id: default, + isImmutable: false, isAliased: false, callee-saved-register: '', + callee-saved-restored: true, debug-info-variable: '', debug-info-expression: '', + debug-info-location: '' } + - { id: 1, type: spill-slot, offset: -16, size: 8, alignment: 16, stack-id: default, + callee-saved-register: '', callee-saved-restored: true, debug-info-variable: '', + debug-info-expression: '', debug-info-location: '' } +stack: + - { id: 0, name: '', type: spill-slot, offset: -28, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +callSites: [] +constants: [] +machineFunctionInfo: {} +body: | + bb.0.entry: + successors: %bb.1(0x00000001), %bb.3(0x7fffffff) + liveins: $cl, $dl, $r8b + + frame-setup PUSH64r killed $rbp, implicit-def $rsp, implicit $rsp + frame-setup SEH_PushReg 50 + $rsp = frame-setup SUB64ri8 $rsp, 48, implicit-def dead $eflags + frame-setup SEH_StackAlloc 48 + $rbp = LEA64r $rsp, 1, $noreg, 48, $noreg + frame-setup SEH_SetFrame 50, 48 + frame-setup SEH_EndPrologue + MOV64mi32 $rbp, 1, $noreg, -8, $noreg, -2 :: (store 8 into %fixed-stack.0) + TEST8rr killed renamable $cl, renamable $cl, implicit-def $eflags + JCC_1 %bb.1, 5, implicit $eflags + + bb.3.if.end: + successors: %bb.4(0x00000001), %bb.5(0x7fffffff) + liveins: $dl, $r8b + + TEST8rr killed renamable $dl, renamable $dl, implicit-def $eflags + JCC_1 %bb.4, 5, implicit $eflags + + bb.5.if.end6: + successors: %bb.6(0x00000001), %bb.8(0x7fffffff) + liveins: $r8b + + MOV32mi $rbp, 1, $noreg, -12, $noreg, 0 :: (store 4 into %stack.0) + TEST8rr killed renamable $r8b, renamable $r8b, implicit-def $eflags + JCC_1 %bb.6, 5, implicit $eflags + + bb.8.return (address-taken): + $eax = MOV32rm $rbp, 1, $noreg, -12, $noreg :: (load 4 from %stack.0) + SEH_Epilogue + $rsp = frame-destroy ADD64ri8 $rsp, 48, implicit-def dead $eflags + $rbp = frame-destroy POP64r implicit-def $rsp, implicit $rsp + RETQ $eax + + bb.1.if.then: + successors: %bb.2(0x7ffff800), %bb.7(0x00000800) + + EH_LABEL + CALL64pcrel32 @mythrow, csr_win64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp + EH_LABEL + JMP_1 %bb.2 + + bb.4.if.then4: + successors: %bb.2(0x7ffff800), %bb.7(0x00000800) + + EH_LABEL + CALL64pcrel32 @mythrow, csr_win64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp + EH_LABEL + JMP_1 %bb.2 + + bb.6.if.then8: + successors: %bb.2(0x7ffff800), %bb.7(0x00000800) + + EH_LABEL + CALL64pcrel32 @mythrow, csr_win64, implicit $rsp, implicit $ssp, implicit-def $rsp, implicit-def $ssp + EH_LABEL + + bb.2.unreachable: + successors: + + + bb.7.catch (landing-pad, ehfunclet-entry): + successors: %bb.8(0x80000000) + liveins: $rdx + + frame-setup MOV64mr killed $rsp, 1, $noreg, 16, $noreg, $rdx + frame-setup PUSH64r killed $rbp, implicit-def $rsp, implicit $rsp + frame-setup SEH_PushReg 50 + $rsp = frame-setup SUB64ri8 $rsp, 32, implicit-def dead $eflags + frame-setup SEH_StackAlloc 32 + $rbp = LEA64r $rdx, 1, $noreg, 48, $noreg + frame-setup SEH_EndPrologue + MOV32mi $rbp, 1, $noreg, -12, $noreg, 1 :: (store 4 into %stack.0) + $rax = LEA64r $rip, 0, $noreg, %bb.8, $noreg + SEH_Epilogue + $rsp = frame-destroy ADD64ri8 $rsp, 32, implicit-def dead $eflags + $rbp = frame-destroy POP64r implicit-def $rsp, implicit $rsp + CATCHRET %bb.8, %bb.0 + +... diff --git a/llvm/test/CodeGen/X86/win64-eh-empty-block.ll b/llvm/test/CodeGen/X86/win64-eh-empty-block.ll index c93c53b6b68e0..0ec7adf6f6825 100644 --- a/llvm/test/CodeGen/X86/win64-eh-empty-block.ll +++ b/llvm/test/CodeGen/X86/win64-eh-empty-block.ll @@ -20,8 +20,8 @@ ; CHECK: callq __cxa_throw ; CHECK: # %eh.resume ; CHECK: callq _Unwind_Resume -; CHECK-NEXT: int3 ; CHECK-NEXT: # %unreachable +; CHECK-NEXT: int3 ; CHECK-NEXT: .Lfunc_end0: %struct.as = type { i32* } diff --git a/llvm/test/CodeGen/X86/wineh-coreclr.ll b/llvm/test/CodeGen/X86/wineh-coreclr.ll index fd2569e4bfb03..16daa1fa97180 100644 --- a/llvm/test/CodeGen/X86/wineh-coreclr.ll +++ b/llvm/test/CodeGen/X86/wineh-coreclr.ll @@ -320,8 +320,8 @@ unreachable: ; CHECK: [[test2_before_f2:.+]]: ; CHECK-NEXT: movl $2, %ecx ; CHECK-NEXT: callq f -; CHECK-NEXT: int3 ; CHECK-NEXT: [[test2_after_f2:.+]]: +; CHECK: int3 ; CHECK: [[test2_end:.*func_end.*]]: @@ -512,24 +512,24 @@ unreachable: ; CHECK: [[test3_before_f4:.+]]: ; CHECK-NEXT: movl $4, %ecx ; CHECK-NEXT: callq f -; CHECK-NEXT: int3 ; CHECK-NEXT: [[test3_after_f4:.+]]: +; CHECK: int3 ; CHECK: .seh_proc [[test3_fault2:[^ ]+]] ; CHECK: # %fault2 ; CHECK: .seh_endprologue ; CHECK: [[test3_before_f3:.+]]: ; CHECK-NEXT: movl $3, %ecx ; CHECK-NEXT: callq f -; CHECK-NEXT: int3 ; CHECK-NEXT: [[test3_after_f3:.+]]: +; CHECK: int3 ; CHECK: .seh_proc [[test3_fault1:[^ ]+]] ; CHECK: # %fault1 ; CHECK: .seh_endprologue ; CHECK: [[test3_before_f2:.+]]: ; CHECK-NEXT: movl $2, %ecx ; CHECK-NEXT: callq f -; CHECK-NEXT: int3 ; CHECK-NEXT: [[test3_after_f2:.+]]: +; CHECK: int3 ; CHECK: [[test3_end:.*func_end.*]]: } diff --git a/llvm/test/DebugInfo/AArch64/dbgcall-site-float-entry-value.ll b/llvm/test/DebugInfo/AArch64/dbgcall-site-float-entry-value.ll index a925cd0c1452d..3b91d17dc6284 100644 --- a/llvm/test/DebugInfo/AArch64/dbgcall-site-float-entry-value.ll +++ b/llvm/test/DebugInfo/AArch64/dbgcall-site-float-entry-value.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple aarch64-linux-gnu -emit-call-site-info -debug-entry-values -filetype=obj -o - %s | llvm-dwarfdump - | FileCheck %s +; RUN: llc -mtriple aarch64-linux-gnu -emit-call-site-info -filetype=obj -o - %s | llvm-dwarfdump - | FileCheck %s ; Based on the following C reproducer: ; diff --git a/llvm/test/DebugInfo/AMDGPU/variable-locations.ll b/llvm/test/DebugInfo/AMDGPU/variable-locations.ll index cb8ebb6a6997b..365dd9dfea78b 100644 --- a/llvm/test/DebugInfo/AMDGPU/variable-locations.ll +++ b/llvm/test/DebugInfo/AMDGPU/variable-locations.ll @@ -32,7 +32,7 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) @GlobB = common addrspace(1) global i32 0, align 4, !dbg !6 ; CHECK: {{.*}}DW_TAG_subprogram -; CHECK: DW_AT_frame_base [DW_FORM_block1] (DW_OP_reg{{.*}} SGPR9) +; CHECK-NOT: DW_AT_frame_base define amdgpu_kernel void @kernel1( ; CHECK: {{.*}}DW_TAG_formal_parameter diff --git a/llvm/test/DebugInfo/MIR/AArch64/dbgcall-site-orr-moves.mir b/llvm/test/DebugInfo/MIR/AArch64/dbgcall-site-orr-moves.mir index ed2a7d903885b..e3ee2cac4fa98 100644 --- a/llvm/test/DebugInfo/MIR/AArch64/dbgcall-site-orr-moves.mir +++ b/llvm/test/DebugInfo/MIR/AArch64/dbgcall-site-orr-moves.mir @@ -1,4 +1,4 @@ -# RUN: llc -emit-call-site-info -debug-entry-values -start-after=livedebugvalues -filetype=obj -o - %s | llvm-dwarfdump - | FileCheck %s +# RUN: llc -emit-call-site-info -start-after=livedebugvalues -filetype=obj -o - %s | llvm-dwarfdump - | FileCheck %s # Based on the following C reproducer: # diff --git a/llvm/test/DebugInfo/MIR/ARM/call-site-info-vmovd.mir b/llvm/test/DebugInfo/MIR/ARM/call-site-info-vmovd.mir index bd42c229d29c9..8f7f789eb8fb7 100644 --- a/llvm/test/DebugInfo/MIR/ARM/call-site-info-vmovd.mir +++ b/llvm/test/DebugInfo/MIR/ARM/call-site-info-vmovd.mir @@ -1,4 +1,4 @@ -# RUN: llc -O1 -emit-call-site-info -debug-entry-values -filetype=obj -mtriple thumbv7em-apple-unknown-macho -start-after=machineverifier %s -o %t.o +# RUN: llc -O1 -emit-call-site-info -filetype=obj -mtriple thumbv7em-apple-unknown-macho -start-after=machineverifier %s -o %t.o # RUN: llvm-dwarfdump %t.o | FileCheck %s # Crash test, reduced from: diff --git a/llvm/test/DebugInfo/MIR/ARM/call-site-info-vmovs.mir b/llvm/test/DebugInfo/MIR/ARM/call-site-info-vmovs.mir index 2cf7e4d1c87fc..e3f1031796a04 100644 --- a/llvm/test/DebugInfo/MIR/ARM/call-site-info-vmovs.mir +++ b/llvm/test/DebugInfo/MIR/ARM/call-site-info-vmovs.mir @@ -1,4 +1,4 @@ -# RUN: llc -O1 -emit-call-site-info -debug-entry-values -filetype=obj -mtriple thumbv7em-apple-unknown-macho -start-after=machineverifier %s -o %t.o +# RUN: llc -O1 -emit-call-site-info -filetype=obj -mtriple thumbv7em-apple-unknown-macho -start-after=machineverifier %s -o %t.o # RUN: llvm-dwarfdump %t.o | FileCheck %s # Crash test, reduced from: diff --git a/llvm/test/DebugInfo/MIR/ARM/dbgcall-site-propagated-value.mir b/llvm/test/DebugInfo/MIR/ARM/dbgcall-site-propagated-value.mir index 5b84d9e9627f1..b25b1c90d6348 100644 --- a/llvm/test/DebugInfo/MIR/ARM/dbgcall-site-propagated-value.mir +++ b/llvm/test/DebugInfo/MIR/ARM/dbgcall-site-propagated-value.mir @@ -1,4 +1,4 @@ -# RUN: llc -emit-call-site-info -debug-entry-values -run-pass=livedebugvalues -o - %s | FileCheck %s +# RUN: llc -run-pass=livedebugvalues -o - %s | FileCheck %s # Based on the following C reproducer: # @@ -106,10 +106,6 @@ name: caller alignment: 4 tracksRegLiveness: true -callSites: - - { bb: 0, offset: 6 } - - { bb: 0, offset: 9, fwdArgRegs: - - { arg: 0, reg: '$r0' } } body: | bb.0: liveins: $lr diff --git a/llvm/test/DebugInfo/MIR/Hexagon/dbgcall-site-instr-before-bundled-call.mir b/llvm/test/DebugInfo/MIR/Hexagon/dbgcall-site-instr-before-bundled-call.mir index 3ae23d4189bf1..9baa815a0458b 100644 --- a/llvm/test/DebugInfo/MIR/Hexagon/dbgcall-site-instr-before-bundled-call.mir +++ b/llvm/test/DebugInfo/MIR/Hexagon/dbgcall-site-instr-before-bundled-call.mir @@ -1,4 +1,6 @@ -# RUN: llc -mtriple hexagon -emit-call-site-info -debug-entry-values -start-after=machineverifier -filetype=obj %s -o - | llvm-dwarfdump - | FileCheck %s +# We do not support the call site info for the target now, so we use the experimental option (-emit-call-site-info -debug-entry-values). + +# RUN: llc -emit-call-site-info -debug-entry-values -mtriple hexagon -start-after=machineverifier -filetype=obj %s -o - | llvm-dwarfdump - | FileCheck %s # Based on the following C reproducer: # diff --git a/llvm/test/DebugInfo/MIR/Hexagon/live-debug-values-bundled-entry-values.mir b/llvm/test/DebugInfo/MIR/Hexagon/live-debug-values-bundled-entry-values.mir index 8bb0b3202acd3..2ed3672c2ec30 100644 --- a/llvm/test/DebugInfo/MIR/Hexagon/live-debug-values-bundled-entry-values.mir +++ b/llvm/test/DebugInfo/MIR/Hexagon/live-debug-values-bundled-entry-values.mir @@ -1,3 +1,5 @@ +# We do not support the call site info for the target now, so we use the experimental option (-emit-call-site-info -debug-entry-values). + # RUN: llc -emit-call-site-info -debug-entry-values -run-pass=livedebugvalues -o - %s | FileCheck %s # Verify that the entry values for the input parameters are inserted after the diff --git a/llvm/test/DebugInfo/MIR/SystemZ/call-site-lzer.mir b/llvm/test/DebugInfo/MIR/SystemZ/call-site-lzer.mir index 3cf41467f7f9f..f173c9d780fa9 100644 --- a/llvm/test/DebugInfo/MIR/SystemZ/call-site-lzer.mir +++ b/llvm/test/DebugInfo/MIR/SystemZ/call-site-lzer.mir @@ -1,3 +1,5 @@ +# We do not support the call site info for the target now, so we use the experimental option (-emit-call-site-info -debug-entry-values). + # RUN: llc -emit-call-site-info -debug-entry-values -start-after=livedebugvalues -o - %s | FileCheck %s # This test would previously trigger an assertion when trying to describe the diff --git a/llvm/test/DebugInfo/MIR/X86/DW_OP_entry_value.mir b/llvm/test/DebugInfo/MIR/X86/DW_OP_entry_value.mir index 4e5a07321d428..f7f74b628d166 100644 --- a/llvm/test/DebugInfo/MIR/X86/DW_OP_entry_value.mir +++ b/llvm/test/DebugInfo/MIR/X86/DW_OP_entry_value.mir @@ -1,4 +1,4 @@ -# RUN: llc -emit-call-site-info -debug-entry-values -start-before=livedebugvalues -mtriple=x86_64-apple-darwin -o %t %s -filetype=obj +# RUN: llc -start-before=livedebugvalues -mtriple=x86_64-apple-darwin -o %t %s -filetype=obj # RUN: llvm-dwarfdump %t | FileCheck %s # # int global; diff --git a/llvm/test/DebugInfo/MIR/X86/call-site-gnu-vs-dwarf5-attrs.mir b/llvm/test/DebugInfo/MIR/X86/call-site-gnu-vs-dwarf5-attrs.mir index 891fbb60b36e6..b60c10a04e596 100644 --- a/llvm/test/DebugInfo/MIR/X86/call-site-gnu-vs-dwarf5-attrs.mir +++ b/llvm/test/DebugInfo/MIR/X86/call-site-gnu-vs-dwarf5-attrs.mir @@ -1,16 +1,28 @@ # Test the call site encoding in DWARF5 vs GNU extensions. # -# RUN: llc -dwarf-version 4 -debugger-tune=gdb -emit-call-site-info -debug-entry-values -filetype=obj \ +# === DWARF4, tune for gdb === +# RUN: llc -emit-call-site-info -dwarf-version 4 -debugger-tune=gdb -filetype=obj \ # RUN: -mtriple=x86_64-unknown-unknown -start-after=machineverifier -o - %s \ -# RUN: | llvm-dwarfdump - | FileCheck %s -check-prefixes=CHECK-GNU +# RUN: | llvm-dwarfdump - | FileCheck %s -check-prefixes=CHECK-GNU -implicit-check-not=DW_AT_call # -# RUN: llc -dwarf-version 5 -debugger-tune=lldb -emit-call-site-info -debug-entry-values -filetype=obj \ +# === DWARF5, tune for gdb === +# RUN: llc -dwarf-version 5 -debugger-tune=gdb -emit-call-site-info -filetype=obj \ +# RUN: -mtriple=x86_64-unknown-unknown -start-after=machineverifier -o - %s \ +# RUN: | llvm-dwarfdump - | FileCheck %s -check-prefixes=CHECK-DWARF5 -implicit-check-not=DW_AT_call +# +# === DWARF4, tune for lldb === +# RUN: llc -dwarf-version 4 -debugger-tune=lldb -emit-call-site-info -filetype=obj \ # RUN: -mtriple=x86_64-unknown-unknown -start-after=machineverifier -o - %s \ -# RUN: | llvm-dwarfdump - | FileCheck %s -check-prefixes=CHECK-DWARF5 +# RUN: | llvm-dwarfdump - | FileCheck %s -check-prefixes=CHECK-DWARF5 -implicit-check-not=DW_AT_call # -# RUN: llc -dwarf-version 5 -emit-call-site-info -debug-entry-values -filetype=obj \ +# === DWARF5, tune for lldb === +# RUN: llc -dwarf-version 5 -debugger-tune=lldb -emit-call-site-info -filetype=obj \ # RUN: -mtriple=x86_64-unknown-unknown -start-after=machineverifier -o - %s \ -# RUN: | llvm-dwarfdump - | FileCheck %s -check-prefixes=CHECK-DWARF5 +# RUN: | llvm-dwarfdump - | FileCheck %s -check-prefixes=CHECK-DWARF5 -implicit-check-not=DW_AT_call +# +# RUN: llc -emit-call-site-info -dwarf-version 5 -filetype=obj -debugger-tune=sce \ +# RUN: -emit-debug-entry-values -debug-entry-values -mtriple=x86_64-unknown-unknown \ +# RUN: -start-after=machineverifier -o - %s | llvm-dwarfdump - | FileCheck %s -check-prefixes=CHECK-DWARF5 # # This is based on the following reproducer: # @@ -45,6 +57,7 @@ # CHECK-GNU: DW_TAG_GNU_call_site # CHECK-GNU-NEXT: DW_AT_abstract_origin # CHECK-GNU-NEXT: DW_AT_GNU_tail_call +# CHECK-GNU-NEXT: DW_AT_low_pc # # # Check DWARF 5: @@ -54,6 +67,9 @@ # CHECK-DWARF5: DW_TAG_call_site # CHECK-DWARF5-NEXT: DW_AT_call_origin # CHECK-DWARF5-NEXT: DW_AT_call_return_pc +# CHECK-DWARF5: DW_TAG_call_site +# CHECK-DWARF5-NEXT: DW_AT_call_origin +# CHECK-DWARF5-NEXT: DW_AT_call_return_pc # CHECK-DWARF5: DW_TAG_call_site_parameter # CHECK-DWARF5-NEXT: DW_AT_location # CHECK-DWARF5-NEXT: DW_AT_call_value @@ -63,6 +79,7 @@ # CHECK-DWARF5: DW_TAG_call_site # CHECK-DWARF5-NEXT: DW_AT_call_origin # CHECK-DWARF5-NEXT: DW_AT_call_tail_call +# CHECK-DWARF5-NEXT: DW_AT_call_pc # --- | ; ModuleID = 'call-site-attrs.c' diff --git a/llvm/test/DebugInfo/MIR/X86/callsite-stack-value.mir b/llvm/test/DebugInfo/MIR/X86/callsite-stack-value.mir index 6d69f06302029..5b9ecf08150be 100644 --- a/llvm/test/DebugInfo/MIR/X86/callsite-stack-value.mir +++ b/llvm/test/DebugInfo/MIR/X86/callsite-stack-value.mir @@ -1,5 +1,5 @@ # RUN: llc -start-after=livedebugvalues -mtriple=x86_64-apple-darwin -o - %s -filetype=obj \ -# RUN: -emit-call-site-info -debug-entry-values | llvm-dwarfdump - | FileCheck %s -implicit-check-not=call_site_parameter +# RUN: -emit-call-site-info | llvm-dwarfdump - | FileCheck %s -implicit-check-not=call_site_parameter # CHECK: DW_TAG_formal_parameter # CHECK-NEXT: DW_AT_location diff --git a/llvm/test/DebugInfo/MIR/X86/dbgcall-site-copy-super-sub.mir b/llvm/test/DebugInfo/MIR/X86/dbgcall-site-copy-super-sub.mir index 01a2b887a60b6..347a0ec09bb24 100644 --- a/llvm/test/DebugInfo/MIR/X86/dbgcall-site-copy-super-sub.mir +++ b/llvm/test/DebugInfo/MIR/X86/dbgcall-site-copy-super-sub.mir @@ -1,4 +1,4 @@ -# RUN: llc -emit-call-site-info -debug-entry-values -start-after=livedebugvalues -filetype=obj %s -o -| llvm-dwarfdump -| FileCheck %s +# RUN: llc -emit-call-site-info -start-after=livedebugvalues -filetype=obj %s -o -| llvm-dwarfdump -| FileCheck %s # Based on the following reproducer: # diff --git a/llvm/test/DebugInfo/MIR/X86/dbgcall-site-interpretation.mir b/llvm/test/DebugInfo/MIR/X86/dbgcall-site-interpretation.mir index d6c6b30184622..b142313871eb1 100644 --- a/llvm/test/DebugInfo/MIR/X86/dbgcall-site-interpretation.mir +++ b/llvm/test/DebugInfo/MIR/X86/dbgcall-site-interpretation.mir @@ -1,4 +1,4 @@ -# RUN: llc -emit-call-site-info -debug-entry-values -start-after=machineverifier -filetype=obj %s -o -| llvm-dwarfdump -| FileCheck %s +# RUN: llc -emit-call-site-info -start-after=machineverifier -filetype=obj %s -o -| llvm-dwarfdump -| FileCheck %s # # CHECK: DW_TAG_GNU_call_site # CHECK-NEXT: DW_AT_abstract_origin {{.*}} "foo" diff --git a/llvm/test/DebugInfo/MIR/X86/dbgcall-site-lea-interpretation.mir b/llvm/test/DebugInfo/MIR/X86/dbgcall-site-lea-interpretation.mir index 4d88fa9aab74d..79e40b65c4208 100644 --- a/llvm/test/DebugInfo/MIR/X86/dbgcall-site-lea-interpretation.mir +++ b/llvm/test/DebugInfo/MIR/X86/dbgcall-site-lea-interpretation.mir @@ -1,4 +1,4 @@ -# RUN: llc -emit-call-site-info -debug-entry-values -start-after=machineverifier -filetype=obj %s -o -| llvm-dwarfdump -| FileCheck %s +# RUN: llc -emit-call-site-info -start-after=machineverifier -filetype=obj %s -o -| llvm-dwarfdump -| FileCheck %s # CHECK: DW_TAG_GNU_call_site # CHECK-NEXT: DW_AT_abstract_origin {{.*}} "foo") # CHECK-NEXT: DW_AT_low_pc {{.*}} diff --git a/llvm/test/DebugInfo/MIR/X86/dbgcall-site-partial-describe.mir b/llvm/test/DebugInfo/MIR/X86/dbgcall-site-partial-describe.mir index b97785d650c94..f0902bbe41d66 100644 --- a/llvm/test/DebugInfo/MIR/X86/dbgcall-site-partial-describe.mir +++ b/llvm/test/DebugInfo/MIR/X86/dbgcall-site-partial-describe.mir @@ -1,4 +1,4 @@ -# RUN: llc -emit-call-site-info -debug-entry-values -start-before=livedebugvalues -filetype=obj -o - %s \ +# RUN: llc -emit-call-site-info -start-before=livedebugvalues -filetype=obj -o - %s \ # RUN: | llvm-dwarfdump - | FileCheck %s --implicit-check-not=DW_TAG_GNU_call_site_parameter --- | diff --git a/llvm/test/DebugInfo/MIR/X86/dbgcall-site-reference.mir b/llvm/test/DebugInfo/MIR/X86/dbgcall-site-reference.mir index 81af598ba1942..73927772ca085 100644 --- a/llvm/test/DebugInfo/MIR/X86/dbgcall-site-reference.mir +++ b/llvm/test/DebugInfo/MIR/X86/dbgcall-site-reference.mir @@ -1,4 +1,4 @@ -# RUN: llc -emit-call-site-info -debug-entry-values -start-before=livedebugvalues -filetype=obj -o - %s | llvm-dwarfdump - | FileCheck %s +# RUN: llc -start-before=livedebugvalues -filetype=obj -o - %s | llvm-dwarfdump - | FileCheck %s # Based on the following C++ code: # struct A { A(A &) {} }; diff --git a/llvm/test/DebugInfo/MIR/X86/dbgcall-site-reg-shuffle.mir b/llvm/test/DebugInfo/MIR/X86/dbgcall-site-reg-shuffle.mir index 1baf66393c49d..27a03193e8161 100644 --- a/llvm/test/DebugInfo/MIR/X86/dbgcall-site-reg-shuffle.mir +++ b/llvm/test/DebugInfo/MIR/X86/dbgcall-site-reg-shuffle.mir @@ -1,4 +1,4 @@ -# RUN: llc -emit-call-site-info -debug-entry-values -start-before=livedebugvalues -filetype=obj -o - %s \ +# RUN: llc -emit-call-site-info -start-before=livedebugvalues -filetype=obj -o - %s \ # RUN: | llvm-dwarfdump - | FileCheck %s --implicit-check-not=DW_TAG_GNU_call_site_parameter --- | diff --git a/llvm/test/DebugInfo/MIR/X86/dbgcall-site-two-fwd-reg-defs.mir b/llvm/test/DebugInfo/MIR/X86/dbgcall-site-two-fwd-reg-defs.mir index ac97da66a397c..c1bdbd0783acd 100644 --- a/llvm/test/DebugInfo/MIR/X86/dbgcall-site-two-fwd-reg-defs.mir +++ b/llvm/test/DebugInfo/MIR/X86/dbgcall-site-two-fwd-reg-defs.mir @@ -1,4 +1,4 @@ -# RUN: llc -O1 -emit-call-site-info -debug-entry-values -start-after=livedebugvalues -filetype=obj %s -o - | llvm-dwarfdump - | FileCheck %s +# RUN: llc -O1 -emit-call-site-info -start-after=livedebugvalues -filetype=obj %s -o - | llvm-dwarfdump - | FileCheck %s # Based on the following C reproducer: # diff --git a/llvm/test/DebugInfo/MIR/X86/dbginfo-entryvals.mir b/llvm/test/DebugInfo/MIR/X86/dbginfo-entryvals.mir index 9346b513cf481..302cce20a15ac 100644 --- a/llvm/test/DebugInfo/MIR/X86/dbginfo-entryvals.mir +++ b/llvm/test/DebugInfo/MIR/X86/dbginfo-entryvals.mir @@ -1,4 +1,4 @@ -# RUN: llc -emit-call-site-info -debug-entry-values -run-pass=livedebugvalues -verify-machineinstrs -march=x86-64 -o - %s | FileCheck %s +# RUN: llc -run-pass=livedebugvalues -verify-machineinstrs -march=x86-64 -o - %s | FileCheck %s # #extern void fn2(int); # diff --git a/llvm/test/DebugInfo/MIR/X86/debug-call-site-param.mir b/llvm/test/DebugInfo/MIR/X86/debug-call-site-param.mir index 2a78919afd438..c5ca4f1b2a147 100644 --- a/llvm/test/DebugInfo/MIR/X86/debug-call-site-param.mir +++ b/llvm/test/DebugInfo/MIR/X86/debug-call-site-param.mir @@ -2,8 +2,8 @@ # When the debugger tuning is set to gdb, use GNU opcodes. # For lldb, use the standard DWARF5 opcodes. -# RUN: llc -emit-call-site-info -debug-entry-values -debugger-tune=gdb -filetype=obj -mtriple=x86_64-unknown-unknown -start-after=machineverifier -o - %s | llvm-dwarfdump - | FileCheck %s -check-prefixes=CHECK-GNU -# RUN: llc -emit-call-site-info -debug-entry-values -debugger-tune=lldb -filetype=obj -mtriple=x86_64-unknown-unknown -start-after=machineverifier -o - %s | llvm-dwarfdump - | FileCheck %s -check-prefixes=CHECK-DWARF5 +# RUN: llc -emit-call-site-info -debugger-tune=gdb -filetype=obj -mtriple=x86_64-unknown-unknown -start-after=machineverifier -o - %s | llvm-dwarfdump - | FileCheck %s -check-prefixes=CHECK-GNU +# RUN: llc -emit-call-site-info -debugger-tune=lldb -filetype=obj -mtriple=x86_64-unknown-unknown -start-after=machineverifier -o - %s | llvm-dwarfdump - | FileCheck %s -check-prefixes=CHECK-DWARF5 # # extern void foo(int *a, int b, int c, int d, int e, int f); # extern int getVal(); diff --git a/llvm/test/DebugInfo/MIR/X86/entry-value-of-modified-param.mir b/llvm/test/DebugInfo/MIR/X86/entry-value-of-modified-param.mir index 541a2155578ec..0dd63ae98009c 100644 --- a/llvm/test/DebugInfo/MIR/X86/entry-value-of-modified-param.mir +++ b/llvm/test/DebugInfo/MIR/X86/entry-value-of-modified-param.mir @@ -1,4 +1,4 @@ -# RUN: llc -emit-call-site-info -debug-entry-values -run-pass=livedebugvalues -march=x86-64 -o - %s | FileCheck %s +# RUN: llc -run-pass=livedebugvalues -march=x86-64 -o - %s | FileCheck %s # #extern void fn1 (int, int, int); # @@ -85,11 +85,6 @@ --- name: fn2 alignment: 16 -callSites: - - { bb: 0, offset: 14, fwdArgRegs: - - { arg: 0, reg: '$edi' } - - { arg: 1, reg: '$esi' } - - { arg: 2, reg: '$edx' } } body: | bb.0.entry: liveins: $edi, $esi, $rbx diff --git a/llvm/test/DebugInfo/MIR/X86/entry-values-diamond-bbs.mir b/llvm/test/DebugInfo/MIR/X86/entry-values-diamond-bbs.mir index 042d76058a228..fc7bd93d0223c 100644 --- a/llvm/test/DebugInfo/MIR/X86/entry-values-diamond-bbs.mir +++ b/llvm/test/DebugInfo/MIR/X86/entry-values-diamond-bbs.mir @@ -1,4 +1,4 @@ -# RUN: llc -emit-call-site-info -debug-entry-values -run-pass=livedebugvalues -march=x86-64 -o - %s | FileCheck %s +# RUN: llc -run-pass=livedebugvalues -march=x86-64 -o - %s | FileCheck %s # # The test case was artificially adjusted, in order to make proper diamond basic # block structure relevant to the debug entry values propagation. diff --git a/llvm/test/DebugInfo/MIR/X86/propagate-entry-value-cross-bbs.mir b/llvm/test/DebugInfo/MIR/X86/propagate-entry-value-cross-bbs.mir index c5af863954bfb..34f80f5ca2a32 100644 --- a/llvm/test/DebugInfo/MIR/X86/propagate-entry-value-cross-bbs.mir +++ b/llvm/test/DebugInfo/MIR/X86/propagate-entry-value-cross-bbs.mir @@ -1,4 +1,4 @@ -# RUN: llc -emit-call-site-info -debug-entry-values -run-pass=livedebugvalues -march=x86-64 -o - %s | FileCheck %s +# RUN: llc -run-pass=livedebugvalues -march=x86-64 -o - %s | FileCheck %s # #extern void fn1 (int, int, int); #__attribute__((noinline)) @@ -110,15 +110,6 @@ --- name: fn2 alignment: 16 -callSites: - - { bb: 0, offset: 20, fwdArgRegs: - - { arg: 0, reg: '$edi' } - - { arg: 1, reg: '$esi' } - - { arg: 2, reg: '$edx' } } - - { bb: 3, offset: 2, fwdArgRegs: - - { arg: 0, reg: '$edi' } - - { arg: 1, reg: '$esi' } - - { arg: 2, reg: '$edx' } } body: | bb.0.entry: successors: %bb.1(0x40000000), %bb.2(0x40000000) diff --git a/llvm/test/DebugInfo/MIR/X86/unreachable-block-call-site.mir b/llvm/test/DebugInfo/MIR/X86/unreachable-block-call-site.mir index ea9c12b5a192a..bfc5c2be127e7 100644 --- a/llvm/test/DebugInfo/MIR/X86/unreachable-block-call-site.mir +++ b/llvm/test/DebugInfo/MIR/X86/unreachable-block-call-site.mir @@ -1,4 +1,4 @@ -# RUN: llc -mtriple=x86_64-pc-linux -emit-call-site-info -debug-entry-values -run-pass=unreachable-mbb-elimination -o - %s | FileCheck %s +# RUN: llc -emit-call-site-info -mtriple=x86_64-pc-linux -run-pass=unreachable-mbb-elimination -o - %s | FileCheck %s # Verify that the call site information for the call residing in the eliminated # block is removed. This test case would previously trigger an assertion when diff --git a/llvm/test/DebugInfo/X86/arange.ll b/llvm/test/DebugInfo/X86/arange.ll index f9facc795f541..49090bfc61075 100644 --- a/llvm/test/DebugInfo/X86/arange.ll +++ b/llvm/test/DebugInfo/X86/arange.ll @@ -1,4 +1,3 @@ -; REQUIRES: x86 ; RUN: llc -mtriple=x86_64-linux -O0 -filetype=obj -generate-arange-section < %s | llvm-dwarfdump -debug-aranges - | FileCheck %s ; RUN: llc -mtriple=x86_64-linux -O0 -filetype=obj -generate-arange-section < %s | llvm-readobj --relocations - | FileCheck --check-prefix=OBJ %s diff --git a/llvm/test/DebugInfo/X86/arguments.ll b/llvm/test/DebugInfo/X86/arguments.ll index 05b2981439bc9..8f030867c3d98 100644 --- a/llvm/test/DebugInfo/X86/arguments.ll +++ b/llvm/test/DebugInfo/X86/arguments.ll @@ -1,4 +1,3 @@ -; REQUIRES: x86 ; RUN: llc -mtriple=x86_64-unknown-unknown -O0 -filetype=obj < %s > %t ; RUN: llvm-dwarfdump %t | FileCheck %s diff --git a/llvm/test/DebugInfo/X86/c-type-units.ll b/llvm/test/DebugInfo/X86/c-type-units.ll index 844823b2cda89..889bd79d27dec 100644 --- a/llvm/test/DebugInfo/X86/c-type-units.ll +++ b/llvm/test/DebugInfo/X86/c-type-units.ll @@ -1,4 +1,3 @@ -; REQUIRES: x86 ; RUN: llc -o - %s -filetype=obj -O0 -debugger-tune=lldb -generate-type-units -mtriple=x86_64-unknown-linux-gnu | llvm-dwarfdump -debug-types - | FileCheck %s diff --git a/llvm/test/DebugInfo/X86/dbg-value-range.ll b/llvm/test/DebugInfo/X86/dbg-value-range.ll index e0cfe5f15ee95..9159d2aac780c 100644 --- a/llvm/test/DebugInfo/X86/dbg-value-range.ll +++ b/llvm/test/DebugInfo/X86/dbg-value-range.ll @@ -56,6 +56,6 @@ declare void @llvm.dbg.value(metadata, metadata, metadata) nounwind readnone ;CHECK-NEXT: .quad [[CLOBBER_OFF]] ;CHECK-NEXT: .short 1 ## Loc expr size ;CHECK-NEXT: .byte 85 ## DW_OP_reg -;CHECK-NEXT: .quad 0 +;CHECK: .quad 0 ;CHECK-NEXT: .quad 0 !24 = !{i32 1, !"Debug Info Version", i32 3} diff --git a/llvm/test/DebugInfo/X86/dbg-value-regmask-clobber.ll b/llvm/test/DebugInfo/X86/dbg-value-regmask-clobber.ll index 440498a9d8dd4..425a6cb38c410 100644 --- a/llvm/test/DebugInfo/X86/dbg-value-regmask-clobber.ll +++ b/llvm/test/DebugInfo/X86/dbg-value-regmask-clobber.ll @@ -9,8 +9,7 @@ ; ASM: movl $1, x(%rip) ; ASM: callq clobber ; ASM-NEXT: [[argc_range_end:.Ltmp[0-9]+]]: -; Previously LiveDebugValues would claim argc was still in ecx after the call. -; ASM-NOT: #DEBUG_VALUE: main:argc +; ASM: #DEBUG_VALUE: main:argc <- [DW_OP_LLVM_entry_value 1] $ecx ; argc is the first debug location. ; ASM: .Ldebug_loc1: @@ -23,7 +22,8 @@ ; DWARF: .debug_info contents: ; DWARF: DW_TAG_formal_parameter ; DWARF-NEXT: DW_AT_location ({{0x.*}} -; DWARF-NEXT: [0x0000000000000000, 0x0000000000000013): DW_OP_reg2 RCX) +; DWARF-NEXT: [0x0000000000000000, 0x0000000000000013): DW_OP_reg2 RCX +; DWARF-NEXT: [0x0000000000000013, 0x0000000000000043): DW_OP_GNU_entry_value(DW_OP_reg2 RCX), DW_OP_stack_value ; DWARF-NEXT: DW_AT_name ("argc") ; ModuleID = 't.cpp' diff --git a/llvm/test/DebugInfo/X86/dbgcall-site-64-bit-imms.ll b/llvm/test/DebugInfo/X86/dbgcall-site-64-bit-imms.ll index b8cd9574cc63d..f12dfa6196c1f 100644 --- a/llvm/test/DebugInfo/X86/dbgcall-site-64-bit-imms.ll +++ b/llvm/test/DebugInfo/X86/dbgcall-site-64-bit-imms.ll @@ -1,4 +1,4 @@ -; RUN: llc -O1 -emit-call-site-info -debug-entry-values -filetype=obj -o - %s | llvm-dwarfdump - | FileCheck %s +; RUN: llc -emit-call-site-info -O1 -filetype=obj -o - %s | llvm-dwarfdump - | FileCheck %s ; Verify that the 64-bit call site immediates are not truncated. ; diff --git a/llvm/test/DebugInfo/X86/dbgcall-site-zero-valued-imms.ll b/llvm/test/DebugInfo/X86/dbgcall-site-zero-valued-imms.ll index 5d37774f55d6f..dc8c418117c75 100644 --- a/llvm/test/DebugInfo/X86/dbgcall-site-zero-valued-imms.ll +++ b/llvm/test/DebugInfo/X86/dbgcall-site-zero-valued-imms.ll @@ -1,4 +1,4 @@ -; RUN: llc -O3 -emit-call-site-info -debug-entry-values -filetype=obj -o - %s | llvm-dwarfdump - | FileCheck %s +; RUN: llc -emit-call-site-info -O3 -filetype=obj -o - %s | llvm-dwarfdump - | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" diff --git a/llvm/test/DebugInfo/X86/debug-loc-frame.ll b/llvm/test/DebugInfo/X86/debug-loc-frame.ll index 653ba1f3eb688..83c678ccedf97 100644 --- a/llvm/test/DebugInfo/X86/debug-loc-frame.ll +++ b/llvm/test/DebugInfo/X86/debug-loc-frame.ll @@ -1,4 +1,3 @@ -; REQUIRES: x86 ; Check that when variables are allocated on the stack we generate debug locations ; for the stack location directly instead of generating a register+offset indirection. diff --git a/llvm/test/DebugInfo/X86/debug-names-ir-disabled.ll b/llvm/test/DebugInfo/X86/debug-names-ir-disabled.ll index c1f1dc99ede56..6502c86a20b8e 100644 --- a/llvm/test/DebugInfo/X86/debug-names-ir-disabled.ll +++ b/llvm/test/DebugInfo/X86/debug-names-ir-disabled.ll @@ -1,4 +1,3 @@ -; REQUIRES: x86 ; Verify that no DWARF v5 names section is emitted when all CUs disable name tables. ; RUN: llc -mtriple x86_64-pc-linux -filetype=obj < %s \ diff --git a/llvm/test/DebugInfo/X86/debug-names-partial.ll b/llvm/test/DebugInfo/X86/debug-names-partial.ll index 28ee59ea71623..50a21d6b9cdc8 100644 --- a/llvm/test/DebugInfo/X86/debug-names-partial.ll +++ b/llvm/test/DebugInfo/X86/debug-names-partial.ll @@ -1,4 +1,3 @@ -; REQUIRES: x86 ; Verify that DWARF v5 debug_names omit names from CUs that opt-out. ; RUN: llc -mtriple x86_64-pc-linux -filetype=obj < %s \ diff --git a/llvm/test/DebugInfo/X86/debug-names-split-dwarf.ll b/llvm/test/DebugInfo/X86/debug-names-split-dwarf.ll index 26687e8143cea..66520395dadf0 100644 --- a/llvm/test/DebugInfo/X86/debug-names-split-dwarf.ll +++ b/llvm/test/DebugInfo/X86/debug-names-split-dwarf.ll @@ -1,4 +1,3 @@ -; REQUIRES: x86 ; Verify that DWARF v5 accelerator tables work with split-dwarf. ; RUN: llc -mtriple x86_64-pc-linux -split-dwarf-file=foo.dwo \ diff --git a/llvm/test/DebugInfo/X86/decl-derived-member.ll b/llvm/test/DebugInfo/X86/decl-derived-member.ll index acb39f4e8f6cb..2d5ca1a87f9b5 100644 --- a/llvm/test/DebugInfo/X86/decl-derived-member.ll +++ b/llvm/test/DebugInfo/X86/decl-derived-member.ll @@ -1,4 +1,3 @@ -; REQUIRES: x86 ; RUN: llc -mtriple x86_64-pc-linux -O0 -filetype=obj %s -o %t ; RUN: llvm-dwarfdump %t | FileCheck %s diff --git a/llvm/test/DebugInfo/X86/dwarf-callsite-related-attrs.ll b/llvm/test/DebugInfo/X86/dwarf-callsite-related-attrs.ll index c019d75c0dbfd..3a34ff38809f9 100644 --- a/llvm/test/DebugInfo/X86/dwarf-callsite-related-attrs.ll +++ b/llvm/test/DebugInfo/X86/dwarf-callsite-related-attrs.ll @@ -12,10 +12,9 @@ ; and fail with "failed to compute relocation: IMAGE_REL_AMD64_ADDR32". ; UNSUPPORTED: cygwin,windows-gnu,windows-msvc -; REQUIRES: x86 ; RUN: %llc_dwarf -mtriple=x86_64-- < %s -o - | FileCheck %s -check-prefix=ASM ; RUN: %llc_dwarf -debugger-tune=lldb -mtriple=x86_64-- < %s -filetype=obj -o %t.o -; RUN: llvm-dwarfdump %t.o -o - | FileCheck %s -check-prefix=OBJ -implicit-check-not=DW_TAG_call_site +; RUN: llvm-dwarfdump %t.o -o - | FileCheck %s -check-prefix=OBJ -implicit-check-not=DW_TAG_call -implicit-check-not=DW_AT_call ; RUN: llvm-dwarfdump -verify %t.o 2>&1 | FileCheck %s -check-prefix=VERIFY ; RUN: llvm-dwarfdump -statistics %t.o | FileCheck %s -check-prefix=STATS ; RUN: llvm-as < %s | llvm-dis | llvm-as | llvm-dis -o /dev/null @@ -76,6 +75,7 @@ entry: ; OBJ: DW_TAG_call_site ; OBJ: DW_AT_call_origin ([[bat_sp]]) ; OBJ: DW_AT_call_tail_call +; OBJ: DW_AT_call_pc define void @_Z3foov() !dbg !25 { entry: tail call void @__has_no_subprogram() diff --git a/llvm/test/DebugInfo/X86/generate-odr-hash.ll b/llvm/test/DebugInfo/X86/generate-odr-hash.ll index 68dcfda2254b0..9fa954cd24858 100644 --- a/llvm/test/DebugInfo/X86/generate-odr-hash.ll +++ b/llvm/test/DebugInfo/X86/generate-odr-hash.ll @@ -1,4 +1,3 @@ -; REQUIRES: x86 ; RUN: llc < %s -o %t -filetype=obj -O0 -generate-type-units -mtriple=x86_64-unknown-linux-gnu ; RUN: llvm-dwarfdump -v %t | FileCheck --check-prefix=CHECK --check-prefix=SINGLE %s diff --git a/llvm/test/DebugInfo/X86/ghost-sdnode-dbgvalues.ll b/llvm/test/DebugInfo/X86/ghost-sdnode-dbgvalues.ll index 8796851e593cf..f80b3ac162240 100644 --- a/llvm/test/DebugInfo/X86/ghost-sdnode-dbgvalues.ll +++ b/llvm/test/DebugInfo/X86/ghost-sdnode-dbgvalues.ll @@ -1,4 +1,3 @@ -; REQUIRES: x86 ; RUN: llc -mtriple=x86_64-apple-macosx10.10.0 -o %t %s diff --git a/llvm/test/DebugInfo/X86/inline-member-function.ll b/llvm/test/DebugInfo/X86/inline-member-function.ll index 76f1d86777ac2..31cc5b0fa5cff 100644 --- a/llvm/test/DebugInfo/X86/inline-member-function.ll +++ b/llvm/test/DebugInfo/X86/inline-member-function.ll @@ -1,4 +1,3 @@ -; REQUIRES: x86 ; RUN: llc -mtriple=x86_64-linux -O0 -filetype=obj < %s | llvm-dwarfdump -v -debug-info - | FileCheck %s diff --git a/llvm/test/DebugInfo/X86/lexical_block.ll b/llvm/test/DebugInfo/X86/lexical_block.ll index a08cb0346c124..1af231e8dfe0a 100644 --- a/llvm/test/DebugInfo/X86/lexical_block.ll +++ b/llvm/test/DebugInfo/X86/lexical_block.ll @@ -1,4 +1,3 @@ -; REQUIRES: x86 ; RUN: llc -mtriple=x86_64-linux -O0 -filetype=obj < %s \ ; RUN: | llvm-dwarfdump -v -debug-info - | FileCheck --check-prefix=CHECK --check-prefix=CHECK-V4 %s diff --git a/llvm/test/DebugInfo/X86/loclists-dwp.ll b/llvm/test/DebugInfo/X86/loclists-dwp.ll index 91f8388763386..a972c8094c5f1 100644 --- a/llvm/test/DebugInfo/X86/loclists-dwp.ll +++ b/llvm/test/DebugInfo/X86/loclists-dwp.ll @@ -19,10 +19,12 @@ ; void b(int i) { asm("" : : : "rdi"); } ; CHECK: DW_AT_location [DW_FORM_sec_offset] (0x00000000 -; CHECK-NEXT: DW_LLE_startx_length (0x0000000000000000, 0x0000000000000006): DW_OP_reg5 RDI) +; CHECK-NEXT: DW_LLE_startx_length (0x0000000000000000, 0x0000000000000006): DW_OP_reg5 RDI +; CHECK-NEXT: DW_LLE_startx_length (0x0000000000000001, 0x0000000000000002): DW_OP_GNU_entry_value(DW_OP_reg5 RDI), DW_OP_stack_value) ; CHECK: DW_AT_location [DW_FORM_sec_offset] (0x00000000 -; CHECK-NEXT: DW_LLE_startx_length (0x0000000000000000, 0x0000000000000000): DW_OP_reg5 RDI) +; CHECK-NEXT: DW_LLE_startx_length (0x0000000000000000, 0x0000000000000000): DW_OP_reg5 RDI +; CHECK-NEXT: DW_LLE_startx_length (0x0000000000000001, 0x0000000000000001): DW_OP_GNU_entry_value(DW_OP_reg5 RDI), DW_OP_stack_value) target triple = "x86_64-unknown-linux-gnu" diff --git a/llvm/test/DebugInfo/X86/missing-file-line.ll b/llvm/test/DebugInfo/X86/missing-file-line.ll index 24cc418c43976..08f6f1529040e 100644 --- a/llvm/test/DebugInfo/X86/missing-file-line.ll +++ b/llvm/test/DebugInfo/X86/missing-file-line.ll @@ -1,4 +1,3 @@ -; REQUIRES: x86 ; RUN: llc -mtriple=x86_64-linux-gnu -filetype=obj %s -o - | llvm-dwarfdump -all - | FileCheck %s diff --git a/llvm/test/DebugInfo/X86/no-entry-values-with-O0.ll b/llvm/test/DebugInfo/X86/no-entry-values-with-O0.ll new file mode 100644 index 0000000000000..8ba22b7b6e510 --- /dev/null +++ b/llvm/test/DebugInfo/X86/no-entry-values-with-O0.ll @@ -0,0 +1,88 @@ +; RUN: llc -O0 -dwarf-version=5 -debugger-tune=lldb -march=x86-64 -filetype=obj < %s \ +; RUN: | llvm-dwarfdump - | FileCheck --implicit-check-not=DW_OP_entry_value %s +; RUN: llc -O0 -dwarf-version=5 -debugger-tune=gdb -march=x86-64 -filetype=obj < %s \ +; RUN: | llvm-dwarfdump - | FileCheck --implicit-check-not=DW_OP_entry_value %s + +; The call-site-params are created iff corresponding DISubprogram contains +; the AllCallsDescribed DIFlag. +; CHECK-NOT: DW_TAG_call_site_param + +; Genarated with: +; clang -gdwarf-5 -O0 test.c -S -emit-llvm +; +; ModuleID = 'test.c' +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: noinline nounwind optnone uwtable +define dso_local void @fn1(i32 %x, i32 %y) !dbg !7 { +entry: + %x.addr = alloca i32, align 4 + %y.addr = alloca i32, align 4 + %u = alloca i32, align 4 + %a = alloca i32, align 4 + store i32 %x, i32* %x.addr, align 4 + call void @llvm.dbg.declare(metadata i32* %x.addr, metadata !11, metadata !DIExpression()), !dbg !12 + store i32 %y, i32* %y.addr, align 4 + call void @llvm.dbg.declare(metadata i32* %y.addr, metadata !13, metadata !DIExpression()), !dbg !14 + call void @llvm.dbg.declare(metadata i32* %u, metadata !15, metadata !DIExpression()), !dbg !16 + %0 = load i32, i32* %x.addr, align 4, !dbg !16 + %1 = load i32, i32* %y.addr, align 4, !dbg !16 + %add = add nsw i32 %0, %1, !dbg !16 + store i32 %add, i32* %u, align 4, !dbg !16 + %2 = load i32, i32* %x.addr, align 4, !dbg !17 + %cmp = icmp sgt i32 %2, 1, !dbg !17 + br i1 %cmp, label %if.then, label %if.else, !dbg !16 + +if.then: ; preds = %entry + %3 = load i32, i32* %u, align 4, !dbg !17 + %add1 = add nsw i32 %3, 1, !dbg !17 + store i32 %add1, i32* %u, align 4, !dbg !17 + br label %if.end, !dbg !17 + +if.else: ; preds = %entry + %4 = load i32, i32* %u, align 4, !dbg !17 + %add2 = add nsw i32 %4, 2, !dbg !17 + store i32 %add2, i32* %u, align 4, !dbg !17 + br label %if.end + +if.end: ; preds = %if.else, %if.then + call void @llvm.dbg.declare(metadata i32* %a, metadata !19, metadata !DIExpression()), !dbg !16 + store i32 7, i32* %a, align 4, !dbg !16 + %5 = load i32, i32* %a, align 4, !dbg !16 + call void @fn2(i32 %5), !dbg !16 + %6 = load i32, i32* %u, align 4, !dbg !16 + %dec = add nsw i32 %6, -1, !dbg !16 + store i32 %dec, i32* %u, align 4, !dbg !16 + ret void, !dbg !16 +} + +; Function Attrs: nounwind readnone speculatable willreturn +declare void @llvm.dbg.declare(metadata, metadata, metadata) + +declare dso_local void @fn2(i32) + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!3, !4, !5} +!llvm.ident = !{!6} + +!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 11.0.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false, nameTableKind: None) +!1 = !DIFile(filename: "test.c", directory: "/") +!2 = !{} +!3 = !{i32 7, !"Dwarf Version", i32 5} +!4 = !{i32 2, !"Debug Info Version", i32 3} +!5 = !{i32 1, !"wchar_size", i32 4} +!6 = !{!"clang version 11.0.0"} +!7 = distinct !DISubprogram(name: "fn1", scope: !1, file: !1, line: 5, type: !8, scopeLine: 5, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2) +!8 = !DISubroutineType(types: !9) +!9 = !{null, !10, !10} +!10 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +!11 = !DILocalVariable(name: "x", arg: 1, scope: !7, file: !1, line: 5, type: !10) +!12 = !DILocation(line: 5, column: 10, scope: !7) +!13 = !DILocalVariable(name: "y", arg: 2, scope: !7, file: !1, line: 5, type: !10) +!14 = !DILocation(line: 5, column: 17, scope: !7) +!15 = !DILocalVariable(name: "u", scope: !7, file: !1, line: 6, type: !10) +!16 = !DILocation(line: 6, column: 7, scope: !7) +!17 = !DILocation(line: 7, column: 7, scope: !18) +!18 = distinct !DILexicalBlock(scope: !7, file: !1, line: 7, column: 7) +!19 = !DILocalVariable(name: "a", scope: !7, file: !1, line: 11, type: !10) diff --git a/llvm/test/DebugInfo/X86/nodebug.ll b/llvm/test/DebugInfo/X86/nodebug.ll index 6062f114f80b6..b52254dea86bc 100644 --- a/llvm/test/DebugInfo/X86/nodebug.ll +++ b/llvm/test/DebugInfo/X86/nodebug.ll @@ -1,4 +1,3 @@ -; REQUIRES: x86 ; RUN: llc < %s -filetype=obj -mtriple=x86_64-apple-darwin | llvm-dwarfdump -v - | FileCheck %s diff --git a/llvm/test/DebugInfo/X86/nodebug_with_debug_loc.ll b/llvm/test/DebugInfo/X86/nodebug_with_debug_loc.ll index 417922c0d9a9b..a01e6c06398aa 100644 --- a/llvm/test/DebugInfo/X86/nodebug_with_debug_loc.ll +++ b/llvm/test/DebugInfo/X86/nodebug_with_debug_loc.ll @@ -1,4 +1,3 @@ -; REQUIRES: x86 ; RUN: llc -mtriple=i386-linux-gnu -filetype=obj -relocation-model=pic %s -o /dev/null diff --git a/llvm/test/DebugInfo/X86/parameters.ll b/llvm/test/DebugInfo/X86/parameters.ll index f0a970471bb40..5f4edd5b963de 100644 --- a/llvm/test/DebugInfo/X86/parameters.ll +++ b/llvm/test/DebugInfo/X86/parameters.ll @@ -1,4 +1,3 @@ -; REQUIRES: x86 ; ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -O0 -filetype=obj %s -o - | llvm-dwarfdump -v -debug-info - | FileCheck %s diff --git a/llvm/test/DebugInfo/X86/pr45181.ll b/llvm/test/DebugInfo/X86/pr45181.ll new file mode 100644 index 0000000000000..9c168164d4f8c --- /dev/null +++ b/llvm/test/DebugInfo/X86/pr45181.ll @@ -0,0 +1,306 @@ +; RUN: llc -O1 -filetype=obj -emit-call-site-info -debug-entry-values -o - < %s | llvm-dwarfdump -verify - -o /dev/null + +; TODO: This test should be made more targeted by converting to MIR and reducing, +; however at the moment conversion to MIR fails with: +; Assertion failed: (!NameRef.empty() && "Normal symbols cannot be unnamed!") + +target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.14.0" + +%struct.e = type opaque +%"class.aa::aq" = type { i8 } +%"class.aa::ah" = type { i8 } +%"class.aa::y" = type { i8 } +%"class.aa::y.0" = type { i8 } +%struct.j = type opaque +%struct.h = type opaque +%struct.r = type opaque + +@o = local_unnamed_addr global i32 0, align 4, !dbg !0 +@p = local_unnamed_addr global %struct.e* null, align 8, !dbg !42 + +; Function Attrs: optsize ssp uwtable +define void @_ZN2aa2aq2arEv(%"class.aa::aq"* %this) local_unnamed_addr #0 align 2 !dbg !50 { +entry: + call void @llvm.dbg.value(metadata %"class.aa::aq"* %this, metadata !71, metadata !DIExpression()), !dbg !75 + %0 = bitcast %"class.aa::aq"* %this to %"class.aa::ah"*, !dbg !76 + tail call void @_ZN2aa2ah2aiEiib(%"class.aa::ah"* %0, i32 undef, i32 undef, i1 zeroext true) #5, !dbg !76 + ret void, !dbg !77 +} + +; Function Attrs: nounwind readnone speculatable willreturn +declare void @llvm.dbg.declare(metadata, metadata, metadata) #1 + +; Function Attrs: argmemonly nounwind willreturn +declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) #2 + +; Function Attrs: optsize ssp uwtable +define linkonce_odr void @_ZN2aa2ah2aiEiib(%"class.aa::ah"* %this, i32 %aj, i32 %0, i1 zeroext %1) local_unnamed_addr #0 align 2 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) !dbg !78 { +entry: + %ao = alloca %"class.aa::y", align 1 + %ap = alloca %"class.aa::y.0", align 1 + call void @llvm.dbg.value(metadata %"class.aa::ah"* %this, metadata !80, metadata !DIExpression()), !dbg !126 + call void @llvm.dbg.value(metadata i32 %aj, metadata !82, metadata !DIExpression()), !dbg !126 + call void @llvm.dbg.value(metadata i32 %0, metadata !83, metadata !DIExpression()), !dbg !126 + call void @llvm.dbg.value(metadata i1 %1, metadata !84, metadata !DIExpression()), !dbg !126 + call void @llvm.dbg.value(metadata i32 %aj, metadata !85, metadata !DIExpression()), !dbg !126 + %2 = getelementptr inbounds %"class.aa::y", %"class.aa::y"* %ao, i64 0, i32 0, !dbg !127 + call void @llvm.lifetime.start.p0i8(i64 1, i8* nonnull %2) #6, !dbg !127 + call void @llvm.dbg.declare(metadata %"class.aa::y"* %ao, metadata !91, metadata !DIExpression()), !dbg !128 + %call = tail call %struct.j* @_Z1mPvS_lPFvS_PKvlE(i8* undef, i8* undef, i64 0, void (i8*, i8*, i64)* nonnull @_ZN2aa12_GLOBAL__N_12agEPvPKvl) #5, !dbg !129 + call void @_ZN2aa1yIP1jNS_2ac1zI1eEEEC1ES2_(%"class.aa::y"* nonnull %ao, %struct.j* %call) #5, !dbg !128 + %3 = getelementptr inbounds %"class.aa::y.0", %"class.aa::y.0"* %ap, i64 0, i32 0, !dbg !130 + call void @llvm.lifetime.start.p0i8(i64 1, i8* nonnull %3) #6, !dbg !130 + call void @llvm.dbg.declare(metadata %"class.aa::y.0"* %ap, metadata !110, metadata !DIExpression()), !dbg !131 + %4 = load %struct.e*, %struct.e** @p, align 8, !dbg !132, !tbaa !133 + %call3 = invoke %struct.h* @_Z1qP1e(%struct.e* %4) #5 + to label %invoke.cont unwind label %lpad, !dbg !137 + +invoke.cont: ; preds = %entry + invoke void @_ZN2aa1yIP1hNS_2ac1zI1eEEEC1ES2_(%"class.aa::y.0"* nonnull %ap, %struct.h* %call3) #5 + to label %invoke.cont4 unwind label %lpad, !dbg !131 + +invoke.cont4: ; preds = %invoke.cont + %conv = sext i32 %aj to i64, !dbg !138 + %mul = shl nsw i32 %aj, 2, !dbg !139 + %conv6 = sext i32 %mul to i64, !dbg !140 + %call9 = invoke %struct.h* @_ZN2aa1yIP1hNS_2ac1zI1eEEE2abEv(%"class.aa::y.0"* nonnull %ap) #5 + to label %invoke.cont8 unwind label %lpad7, !dbg !141 + +invoke.cont8: ; preds = %invoke.cont4 + %call11 = invoke %struct.j* @_ZN2aa1yIP1jNS_2ac1zI1eEEE2abEv(%"class.aa::y"* nonnull %ao) #5 + to label %invoke.cont10 unwind label %lpad7, !dbg !142 + +invoke.cont10: ; preds = %invoke.cont8 + %5 = load i32, i32* @o, align 4, !dbg !143, !tbaa !144 + %call13 = invoke %struct.r* @_Z1vlllllP1hiP1jPdb1n(i64 %conv, i64 0, i64 8, i64 2, i64 %conv6, %struct.h* %call9, i32 0, %struct.j* %call11, double* null, i1 zeroext false, i32 %5) #5 + to label %invoke.cont12 unwind label %lpad7, !dbg !146 + +invoke.cont12: ; preds = %invoke.cont10 + unreachable, !dbg !146 + +lpad: ; preds = %invoke.cont, %entry + %6 = landingpad { i8*, i32 } + cleanup, !dbg !147 + %7 = extractvalue { i8*, i32 } %6, 0, !dbg !147 + %8 = extractvalue { i8*, i32 } %6, 1, !dbg !147 + br label %ehcleanup, !dbg !147 + +lpad7: ; preds = %invoke.cont10, %invoke.cont8, %invoke.cont4 + %9 = landingpad { i8*, i32 } + cleanup, !dbg !147 + %10 = extractvalue { i8*, i32 } %9, 0, !dbg !147 + %11 = extractvalue { i8*, i32 } %9, 1, !dbg !147 + call void @_ZN2aa1yIP1hNS_2ac1zI1eEEED1Ev(%"class.aa::y.0"* nonnull %ap) #7, !dbg !147 + br label %ehcleanup, !dbg !147 + +ehcleanup: ; preds = %lpad7, %lpad + %exn.slot.0 = phi i8* [ %10, %lpad7 ], [ %7, %lpad ], !dbg !147 + %ehselector.slot.0 = phi i32 [ %11, %lpad7 ], [ %8, %lpad ], !dbg !147 + call void @llvm.lifetime.end.p0i8(i64 1, i8* nonnull %3) #6, !dbg !147 + call void @_ZN2aa1yIP1jNS_2ac1zI1eEEED1Ev(%"class.aa::y"* nonnull %ao) #7, !dbg !147 + call void @llvm.lifetime.end.p0i8(i64 1, i8* nonnull %2) #6, !dbg !147 + %lpad.val = insertvalue { i8*, i32 } undef, i8* %exn.slot.0, 0, !dbg !147 + %lpad.val19 = insertvalue { i8*, i32 } %lpad.val, i32 %ehselector.slot.0, 1, !dbg !147 + resume { i8*, i32 } %lpad.val19, !dbg !147 +} + +; Function Attrs: argmemonly nounwind willreturn +declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) #2 + +; Function Attrs: optsize +declare !dbg !11 %struct.j* @_Z1mPvS_lPFvS_PKvlE(i8*, i8*, i64, void (i8*, i8*, i64)*) local_unnamed_addr #3 + +; Function Attrs: optsize +declare void @_ZN2aa12_GLOBAL__N_12agEPvPKvl(i8*, i8*, i64) #3 + +; Function Attrs: optsize +declare void @_ZN2aa1yIP1jNS_2ac1zI1eEEEC1ES2_(%"class.aa::y"*, %struct.j*) unnamed_addr #3 + +; Function Attrs: optsize +declare !dbg !24 %struct.h* @_Z1qP1e(%struct.e*) local_unnamed_addr #3 + +declare i32 @__gxx_personality_v0(...) + +; Function Attrs: optsize +declare void @_ZN2aa1yIP1hNS_2ac1zI1eEEEC1ES2_(%"class.aa::y.0"*, %struct.h*) unnamed_addr #3 + +; Function Attrs: optsize +declare !dbg !31 %struct.r* @_Z1vlllllP1hiP1jPdb1n(i64, i64, i64, i64, i64, %struct.h*, i32, %struct.j*, double*, i1 zeroext, i32) local_unnamed_addr #3 + +; Function Attrs: optsize +declare %struct.h* @_ZN2aa1yIP1hNS_2ac1zI1eEEE2abEv(%"class.aa::y.0"*) local_unnamed_addr #3 + +; Function Attrs: optsize +declare %struct.j* @_ZN2aa1yIP1jNS_2ac1zI1eEEE2abEv(%"class.aa::y"*) local_unnamed_addr #3 + +; Function Attrs: nounwind optsize +declare void @_ZN2aa1yIP1hNS_2ac1zI1eEEED1Ev(%"class.aa::y.0"*) unnamed_addr #4 + +; Function Attrs: nounwind optsize +declare void @_ZN2aa1yIP1jNS_2ac1zI1eEEED1Ev(%"class.aa::y"*) unnamed_addr #4 + +; Function Attrs: nounwind readnone speculatable willreturn +declare void @llvm.dbg.value(metadata, metadata, metadata) #1 + +attributes #0 = { optsize ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+cx8,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind readnone speculatable willreturn } +attributes #2 = { argmemonly nounwind willreturn } +attributes #3 = { optsize "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+cx8,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #4 = { nounwind optsize "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+cx8,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #5 = { optsize } +attributes #6 = { nounwind } +attributes #7 = { nounwind optsize } + +!llvm.dbg.cu = !{!2} +!llvm.module.flags = !{!45, !46, !47, !48} +!llvm.ident = !{!49} + +!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression()) +!1 = distinct !DIGlobalVariable(name: "o", scope: !2, file: !6, line: 11, type: !40, isLocal: false, isDefinition: true) +!2 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !3, producer: "clang version 11.0.0 (git@github.com:llvm/llvm-project.git 0fecdcd1628999a1900d9cf84cd33dacf1319fa6)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, retainedTypes: !10, globals: !41, nameTableKind: None, sysroot: "/") +!3 = !DIFile(filename: "/Users/vsk/tmp/x.cc", directory: "/Users/vsk/src/llvm-backup-master") +!4 = !{!5} +!5 = !DICompositeType(tag: DW_TAG_enumeration_type, file: !6, line: 16, baseType: !7, size: 32, elements: !8) +!6 = !DIFile(filename: "tmp/x.cc", directory: "/Users/vsk") +!7 = !DIBasicType(name: "unsigned int", size: 32, encoding: DW_ATE_unsigned) +!8 = !{!9} +!9 = !DIEnumerator(name: "u", value: 0, isUnsigned: true) +!10 = !{!11, !24, !31} +!11 = !DISubprogram(name: "m", linkageName: "_Z1mPvS_lPFvS_PKvlE", scope: !6, file: !6, line: 10, type: !12, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !23) +!12 = !DISubroutineType(types: !13) +!13 = !{!14, !16, !16, !17, !18} +!14 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !15, size: 64) +!15 = !DICompositeType(tag: DW_TAG_structure_type, name: "j", file: !6, line: 8, flags: DIFlagFwdDecl, identifier: "_ZTS1j") +!16 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: null, size: 64) +!17 = !DIBasicType(name: "long int", size: 64, encoding: DW_ATE_signed) +!18 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !19, size: 64) +!19 = !DISubroutineType(types: !20) +!20 = !{null, !16, !21, !17} +!21 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !22, size: 64) +!22 = !DIDerivedType(tag: DW_TAG_const_type, baseType: null) +!23 = !{} +!24 = !DISubprogram(name: "q", linkageName: "_Z1qP1e", scope: !6, file: !6, line: 13, type: !25, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !23) +!25 = !DISubroutineType(types: !26) +!26 = !{!27, !29} +!27 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !28, size: 64) +!28 = !DICompositeType(tag: DW_TAG_structure_type, name: "h", file: !6, line: 7, flags: DIFlagFwdDecl, identifier: "_ZTS1h") +!29 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !30, size: 64) +!30 = !DICompositeType(tag: DW_TAG_structure_type, name: "e", file: !6, line: 5, flags: DIFlagFwdDecl, identifier: "_ZTS1e") +!31 = !DISubprogram(name: "v", linkageName: "_Z1vlllllP1hiP1jPdb1n", scope: !6, file: !6, line: 17, type: !32, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !23) +!32 = !DISubroutineType(types: !33) +!33 = !{!34, !17, !17, !17, !17, !17, !27, !36, !14, !37, !39, !40} +!34 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !35, size: 64) +!35 = !DICompositeType(tag: DW_TAG_structure_type, name: "r", file: !6, line: 14, flags: DIFlagFwdDecl, identifier: "_ZTS1r") +!36 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +!37 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !38, size: 64) +!38 = !DIBasicType(name: "double", size: 64, encoding: DW_ATE_float) +!39 = !DIBasicType(name: "bool", size: 8, encoding: DW_ATE_boolean) +!40 = !DICompositeType(tag: DW_TAG_enumeration_type, name: "n", file: !6, line: 11, size: 32, flags: DIFlagFwdDecl, identifier: "_ZTS1n") +!41 = !{!0, !42} +!42 = !DIGlobalVariableExpression(var: !43, expr: !DIExpression()) +!43 = distinct !DIGlobalVariable(name: "p", scope: !2, file: !6, line: 12, type: !44, isLocal: false, isDefinition: true) +!44 = !DIDerivedType(tag: DW_TAG_typedef, name: "f", file: !6, line: 5, baseType: !29) +!45 = !{i32 7, !"Dwarf Version", i32 4} +!46 = !{i32 2, !"Debug Info Version", i32 3} +!47 = !{i32 1, !"wchar_size", i32 4} +!48 = !{i32 7, !"PIC Level", i32 2} +!49 = !{!"clang version 11.0.0 (git@github.com:llvm/llvm-project.git 0fecdcd1628999a1900d9cf84cd33dacf1319fa6)"} +!50 = distinct !DISubprogram(name: "ar", linkageName: "_ZN2aa2aq2arEv", scope: !51, file: !6, line: 48, type: !67, scopeLine: 48, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, declaration: !66, retainedNodes: !70) +!51 = distinct !DICompositeType(tag: DW_TAG_class_type, name: "aq", scope: !52, file: !6, line: 45, size: 8, flags: DIFlagTypePassByValue, elements: !53, identifier: "_ZTSN2aa2aqE") +!52 = !DINamespace(name: "aa", scope: null) +!53 = !{!54, !66} +!54 = !DIDerivedType(tag: DW_TAG_inheritance, scope: !51, baseType: !55, extraData: i32 0) +!55 = distinct !DICompositeType(tag: DW_TAG_class_type, name: "ah", scope: !52, file: !6, line: 34, size: 8, flags: DIFlagTypePassByValue, elements: !56, identifier: "_ZTSN2aa2ahE") +!56 = !{!57} +!57 = !DISubprogram(name: "ai", linkageName: "_ZN2aa2ah2aiEiib", scope: !55, file: !6, line: 36, type: !58, scopeLine: 36, flags: DIFlagPublic | DIFlagPrototyped, spFlags: DISPFlagOptimized) +!58 = !DISubroutineType(types: !59) +!59 = !{!60, !64, !65, !65, !39} +!60 = distinct !DICompositeType(tag: DW_TAG_class_type, name: "af", scope: !52, file: !6, line: 30, size: 8, flags: DIFlagTypePassByValue, elements: !23, templateParams: !61, identifier: "_ZTSN2aa2afI1wEE") +!61 = !{!62} +!62 = !DITemplateTypeParameter(type: !63) +!63 = !DICompositeType(tag: DW_TAG_class_type, name: "w", file: !6, line: 18, flags: DIFlagFwdDecl, identifier: "_ZTS1w") +!64 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !55, size: 64, flags: DIFlagArtificial | DIFlagObjectPointer) +!65 = !DIDerivedType(tag: DW_TAG_typedef, name: "b", file: !6, line: 2, baseType: !36) +!66 = !DISubprogram(name: "ar", linkageName: "_ZN2aa2aq2arEv", scope: !51, file: !6, line: 46, type: !67, scopeLine: 46, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized) +!67 = !DISubroutineType(types: !68) +!68 = !{null, !69} +!69 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !51, size: 64, flags: DIFlagArtificial | DIFlagObjectPointer) +!70 = !{!71, !73, !74} +!71 = !DILocalVariable(name: "this", arg: 1, scope: !50, type: !72, flags: DIFlagArtificial | DIFlagObjectPointer) +!72 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !51, size: 64) +!73 = !DILocalVariable(name: "aj", scope: !50, file: !6, line: 49, type: !65) +!74 = !DILocalVariable(name: "am", scope: !50, file: !6, line: 50, type: !65) +!75 = !DILocation(line: 0, scope: !50) +!76 = !DILocation(line: 51, column: 3, scope: !50) +!77 = !DILocation(line: 52, column: 1, scope: !50) +!78 = distinct !DISubprogram(name: "ai", linkageName: "_ZN2aa2ah2aiEiib", scope: !55, file: !6, line: 36, type: !58, scopeLine: 36, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, declaration: !57, retainedNodes: !79) +!79 = !{!80, !82, !83, !84, !85, !86, !87, !91, !110} +!80 = !DILocalVariable(name: "this", arg: 1, scope: !78, type: !81, flags: DIFlagArtificial | DIFlagObjectPointer) +!81 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !55, size: 64) +!82 = !DILocalVariable(name: "aj", arg: 2, scope: !78, file: !6, line: 36, type: !65) +!83 = !DILocalVariable(arg: 3, scope: !78, file: !6, line: 36, type: !65) +!84 = !DILocalVariable(arg: 4, scope: !78, file: !6, line: 36, type: !39) +!85 = !DILocalVariable(name: "ak", scope: !78, file: !6, line: 37, type: !65) +!86 = !DILocalVariable(name: "al", scope: !78, file: !6, line: 38, type: !65) +!87 = !DILocalVariable(name: "an", scope: !78, file: !6, line: 39, type: !88) +!88 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !89, size: 64) +!89 = !DIDerivedType(tag: DW_TAG_typedef, name: "c", file: !6, line: 3, baseType: !90) +!90 = !DIBasicType(name: "char", size: 8, encoding: DW_ATE_signed_char) +!91 = !DILocalVariable(name: "ao", scope: !78, file: !6, line: 40, type: !92) +!92 = !DIDerivedType(tag: DW_TAG_typedef, name: "ae", scope: !52, file: !6, line: 29, baseType: !93) +!93 = distinct !DICompositeType(tag: DW_TAG_class_type, name: "y >", scope: !52, file: !6, line: 20, size: 8, flags: DIFlagTypePassByReference | DIFlagNonTrivial, elements: !94, templateParams: !105, identifier: "_ZTSN2aa1yIP1jNS_2ac1zI1eEEEE") +!94 = !{!95, !99, !102} +!95 = !DISubprogram(name: "y", scope: !93, file: !6, line: 22, type: !96, scopeLine: 22, flags: DIFlagPublic | DIFlagPrototyped, spFlags: DISPFlagOptimized) +!96 = !DISubroutineType(types: !97) +!97 = !{null, !98, !14} +!98 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !93, size: 64, flags: DIFlagArtificial | DIFlagObjectPointer) +!99 = !DISubprogram(name: "~y", scope: !93, file: !6, line: 23, type: !100, scopeLine: 23, flags: DIFlagPublic | DIFlagPrototyped, spFlags: DISPFlagOptimized) +!100 = !DISubroutineType(types: !101) +!101 = !{null, !98} +!102 = !DISubprogram(name: "ab", linkageName: "_ZN2aa1yIP1jNS_2ac1zI1eEEE2abEv", scope: !93, file: !6, line: 24, type: !103, scopeLine: 24, flags: DIFlagPublic | DIFlagPrototyped, spFlags: DISPFlagOptimized) +!103 = !DISubroutineType(types: !104) +!104 = !{!14, !98} +!105 = !{!106, !107} +!106 = !DITemplateTypeParameter(name: "x", type: !14) +!107 = !DITemplateTypeParameter(type: !108) +!108 = !DICompositeType(tag: DW_TAG_structure_type, name: "z", scope: !109, file: !6, line: 27, flags: DIFlagFwdDecl, identifier: "_ZTSN2aa2ac1zI1eEE") +!109 = !DINamespace(name: "ac", scope: !52) +!110 = !DILocalVariable(name: "ap", scope: !78, file: !6, line: 41, type: !111) +!111 = !DIDerivedType(tag: DW_TAG_typedef, name: "ae", scope: !52, file: !6, line: 29, baseType: !112) +!112 = distinct !DICompositeType(tag: DW_TAG_class_type, name: "y >", scope: !52, file: !6, line: 20, size: 8, flags: DIFlagTypePassByReference | DIFlagNonTrivial, elements: !113, templateParams: !124, identifier: "_ZTSN2aa1yIP1hNS_2ac1zI1eEEEE") +!113 = !{!114, !118, !121} +!114 = !DISubprogram(name: "y", scope: !112, file: !6, line: 22, type: !115, scopeLine: 22, flags: DIFlagPublic | DIFlagPrototyped, spFlags: DISPFlagOptimized) +!115 = !DISubroutineType(types: !116) +!116 = !{null, !117, !27} +!117 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !112, size: 64, flags: DIFlagArtificial | DIFlagObjectPointer) +!118 = !DISubprogram(name: "~y", scope: !112, file: !6, line: 23, type: !119, scopeLine: 23, flags: DIFlagPublic | DIFlagPrototyped, spFlags: DISPFlagOptimized) +!119 = !DISubroutineType(types: !120) +!120 = !{null, !117} +!121 = !DISubprogram(name: "ab", linkageName: "_ZN2aa1yIP1hNS_2ac1zI1eEEE2abEv", scope: !112, file: !6, line: 24, type: !122, scopeLine: 24, flags: DIFlagPublic | DIFlagPrototyped, spFlags: DISPFlagOptimized) +!122 = !DISubroutineType(types: !123) +!123 = !{!27, !117} +!124 = !{!125, !107} +!125 = !DITemplateTypeParameter(name: "x", type: !27) +!126 = !DILocation(line: 0, scope: !78) +!127 = !DILocation(line: 40, column: 5, scope: !78) +!128 = !DILocation(line: 40, column: 11, scope: !78) +!129 = !DILocation(line: 40, column: 14, scope: !78) +!130 = !DILocation(line: 41, column: 5, scope: !78) +!131 = !DILocation(line: 41, column: 11, scope: !78) +!132 = !DILocation(line: 41, column: 16, scope: !78) +!133 = !{!134, !134, i64 0} +!134 = !{!"any pointer", !135, i64 0} +!135 = !{!"omnipotent char", !136, i64 0} +!136 = !{!"Simple C++ TBAA"} +!137 = !DILocation(line: 41, column: 14, scope: !78) +!138 = !DILocation(line: 42, column: 7, scope: !78) +!139 = !DILocation(line: 42, column: 23, scope: !78) +!140 = !DILocation(line: 42, column: 21, scope: !78) +!141 = !DILocation(line: 42, column: 32, scope: !78) +!142 = !DILocation(line: 42, column: 44, scope: !78) +!143 = !DILocation(line: 42, column: 70, scope: !78) +!144 = !{!145, !145, i64 0} +!145 = !{!"_ZTS1n", !135, i64 0} +!146 = !DILocation(line: 42, column: 5, scope: !78) +!147 = !DILocation(line: 43, column: 3, scope: !78) diff --git a/llvm/test/DebugInfo/X86/rematerialize.ll b/llvm/test/DebugInfo/X86/rematerialize.ll index 4b646be2481e4..f3e7e0a2086ac 100644 --- a/llvm/test/DebugInfo/X86/rematerialize.ll +++ b/llvm/test/DebugInfo/X86/rematerialize.ll @@ -1,4 +1,3 @@ -; REQUIRES: x86 ; RUN: llc -O2 -filetype=obj -mtriple=x86_64-unknown-linux-gnu < %s \ ; RUN: | llvm-dwarfdump -debug-line - | FileCheck %s ; diff --git a/llvm/test/DebugInfo/X86/string-offsets-multiple-cus.ll b/llvm/test/DebugInfo/X86/string-offsets-multiple-cus.ll index e1042a95ddeee..4e8dfc2ada747 100644 --- a/llvm/test/DebugInfo/X86/string-offsets-multiple-cus.ll +++ b/llvm/test/DebugInfo/X86/string-offsets-multiple-cus.ll @@ -1,4 +1,3 @@ -; REQUIRES: x86 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -filetype=obj < %s | llvm-dwarfdump -v - | \ ; RUN: FileCheck --check-prefix=DEFAULT --check-prefix=BOTH %s ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -filetype=obj -generate-type-units < %s | \ diff --git a/llvm/test/DebugInfo/X86/string-offsets-table-order.ll b/llvm/test/DebugInfo/X86/string-offsets-table-order.ll index ffa8550be5409..ca159eea615f6 100644 --- a/llvm/test/DebugInfo/X86/string-offsets-table-order.ll +++ b/llvm/test/DebugInfo/X86/string-offsets-table-order.ll @@ -1,4 +1,3 @@ -; REQUIRES: x86 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -split-dwarf-file=foo.dwo -filetype=obj < %s \ ; RUN: | llvm-dwarfdump -v - | FileCheck %s diff --git a/llvm/test/DebugInfo/X86/string-offsets-table.ll b/llvm/test/DebugInfo/X86/string-offsets-table.ll index 21016bd286b61..e1c914a1946b8 100644 --- a/llvm/test/DebugInfo/X86/string-offsets-table.ll +++ b/llvm/test/DebugInfo/X86/string-offsets-table.ll @@ -1,4 +1,3 @@ -; REQUIRES: x86 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -filetype=obj < %s | llvm-dwarfdump -v - \ ; RUN: | FileCheck --check-prefix=MONOLITHIC %s ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -split-dwarf-file=foo.dwo -filetype=obj < %s \ diff --git a/llvm/test/DebugInfo/X86/template.ll b/llvm/test/DebugInfo/X86/template.ll index 769e2541cd5ff..72566154d2258 100644 --- a/llvm/test/DebugInfo/X86/template.ll +++ b/llvm/test/DebugInfo/X86/template.ll @@ -1,4 +1,3 @@ -; REQUIRES: x86 ; RUN: llc -mtriple=x86_64-linux -O0 -filetype=obj < %s | llvm-dwarfdump -v -debug-info - | FileCheck %s ; RUN: llc -mtriple=x86_64-linux -O0 -filetype=obj < %s | not llvm-dwarfdump -verify - | FileCheck %s --check-prefix VERIFY diff --git a/llvm/test/DebugInfo/X86/tu-to-non-named-type.ll b/llvm/test/DebugInfo/X86/tu-to-non-named-type.ll index 19d9976449bc4..883bab142ec7e 100644 --- a/llvm/test/DebugInfo/X86/tu-to-non-named-type.ll +++ b/llvm/test/DebugInfo/X86/tu-to-non-named-type.ll @@ -1,4 +1,3 @@ -; REQUIRES: x86 ; RUN: llc -filetype=obj -O0 -generate-type-units -mtriple=x86_64-unknown-linux-gnu < %s \ ; RUN: | llvm-dwarfdump -debug-info -debug-types - | FileCheck %s diff --git a/llvm/test/DebugInfo/X86/type_units_with_addresses.ll b/llvm/test/DebugInfo/X86/type_units_with_addresses.ll index de563ee2a395b..0f33ee2209f64 100644 --- a/llvm/test/DebugInfo/X86/type_units_with_addresses.ll +++ b/llvm/test/DebugInfo/X86/type_units_with_addresses.ll @@ -1,4 +1,3 @@ -; REQUIRES: x86 ; RUN: llc -split-dwarf-file=foo.dwo -filetype=obj -O0 -generate-type-units -mtriple=x86_64-unknown-linux-gnu < %s \ ; RUN: | llvm-dwarfdump -v - | FileCheck %s diff --git a/llvm/test/ExecutionEngine/OrcMCJIT/test-global-ctors.ll b/llvm/test/ExecutionEngine/OrcMCJIT/test-global-ctors.ll index d66efc27fb419..c42f193f4bad9 100644 --- a/llvm/test/ExecutionEngine/OrcMCJIT/test-global-ctors.ll +++ b/llvm/test/ExecutionEngine/OrcMCJIT/test-global-ctors.ll @@ -1,8 +1,8 @@ ; RUN: %lli -jit-kind=orc-mcjit %s > /dev/null ; XFAIL: darwin @var = global i32 1, align 4 -@llvm.global_ctors = appending global [1 x { i32, void ()* }] [{ i32, void ()* } { i32 65535, void ()* @ctor_func }] -@llvm.global_dtors = appending global [1 x { i32, void ()* }] [{ i32, void ()* } { i32 65535, void ()* @dtor_func }] +@llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* @ctor_func, i8* null }] +@llvm.global_dtors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* @dtor_func, i8* null }] define i32 @main() nounwind { entry: diff --git a/llvm/test/MC/ARM/arm-memory-instructions.s b/llvm/test/MC/ARM/arm-memory-instructions.s index 416069ac5d2c3..259afb2f5fd6c 100644 --- a/llvm/test/MC/ARM/arm-memory-instructions.s +++ b/llvm/test/MC/ARM/arm-memory-instructions.s @@ -194,11 +194,13 @@ Lbaz: .quad 0 @------------------------------------------------------------------------------ ldrht r9, [r7], #128 ldrht r4, [r3], #-75 + ldrht r4, [r3] ldrht r9, [r7], r2 ldrht r4, [r3], -r2 @ CHECK: ldrht r9, [r7], #128 @ encoding: [0xb0,0x98,0xf7,0xe0] @ CHECK: ldrht r4, [r3], #-75 @ encoding: [0xbb,0x44,0x73,0xe0] +@ CHECK: ldrht r4, [r3], #0 @ encoding: [0xb0,0x40,0xf3,0xe0] @ CHECK: ldrht r9, [r7], r2 @ encoding: [0xb2,0x90,0xb7,0xe0] @ CHECK: ldrht r4, [r3], -r2 @ encoding: [0xb2,0x40,0x33,0xe0] @@ -244,11 +246,13 @@ Lbaz: .quad 0 @------------------------------------------------------------------------------ ldrsbt r5, [r6], #1 ldrsbt r3, [r8], #-12 + ldrsbt r5, [r6] ldrsbt r8, [r9], r5 ldrsbt r2, [r1], -r4 @ CHECK: ldrsbt r5, [r6], #1 @ encoding: [0xd1,0x50,0xf6,0xe0] @ CHECK: ldrsbt r3, [r8], #-12 @ encoding: [0xdc,0x30,0x78,0xe0] +@ CHECK: ldrsbt r5, [r6], #0 @ encoding: [0xd0,0x50,0xf6,0xe0] @ CHECK: ldrsbt r8, [r9], r5 @ encoding: [0xd5,0x80,0xb9,0xe0] @ CHECK: ldrsbt r2, [r1], -r4 @ encoding: [0xd4,0x20,0x31,0xe0] @@ -293,11 +297,13 @@ Lbaz: .quad 0 @------------------------------------------------------------------------------ ldrsht r5, [r6], #1 ldrsht r3, [r8], #-12 + ldrsht r5, [r6] ldrsht r8, [r9], r5 ldrsht r2, [r1], -r4 @ CHECK: ldrsht r5, [r6], #1 @ encoding: [0xf1,0x50,0xf6,0xe0] @ CHECK: ldrsht r3, [r8], #-12 @ encoding: [0xfc,0x30,0x78,0xe0] +@ CHECK: ldrsht r5, [r6], #0 @ encoding: [0xf0,0x50,0xf6,0xe0] @ CHECK: ldrsht r8, [r9], r5 @ encoding: [0xf5,0x80,0xb9,0xe0] @ CHECK: ldrsht r2, [r1], -r4 @ encoding: [0xf4,0x20,0x31,0xe0] diff --git a/llvm/test/MC/Mips/cpadd-bad.s b/llvm/test/MC/Mips/cpadd-bad.s new file mode 100644 index 0000000000000..ce0b480605637 --- /dev/null +++ b/llvm/test/MC/Mips/cpadd-bad.s @@ -0,0 +1,13 @@ +# RUN: not llvm-mc -triple=mips-unknown-linux-gnu %s 2>&1 | FileCheck %s +# RUN: not llvm-mc -triple=mips64-unknown-linux-gnuabin32 %s 2>&1 | FileCheck %s +# RUN: not llvm-mc -triple=mips64-unknown-linux-gnu %s 2>&1 | FileCheck %s + + .text + .cpadd $32 +# CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: invalid register + .cpadd $foo +# CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: expected register + .cpadd bar +# CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: expected register + .cpadd $25 foobar +# CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: unexpected token, expected end of statement diff --git a/llvm/test/MC/Mips/cpadd.s b/llvm/test/MC/Mips/cpadd.s new file mode 100644 index 0000000000000..9b87897e91f2c --- /dev/null +++ b/llvm/test/MC/Mips/cpadd.s @@ -0,0 +1,29 @@ +# RUN: llvm-mc -triple=mips-unknown-linux-gnu -position-independent %s \ +# RUN: | FileCheck -check-prefix=ASM %s +# RUN: llvm-mc -triple=mips64-unknown-linux-gnu -position-independent %s \ +# RUN: | FileCheck -check-prefix=ASM %s +# RUN: llvm-mc -triple=mips-unknown-linux-gnu %s \ +# RUN: | FileCheck -check-prefix=ASM %s + +# RUN: llvm-mc -triple=mips-unknown-linux-gnu \ +# RUN: -position-independent -filetype=obj -o - %s \ +# RUN: | llvm-objdump -d -r - | FileCheck -check-prefix=OBJ32-PIC %s +# RUN: llvm-mc -triple=mips64-unknown-linux-gnu \ +# RUN: -position-independent -filetype=obj -o - %s \ +# RUN: | llvm-objdump -d -r - | FileCheck -check-prefix=OBJ64-PIC %s + +# RUN: llvm-mc -triple=mips-unknown-linux-gnu \ +# RUN: -filetype=obj -o - %s \ +# RUN: | llvm-objdump -d -r - | FileCheck -check-prefix=OBJ32-NPIC %s +# RUN: llvm-mc -triple=mips64-unknown-linux-gnu \ +# RUN: -filetype=obj -o - %s \ +# RUN: | llvm-objdump -d -r - | FileCheck -check-prefix=OBJ64-NPIC %s + +# ASM: .cpadd $4 +# OBJ32-PIC: addu $4, $4, $gp +# OBJ64-PIC: daddu $4, $4, $gp +# OBJ32-NPIC-NOT: addu +# OBJ64-NPIC-NOT: daddu + + .text + .cpadd $4 diff --git a/llvm/test/MC/Mips/macro-sle.s b/llvm/test/MC/Mips/macro-sle.s new file mode 100644 index 0000000000000..6d93ce5b820b2 --- /dev/null +++ b/llvm/test/MC/Mips/macro-sle.s @@ -0,0 +1,31 @@ +# RUN: llvm-mc -arch=mips -show-encoding -mcpu=mips1 < %s | FileCheck %s +# RUN: llvm-mc -arch=mips -show-encoding -mcpu=mips64 < %s | FileCheck %s + +sle $4, $5 +# CHECK: slt $4, $5, $4 # encoding: [0x00,0xa4,0x20,0x2a] +# CHECK: xori $4, $4, 1 # encoding: [0x38,0x84,0x00,0x01] +sle $4, $5, $6 +# CHECK: slt $4, $6, $5 # encoding: [0x00,0xc5,0x20,0x2a] +# CHECK: xori $4, $4, 1 # encoding: [0x38,0x84,0x00,0x01] +sle $4, $5, 16 +# CHECK: addiu $4, $zero, 16 # encoding: [0x24,0x04,0x00,0x10] +# CHECK: slt $4, $4, $5 # encoding: [0x00,0x85,0x20,0x2a] +# CHECK: xori $4, $4, 1 # encoding: [0x38,0x84,0x00,0x01] +sleu $4, $5 +# CHECK: sltu $4, $5, $4 # encoding: [0x00,0xa4,0x20,0x2b] +# CHECK: xori $4, $4, 1 # encoding: [0x38,0x84,0x00,0x01] +sleu $4, $5, $6 +# CHECK: sltu $4, $6, $5 # encoding: [0x00,0xc5,0x20,0x2b] +# CHECK: xori $4, $4, 1 # encoding: [0x38,0x84,0x00,0x01] +sleu $4, $5, 16 +# CHECK: addiu $4, $zero, 16 # encoding: [0x24,0x04,0x00,0x10] +# CHECK: sltu $4, $4, $5 # encoding: [0x00,0x85,0x20,0x2b] +# CHECK: xori $4, $4, 1 # encoding: [0x38,0x84,0x00,0x01] +sle $4, 16 +# CHECK: addiu $1, $zero, 16 # encoding: [0x24,0x01,0x00,0x10] +# CHECK: slt $4, $1, $4 # encoding: [0x00,0x24,0x20,0x2a] +# CHECK: xori $4, $4, 1 # encoding: [0x38,0x84,0x00,0x01] +sleu $4, 16 +# CHECK: addiu $1, $zero, 16 # encoding: [0x24,0x01,0x00,0x10] +# CHECK: sltu $4, $1, $4 # encoding: [0x00,0x24,0x20,0x2b] +# CHECK: xori $4, $4, 1 # encoding: [0x38,0x84,0x00,0x01] diff --git a/llvm/test/MC/Mips/macro-sle64.s b/llvm/test/MC/Mips/macro-sle64.s new file mode 100644 index 0000000000000..62ad7c81f9dda --- /dev/null +++ b/llvm/test/MC/Mips/macro-sle64.s @@ -0,0 +1,29 @@ +# RUN: not llvm-mc -arch=mips -mcpu=mips1 < %s 2>&1 \ +# RUN: | FileCheck --check-prefix=MIPS32 %s +# RUN: llvm-mc -arch=mips -show-encoding -mcpu=mips64 < %s \ +# RUN: | FileCheck --check-prefix=MIPS64 %s + +sle $4, $5, 0x100000000 +# MIPS32: :[[@LINE-1]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled +# MIPS64: ori $4, $zero, 32768 # encoding: [0x34,0x04,0x80,0x00] +# MIPS64: dsll $4, $4, 17 # encoding: [0x00,0x04,0x24,0x78] +# MIPS64: slt $4, $4, $5 # encoding: [0x00,0x85,0x20,0x2a] +# MIPS64: xori $4, $4, 1 # encoding: [0x38,0x84,0x00,0x01] +sleu $4, $5, 0x100000000 +# MIPS32: :[[@LINE-1]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled +# MIPS64: ori $4, $zero, 32768 # encoding: [0x34,0x04,0x80,0x00] +# MIPS64: dsll $4, $4, 17 # encoding: [0x00,0x04,0x24,0x78] +# MIPS64: sltu $4, $4, $5 # encoding: [0x00,0x85,0x20,0x2b] +# MIPS64: xori $4, $4, 1 # encoding: [0x38,0x84,0x00,0x01] +sle $4, 0x100000000 +# MIPS32: :[[@LINE-1]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled +# MIPS64: ori $1, $zero, 32768 # encoding: [0x34,0x01,0x80,0x00] +# MIPS64: dsll $1, $1, 17 # encoding: [0x00,0x01,0x0c,0x78] +# MIPS64: slt $4, $1, $4 # encoding: [0x00,0x24,0x20,0x2a] +# MIPS64: xori $4, $4, 1 # encoding: [0x38,0x84,0x00,0x01] +sleu $4, 0x100000000 +# MIPS32: :[[@LINE-1]]:{{[0-9]+}}: error: instruction requires a CPU feature not currently enabled +# MIPS64: ori $1, $zero, 32768 # encoding: [0x34,0x01,0x80,0x00] +# MIPS64: dsll $1, $1, 17 # encoding: [0x00,0x01,0x0c,0x78] +# MIPS64: sltu $4, $1, $4 # encoding: [0x00,0x24,0x20,0x2b] +# MIPS64: xori $4, $4, 1 # encoding: [0x38,0x84,0x00,0x01] diff --git a/llvm/test/MC/Mips/macro-sne.s b/llvm/test/MC/Mips/macro-sne.s new file mode 100644 index 0000000000000..497e1d604c41a --- /dev/null +++ b/llvm/test/MC/Mips/macro-sne.s @@ -0,0 +1,27 @@ +# RUN: llvm-mc -arch=mips -show-encoding -mcpu=mips1 < %s \ +# RUN: | FileCheck --check-prefixes=ALL,MIPS32 %s +# RUN: llvm-mc -arch=mips -show-encoding -mcpu=mips64 < %s \ +# RUN: | FileCheck --check-prefixes=ALL,MIPS64 %s + +sne $4, $5, $6 +# ALL: xor $4, $5, $6 # encoding: [0x00,0xa6,0x20,0x26] +# ALL: sltu $4, $zero, $4 # encoding: [0x00,0x04,0x20,0x2b] +sne $4, $zero, $6 +# ALL: sltu $4, $zero, $6 # encoding: [0x00,0x06,0x20,0x2b] +sne $4, $5, $zero +# ALL: sltu $4, $zero, $5 # encoding: [0x00,0x05,0x20,0x2b] +sne $4, $5, 0 +# ALL: sltu $4, $zero, $5 # encoding: [0x00,0x05,0x20,0x2b] +sne $4, $zero, 1 +# ALL: addiu $4, $zero, 1 # encoding: [0x24,0x04,0x00,0x01] +sne $4, $5, -1 +# MIPS32: addiu $4, $5, 1 # encoding: [0x24,0xa4,0x00,0x01] +# MIPS64: daddiu $4, $5, 1 # encoding: [0x64,0xa4,0x00,0x01] +# ALL: sltu $4, $zero, $4 # encoding: [0x00,0x04,0x20,0x2b] +sne $4, $5, 1 +# ALL: xori $4, $5, 1 # encoding: [0x38,0xa4,0x00,0x01] +# ALL: sltu $4, $zero, $4 # encoding: [0x00,0x04,0x20,0x2b] +sne $4, $5, 0x10000 +# ALL: lui $1, 1 # encoding: [0x3c,0x01,0x00,0x01] +# ALL: xor $4, $5, $1 # encoding: [0x00,0xa1,0x20,0x26] +# ALL: sltu $4, $zero, $4 # encoding: [0x00,0x04,0x20,0x2b] diff --git a/llvm/test/MC/WebAssembly/simd-encodings.s b/llvm/test/MC/WebAssembly/simd-encodings.s index e40de53354704..023660ea4dca1 100644 --- a/llvm/test/MC/WebAssembly/simd-encodings.s +++ b/llvm/test/MC/WebAssembly/simd-encodings.s @@ -580,4 +580,22 @@ main: # CHECK: i32x4.dot_i16x8_s # encoding: [0xfd,0xdb,0x01] i32x4.dot_i16x8_s + # CHECK: i8x16.abs # encoding: [0xfd,0xe1,0x01] + i8x16.abs + + # CHECK: i16x8.abs # encoding: [0xfd,0xe2,0x01] + i16x8.abs + + # CHECK: i32x4.abs # encoding: [0xfd,0xe3,0x01] + i32x4.abs + + # CHECK: i8x16.bitmask # encoding: [0xfd,0xe4,0x01] + i8x16.bitmask + + # CHECK: i16x8.bitmask # encoding: [0xfd,0xe5,0x01] + i16x8.bitmask + + # CHECK: i32x4.bitmask # encoding: [0xfd,0xe6,0x01] + i32x4.bitmask + end_function diff --git a/llvm/test/Object/AArch64/yaml2obj-elf-aarch64-rel.yaml b/llvm/test/Object/AArch64/yaml2obj-elf-aarch64-rel.yaml index 40eec94971306..17b638da0eb8c 100644 --- a/llvm/test/Object/AArch64/yaml2obj-elf-aarch64-rel.yaml +++ b/llvm/test/Object/AArch64/yaml2obj-elf-aarch64-rel.yaml @@ -5,7 +5,6 @@ # CHECK-NEXT: Type: SHT_RELA # CHECK-NEXT: Link: .symtab # CHECK-NEXT: AddressAlign: 0x0000000000000008 -# CHECK-NEXT: EntSize: 0x0000000000000018 # CHECK-NEXT: Info: .text # CHECK-NEXT: Relocations: # CHECK-NEXT: - Symbol: main diff --git a/llvm/test/Object/obj2yaml.test b/llvm/test/Object/obj2yaml.test index 748e713d1a95a..a5f008ffd238a 100644 --- a/llvm/test/Object/obj2yaml.test +++ b/llvm/test/Object/obj2yaml.test @@ -362,7 +362,6 @@ # ELF-MIPSEL-NEXT: Type: SHT_REL # ELF-MIPSEL-NEXT: Link: .symtab # ELF-MIPSEL-NEXT: AddressAlign: 0x0000000000000004 -# ELF-MIPSEL-NEXT: EntSize: 0x0000000000000008 # ELF-MIPSEL-NEXT: Info: .text # ELF-MIPSEL-NEXT: Relocations: # ELF-MIPSEL-NEXT: - Symbol: _gp_disp @@ -483,7 +482,6 @@ # ELF-MIPS64EL-NEXT: Type: SHT_RELA # ELF-MIPS64EL-NEXT: Link: .symtab # ELF-MIPS64EL-NEXT: AddressAlign: 0x0000000000000008 -# ELF-MIPS64EL-NEXT: EntSize: 0x0000000000000018 # ELF-MIPS64EL-NEXT: Info: .data # ELF-MIPS64EL-NEXT: Relocations: # ELF-MIPS64EL-NEXT: - Symbol: zed @@ -552,7 +550,6 @@ # ELF-X86-64-NEXT: Address: 0x0000000000000038 # ELF-X86-64-NEXT: Link: .symtab # ELF-X86-64-NEXT: AddressAlign: 0x0000000000000008 -# ELF-X86-64-NEXT: EntSize: 0x0000000000000018 # ELF-X86-64-NEXT: Info: .text # ELF-X86-64-NEXT: Relocations: # ELF-X86-64-NEXT: - Offset: 0x000000000000000D diff --git a/llvm/test/Transforms/Attributor/IPConstantProp/musttail-call.ll b/llvm/test/Transforms/Attributor/IPConstantProp/musttail-call.ll index d5755da165557..d14c06760cb70 100644 --- a/llvm/test/Transforms/Attributor/IPConstantProp/musttail-call.ll +++ b/llvm/test/Transforms/Attributor/IPConstantProp/musttail-call.ll @@ -17,8 +17,7 @@ define i8* @start(i8 %v) { ; CHECK-NEXT: [[C2:%.*]] = icmp eq i8 [[V]], 1 ; CHECK-NEXT: br i1 [[C2]], label [[C2_TRUE:%.*]], label [[C2_FALSE:%.*]] ; CHECK: c2_true: -; CHECK-NEXT: [[CA1:%.*]] = musttail call i8* @no_side_effects(i8 undef) -; CHECK-NEXT: ret i8* [[CA1]] +; CHECK-NEXT: ret i8* null ; CHECK: c2_false: ; CHECK-NEXT: [[CA2:%.*]] = musttail call i8* @dont_zap_me(i8 undef) ; CHECK-NEXT: ret i8* [[CA2]] @@ -61,10 +60,6 @@ define internal i8* @side_effects(i8 %v) { } define internal i8* @no_side_effects(i8 %v) readonly nounwind { -; CHECK-LABEL: define {{[^@]+}}@no_side_effects -; CHECK-SAME: (i8 [[V:%.*]]) -; CHECK-NEXT: ret i8* undef -; ret i8* null } diff --git a/llvm/test/Transforms/Attributor/dereferenceable-1.ll b/llvm/test/Transforms/Attributor/dereferenceable-1.ll index 87fb7867eabd8..459e57e5c2ef9 100644 --- a/llvm/test/Transforms/Attributor/dereferenceable-1.ll +++ b/llvm/test/Transforms/Attributor/dereferenceable-1.ll @@ -455,5 +455,67 @@ if.end8: ; preds = %if.then5, %if.else6 ret void } +declare void @unknown() +define void @nonnull_assume_pos(i8* %arg1, i8* %arg2, i8* %arg3, i8* %arg4) { +; ATTRIBUTOR-LABEL: define {{[^@]+}}@nonnull_assume_pos +; ATTRIBUTOR-SAME: (i8* nocapture nofree nonnull readnone dereferenceable(101) [[ARG1:%.*]], i8* nocapture nofree readnone dereferenceable_or_null(31) [[ARG2:%.*]], i8* nocapture nofree nonnull readnone [[ARG3:%.*]], i8* nocapture nofree readnone dereferenceable_or_null(42) [[ARG4:%.*]]) +; ATTRIBUTOR-NEXT: call void @llvm.assume(i1 true) #6 [ "nonnull"(i8* undef), "dereferenceable"(i8* undef, i64 1), "dereferenceable"(i8* undef, i64 2), "dereferenceable"(i8* undef, i64 101), "dereferenceable_or_null"(i8* undef, i64 31), "dereferenceable_or_null"(i8* undef, i64 42) ] +; ATTRIBUTOR-NEXT: call void @unknown() +; ATTRIBUTOR-NEXT: ret void +; + call void @llvm.assume(i1 true) [ "nonnull"(i8* %arg3), "dereferenceable"(i8* %arg1, i64 1), "dereferenceable"(i8* %arg1, i64 2), "dereferenceable"(i8* %arg1, i64 101), "dereferenceable_or_null"(i8* %arg2, i64 31), "dereferenceable_or_null"(i8* %arg4, i64 42)] + call void @unknown() + ret void +} +define void @nonnull_assume_neg(i8* %arg1, i8* %arg2, i8* %arg3) { +; ATTRIBUTOR-LABEL: define {{[^@]+}}@nonnull_assume_neg +; ATTRIBUTOR-SAME: (i8* nocapture nofree readnone [[ARG1:%.*]], i8* nocapture nofree readnone [[ARG2:%.*]], i8* nocapture nofree readnone [[ARG3:%.*]]) +; ATTRIBUTOR-NEXT: call void @unknown() +; ATTRIBUTOR-NEXT: call void @llvm.assume(i1 true) [ "dereferenceable"(i8* undef, i64 101), "dereferenceable"(i8* undef, i64 -2), "dereferenceable_or_null"(i8* undef, i64 31) ] +; ATTRIBUTOR-NEXT: ret void +; + call void @unknown() + call void @llvm.assume(i1 true) ["dereferenceable"(i8* %arg1, i64 101), "dereferenceable"(i8* %arg2, i64 -2), "dereferenceable_or_null"(i8* %arg3, i64 31)] + ret void +} +define void @nonnull_assume_call(i8* %arg1, i8* %arg2, i8* %arg3, i8* %arg4) { +; ATTRIBUTOR-LABEL: define {{[^@]+}}@nonnull_assume_call +; ATTRIBUTOR-SAME: (i8* [[ARG1:%.*]], i8* [[ARG2:%.*]], i8* [[ARG3:%.*]], i8* [[ARG4:%.*]]) +; ATTRIBUTOR-NEXT: call void @unknown() +; ATTRIBUTOR-NEXT: [[P:%.*]] = call nonnull dereferenceable(101) i32* @unkown_ptr() +; ATTRIBUTOR-NEXT: call void @unknown_use32(i32* nonnull dereferenceable(101) [[P]]) +; ATTRIBUTOR-NEXT: call void @unknown_use8(i8* nonnull dereferenceable(42) [[ARG4]]) +; ATTRIBUTOR-NEXT: call void @unknown_use8(i8* nonnull [[ARG3]]) +; ATTRIBUTOR-NEXT: call void @unknown_use8(i8* nonnull dereferenceable(31) [[ARG2]]) +; ATTRIBUTOR-NEXT: call void @unknown_use8(i8* nonnull dereferenceable(2) [[ARG1]]) +; ATTRIBUTOR-NEXT: call void @llvm.assume(i1 true) [ "nonnull"(i8* [[ARG3]]), "dereferenceable"(i8* [[ARG1]], i64 1), "dereferenceable"(i8* [[ARG1]], i64 2), "dereferenceable"(i32* [[P]], i64 101), "dereferenceable_or_null"(i8* [[ARG2]], i64 31), "dereferenceable_or_null"(i8* [[ARG4]], i64 42) ] +; ATTRIBUTOR-NEXT: call void @unknown_use8(i8* nonnull dereferenceable(2) [[ARG1]]) +; ATTRIBUTOR-NEXT: call void @unknown_use8(i8* nonnull dereferenceable(31) [[ARG2]]) +; ATTRIBUTOR-NEXT: call void @unknown_use8(i8* nonnull [[ARG3]]) +; ATTRIBUTOR-NEXT: call void @unknown_use8(i8* nonnull dereferenceable(42) [[ARG4]]) +; ATTRIBUTOR-NEXT: call void @unknown_use32(i32* nonnull dereferenceable(101) [[P]]) +; ATTRIBUTOR-NEXT: call void @unknown() +; ATTRIBUTOR-NEXT: ret void +; + call void @unknown() + %p = call i32* @unkown_ptr() + call void @unknown_use32(i32* %p) + call void @unknown_use8(i8* %arg4) + call void @unknown_use8(i8* %arg3) + call void @unknown_use8(i8* %arg2) + call void @unknown_use8(i8* %arg1) + call void @llvm.assume(i1 true) [ "nonnull"(i8* %arg3), "dereferenceable"(i8* %arg1, i64 1), "dereferenceable"(i8* %arg1, i64 2), "dereferenceable"(i32* %p, i64 101), "dereferenceable_or_null"(i8* %arg2, i64 31), "dereferenceable_or_null"(i8* %arg4, i64 42)] + call void @unknown_use8(i8* %arg1) + call void @unknown_use8(i8* %arg2) + call void @unknown_use8(i8* %arg3) + call void @unknown_use8(i8* %arg4) + call void @unknown_use32(i32* %p) + call void @unknown() + ret void +} +declare void @unknown_use8(i8*) willreturn nounwind +declare void @unknown_use32(i32*) willreturn nounwind +declare void @llvm.assume(i1) + !0 = !{i64 10, i64 100} diff --git a/llvm/test/Transforms/Attributor/nofree.ll b/llvm/test/Transforms/Attributor/nofree.ll index 07f0a4eb0ef27..a5af24350164b 100644 --- a/llvm/test/Transforms/Attributor/nofree.ll +++ b/llvm/test/Transforms/Attributor/nofree.ll @@ -247,6 +247,64 @@ define void @test14(i8* nocapture %0, i8* nocapture %1) { ret void } +; UTC_ARGS: --enable + +define void @nonnull_assume_pos(i8* %arg1, i8* %arg2, i8* %arg3, i8* %arg4) { +; ATTRIBUTOR-LABEL: define {{[^@]+}}@nonnull_assume_pos +; ATTRIBUTOR-SAME: (i8* nofree [[ARG1:%.*]], i8* [[ARG2:%.*]], i8* nofree [[ARG3:%.*]], i8* [[ARG4:%.*]]) +; ATTRIBUTOR-NEXT: call void @llvm.assume(i1 true) #11 [ "nofree"(i8* [[ARG1]]), "nofree"(i8* [[ARG3]]) ] +; ATTRIBUTOR-NEXT: call void @unknown(i8* nofree [[ARG1]], i8* [[ARG2]], i8* nofree [[ARG3]], i8* [[ARG4]]) +; ATTRIBUTOR-NEXT: ret void +; + call void @llvm.assume(i1 true) ["nofree"(i8* %arg1), "nofree"(i8* %arg3)] + call void @unknown(i8* %arg1, i8* %arg2, i8* %arg3, i8* %arg4) + ret void +} +define void @nonnull_assume_neg(i8* %arg1, i8* %arg2, i8* %arg3, i8* %arg4) { +; ATTRIBUTOR-LABEL: define {{[^@]+}}@nonnull_assume_neg +; ATTRIBUTOR-SAME: (i8* [[ARG1:%.*]], i8* [[ARG2:%.*]], i8* [[ARG3:%.*]], i8* [[ARG4:%.*]]) +; ATTRIBUTOR-NEXT: call void @unknown(i8* [[ARG1]], i8* [[ARG2]], i8* [[ARG3]], i8* [[ARG4]]) +; ATTRIBUTOR-NEXT: call void @llvm.assume(i1 true) [ "nofree"(i8* [[ARG1]]), "nofree"(i8* [[ARG3]]) ] +; ATTRIBUTOR-NEXT: ret void +; + call void @unknown(i8* %arg1, i8* %arg2, i8* %arg3, i8* %arg4) + call void @llvm.assume(i1 true) ["nofree"(i8* %arg1), "nofree"(i8* %arg3)] + ret void +} +define void @nonnull_assume_call(i8* %arg1, i8* %arg2, i8* %arg3, i8* %arg4) { +; ATTRIBUTOR-LABEL: define {{[^@]+}}@nonnull_assume_call +; ATTRIBUTOR-SAME: (i8* [[ARG1:%.*]], i8* [[ARG2:%.*]], i8* [[ARG3:%.*]], i8* [[ARG4:%.*]]) +; ATTRIBUTOR-NEXT: call void @unknown(i8* [[ARG1]], i8* [[ARG2]], i8* [[ARG3]], i8* [[ARG4]]) +; ATTRIBUTOR-NEXT: call void @use_i8_ptr(i8* noalias readnone [[ARG1]]) +; ATTRIBUTOR-NEXT: call void @use_i8_ptr(i8* noalias readnone [[ARG2]]) +; ATTRIBUTOR-NEXT: call void @llvm.assume(i1 true) [ "nofree"(i8* [[ARG1]]), "nofree"(i8* [[ARG3]]) ] +; ATTRIBUTOR-NEXT: call void @use_i8_ptr(i8* noalias nofree readnone [[ARG3]]) +; ATTRIBUTOR-NEXT: call void @use_i8_ptr(i8* noalias readnone [[ARG4]]) +; ATTRIBUTOR-NEXT: call void @use_i8_ptr_ret(i8* noalias nofree readnone [[ARG1]]) +; ATTRIBUTOR-NEXT: call void @use_i8_ptr_ret(i8* noalias readnone [[ARG2]]) +; ATTRIBUTOR-NEXT: call void @llvm.assume(i1 true) [ "nofree"(i8* [[ARG1]]), "nofree"(i8* [[ARG4]]) ] +; ATTRIBUTOR-NEXT: call void @use_i8_ptr_ret(i8* noalias nofree readnone [[ARG3]]) +; ATTRIBUTOR-NEXT: call void @use_i8_ptr_ret(i8* noalias nofree readnone [[ARG4]]) +; ATTRIBUTOR-NEXT: ret void +; + call void @unknown(i8* %arg1, i8* %arg2, i8* %arg3, i8* %arg4) + call void @use_i8_ptr(i8* %arg1) + call void @use_i8_ptr(i8* %arg2) + call void @llvm.assume(i1 true) ["nofree"(i8* %arg1), "nofree"(i8* %arg3)] + call void @use_i8_ptr(i8* %arg3) + call void @use_i8_ptr(i8* %arg4) + call void @use_i8_ptr_ret(i8* %arg1) + call void @use_i8_ptr_ret(i8* %arg2) + call void @llvm.assume(i1 true) ["nofree"(i8* %arg1), "nofree"(i8* %arg4)] + call void @use_i8_ptr_ret(i8* %arg3) + call void @use_i8_ptr_ret(i8* %arg4) + ret void +} +declare void @llvm.assume(i1) +declare void @unknown(i8*, i8*, i8*, i8*) +declare void @use_i8_ptr(i8* nocapture readnone) nounwind +declare void @use_i8_ptr_ret(i8* nocapture readnone) nounwind willreturn + declare noalias i8* @malloc(i64) attributes #0 = { nounwind uwtable noinline } diff --git a/llvm/test/Transforms/Attributor/nonnull.ll b/llvm/test/Transforms/Attributor/nonnull.ll index 2aae7aa277063..06ec6c2c585f0 100644 --- a/llvm/test/Transforms/Attributor/nonnull.ll +++ b/llvm/test/Transforms/Attributor/nonnull.ll @@ -167,7 +167,6 @@ define void @test13_helper() { tail call void @test13(i8* %nonnullptr, i8* %maybenullptr, i8* %nonnullptr) ret void } -declare void @use_i8_ptr(i8* nofree nocapture readnone) nounwind define internal void @test13(i8* %a, i8* %b, i8* %c) { ; ATTRIBUTOR: define internal void @test13(i8* noalias nocapture nofree nonnull readnone %a, i8* noalias nocapture nofree readnone %b, i8* noalias nocapture nofree readnone %c) call void @use_i8_ptr(i8* %a) @@ -839,5 +838,44 @@ define i8* @mybasename(i8* nofree readonly %str) { ret i8* %cond } +define void @nonnull_assume_pos(i8* %arg) { +; ATTRIBUTOR-LABEL: define {{[^@]+}}@nonnull_assume_pos +; ATTRIBUTOR-SAME: (i8* nocapture nofree nonnull readnone [[ARG:%.*]]) +; ATTRIBUTOR-NEXT: call void @llvm.assume(i1 true) #11 [ "nonnull"(i8* [[ARG]]) ] +; ATTRIBUTOR-NEXT: call void @use_i8_ptr(i8* noalias nocapture nofree nonnull readnone [[ARG]]) +; ATTRIBUTOR-NEXT: [[TMP1:%.*]] = call i8* @unknown() +; ATTRIBUTOR-NEXT: ret void +; + call void @llvm.assume(i1 true) ["nonnull"(i8* %arg)] + call void @use_i8_ptr(i8* %arg) + call i8* @unknown() + ret void +} +define void @nonnull_assume_neg(i8* %arg) { +; ATTRIBUTOR-LABEL: define {{[^@]+}}@nonnull_assume_neg +; ATTRIBUTOR-SAME: (i8* nocapture nofree readnone [[ARG:%.*]]) +; ATTRIBUTOR-NEXT: [[TMP1:%.*]] = call i8* @unknown() +; ATTRIBUTOR-NEXT: call void @use_i8_ptr(i8* noalias nocapture nofree readnone [[ARG]]) +; ATTRIBUTOR-NEXT: call void @llvm.assume(i1 true) [ "nonnull"(i8* [[ARG]]) ] +; ATTRIBUTOR-NEXT: call void @use_i8_ptr(i8* noalias nocapture nofree nonnull readnone [[ARG]]) +; ATTRIBUTOR-NEXT: [[TMP2:%.*]] = call i8* @unknown() +; ATTRIBUTOR-NEXT: call void @use_i8_ptr_ret(i8* noalias nocapture nofree nonnull readnone [[ARG]]) +; ATTRIBUTOR-NEXT: call void @llvm.assume(i1 true) [ "nonnull"(i8* [[ARG]]) ] +; ATTRIBUTOR-NEXT: call void @use_i8_ptr_ret(i8* noalias nocapture nofree nonnull readnone [[ARG]]) +; ATTRIBUTOR-NEXT: ret void +; + call i8* @unknown() + call void @use_i8_ptr(i8* %arg) + call void @llvm.assume(i1 true) ["nonnull"(i8* %arg)] + call void @use_i8_ptr(i8* %arg) + call i8* @unknown() + call void @use_i8_ptr_ret(i8* %arg) + call void @llvm.assume(i1 true) ["nonnull"(i8* %arg)] + call void @use_i8_ptr_ret(i8* %arg) + ret void +} +declare void @use_i8_ptr(i8* nofree nocapture readnone) nounwind +declare void @use_i8_ptr_ret(i8* nofree nocapture readnone) nounwind willreturn + attributes #0 = { "null-pointer-is-valid"="true" } attributes #1 = { nounwind willreturn} diff --git a/llvm/test/Transforms/Attributor/range.ll b/llvm/test/Transforms/Attributor/range.ll index 25cf27e7886a8..1e93e97634b24 100644 --- a/llvm/test/Transforms/Attributor/range.ll +++ b/llvm/test/Transforms/Attributor/range.ll @@ -1264,6 +1264,108 @@ define i8 @undef_collapse_caller() { ret i8 %a } +define i32 @ret1or2(i1 %c) { +; CHECK-LABEL: define {{[^@]+}}@ret1or2 +; CHECK-SAME: (i1 [[C:%.*]]) +; CHECK-NEXT: [[S:%.*]] = select i1 [[C]], i32 1, i32 2 +; CHECK-NEXT: ret i32 [[S]] +; + %s = select i1 %c, i32 1, i32 2 + ret i32 %s +} +define i1 @callee_range_1(i1 %c1, i1 %c2, i1 %c3) { +; OLD_PM-LABEL: define {{[^@]+}}@callee_range_1 +; OLD_PM-SAME: (i1 [[C1:%.*]], i1 [[C2:%.*]], i1 [[C3:%.*]]) +; OLD_PM-NEXT: [[F:%.*]] = and i1 true, true +; OLD_PM-NEXT: ret i1 [[F]] +; +; NEW_PM-LABEL: define {{[^@]+}}@callee_range_1 +; NEW_PM-SAME: (i1 [[C1:%.*]], i1 [[C2:%.*]], i1 [[C3:%.*]]) +; NEW_PM-NEXT: [[F:%.*]] = and i1 true, true +; NEW_PM-NEXT: ret i1 [[F]] +; +; CGSCC_OLD_PM-LABEL: define {{[^@]+}}@callee_range_1 +; CGSCC_OLD_PM-SAME: (i1 [[C1:%.*]], i1 [[C2:%.*]], i1 [[C3:%.*]]) +; CGSCC_OLD_PM-NEXT: [[R1:%.*]] = call i32 @ret1or2(i1 [[C1]]) +; CGSCC_OLD_PM-NEXT: [[R2:%.*]] = call i32 @ret1or2(i1 [[C2]]) +; CGSCC_OLD_PM-NEXT: [[INDIRECTION:%.*]] = select i1 [[C3]], i32 [[R1]], i32 [[R2]] +; CGSCC_OLD_PM-NEXT: [[A:%.*]] = add i32 [[R1]], [[INDIRECTION]] +; CGSCC_OLD_PM-NEXT: [[I1:%.*]] = icmp sle i32 [[A]], 4 +; CGSCC_OLD_PM-NEXT: [[I2:%.*]] = icmp sge i32 [[A]], 2 +; CGSCC_OLD_PM-NEXT: [[F:%.*]] = and i1 [[I1]], [[I2]] +; CGSCC_OLD_PM-NEXT: ret i1 [[F]] +; +; CGSCC_NEW_PM-LABEL: define {{[^@]+}}@callee_range_1 +; CGSCC_NEW_PM-SAME: (i1 [[C1:%.*]], i1 [[C2:%.*]], i1 [[C3:%.*]]) +; CGSCC_NEW_PM-NEXT: [[R1:%.*]] = call i32 @ret1or2(i1 [[C1]]) +; CGSCC_NEW_PM-NEXT: [[R2:%.*]] = call i32 @ret1or2(i1 [[C2]]) +; CGSCC_NEW_PM-NEXT: [[INDIRECTION:%.*]] = select i1 [[C3]], i32 [[R1]], i32 [[R2]] +; CGSCC_NEW_PM-NEXT: [[A:%.*]] = add i32 [[R1]], [[INDIRECTION]] +; CGSCC_NEW_PM-NEXT: [[I1:%.*]] = icmp sle i32 [[A]], 4 +; CGSCC_NEW_PM-NEXT: [[I2:%.*]] = icmp sge i32 [[A]], 2 +; CGSCC_NEW_PM-NEXT: [[F:%.*]] = and i1 [[I1]], [[I2]] +; CGSCC_NEW_PM-NEXT: ret i1 [[F]] +; + %r1 = call i32 @ret1or2(i1 %c1) + %r2 = call i32 @ret1or2(i1 %c2) + %indirection = select i1 %c3, i32 %r1, i32 %r2 + %a = add i32 %r1, %indirection + %i1 = icmp sle i32 %a, 4 + %i2 = icmp sge i32 %a, 2 + %f = and i1 %i1, %i2 + ret i1 %f +} + +define i1 @callee_range_2(i1 %c1, i1 %c2) { +; OLD_PM-LABEL: define {{[^@]+}}@callee_range_2 +; OLD_PM-SAME: (i1 [[C1:%.*]], i1 [[C2:%.*]]) +; OLD_PM-NEXT: [[R1:%.*]] = call i32 @ret1or2(i1 [[C1]]) #2, !range !4 +; OLD_PM-NEXT: [[R2:%.*]] = call i32 @ret1or2(i1 [[C2]]) #3, !range !4 +; OLD_PM-NEXT: [[A:%.*]] = add i32 [[R1]], [[R2]] +; OLD_PM-NEXT: [[I1:%.*]] = icmp sle i32 [[A]], 3 +; OLD_PM-NEXT: [[I2:%.*]] = icmp sge i32 [[A]], 2 +; OLD_PM-NEXT: [[F:%.*]] = and i1 [[I1]], [[I2]] +; OLD_PM-NEXT: ret i1 [[F]] +; +; NEW_PM-LABEL: define {{[^@]+}}@callee_range_2 +; NEW_PM-SAME: (i1 [[C1:%.*]], i1 [[C2:%.*]]) +; NEW_PM-NEXT: [[R1:%.*]] = call i32 @ret1or2(i1 [[C1]]) #2, !range !5 +; NEW_PM-NEXT: [[R2:%.*]] = call i32 @ret1or2(i1 [[C2]]) #3, !range !5 +; NEW_PM-NEXT: [[A:%.*]] = add i32 [[R1]], [[R2]] +; NEW_PM-NEXT: [[I1:%.*]] = icmp sle i32 [[A]], 3 +; NEW_PM-NEXT: [[I2:%.*]] = icmp sge i32 [[A]], 2 +; NEW_PM-NEXT: [[F:%.*]] = and i1 [[I1]], [[I2]] +; NEW_PM-NEXT: ret i1 [[F]] +; +; CGSCC_OLD_PM-LABEL: define {{[^@]+}}@callee_range_2 +; CGSCC_OLD_PM-SAME: (i1 [[C1:%.*]], i1 [[C2:%.*]]) +; CGSCC_OLD_PM-NEXT: [[R1:%.*]] = call i32 @ret1or2(i1 [[C1]]) +; CGSCC_OLD_PM-NEXT: [[R2:%.*]] = call i32 @ret1or2(i1 [[C2]]) +; CGSCC_OLD_PM-NEXT: [[A:%.*]] = add i32 [[R1]], [[R2]] +; CGSCC_OLD_PM-NEXT: [[I1:%.*]] = icmp sle i32 [[A]], 3 +; CGSCC_OLD_PM-NEXT: [[I2:%.*]] = icmp sge i32 [[A]], 2 +; CGSCC_OLD_PM-NEXT: [[F:%.*]] = and i1 [[I1]], [[I2]] +; CGSCC_OLD_PM-NEXT: ret i1 [[F]] +; +; CGSCC_NEW_PM-LABEL: define {{[^@]+}}@callee_range_2 +; CGSCC_NEW_PM-SAME: (i1 [[C1:%.*]], i1 [[C2:%.*]]) +; CGSCC_NEW_PM-NEXT: [[R1:%.*]] = call i32 @ret1or2(i1 [[C1]]) +; CGSCC_NEW_PM-NEXT: [[R2:%.*]] = call i32 @ret1or2(i1 [[C2]]) +; CGSCC_NEW_PM-NEXT: [[A:%.*]] = add i32 [[R1]], [[R2]] +; CGSCC_NEW_PM-NEXT: [[I1:%.*]] = icmp sle i32 [[A]], 3 +; CGSCC_NEW_PM-NEXT: [[I2:%.*]] = icmp sge i32 [[A]], 2 +; CGSCC_NEW_PM-NEXT: [[F:%.*]] = and i1 [[I1]], [[I2]] +; CGSCC_NEW_PM-NEXT: ret i1 [[F]] +; + %r1 = call i32 @ret1or2(i1 %c1) + %r2 = call i32 @ret1or2(i1 %c2) + %a = add i32 %r1, %r2 + %i1 = icmp sle i32 %a, 3 + %i2 = icmp sge i32 %a, 2 + %f = and i1 %i1, %i2 + ret i1 %f +} + !0 = !{i32 0, i32 10} !1 = !{i32 10, i32 100} diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/freeze-brcond.ll b/llvm/test/Transforms/CodeGenPrepare/X86/freeze-brcond.ll new file mode 100644 index 0000000000000..865eb896f64a8 --- /dev/null +++ b/llvm/test/Transforms/CodeGenPrepare/X86/freeze-brcond.ll @@ -0,0 +1,323 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -codegenprepare < %s | FileCheck %s + +target triple = "x86_64-unknown-linux-gnu" + +define void @f1(i32 %a) { +; CHECK-LABEL: @f1( +; CHECK-NEXT: [[FR:%.*]] = freeze i32 [[A:%.*]] +; CHECK-NEXT: [[C:%.*]] = icmp eq i32 [[FR]], 0 +; CHECK-NEXT: br i1 [[C]], label [[A:%.*]], label [[B:%.*]] +; CHECK: A: +; CHECK-NEXT: call void @g1() +; CHECK-NEXT: ret void +; CHECK: B: +; CHECK-NEXT: call void @g2() +; CHECK-NEXT: ret void +; + %c = icmp eq i32 %a, 0 + %fr = freeze i1 %c + br i1 %fr, label %A, label %B +A: + call void @g1() + ret void +B: + call void @g2() + ret void +} + +define void @f2(i32 %a) { +; CHECK-LABEL: @f2( +; CHECK-NEXT: [[FR:%.*]] = freeze i32 [[A:%.*]] +; CHECK-NEXT: [[C:%.*]] = icmp eq i32 0, [[FR]] +; CHECK-NEXT: br i1 [[C]], label [[A:%.*]], label [[B:%.*]] +; CHECK: A: +; CHECK-NEXT: call void @g1() +; CHECK-NEXT: ret void +; CHECK: B: +; CHECK-NEXT: call void @g2() +; CHECK-NEXT: ret void +; + %c = icmp eq i32 0, %a + %fr = freeze i1 %c + br i1 %fr, label %A, label %B +A: + call void @g1() + ret void +B: + call void @g2() + ret void +} + +define void @f3(i32 %a) { +; CHECK-LABEL: @f3( +; CHECK-NEXT: [[C:%.*]] = icmp eq i32 0, 1 +; CHECK-NEXT: br i1 [[C]], label [[A:%.*]], label [[B:%.*]] +; CHECK: A: +; CHECK-NEXT: call void @g1() +; CHECK-NEXT: ret void +; CHECK: B: +; CHECK-NEXT: call void @g2() +; CHECK-NEXT: ret void +; + %c = icmp eq i32 0, 1 + %fr = freeze i1 %c + br i1 %fr, label %A, label %B +A: + call void @g1() + ret void +B: + call void @g2() + ret void +} + +define i1 @ptrcmp(i8* %p) { +; CHECK-LABEL: @ptrcmp( +; CHECK-NEXT: [[FR:%.*]] = freeze i8* [[P:%.*]] +; CHECK-NEXT: [[C:%.*]] = icmp eq i8* [[FR]], null +; CHECK-NEXT: ret i1 [[C]] +; + %c = icmp eq i8* %p, null + %fr = freeze i1 %c + ret i1 %fr +} + + +define i1 @fcmp(float %a) { +; CHECK-LABEL: @fcmp( +; CHECK-NEXT: [[FR:%.*]] = freeze float [[A:%.*]] +; CHECK-NEXT: [[C:%.*]] = fcmp oeq float [[FR]], 0.000000e+00 +; CHECK-NEXT: ret i1 [[C]] +; + %c = fcmp oeq float %a, 0.0 + %fr = freeze i1 %c + ret i1 %fr +} + +define i1 @fcmp_nan(float %a) { +; CHECK-LABEL: @fcmp_nan( +; CHECK-NEXT: [[C:%.*]] = fcmp nnan oeq float [[A:%.*]], 0.000000e+00 +; CHECK-NEXT: [[FR:%.*]] = freeze i1 [[C]] +; CHECK-NEXT: ret i1 [[FR]] +; + %c = fcmp nnan oeq float %a, 0.0 + %fr = freeze i1 %c + ret i1 %fr +} + +define void @and_bitmask(i32 %flag) { +; CHECK-LABEL: @and_bitmask( +; CHECK-NEXT: [[V:%.*]] = and i32 [[FLAG:%.*]], 1 +; CHECK-NEXT: [[FR:%.*]] = freeze i32 [[V]] +; CHECK-NEXT: [[C:%.*]] = icmp eq i32 [[FR]], 0 +; CHECK-NEXT: br i1 [[C]], label [[A:%.*]], label [[B:%.*]] +; CHECK: A: +; CHECK-NEXT: call void @g1() +; CHECK-NEXT: ret void +; CHECK: B: +; CHECK-NEXT: call void @g2() +; CHECK-NEXT: ret void +; + %v = and i32 %flag, 1 + %c = icmp eq i32 %v, 0 + %fr = freeze i1 %c + br i1 %fr, label %A, label %B +A: + call void @g1() + ret void +B: + call void @g2() + ret void +} + +define void @and_bitmask_r(i32 %flag) { +; CHECK-LABEL: @and_bitmask_r( +; CHECK-NEXT: [[V:%.*]] = and i32 1, [[FLAG:%.*]] +; CHECK-NEXT: [[FR:%.*]] = freeze i32 [[V]] +; CHECK-NEXT: [[C:%.*]] = icmp eq i32 0, [[FR]] +; CHECK-NEXT: br i1 [[C]], label [[A:%.*]], label [[B:%.*]] +; CHECK: A: +; CHECK-NEXT: call void @g1() +; CHECK-NEXT: ret void +; CHECK: B: +; CHECK-NEXT: call void @g2() +; CHECK-NEXT: ret void +; + %v = and i32 1, %flag + %c = icmp eq i32 0, %v + %fr = freeze i1 %c + br i1 %fr, label %A, label %B +A: + call void @g1() + ret void +B: + call void @g2() + ret void +} + +define void @and_bitmask2(i32 %flag, i32 %flag2) { +; CHECK-LABEL: @and_bitmask2( +; CHECK-NEXT: [[V:%.*]] = and i32 [[FLAG:%.*]], 1 +; CHECK-NEXT: [[C:%.*]] = icmp eq i32 [[V]], 0 +; CHECK-NEXT: [[V2:%.*]] = and i32 [[FLAG2:%.*]], 2 +; CHECK-NEXT: [[C2:%.*]] = icmp eq i32 [[V2]], 0 +; CHECK-NEXT: [[COND:%.*]] = or i1 [[C]], [[C2]] +; CHECK-NEXT: [[FR:%.*]] = freeze i1 [[COND]] +; CHECK-NEXT: br i1 [[FR]], label [[A:%.*]], label [[B:%.*]] +; CHECK: A: +; CHECK-NEXT: call void @g1() +; CHECK-NEXT: ret void +; CHECK: B: +; CHECK-NEXT: call void @g2() +; CHECK-NEXT: ret void +; + %v = and i32 %flag, 1 + %c = icmp eq i32 %v, 0 + %v2 = and i32 %flag2, 2 + %c2 = icmp eq i32 %v2, 0 + %cond = or i1 %c, %c2 + %fr = freeze i1 %cond + br i1 %fr, label %A, label %B +A: + call void @g1() + ret void +B: + call void @g2() + ret void +} + +define void @and(i1 %a, i1 %b, i1 %c) { +; CHECK-LABEL: @and( +; CHECK-NEXT: [[COND:%.*]] = and i1 [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[COND2:%.*]] = and i1 [[C:%.*]], [[COND]] +; CHECK-NEXT: [[FR:%.*]] = freeze i1 [[COND2]] +; CHECK-NEXT: br i1 [[FR]], label [[A:%.*]], label [[B:%.*]] +; CHECK: A: +; CHECK-NEXT: call void @g1() +; CHECK-NEXT: ret void +; CHECK: B: +; CHECK-NEXT: call void @g2() +; CHECK-NEXT: ret void +; + %cond = and i1 %a, %b + %cond2 = and i1 %c, %cond + %fr = freeze i1 %cond2 + br i1 %fr, label %A, label %B +A: + call void @g1() + ret void +B: + call void @g2() + ret void +} + +define void @and_long(i1 %a, i1 %b, i1 %c, i1 %d, i1 %e, i1 %f, i1 %g) { +; CHECK-LABEL: @and_long( +; CHECK-NEXT: [[COND:%.*]] = and i1 [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[COND2:%.*]] = and i1 [[C:%.*]], [[COND]] +; CHECK-NEXT: [[COND3:%.*]] = and i1 [[D:%.*]], [[COND2]] +; CHECK-NEXT: [[COND4:%.*]] = and i1 [[E:%.*]], [[COND3]] +; CHECK-NEXT: [[COND5:%.*]] = and i1 [[F:%.*]], [[COND4]] +; CHECK-NEXT: [[COND6:%.*]] = and i1 [[G:%.*]], [[COND5]] +; CHECK-NEXT: [[FR:%.*]] = freeze i1 [[COND6]] +; CHECK-NEXT: br i1 [[FR]], label [[A:%.*]], label [[B:%.*]] +; CHECK: A: +; CHECK-NEXT: call void @g1() +; CHECK-NEXT: ret void +; CHECK: B: +; CHECK-NEXT: call void @g2() +; CHECK-NEXT: ret void +; + %cond = and i1 %a, %b + %cond2 = and i1 %c, %cond + %cond3 = and i1 %d, %cond2 + %cond4 = and i1 %e, %cond3 + %cond5 = and i1 %f, %cond4 + %cond6 = and i1 %g, %cond5 + %fr = freeze i1 %cond6 + br i1 %fr, label %A, label %B +A: + call void @g1() + ret void +B: + call void @g2() + ret void +} + +define void @and_cmp(i32 %v, float %w, i32 %v2) { +; CHECK-LABEL: @and_cmp( +; CHECK-NEXT: [[C1:%.*]] = icmp eq i32 [[V:%.*]], 0 +; CHECK-NEXT: [[C2:%.*]] = fcmp oeq float [[W:%.*]], 0.000000e+00 +; CHECK-NEXT: [[COND:%.*]] = and i1 [[C1]], [[C2]] +; CHECK-NEXT: [[C3:%.*]] = icmp eq i32 [[V2:%.*]], 1 +; CHECK-NEXT: [[COND2:%.*]] = and i1 [[COND]], [[C3]] +; CHECK-NEXT: [[FR:%.*]] = freeze i1 [[COND2]] +; CHECK-NEXT: br i1 [[FR]], label [[A:%.*]], label [[B:%.*]] +; CHECK: A: +; CHECK-NEXT: call void @g1() +; CHECK-NEXT: ret void +; CHECK: B: +; CHECK-NEXT: call void @g2() +; CHECK-NEXT: ret void +; + %c1 = icmp eq i32 %v, 0 + %c2 = fcmp oeq float %w, 0.0 + %cond = and i1 %c1, %c2 + %c3 = icmp eq i32 %v2, 1 + %cond2 = and i1 %cond, %c3 + %fr = freeze i1 %cond2 + br i1 %fr, label %A, label %B +A: + call void @g1() + ret void +B: + call void @g2() + ret void +} + +define void @or(i1 %a, i1 %b, i1 %c) { +; CHECK-LABEL: @or( +; CHECK-NEXT: [[COND:%.*]] = or i1 [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[COND2:%.*]] = or i1 [[C:%.*]], [[COND]] +; CHECK-NEXT: [[FR:%.*]] = freeze i1 [[COND2]] +; CHECK-NEXT: br i1 [[FR]], label [[A:%.*]], label [[B:%.*]] +; CHECK: A: +; CHECK-NEXT: call void @g1() +; CHECK-NEXT: ret void +; CHECK: B: +; CHECK-NEXT: call void @g2() +; CHECK-NEXT: ret void +; + %cond = or i1 %a, %b + %cond2 = or i1 %c, %cond + %fr = freeze i1 %cond2 + br i1 %fr, label %A, label %B +A: + call void @g1() + ret void +B: + call void @g2() + ret void +} + +define void @and_loop(i1 %a, i1 %b) { +; CHECK-LABEL: @and_loop( +; CHECK-NEXT: ret void +; CHECK: UNREACHABLE: +; CHECK-NEXT: [[C:%.*]] = and i1 [[A:%.*]], [[C]] +; CHECK-NEXT: [[FR:%.*]] = freeze i1 [[C]] +; CHECK-NEXT: br i1 [[FR]], label [[UNREACHABLE:%.*]], label [[EXIT:%.*]] +; CHECK: EXIT: +; CHECK-NEXT: ret void +; + ret void +UNREACHABLE: + %c = and i1 %a, %c + %fr = freeze i1 %c + br i1 %fr, label %UNREACHABLE, label %EXIT +EXIT: + ret void +} + +declare void @g1() +declare void @g2() diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/freeze-cmp.ll b/llvm/test/Transforms/CodeGenPrepare/X86/freeze-cmp.ll deleted file mode 100644 index d0d40bc91a933..0000000000000 --- a/llvm/test/Transforms/CodeGenPrepare/X86/freeze-cmp.ll +++ /dev/null @@ -1,109 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -S -codegenprepare < %s | FileCheck %s - -target triple = "x86_64-unknown-linux-gnu" - -define void @f1(i32 %a) { -; CHECK-LABEL: @f1( -; CHECK-NEXT: [[FR:%.*]] = freeze i32 [[A:%.*]] -; CHECK-NEXT: [[C:%.*]] = icmp eq i32 [[FR]], 0 -; CHECK-NEXT: br i1 [[C]], label [[A:%.*]], label [[B:%.*]] -; CHECK: A: -; CHECK-NEXT: call void @g1() -; CHECK-NEXT: ret void -; CHECK: B: -; CHECK-NEXT: call void @g2() -; CHECK-NEXT: ret void -; - %c = icmp eq i32 %a, 0 - %fr = freeze i1 %c - br i1 %fr, label %A, label %B -A: - call void @g1() - ret void -B: - call void @g2() - ret void -} - -define void @f2(i32 %a) { -; CHECK-LABEL: @f2( -; CHECK-NEXT: [[FR:%.*]] = freeze i32 [[A:%.*]] -; CHECK-NEXT: [[C:%.*]] = icmp eq i32 0, [[FR]] -; CHECK-NEXT: br i1 [[C]], label [[A:%.*]], label [[B:%.*]] -; CHECK: A: -; CHECK-NEXT: call void @g1() -; CHECK-NEXT: ret void -; CHECK: B: -; CHECK-NEXT: call void @g2() -; CHECK-NEXT: ret void -; - %c = icmp eq i32 0, %a - %fr = freeze i1 %c - br i1 %fr, label %A, label %B -A: - call void @g1() - ret void -B: - call void @g2() - ret void -} - -define void @f3(i32 %a) { -; CHECK-LABEL: @f3( -; CHECK-NEXT: [[C:%.*]] = icmp eq i32 0, 1 -; CHECK-NEXT: br i1 [[C]], label [[A:%.*]], label [[B:%.*]] -; CHECK: A: -; CHECK-NEXT: call void @g1() -; CHECK-NEXT: ret void -; CHECK: B: -; CHECK-NEXT: call void @g2() -; CHECK-NEXT: ret void -; - %c = icmp eq i32 0, 1 - %fr = freeze i1 %c - br i1 %fr, label %A, label %B -A: - call void @g1() - ret void -B: - call void @g2() - ret void -} - -define i1 @ptrcmp(i8* %p) { -; CHECK-LABEL: @ptrcmp( -; CHECK-NEXT: [[FR:%.*]] = freeze i8* [[P:%.*]] -; CHECK-NEXT: [[C:%.*]] = icmp eq i8* [[FR]], null -; CHECK-NEXT: ret i1 [[C]] -; - %c = icmp eq i8* %p, null - %fr = freeze i1 %c - ret i1 %fr -} - - -define i1 @fcmp(float %a) { -; CHECK-LABEL: @fcmp( -; CHECK-NEXT: [[FR:%.*]] = freeze float [[A:%.*]] -; CHECK-NEXT: [[C:%.*]] = fcmp oeq float [[FR]], 0.000000e+00 -; CHECK-NEXT: ret i1 [[C]] -; - %c = fcmp oeq float %a, 0.0 - %fr = freeze i1 %c - ret i1 %fr -} - -define i1 @fcmp_nan(float %a) { -; CHECK-LABEL: @fcmp_nan( -; CHECK-NEXT: [[C:%.*]] = fcmp nnan oeq float [[A:%.*]], 0.000000e+00 -; CHECK-NEXT: [[FR:%.*]] = freeze i1 [[C]] -; CHECK-NEXT: ret i1 [[FR]] -; - %c = fcmp nnan oeq float %a, 0.0 - %fr = freeze i1 %c - ret i1 %fr -} - -declare void @g1() -declare void @g2() diff --git a/llvm/test/Transforms/Coroutines/coro-split-02.ll b/llvm/test/Transforms/Coroutines/coro-split-02.ll index 0309c0db20100..993374291f415 100644 --- a/llvm/test/Transforms/Coroutines/coro-split-02.ll +++ b/llvm/test/Transforms/Coroutines/coro-split-02.ll @@ -14,6 +14,7 @@ declare void @print(i32) define void @a() "coroutine.presplit"="1" { entry: %ref.tmp7 = alloca %"struct.lean_future::Awaiter", align 8 + %testval = alloca i32 %id = call token @llvm.coro.id(i32 0, i8* null, i8* null, i8* null) %alloc = call i8* @malloc(i64 16) #3 %vFrame = call noalias nonnull i8* @llvm.coro.begin(token %id, i8* %alloc) @@ -28,6 +29,9 @@ entry: await.ready: %StrayCoroSave = call token @llvm.coro.save(i8* null) %val = load i32, i32* %Result.i19 + %cast = bitcast i32* %testval to i8* + call void @llvm.lifetime.start.p0i8(i64 4, i8* %cast) + call void @llvm.lifetime.end.p0i8(i64 4, i8* %cast) call void @print(i32 %val) br label %exit exit: @@ -36,10 +40,14 @@ exit: } ; CHECK-LABEL: @a.resume( +; CHECK: %testval = alloca i32 ; CHECK: getelementptr inbounds %a.Frame ; CHECK-NEXT: getelementptr inbounds %"struct.lean_future::Awaiter" ; CHECK-NOT: call token @llvm.coro.save(i8* null) ; CHECK-NEXT: %val = load i32, i32* %Result +; CHECK-NEXT: %cast = bitcast i32* %testval to i8* +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 4, i8* %cast) +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 4, i8* %cast) ; CHECK-NEXT: call void @print(i32 %val) ; CHECK-NEXT: ret void @@ -55,4 +63,6 @@ declare i8 @llvm.coro.suspend(token, i1) #3 declare void @"\01??3@YAXPEAX@Z"(i8*) local_unnamed_addr #10 declare i8* @llvm.coro.free(token, i8* nocapture readonly) #2 declare i1 @llvm.coro.end(i8*, i1) #3 +declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #4 +declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #4 diff --git a/llvm/test/Transforms/Coroutines/no-suspend.ll b/llvm/test/Transforms/Coroutines/no-suspend.ll index be48c2ab09fe8..211e16c6ccdbc 100644 --- a/llvm/test/Transforms/Coroutines/no-suspend.ll +++ b/llvm/test/Transforms/Coroutines/no-suspend.ll @@ -362,6 +362,58 @@ suspend: ret void } +; SimplifySuspendPoint should not simplify final suspend point +; +; CHECK-LABEL: define void @cannot_simplify_final_suspend( +; CHECK-NEXT: entry: +; CHECK-NEXT: llvm.coro.id +; +define void @cannot_simplify_final_suspend() "coroutine.presplit"="1" personality i32 0 { +entry: + %id = call token @llvm.coro.id(i32 0, i8* null, i8* null, i8* null) + %need.dyn.alloc = call i1 @llvm.coro.alloc(token %id) + br i1 %need.dyn.alloc, label %dyn.alloc, label %coro.begin +dyn.alloc: + %size = call i32 @llvm.coro.size.i32() + %alloc = call i8* @malloc(i32 %size) + br label %coro.begin +coro.begin: + %phi = phi i8* [ null, %entry ], [ %alloc, %dyn.alloc ] + %hdl = call noalias i8* @llvm.coro.begin(token %id, i8* %phi) + br label %body +body: + %save = call token @llvm.coro.save(i8* %hdl) + %subfn = call i8* @llvm.coro.subfn.addr(i8* %hdl, i8 1) + %bcast = bitcast i8* %subfn to void (i8*)* + invoke fastcc void %bcast(i8* %hdl) to label %real_susp unwind label %lpad + +real_susp: + %0 = call i8 @llvm.coro.suspend(token %save, i1 1) + switch i8 %0, label %suspend [i8 0, label %resume + i8 1, label %pre.cleanup] +resume: + call void @print(i32 0) + br label %cleanup + +pre.cleanup: + call void @print(i32 1) + br label %cleanup + +cleanup: + %mem = call i8* @llvm.coro.free(token %id, i8* %hdl) + call void @free(i8* %mem) + br label %suspend +suspend: + call i1 @llvm.coro.end(i8* %hdl, i1 false) + ret void +lpad: + %lpval = landingpad { i8*, i32 } + cleanup + + call void @print(i32 2) + resume { i8*, i32 } %lpval +} + declare i8* @malloc(i32) declare void @free(i8*) declare void @print(i32) diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/memset-unknown-sizes.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/memset-unknown-sizes.ll new file mode 100644 index 0000000000000..741508b9cec7c --- /dev/null +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/memset-unknown-sizes.ll @@ -0,0 +1,71 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -dse -enable-dse-memoryssa -S %s | FileCheck %s + +declare i8* @_Znwm() local_unnamed_addr #0 + +; Function Attrs: argmemonly nounwind willreturn writeonly +declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1 immarg) #1 + +define void @test1(i1 %c) { +; CHECK-LABEL: @test1( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[C:%.*]], label [[COND_TRUE_I_I_I:%.*]], label [[COND_END_I_I_I:%.*]] +; CHECK: cond.true.i.i.i: +; CHECK-NEXT: ret void +; CHECK: cond.end.i.i.i: +; CHECK-NEXT: [[CALL_I_I_I_I_I:%.*]] = tail call noalias nonnull i8* @_Znam() #2 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[CALL_I_I_I_I_I]] to i64* +; CHECK-NEXT: tail call void @llvm.memset.p0i8.i64(i8* nonnull align 8 [[CALL_I_I_I_I_I]], i8 0, i64 undef, i1 false) +; CHECK-NEXT: store i64 0, i64* [[TMP0]], align 8 +; CHECK-NEXT: ret void +; +entry: + br i1 %c, label %cond.true.i.i.i, label %cond.end.i.i.i + +cond.true.i.i.i: ; preds = %entry + ret void + +cond.end.i.i.i: ; preds = %entry + %call.i.i.i.i.i = tail call noalias nonnull i8* @_Znam() #2 + %0 = bitcast i8* %call.i.i.i.i.i to i64* + tail call void @llvm.memset.p0i8.i64(i8* nonnull align 8 %call.i.i.i.i.i, i8 0, i64 undef, i1 false) #3 + store i64 0, i64* %0, align 8 + ret void +} + +declare i8* @_Znam() local_unnamed_addr #0 + + +define void @test2(i1 %c) { +; CHECK-LABEL: @test2( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[C:%.*]], label [[CLEANUP_CONT104:%.*]], label [[IF_THEN:%.*]] +; CHECK: if.then: +; CHECK-NEXT: [[MUL_I_I_I_I:%.*]] = shl nuw nsw i64 undef, 3 +; CHECK-NEXT: [[CALL_I_I_I_I_I_I131:%.*]] = call noalias nonnull i8* @_Znwm() #2 +; CHECK-NEXT: [[DOTCAST_I_I:%.*]] = bitcast i8* [[CALL_I_I_I_I_I_I131]] to i64* +; CHECK-NEXT: store i64 0, i64* [[DOTCAST_I_I]], align 8 +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* nonnull align 8 [[CALL_I_I_I_I_I_I131]], i8 0, i64 [[MUL_I_I_I_I]], i1 false) +; CHECK-NEXT: ret void +; CHECK: cleanup.cont104: +; CHECK-NEXT: ret void +; +entry: + br i1 %c, label %cleanup.cont104, label %if.then + +if.then: ; preds = %entry + %mul.i.i.i.i = shl nuw nsw i64 undef, 3 + %call.i.i.i.i.i.i131 = call noalias nonnull i8* @_Znwm() #2 + %.cast.i.i = bitcast i8* %call.i.i.i.i.i.i131 to i64* + store i64 0, i64* %.cast.i.i, align 8 + call void @llvm.memset.p0i8.i64(i8* nonnull align 8 %call.i.i.i.i.i.i131, i8 0, i64 %mul.i.i.i.i, i1 false) #3 + ret void + +cleanup.cont104: ; preds = %entry + ret void +} + +attributes #0 = { "use-soft-float"="false" } +attributes #1 = { argmemonly nounwind willreturn writeonly } +attributes #2 = { builtin nounwind } +attributes #3 = { nounwind } diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-exceptions.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-exceptions.ll index a719e273e2272..942a46bad6c62 100644 --- a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-exceptions.ll +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-exceptions.ll @@ -18,7 +18,6 @@ define void @test12(i32* %p) personality i32 (...)* @__CxxFrameHandler3 { ; CHECK-NEXT: invoke void @f() ; CHECK-NEXT: to label [[BLOCK3:%.*]] unwind label [[CATCH_DISPATCH:%.*]] ; CHECK: block3: -; CHECK-NEXT: store i32 30, i32* [[SV]] ; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: catch.dispatch: ; CHECK-NEXT: [[CS1:%.*]] = catchswitch within none [label %catch] unwind label [[CLEANUP:%.*]] diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-loops.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-loops.ll index 984f15e3b0f7c..008344f7cd442 100644 --- a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-loops.ll +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-loops.ll @@ -27,10 +27,9 @@ end: define void @test14(i32* noalias %P) { ; CHECK-LABEL: @test14( ; CHECK-NEXT: entry: -; CHECK-NEXT: store i32 1, i32* [[P:%.*]] ; CHECK-NEXT: br label [[FOR:%.*]] ; CHECK: for: -; CHECK-NEXT: store i32 0, i32* [[P]] +; CHECK-NEXT: store i32 0, i32* [[P:%.*]] ; CHECK-NEXT: br i1 false, label [[FOR]], label [[END:%.*]] ; CHECK: end: ; CHECK-NEXT: ret void @@ -77,7 +76,8 @@ define void @test21(i32* noalias %P) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ARRAYIDX0:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1 ; CHECK-NEXT: [[P3:%.*]] = bitcast i32* [[ARRAYIDX0]] to i8* -; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 4 [[P3]], i8 0, i64 28, i1 false) +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[P3]], i64 4 +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 4 [[TMP0]], i8 0, i64 24, i1 false) ; CHECK-NEXT: br label [[FOR:%.*]] ; CHECK: for: ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 1 @@ -281,3 +281,36 @@ end: ret void } +%struct.hoge = type { i32, i32 } + +@global = external local_unnamed_addr global %struct.hoge*, align 8 + +define void @widget(i8* %tmp) { +; CHECK-LABEL: @widget( +; CHECK-NEXT: bb: +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 [[TMP:%.*]], i8* nonnull align 16 undef, i64 64, i1 false) +; CHECK-NEXT: br label [[BB1:%.*]] +; CHECK: bb1: +; CHECK-NEXT: [[TMP2:%.*]] = load %struct.hoge*, %struct.hoge** @global, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_HOGE:%.*]], %struct.hoge* [[TMP2]], i64 undef, i32 1 +; CHECK-NEXT: store i32 0, i32* [[TMP3]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = load %struct.hoge*, %struct.hoge** @global, align 8 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_HOGE]], %struct.hoge* [[TMP4]], i64 undef, i32 1 +; CHECK-NEXT: store i32 10, i32* [[TMP5]], align 4 +; CHECK-NEXT: br label [[BB1]] +; +bb: + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %tmp, i8* nonnull align 16 undef, i64 64, i1 false) + br label %bb1 + +bb1: ; preds = %bb1, %bb + %tmp2 = load %struct.hoge*, %struct.hoge** @global, align 8 + %tmp3 = getelementptr inbounds %struct.hoge, %struct.hoge* %tmp2, i64 undef, i32 1 + store i32 0, i32* %tmp3, align 4 + %tmp4 = load %struct.hoge*, %struct.hoge** @global, align 8 + %tmp5 = getelementptr inbounds %struct.hoge, %struct.hoge* %tmp4, i64 undef, i32 1 + store i32 10, i32* %tmp5, align 4 + br label %bb1 +} + +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i64, i1 immarg) diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-memoryphis.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-memoryphis.ll index ce34986ff4e3b..9cd3a7cec013f 100644 --- a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-memoryphis.ll +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-memoryphis.ll @@ -33,13 +33,11 @@ define void @test5(i32* noalias %P) { ; CHECK-LABEL: @test5( ; CHECK-NEXT: br i1 true, label [[BB1:%.*]], label [[BB2:%.*]] ; CHECK: bb1: -; CHECK-NEXT: store i32 1, i32* [[P:%.*]] ; CHECK-NEXT: br label [[BB3:%.*]] ; CHECK: bb2: -; CHECK-NEXT: store i32 1, i32* [[P]] ; CHECK-NEXT: br label [[BB3]] ; CHECK: bb3: -; CHECK-NEXT: store i32 0, i32* [[P]] +; CHECK-NEXT: store i32 0, i32* [[P:%.*]] ; CHECK-NEXT: ret void ; br i1 true, label %bb1, label %bb2 @@ -58,13 +56,12 @@ define void @test8(i32* %P, i32* %Q) { ; CHECK-LABEL: @test8( ; CHECK-NEXT: br i1 true, label [[BB1:%.*]], label [[BB2:%.*]] ; CHECK: bb1: -; CHECK-NEXT: store i32 1, i32* [[P:%.*]] ; CHECK-NEXT: br label [[BB3:%.*]] ; CHECK: bb2: ; CHECK-NEXT: store i32 1, i32* [[Q:%.*]] ; CHECK-NEXT: br label [[BB3]] ; CHECK: bb3: -; CHECK-NEXT: store i32 0, i32* [[P]] +; CHECK-NEXT: store i32 0, i32* [[P:%.*]] ; CHECK-NEXT: ret void ; br i1 true, label %bb1, label %bb2 @@ -115,7 +112,6 @@ define void @widget(i32* %Ptr, i1 %c1, i1 %c2, i32 %v1, i32 %v2, i32 %v3) { ; CHECK: bb1: ; CHECK-NEXT: br i1 [[C2:%.*]], label [[BB2:%.*]], label [[BB3]] ; CHECK: bb2: -; CHECK-NEXT: store i32 -1, i32* [[PTR:%.*]], align 4 ; CHECK-NEXT: br label [[BB3]] ; CHECK: bb3: ; CHECK-NEXT: br label [[BB4:%.*]] @@ -126,7 +122,7 @@ define void @widget(i32* %Ptr, i1 %c1, i1 %c2, i32 %v1, i32 %v2, i32 %v3) { ; CHECK-NEXT: i32 2, label [[BB7:%.*]] ; CHECK-NEXT: ] ; CHECK: bb5: -; CHECK-NEXT: store i32 0, i32* [[PTR]], align 4 +; CHECK-NEXT: store i32 0, i32* [[PTR:%.*]], align 4 ; CHECK-NEXT: br label [[BB8]] ; CHECK: bb6: ; CHECK-NEXT: store i32 1, i32* [[PTR]], align 4 @@ -173,3 +169,34 @@ bb7: ; preds = %bb4 bb8: ; preds = %bb7, %bb6, %bb5, %bb4 br label %bb4 } + + +declare void @fn1_test11() +declare void @fn2_test11() + +define void @test11(i1 %c, i8** %ptr.1) { +; CHECK-LABEL: @test11( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[C:%.*]], label [[IF_THEN:%.*]], label [[EXIT:%.*]] +; CHECK: if.then: +; CHECK-NEXT: tail call void @fn2_test11() #0 +; CHECK-NEXT: br label [[EXIT]] +; CHECK: exit: +; CHECK-NEXT: store i8* null, i8** [[PTR_1:%.*]], align 8 +; CHECK-NEXT: tail call void @fn2_test11() #0 +; CHECK-NEXT: ret void +; +entry: + br i1 %c, label %if.then, label %exit + +if.then: ; preds = %entry + tail call void @fn2_test11() #1 + br label %exit + +exit: + store i8* null, i8** %ptr.1, align 8 + tail call void @fn2_test11() #1 + ret void +} + +attributes #1 = { nounwind } diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-multipath.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-multipath.ll new file mode 100644 index 0000000000000..a24ecd293773c --- /dev/null +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-multipath.ll @@ -0,0 +1,76 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -basicaa -dse -enable-dse-memoryssa -S | FileCheck %s + +target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64" + +declare void @use(i32 *) + +define void @test4(i32* noalias %P, i1 %c1) { +; CHECK-LABEL: @test4( +; CHECK-NEXT: store i32 1, i32* [[P:%.*]] +; CHECK-NEXT: br i1 [[C1:%.*]], label [[BB1:%.*]], label [[BB2:%.*]] +; CHECK: bb1: +; CHECK-NEXT: store i32 0, i32* [[P]] +; CHECK-NEXT: br label [[BB5:%.*]] +; CHECK: bb2: +; CHECK-NEXT: store i32 3, i32* [[P]] +; CHECK-NEXT: br label [[BB5]] +; CHECK: bb5: +; CHECK-NEXT: call void @use(i32* [[P]]) +; CHECK-NEXT: ret void +; + store i32 1, i32* %P + br i1 %c1, label %bb1, label %bb2 + +bb1: + store i32 0, i32* %P + br label %bb5 +bb2: + store i32 3, i32* %P + br label %bb5 + +bb5: + call void @use(i32* %P) + ret void +} + +define void @test5(i32* noalias %P) { +; CHECK-LABEL: @test5( +; CHECK-NEXT: store i32 1, i32* [[P:%.*]] +; CHECK-NEXT: br i1 true, label [[BB1:%.*]], label [[BB2:%.*]] +; CHECK: bb1: +; CHECK-NEXT: store i32 0, i32* [[P]] +; CHECK-NEXT: br label [[BB5:%.*]] +; CHECK: bb2: +; CHECK-NEXT: br i1 undef, label [[BB3:%.*]], label [[BB4:%.*]] +; CHECK: bb3: +; CHECK-NEXT: store i32 3, i32* [[P]] +; CHECK-NEXT: br label [[BB5]] +; CHECK: bb4: +; CHECK-NEXT: store i32 5, i32* [[P]] +; CHECK-NEXT: br label [[BB5]] +; CHECK: bb5: +; CHECK-NEXT: call void @use(i32* [[P]]) +; CHECK-NEXT: ret void +; + store i32 1, i32* %P + br i1 true, label %bb1, label %bb2 +bb1: + store i32 0, i32* %P + br label %bb5 + +bb2: + br i1 undef, label %bb3, label %bb4 + +bb3: + store i32 3, i32* %P + br label %bb5 + +bb4: + store i32 5, i32* %P + br label %bb5 + +bb5: + call void @use(i32* %P) + ret void +} diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-overlap.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-overlap.ll new file mode 100644 index 0000000000000..5c26566e25a5c --- /dev/null +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-overlap.ll @@ -0,0 +1,112 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -dse -enable-dse-memoryssa %s -S | FileCheck %s + + +%struct.ham = type { [3 x double], [3 x double]} + +declare void @may_throw() +declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1 immarg) + +define void @overlap1(%struct.ham* %arg, i1 %cond) { +; CHECK-LABEL: @overlap1( +; CHECK-NEXT: bb: +; CHECK-NEXT: [[TMP:%.*]] = getelementptr inbounds [[STRUCT_HAM:%.*]], %struct.ham* [[ARG:%.*]], i64 0, i32 0, i64 2 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 0, i64 1 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 0, i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 1, i64 2 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 1, i64 1 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 1, i32 0 +; CHECK-NEXT: br i1 [[COND:%.*]], label [[BB7:%.*]], label [[BB8:%.*]] +; CHECK: bb7: +; CHECK-NEXT: br label [[BB9:%.*]] +; CHECK: bb8: +; CHECK-NEXT: br label [[BB9]] +; CHECK: bb9: +; CHECK-NEXT: store double 1.000000e+00, double* [[TMP2]], align 8 +; CHECK-NEXT: store double 2.000000e+00, double* [[TMP1]], align 8 +; CHECK-NEXT: store double 3.000000e+00, double* [[TMP]], align 8 +; CHECK-NEXT: store double 4.000000e+00, double* [[TMP5]], align 8 +; CHECK-NEXT: store double 5.000000e+00, double* [[TMP4]], align 8 +; CHECK-NEXT: store double 6.000000e+00, double* [[TMP3]], align 8 +; CHECK-NEXT: ret void +; +bb: + %tmp = getelementptr inbounds %struct.ham, %struct.ham* %arg, i64 0, i32 0, i64 2 + %tmp1 = getelementptr inbounds %struct.ham, %struct.ham* %arg, i64 0, i32 0, i64 1 + %tmp2 = getelementptr inbounds %struct.ham, %struct.ham* %arg, i64 0, i32 0, i64 0 + %tmp3 = getelementptr inbounds %struct.ham, %struct.ham* %arg, i64 0,i32 1, i64 2 + %tmp4 = getelementptr inbounds %struct.ham, %struct.ham* %arg, i64 0, i32 1, i64 1 + %tmp5 = getelementptr inbounds %struct.ham, %struct.ham* %arg, i64 0, i32 1, i32 0 + %tmp6 = bitcast double* %tmp2 to i8* + call void @llvm.memset.p0i8.i64(i8* nonnull align 8 dereferenceable(48) %tmp6, i8 0, i64 48, i1 false) + br i1 %cond, label %bb7, label %bb8 + +bb7: ; preds = %bb + br label %bb9 + +bb8: ; preds = %bb + br label %bb9 + +bb9: ; preds = %bb8, %bb7 + store double 1.0, double* %tmp2, align 8 + store double 2.0, double* %tmp1, align 8 + store double 3.0, double* %tmp, align 8 + store double 4.0, double* %tmp5, align 8 + store double 5.0, double* %tmp4, align 8 + store double 6.0, double* %tmp3, align 8 + ret void +} + +define void @overlap2(%struct.ham* %arg, i1 %cond) { +; CHECK-LABEL: @overlap2( +; CHECK-NEXT: bb: +; CHECK-NEXT: [[TMP:%.*]] = getelementptr inbounds [[STRUCT_HAM:%.*]], %struct.ham* [[ARG:%.*]], i64 0, i32 0, i64 2 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 0, i64 1 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 0, i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 1, i64 2 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 1, i64 1 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 1, i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast double* [[TMP2]] to i8* +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* nonnull align 8 dereferenceable(48) [[TMP6]], i8 0, i64 48, i1 false) +; CHECK-NEXT: br i1 [[COND:%.*]], label [[BB7:%.*]], label [[BB8:%.*]] +; CHECK: bb7: +; CHECK-NEXT: call void @may_throw() +; CHECK-NEXT: br label [[BB9:%.*]] +; CHECK: bb8: +; CHECK-NEXT: br label [[BB9]] +; CHECK: bb9: +; CHECK-NEXT: store double 1.000000e+00, double* [[TMP2]], align 8 +; CHECK-NEXT: store double 2.000000e+00, double* [[TMP1]], align 8 +; CHECK-NEXT: store double 3.000000e+00, double* [[TMP]], align 8 +; CHECK-NEXT: store double 4.000000e+00, double* [[TMP5]], align 8 +; CHECK-NEXT: store double 5.000000e+00, double* [[TMP4]], align 8 +; CHECK-NEXT: store double 6.000000e+00, double* [[TMP3]], align 8 +; CHECK-NEXT: ret void +; +bb: + %tmp = getelementptr inbounds %struct.ham, %struct.ham* %arg, i64 0, i32 0, i64 2 + %tmp1 = getelementptr inbounds %struct.ham, %struct.ham* %arg, i64 0, i32 0, i64 1 + %tmp2 = getelementptr inbounds %struct.ham, %struct.ham* %arg, i64 0, i32 0, i64 0 + %tmp3 = getelementptr inbounds %struct.ham, %struct.ham* %arg, i64 0,i32 1, i64 2 + %tmp4 = getelementptr inbounds %struct.ham, %struct.ham* %arg, i64 0, i32 1, i64 1 + %tmp5 = getelementptr inbounds %struct.ham, %struct.ham* %arg, i64 0, i32 1, i32 0 + %tmp6 = bitcast double* %tmp2 to i8* + call void @llvm.memset.p0i8.i64(i8* nonnull align 8 dereferenceable(48) %tmp6, i8 0, i64 48, i1 false) + br i1 %cond, label %bb7, label %bb8 + +bb7: ; preds = %bb + call void @may_throw() + br label %bb9 + +bb8: ; preds = %bb + br label %bb9 + +bb9: ; preds = %bb8, %bb7 + store double 1.0, double* %tmp2, align 8 + store double 2.0, double* %tmp1, align 8 + store double 3.0, double* %tmp, align 8 + store double 4.0, double* %tmp5, align 8 + store double 5.0, double* %tmp4, align 8 + store double 6.0, double* %tmp3, align 8 + ret void +} diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-simple.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-simple.ll index 520a6eaccf09f..060cb8c17bdf4 100644 --- a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-simple.ll +++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-simple.ll @@ -127,7 +127,7 @@ define void @overlapping_read(i32* %P) { ; CHECK: bb1: ; CHECK-NEXT: br label [[BB3:%.*]] ; CHECK: bb2: -; CHECK-NEXT: ret void +; CHECK-NEXT: br label [[BB3]] ; CHECK: bb3: ; CHECK-NEXT: store i32 2, i32* [[P]] ; CHECK-NEXT: ret void @@ -142,8 +142,109 @@ define void @overlapping_read(i32* %P) { bb1: br label %bb3 bb2: - ret void + br label %bb3 bb3: store i32 2, i32* %P ret void } + +define void @test10(i32* %P) { +; CHECK-LABEL: @test10( +; CHECK-NEXT: store i32 0, i32* [[P:%.*]] +; CHECK-NEXT: br i1 true, label [[BB1:%.*]], label [[BB2:%.*]] +; CHECK: bb1: +; CHECK-NEXT: store i32 0, i32* [[P]] +; CHECK-NEXT: br label [[BB3:%.*]] +; CHECK: bb2: +; CHECK-NEXT: ret void +; CHECK: bb3: +; CHECK-NEXT: ret void +; + store i32 0, i32* %P + br i1 true, label %bb1, label %bb2 +bb1: + store i32 0, i32* %P + br label %bb3 +bb2: + ret void +bb3: + ret void +} + + +define void @test11() { +; CHECK-LABEL: @test11( +; CHECK-NEXT: [[P:%.*]] = alloca i32 +; CHECK-NEXT: store i32 0, i32* [[P]] +; CHECK-NEXT: br i1 true, label [[BB1:%.*]], label [[BB2:%.*]] +; CHECK: bb1: +; CHECK-NEXT: store i32 0, i32* [[P]] +; CHECK-NEXT: br label [[BB3:%.*]] +; CHECK: bb2: +; CHECK-NEXT: ret void +; CHECK: bb3: +; CHECK-NEXT: ret void +; + %P = alloca i32 + store i32 0, i32* %P + br i1 true, label %bb1, label %bb2 +bb1: + store i32 0, i32* %P + br label %bb3 +bb2: + ret void +bb3: + ret void +} + + +define void @test12(i32* %P) { +; CHECK-LABEL: @test12( +; CHECK-NEXT: store i32 0, i32* [[P:%.*]] +; CHECK-NEXT: br i1 true, label [[BB1:%.*]], label [[BB2:%.*]] +; CHECK: bb1: +; CHECK-NEXT: store i32 1, i32* [[P]] +; CHECK-NEXT: br label [[BB3:%.*]] +; CHECK: bb2: +; CHECK-NEXT: store i32 1, i32* [[P]] +; CHECK-NEXT: ret void +; CHECK: bb3: +; CHECK-NEXT: ret void +; + store i32 0, i32* %P + br i1 true, label %bb1, label %bb2 +bb1: + store i32 1, i32* %P + br label %bb3 +bb2: + store i32 1, i32* %P + ret void +bb3: + ret void +} + + +define void @test13(i32* %P) { +; CHECK-LABEL: @test13( +; CHECK-NEXT: store i32 0, i32* [[P:%.*]] +; CHECK-NEXT: br i1 true, label [[BB1:%.*]], label [[BB2:%.*]] +; CHECK: bb1: +; CHECK-NEXT: store i32 1, i32* [[P]] +; CHECK-NEXT: br label [[BB3:%.*]] +; CHECK: bb2: +; CHECK-NEXT: store i32 1, i32* [[P]] +; CHECK-NEXT: br label [[BB3]] +; CHECK: bb3: +; CHECK-NEXT: ret void +; + store i32 0, i32* %P + br i1 true, label %bb1, label %bb2 +bb1: + store i32 1, i32* %P + br label %bb3 +bb2: + store i32 1, i32* %P + br label %bb3 +bb3: + ret void +} diff --git a/llvm/test/Transforms/DivRemPairs/PowerPC/div-expanded-rem-pair.ll b/llvm/test/Transforms/DivRemPairs/PowerPC/div-expanded-rem-pair.ll index 04a8a7721e91e..4b640b98e302d 100644 --- a/llvm/test/Transforms/DivRemPairs/PowerPC/div-expanded-rem-pair.ll +++ b/llvm/test/Transforms/DivRemPairs/PowerPC/div-expanded-rem-pair.ll @@ -100,14 +100,18 @@ end: define i32 @srem_of_srem_unexpanded(i32 %X, i32 %Y, i32 %Z) { ; CHECK-LABEL: @srem_of_srem_unexpanded( ; CHECK-NEXT: [[T0:%.*]] = mul nsw i32 [[Z:%.*]], [[Y:%.*]] -; CHECK-NEXT: [[T1:%.*]] = sdiv i32 [[X:%.*]], [[T0]] +; CHECK-NEXT: [[X_FROZEN:%.*]] = freeze i32 [[X:%.*]] +; CHECK-NEXT: [[T0_FROZEN:%.*]] = freeze i32 [[T0]] +; CHECK-NEXT: [[T1:%.*]] = sdiv i32 [[X_FROZEN]], [[T0_FROZEN]] ; CHECK-NEXT: [[T2:%.*]] = mul nsw i32 [[T0]], [[T1]] -; CHECK-NEXT: [[TMP1:%.*]] = mul i32 [[T1]], [[T0]] -; CHECK-NEXT: [[T3_DECOMPOSED:%.*]] = sub i32 [[X]], [[TMP1]] -; CHECK-NEXT: [[T4:%.*]] = sdiv i32 [[T3_DECOMPOSED]], [[Y]] +; CHECK-NEXT: [[TMP1:%.*]] = mul i32 [[T1]], [[T0_FROZEN]] +; CHECK-NEXT: [[T3_DECOMPOSED:%.*]] = sub i32 [[X_FROZEN]], [[TMP1]] +; CHECK-NEXT: [[T3_DECOMPOSED_FROZEN:%.*]] = freeze i32 [[T3_DECOMPOSED]] +; CHECK-NEXT: [[Y_FROZEN:%.*]] = freeze i32 [[Y]] +; CHECK-NEXT: [[T4:%.*]] = sdiv i32 [[T3_DECOMPOSED_FROZEN]], [[Y_FROZEN]] ; CHECK-NEXT: [[T5:%.*]] = mul nsw i32 [[T4]], [[Y]] -; CHECK-NEXT: [[TMP2:%.*]] = mul i32 [[T4]], [[Y]] -; CHECK-NEXT: [[T6_DECOMPOSED:%.*]] = sub i32 [[T3_DECOMPOSED]], [[TMP2]] +; CHECK-NEXT: [[TMP2:%.*]] = mul i32 [[T4]], [[Y_FROZEN]] +; CHECK-NEXT: [[T6_DECOMPOSED:%.*]] = sub i32 [[T3_DECOMPOSED_FROZEN]], [[TMP2]] ; CHECK-NEXT: ret i32 [[T6_DECOMPOSED]] ; %t0 = mul nsw i32 %Z, %Y diff --git a/llvm/test/Transforms/DivRemPairs/PowerPC/div-rem-pairs.ll b/llvm/test/Transforms/DivRemPairs/PowerPC/div-rem-pairs.ll index 4e95e0e399ef3..3692d7d224f10 100644 --- a/llvm/test/Transforms/DivRemPairs/PowerPC/div-rem-pairs.ll +++ b/llvm/test/Transforms/DivRemPairs/PowerPC/div-rem-pairs.ll @@ -5,9 +5,11 @@ declare void @foo(i32, i32) define void @decompose_illegal_srem_same_block(i32 %a, i32 %b) { ; CHECK-LABEL: @decompose_illegal_srem_same_block( -; CHECK-NEXT: [[DIV:%.*]] = sdiv i32 [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP1:%.*]] = mul i32 [[DIV]], [[B]] -; CHECK-NEXT: [[REM_DECOMPOSED:%.*]] = sub i32 [[A]], [[TMP1]] +; CHECK-NEXT: [[A_FROZEN:%.*]] = freeze i32 [[A:%.*]] +; CHECK-NEXT: [[B_FROZEN:%.*]] = freeze i32 [[B:%.*]] +; CHECK-NEXT: [[DIV:%.*]] = sdiv i32 [[A_FROZEN]], [[B_FROZEN]] +; CHECK-NEXT: [[TMP1:%.*]] = mul i32 [[DIV]], [[B_FROZEN]] +; CHECK-NEXT: [[REM_DECOMPOSED:%.*]] = sub i32 [[A_FROZEN]], [[TMP1]] ; CHECK-NEXT: call void @foo(i32 [[REM_DECOMPOSED]], i32 [[DIV]]) ; CHECK-NEXT: ret void ; @@ -19,9 +21,11 @@ define void @decompose_illegal_srem_same_block(i32 %a, i32 %b) { define void @decompose_illegal_urem_same_block(i32 %a, i32 %b) { ; CHECK-LABEL: @decompose_illegal_urem_same_block( -; CHECK-NEXT: [[DIV:%.*]] = udiv i32 [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP1:%.*]] = mul i32 [[DIV]], [[B]] -; CHECK-NEXT: [[REM_DECOMPOSED:%.*]] = sub i32 [[A]], [[TMP1]] +; CHECK-NEXT: [[A_FROZEN:%.*]] = freeze i32 [[A:%.*]] +; CHECK-NEXT: [[B_FROZEN:%.*]] = freeze i32 [[B:%.*]] +; CHECK-NEXT: [[DIV:%.*]] = udiv i32 [[A_FROZEN]], [[B_FROZEN]] +; CHECK-NEXT: [[TMP1:%.*]] = mul i32 [[DIV]], [[B_FROZEN]] +; CHECK-NEXT: [[REM_DECOMPOSED:%.*]] = sub i32 [[A_FROZEN]], [[TMP1]] ; CHECK-NEXT: call void @foo(i32 [[REM_DECOMPOSED]], i32 [[DIV]]) ; CHECK-NEXT: ret void ; @@ -37,9 +41,11 @@ define void @decompose_illegal_urem_same_block(i32 %a, i32 %b) { define i32 @hoist_sdiv(i32 %a, i32 %b) { ; CHECK-LABEL: @hoist_sdiv( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[DIV:%.*]] = sdiv i32 [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP0:%.*]] = mul i32 [[DIV]], [[B]] -; CHECK-NEXT: [[REM_DECOMPOSED:%.*]] = sub i32 [[A]], [[TMP0]] +; CHECK-NEXT: [[A_FROZEN:%.*]] = freeze i32 [[A:%.*]] +; CHECK-NEXT: [[B_FROZEN:%.*]] = freeze i32 [[B:%.*]] +; CHECK-NEXT: [[DIV:%.*]] = sdiv i32 [[A_FROZEN]], [[B_FROZEN]] +; CHECK-NEXT: [[TMP0:%.*]] = mul i32 [[DIV]], [[B_FROZEN]] +; CHECK-NEXT: [[REM_DECOMPOSED:%.*]] = sub i32 [[A_FROZEN]], [[TMP0]] ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[REM_DECOMPOSED]], 42 ; CHECK-NEXT: br i1 [[CMP]], label [[IF:%.*]], label [[END:%.*]] ; CHECK: if: @@ -67,9 +73,11 @@ end: define i64 @hoist_udiv(i64 %a, i64 %b) { ; CHECK-LABEL: @hoist_udiv( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[DIV:%.*]] = udiv i64 [[A:%.*]], [[B:%.*]] -; CHECK-NEXT: [[TMP0:%.*]] = mul i64 [[DIV]], [[B]] -; CHECK-NEXT: [[REM_DECOMPOSED:%.*]] = sub i64 [[A]], [[TMP0]] +; CHECK-NEXT: [[A_FROZEN:%.*]] = freeze i64 [[A:%.*]] +; CHECK-NEXT: [[B_FROZEN:%.*]] = freeze i64 [[B:%.*]] +; CHECK-NEXT: [[DIV:%.*]] = udiv i64 [[A_FROZEN]], [[B_FROZEN]] +; CHECK-NEXT: [[TMP0:%.*]] = mul i64 [[DIV]], [[B_FROZEN]] +; CHECK-NEXT: [[REM_DECOMPOSED:%.*]] = sub i64 [[A_FROZEN]], [[TMP0]] ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[REM_DECOMPOSED]], 42 ; CHECK-NEXT: br i1 [[CMP]], label [[IF:%.*]], label [[END:%.*]] ; CHECK: if: @@ -97,12 +105,14 @@ end: define i16 @hoist_srem(i16 %a, i16 %b) { ; CHECK-LABEL: @hoist_srem( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[DIV:%.*]] = sdiv i16 [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[A_FROZEN:%.*]] = freeze i16 [[A:%.*]] +; CHECK-NEXT: [[B_FROZEN:%.*]] = freeze i16 [[B:%.*]] +; CHECK-NEXT: [[DIV:%.*]] = sdiv i16 [[A_FROZEN]], [[B_FROZEN]] ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i16 [[DIV]], 42 ; CHECK-NEXT: br i1 [[CMP]], label [[IF:%.*]], label [[END:%.*]] ; CHECK: if: -; CHECK-NEXT: [[TMP0:%.*]] = mul i16 [[DIV]], [[B]] -; CHECK-NEXT: [[REM_DECOMPOSED:%.*]] = sub i16 [[A]], [[TMP0]] +; CHECK-NEXT: [[TMP0:%.*]] = mul i16 [[DIV]], [[B_FROZEN]] +; CHECK-NEXT: [[REM_DECOMPOSED:%.*]] = sub i16 [[A_FROZEN]], [[TMP0]] ; CHECK-NEXT: br label [[END]] ; CHECK: end: ; CHECK-NEXT: [[RET:%.*]] = phi i16 [ [[REM_DECOMPOSED]], [[IF]] ], [ 3, [[ENTRY:%.*]] ] @@ -127,12 +137,14 @@ end: define i8 @hoist_urem(i8 %a, i8 %b) { ; CHECK-LABEL: @hoist_urem( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[DIV:%.*]] = udiv i8 [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[A_FROZEN:%.*]] = freeze i8 [[A:%.*]] +; CHECK-NEXT: [[B_FROZEN:%.*]] = freeze i8 [[B:%.*]] +; CHECK-NEXT: [[DIV:%.*]] = udiv i8 [[A_FROZEN]], [[B_FROZEN]] ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i8 [[DIV]], 42 ; CHECK-NEXT: br i1 [[CMP]], label [[IF:%.*]], label [[END:%.*]] ; CHECK: if: -; CHECK-NEXT: [[TMP0:%.*]] = mul i8 [[DIV]], [[B]] -; CHECK-NEXT: [[REM_DECOMPOSED:%.*]] = sub i8 [[A]], [[TMP0]] +; CHECK-NEXT: [[TMP0:%.*]] = mul i8 [[DIV]], [[B_FROZEN]] +; CHECK-NEXT: [[REM_DECOMPOSED:%.*]] = sub i8 [[A_FROZEN]], [[TMP0]] ; CHECK-NEXT: br label [[END]] ; CHECK: end: ; CHECK-NEXT: [[RET:%.*]] = phi i8 [ [[REM_DECOMPOSED]], [[IF]] ], [ 3, [[ENTRY:%.*]] ] @@ -157,14 +169,18 @@ end: define i32 @srem_of_srem_unexpanded(i32 %X, i32 %Y, i32 %Z) { ; CHECK-LABEL: @srem_of_srem_unexpanded( ; CHECK-NEXT: [[T0:%.*]] = mul nsw i32 [[Z:%.*]], [[Y:%.*]] -; CHECK-NEXT: [[T1:%.*]] = sdiv i32 [[X:%.*]], [[T0]] +; CHECK-NEXT: [[X_FROZEN:%.*]] = freeze i32 [[X:%.*]] +; CHECK-NEXT: [[T0_FROZEN:%.*]] = freeze i32 [[T0]] +; CHECK-NEXT: [[T1:%.*]] = sdiv i32 [[X_FROZEN]], [[T0_FROZEN]] ; CHECK-NEXT: [[T2:%.*]] = mul nsw i32 [[T0]], [[T1]] -; CHECK-NEXT: [[TMP1:%.*]] = mul i32 [[T1]], [[T0]] -; CHECK-NEXT: [[T3_DECOMPOSED:%.*]] = sub i32 [[X]], [[TMP1]] -; CHECK-NEXT: [[T4:%.*]] = sdiv i32 [[T3_DECOMPOSED]], [[Y]] +; CHECK-NEXT: [[TMP1:%.*]] = mul i32 [[T1]], [[T0_FROZEN]] +; CHECK-NEXT: [[T3_DECOMPOSED:%.*]] = sub i32 [[X_FROZEN]], [[TMP1]] +; CHECK-NEXT: [[T3_DECOMPOSED_FROZEN:%.*]] = freeze i32 [[T3_DECOMPOSED]] +; CHECK-NEXT: [[Y_FROZEN:%.*]] = freeze i32 [[Y]] +; CHECK-NEXT: [[T4:%.*]] = sdiv i32 [[T3_DECOMPOSED_FROZEN]], [[Y_FROZEN]] ; CHECK-NEXT: [[T5:%.*]] = mul nsw i32 [[T4]], [[Y]] -; CHECK-NEXT: [[TMP2:%.*]] = mul i32 [[T4]], [[Y]] -; CHECK-NEXT: [[T6_DECOMPOSED:%.*]] = sub i32 [[T3_DECOMPOSED]], [[TMP2]] +; CHECK-NEXT: [[TMP2:%.*]] = mul i32 [[T4]], [[Y_FROZEN]] +; CHECK-NEXT: [[T6_DECOMPOSED:%.*]] = sub i32 [[T3_DECOMPOSED_FROZEN]], [[TMP2]] ; CHECK-NEXT: ret i32 [[T6_DECOMPOSED]] ; %t0 = mul nsw i32 %Z, %Y @@ -289,12 +305,14 @@ end: define i128 @dont_hoist_urem(i128 %a, i128 %b) { ; CHECK-LABEL: @dont_hoist_urem( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[DIV:%.*]] = udiv i128 [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[A_FROZEN:%.*]] = freeze i128 [[A:%.*]] +; CHECK-NEXT: [[B_FROZEN:%.*]] = freeze i128 [[B:%.*]] +; CHECK-NEXT: [[DIV:%.*]] = udiv i128 [[A_FROZEN]], [[B_FROZEN]] ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i128 [[DIV]], 42 ; CHECK-NEXT: br i1 [[CMP]], label [[IF:%.*]], label [[END:%.*]] ; CHECK: if: -; CHECK-NEXT: [[TMP0:%.*]] = mul i128 [[DIV]], [[B]] -; CHECK-NEXT: [[REM_DECOMPOSED:%.*]] = sub i128 [[A]], [[TMP0]] +; CHECK-NEXT: [[TMP0:%.*]] = mul i128 [[DIV]], [[B_FROZEN]] +; CHECK-NEXT: [[REM_DECOMPOSED:%.*]] = sub i128 [[A_FROZEN]], [[TMP0]] ; CHECK-NEXT: br label [[END]] ; CHECK: end: ; CHECK-NEXT: [[RET:%.*]] = phi i128 [ [[REM_DECOMPOSED]], [[IF]] ], [ 3, [[ENTRY:%.*]] ] diff --git a/llvm/test/Transforms/DivRemPairs/X86/div-rem-pairs.ll b/llvm/test/Transforms/DivRemPairs/X86/div-rem-pairs.ll index 5a5deb5e41b4a..e054dac780cca 100644 --- a/llvm/test/Transforms/DivRemPairs/X86/div-rem-pairs.ll +++ b/llvm/test/Transforms/DivRemPairs/X86/div-rem-pairs.ll @@ -281,12 +281,14 @@ end: define i128 @dont_hoist_urem(i128 %a, i128 %b) { ; CHECK-LABEL: @dont_hoist_urem( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[DIV:%.*]] = udiv i128 [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[A_FROZEN:%.*]] = freeze i128 [[A:%.*]] +; CHECK-NEXT: [[B_FROZEN:%.*]] = freeze i128 [[B:%.*]] +; CHECK-NEXT: [[DIV:%.*]] = udiv i128 [[A_FROZEN]], [[B_FROZEN]] ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i128 [[DIV]], 42 ; CHECK-NEXT: br i1 [[CMP]], label [[IF:%.*]], label [[END:%.*]] ; CHECK: if: -; CHECK-NEXT: [[TMP0:%.*]] = mul i128 [[DIV]], [[B]] -; CHECK-NEXT: [[REM_DECOMPOSED:%.*]] = sub i128 [[A]], [[TMP0]] +; CHECK-NEXT: [[TMP0:%.*]] = mul i128 [[DIV]], [[B_FROZEN]] +; CHECK-NEXT: [[REM_DECOMPOSED:%.*]] = sub i128 [[A_FROZEN]], [[TMP0]] ; CHECK-NEXT: br label [[END]] ; CHECK: end: ; CHECK-NEXT: [[RET:%.*]] = phi i128 [ [[REM_DECOMPOSED]], [[IF]] ], [ 3, [[ENTRY:%.*]] ] diff --git a/llvm/test/Transforms/GVN/PRE/volatile.ll b/llvm/test/Transforms/GVN/PRE/volatile.ll index ccc5bbfa48e48..552f8dce78330 100644 --- a/llvm/test/Transforms/GVN/PRE/volatile.ll +++ b/llvm/test/Transforms/GVN/PRE/volatile.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; Tests that check our handling of volatile instructions encountered ; when scanning for dependencies ; RUN: opt -basicaa -gvn -S < %s | FileCheck %s @@ -5,9 +6,11 @@ ; Check that we can bypass a volatile load when searching ; for dependencies of a non-volatile load define i32 @test1(i32* nocapture %p, i32* nocapture %q) { -; CHECK-LABEL: test1 -; CHECK: %0 = load volatile i32, i32* %q -; CHECK-NEXT: ret i32 0 +; CHECK-LABEL: @test1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load volatile i32, i32* [[Q:%.*]] +; CHECK-NEXT: ret i32 0 +; entry: %x = load i32, i32* %p load volatile i32, i32* %q @@ -16,13 +19,16 @@ entry: ret i32 %add } -; We can not value forward if the query instruction is +; We can not value forward if the query instruction is ; volatile, this would be (in effect) removing the volatile load define i32 @test2(i32* nocapture %p, i32* nocapture %q) { -; CHECK-LABEL: test2 -; CHECK: %x = load i32, i32* %p -; CHECK-NEXT: %y = load volatile i32, i32* %p -; CHECK-NEXT: %add = sub i32 %y, %x +; CHECK-LABEL: @test2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[X:%.*]] = load i32, i32* [[P:%.*]] +; CHECK-NEXT: [[Y:%.*]] = load volatile i32, i32* [[P]] +; CHECK-NEXT: [[ADD:%.*]] = sub i32 [[Y]], [[X]] +; CHECK-NEXT: ret i32 [[ADD]] +; entry: %x = load i32, i32* %p %y = load volatile i32, i32* %p @@ -33,10 +39,14 @@ entry: ; If the query instruction is itself volatile, we *cannot* ; reorder it even if p and q are noalias define i32 @test3(i32* noalias nocapture %p, i32* noalias nocapture %q) { -; CHECK-LABEL: test3 -; CHECK: %x = load i32, i32* %p -; CHECK-NEXT: %0 = load volatile i32, i32* %q -; CHECK-NEXT: %y = load volatile i32, i32* %p +; CHECK-LABEL: @test3( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[X:%.*]] = load i32, i32* [[P:%.*]] +; CHECK-NEXT: [[TMP0:%.*]] = load volatile i32, i32* [[Q:%.*]] +; CHECK-NEXT: [[Y:%.*]] = load volatile i32, i32* [[P]] +; CHECK-NEXT: [[ADD:%.*]] = sub i32 [[Y]], [[X]] +; CHECK-NEXT: ret i32 [[ADD]] +; entry: %x = load i32, i32* %p load volatile i32, i32* %q @@ -45,14 +55,18 @@ entry: ret i32 %add } -; If an encountered instruction is both volatile and ordered, -; we need to use the strictest ordering of either. In this +; If an encountered instruction is both volatile and ordered, +; we need to use the strictest ordering of either. In this ; case, the ordering prevents forwarding. define i32 @test4(i32* noalias nocapture %p, i32* noalias nocapture %q) { -; CHECK-LABEL: test4 -; CHECK: %x = load i32, i32* %p -; CHECK-NEXT: %0 = load atomic volatile i32, i32* %q seq_cst -; CHECK-NEXT: %y = load atomic i32, i32* %p seq_cst +; CHECK-LABEL: @test4( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[X:%.*]] = load i32, i32* [[P:%.*]] +; CHECK-NEXT: [[TMP0:%.*]] = load atomic volatile i32, i32* [[Q:%.*]] seq_cst, align 4 +; CHECK-NEXT: [[Y:%.*]] = load atomic i32, i32* [[P]] seq_cst, align 4 +; CHECK-NEXT: [[ADD:%.*]] = sub i32 [[Y]], [[X]] +; CHECK-NEXT: ret i32 [[ADD]] +; entry: %x = load i32, i32* %p load atomic volatile i32, i32* %q seq_cst, align 4 @@ -63,9 +77,11 @@ entry: ; Value forwarding from a volatile load is perfectly legal define i32 @test5(i32* nocapture %p, i32* nocapture %q) { -; CHECK-LABEL: test5 -; CHECK: %x = load volatile i32, i32* %p -; CHECK-NEXT: ret i32 0 +; CHECK-LABEL: @test5( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[X:%.*]] = load volatile i32, i32* [[P:%.*]] +; CHECK-NEXT: ret i32 0 +; entry: %x = load volatile i32, i32* %p %y = load i32, i32* %p @@ -75,11 +91,19 @@ entry: ; Does cross block redundancy elimination work with volatiles? define i32 @test6(i32* noalias nocapture %p, i32* noalias nocapture %q) { -; CHECK-LABEL: test6 -; CHECK: %y1 = load i32, i32* %p -; CHECK-LABEL: header -; CHECK: %x = load volatile i32, i32* %q -; CHECK-NEXT: %add = sub i32 %y1, %x +; CHECK-LABEL: @test6( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[Y1:%.*]] = load i32, i32* [[P:%.*]] +; CHECK-NEXT: call void @use(i32 [[Y1]]) +; CHECK-NEXT: br label [[HEADER:%.*]] +; CHECK: header: +; CHECK-NEXT: [[X:%.*]] = load volatile i32, i32* [[Q:%.*]] +; CHECK-NEXT: [[ADD:%.*]] = sub i32 [[Y1]], [[X]] +; CHECK-NEXT: [[CND:%.*]] = icmp eq i32 [[ADD]], 0 +; CHECK-NEXT: br i1 [[CND]], label [[EXIT:%.*]], label [[HEADER]] +; CHECK: exit: +; CHECK-NEXT: ret i32 0 +; entry: %y1 = load i32, i32* %p call void @use(i32 %y1) @@ -96,15 +120,25 @@ exit: ; Does cross block PRE work with volatiles? define i32 @test7(i1 %c, i32* noalias nocapture %p, i32* noalias nocapture %q) { -; CHECK-LABEL: test7 -; CHECK-LABEL: entry.header_crit_edge: -; CHECK: %y.pre = load i32, i32* %p -; CHECK-LABEL: skip: -; CHECK: %y1 = load i32, i32* %p -; CHECK-LABEL: header: -; CHECK: %y = phi i32 -; CHECK-NEXT: %x = load volatile i32, i32* %q -; CHECK-NEXT: %add = sub i32 %y, %x +; CHECK-LABEL: @test7( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[C:%.*]], label [[ENTRY_HEADER_CRIT_EDGE:%.*]], label [[SKIP:%.*]] +; CHECK: entry.header_crit_edge: +; CHECK-NEXT: [[Y_PRE:%.*]] = load i32, i32* [[P:%.*]] +; CHECK-NEXT: br label [[HEADER:%.*]] +; CHECK: skip: +; CHECK-NEXT: [[Y1:%.*]] = load i32, i32* [[P]] +; CHECK-NEXT: call void @use(i32 [[Y1]]) +; CHECK-NEXT: br label [[HEADER]] +; CHECK: header: +; CHECK-NEXT: [[Y:%.*]] = phi i32 [ [[Y_PRE]], [[ENTRY_HEADER_CRIT_EDGE]] ], [ [[Y]], [[HEADER]] ], [ [[Y1]], [[SKIP]] ] +; CHECK-NEXT: [[X:%.*]] = load volatile i32, i32* [[Q:%.*]] +; CHECK-NEXT: [[ADD:%.*]] = sub i32 [[Y]], [[X]] +; CHECK-NEXT: [[CND:%.*]] = icmp eq i32 [[ADD]], 0 +; CHECK-NEXT: br i1 [[CND]], label [[EXIT:%.*]], label [[HEADER]] +; CHECK: exit: +; CHECK-NEXT: ret i32 0 +; entry: br i1 %c, label %header, label %skip skip: @@ -124,15 +158,26 @@ exit: ; Another volatile PRE case - two paths through a loop ; load in preheader, one path read only, one not define i32 @test8(i1 %b, i1 %c, i32* noalias %p, i32* noalias %q) { -; CHECK-LABEL: test8 -; CHECK-LABEL: entry -; CHECK: %y1 = load i32, i32* %p -; CHECK-LABEL: header: -; CHECK: %y = phi i32 -; CHECK-NEXT: %x = load volatile i32, i32* %q -; CHECK-NOT: load -; CHECK-LABEL: skip.header_crit_edge: -; CHECK: %y.pre = load i32, i32* %p +; CHECK-LABEL: @test8( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[Y1:%.*]] = load i32, i32* [[P:%.*]] +; CHECK-NEXT: call void @use(i32 [[Y1]]) +; CHECK-NEXT: br label [[HEADER:%.*]] +; CHECK: header: +; CHECK-NEXT: [[Y:%.*]] = phi i32 [ [[Y_PRE:%.*]], [[SKIP_HEADER_CRIT_EDGE:%.*]] ], [ [[Y]], [[HEADER]] ], [ [[Y1]], [[ENTRY:%.*]] ] +; CHECK-NEXT: [[X:%.*]] = load volatile i32, i32* [[Q:%.*]] +; CHECK-NEXT: call void @use(i32 [[Y]]) +; CHECK-NEXT: br i1 [[B:%.*]], label [[SKIP:%.*]], label [[HEADER]] +; CHECK: skip: +; CHECK-NEXT: call void @clobber(i32* [[P]], i32* [[Q]]) +; CHECK-NEXT: br i1 [[C:%.*]], label [[SKIP_HEADER_CRIT_EDGE]], label [[EXIT:%.*]] +; CHECK: skip.header_crit_edge: +; CHECK-NEXT: [[Y_PRE]] = load i32, i32* [[P]] +; CHECK-NEXT: br label [[HEADER]] +; CHECK: exit: +; CHECK-NEXT: [[ADD:%.*]] = sub i32 [[Y]], [[X]] +; CHECK-NEXT: ret i32 [[ADD]] +; entry: %y1 = load i32, i32* %p call void @use(i32 %y1) @@ -143,7 +188,7 @@ header: call void @use(i32 %y) br i1 %b, label %skip, label %header skip: - ; escaping the arguments is explicitly required since we marked + ; escaping the arguments is explicitly required since we marked ; them noalias call void @clobber(i32* %p, i32* %q) br i1 %c, label %header, label %exit @@ -153,13 +198,15 @@ exit: } define i32 @test9(i32* %V) { +; CHECK-LABEL: @test9( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[LOAD:%.*]] = load volatile i32, i32* [[V:%.*]], !range !0 +; CHECK-NEXT: ret i32 0 +; entry: %load = load volatile i32, i32* %V, !range !0 ret i32 %load } -; CHECK-LABEL: test9 -; CHECK: load volatile -; CHECK: ret i32 0 declare void @use(i32) readonly declare void @clobber(i32* %p, i32* %q) diff --git a/llvm/test/Transforms/GVNSink/indirect-call.ll b/llvm/test/Transforms/GVNSink/indirect-call.ll index da98ed0819a65..57b7297c84bd6 100644 --- a/llvm/test/Transforms/GVNSink/indirect-call.ll +++ b/llvm/test/Transforms/GVNSink/indirect-call.ll @@ -68,3 +68,27 @@ if.end: %tobool4 = icmp ne i8 %obeys.0, 0 ret i1 %tobool4 } + +; Make sure no indirect call is introduced from direct calls +declare i8 @ext2(i1) +define zeroext i1 @test4(i1 zeroext %flag, i32 %blksA, i32 %blksB, i32 %nblks) { +entry: + %cmp = icmp uge i32 %blksA, %nblks + br i1 %flag, label %if.then, label %if.else + +; CHECK-LABEL: test4 +; CHECK: call i8 @ext( +; CHECK: call i8 @ext2( +if.then: + %frombool1 = call i8 @ext(i1 %cmp) + br label %if.end + +if.else: + %frombool3 = call i8 @ext2(i1 %cmp) + br label %if.end + +if.end: + %obeys.0 = phi i8 [ %frombool1, %if.then ], [ %frombool3, %if.else ] + %tobool4 = icmp ne i8 %obeys.0, 0 + ret i1 %tobool4 +} diff --git a/llvm/test/Transforms/GVNSink/sink-common-code.ll b/llvm/test/Transforms/GVNSink/sink-common-code.ll index 02b1eb7fe2595..293e0daff5fb1 100644 --- a/llvm/test/Transforms/GVNSink/sink-common-code.ll +++ b/llvm/test/Transforms/GVNSink/sink-common-code.ll @@ -692,6 +692,73 @@ if.end: ; CHECK-NOT: exact ; CHECK: } + +; CHECK-LABEL: @common_bitcast( +; CHECK: %. = select i1 %flag, float 2.000000e+00, float 1.000000e+00 +; CHECK: %[[a1:.*]] = bitcast i32* %x to float* +; CHECK: store float %., float* %[[a1]] +define i32 @common_bitcast(i1 zeroext %flag, i32* %x) { +entry: + br i1 %flag, label %if.then, label %if.else + +if.then: + %a = bitcast i32* %x to float* + store float 2.0, float* %a + br label %if.end + +if.else: + %b = bitcast i32* %x to float* + store float 1.0, float* %b + br label %if.end + +if.end: + ret i32 1 +} + +; CHECK-LABEL: @common_addrspacecast( +; CHECK: %. = select i1 %flag, i32 9, i32 10 +; CHECK: %[[a2:.*]] = addrspacecast i32* %x to i32 addrspace(1)* +; CHECK: store i32 %., i32 addrspace(1)* %[[a2]] +define i32 @common_addrspacecast(i1 zeroext %flag, i32* %x) { +entry: + br i1 %flag, label %if.then, label %if.else + +if.then: + %a = addrspacecast i32* %x to i32 addrspace(1)* + store i32 9, i32 addrspace(1)* %a + br label %if.end + +if.else: + %b = addrspacecast i32* %x to i32 addrspace(1)* + store i32 10, i32 addrspace(1)* %b + br label %if.end + +if.end: + ret i32 1 +} + +; Don't merge different address spaces +; CHECK-LABEL: @no_common_addrspacecast( +; CHECK: addrspacecast i32* %x to i32 addrspace(1)* +; CHECK: addrspacecast i32* %x to i32 addrspace(3)* +define i32 @no_common_addrspacecast(i1 zeroext %flag, i32* %x) { +entry: + br i1 %flag, label %if.then, label %if.else + +if.then: + %a = addrspacecast i32* %x to i32 addrspace(1)* + store i32 9, i32 addrspace(1)* %a + br label %if.end + +if.else: + %b = addrspacecast i32* %x to i32 addrspace(3)* + store i32 10, i32 addrspace(3)* %b + br label %if.end + +if.end: + ret i32 1 +} + ; CHECK: !0 = !{!1, !1, i64 0} ; CHECK: !1 = !{!"float", !2} ; CHECK: !2 = !{!"an example type tree"} diff --git a/llvm/test/Transforms/GlobalOpt/null-check-is-use-pr35760.ll b/llvm/test/Transforms/GlobalOpt/null-check-is-use-pr35760.ll new file mode 100644 index 0000000000000..32516b6db7ca2 --- /dev/null +++ b/llvm/test/Transforms/GlobalOpt/null-check-is-use-pr35760.ll @@ -0,0 +1,41 @@ +; RUN: opt -S -globalopt -o - < %s | FileCheck %s + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@_ZL3g_i = internal global i32* null, align 8 +@.str = private unnamed_addr constant [2 x i8] c"0\00", align 1 +@.str.1 = private unnamed_addr constant [2 x i8] c"1\00", align 1 + +define dso_local i32 @main() { + store i32* null, i32** @_ZL3g_i, align 8 + call void @_ZL13PutsSomethingv() + ret i32 0 +} + +; CHECK-LABEL: define {{.*}} @_ZL13PutsSomethingv() +; CHECK: [[gvLoad:%.*]] = load i32*, i32** @_ZL3g_i +; CHECK-NEXT: icmp eq i32* [[gvLoad]], null +define internal void @_ZL13PutsSomethingv() { + %1 = load i32*, i32** @_ZL3g_i, align 8 + %2 = icmp eq i32* %1, null + br i1 %2, label %3, label %7 + +3: ; preds = %0 + %4 = call noalias i8* @malloc(i64 4) #3 + %5 = bitcast i8* %4 to i32* + store i32* %5, i32** @_ZL3g_i, align 8 + %6 = call i32 @puts(i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str, i64 0, i64 0)) + br label %9 + +7: ; preds = %0 + %8 = call i32 @puts(i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str.1, i64 0, i64 0)) + br label %9 + +9: ; preds = %7, %3 + ret void +} + +declare dso_local noalias i8* @malloc(i64) + +declare dso_local i32 @puts(i8* nocapture readonly) diff --git a/llvm/test/Transforms/InstCombine/X86/x86-vector-shifts.ll b/llvm/test/Transforms/InstCombine/X86/x86-vector-shifts.ll index 39a8c3c923276..8d75cc34be38b 100644 --- a/llvm/test/Transforms/InstCombine/X86/x86-vector-shifts.ll +++ b/llvm/test/Transforms/InstCombine/X86/x86-vector-shifts.ll @@ -2678,6 +2678,237 @@ define <32 x i16> @avx512_psllv_w_512_undef(<32 x i16> %v) { ; Vector Masked Shift Amounts ; +define <8 x i16> @sse2_psra_w_128_masked(<8 x i16> %v, <8 x i16> %a) { +; CHECK-LABEL: @sse2_psra_w_128_masked( +; CHECK-NEXT: [[TMP1:%.*]] = and <8 x i16> [[A:%.*]], +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = ashr <8 x i16> [[V:%.*]], [[TMP2]] +; CHECK-NEXT: ret <8 x i16> [[TMP3]] +; + %1 = and <8 x i16> %a, + %2 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %v, <8 x i16> %1) + ret <8 x i16> %2 +} + +define <8 x i32> @avx2_psra_d_256_masked(<8 x i32> %v, <4 x i32> %a) { +; CHECK-LABEL: @avx2_psra_d_256_masked( +; CHECK-NEXT: [[TMP1:%.*]] = and <4 x i32> [[A:%.*]], +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = ashr <8 x i32> [[V:%.*]], [[TMP2]] +; CHECK-NEXT: ret <8 x i32> [[TMP3]] +; + %1 = and <4 x i32> %a, + %2 = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %v, <4 x i32> %1) + ret <8 x i32> %2 +} + +define <8 x i64> @avx512_psra_q_512_masked(<8 x i64> %v, <2 x i64> %a) { +; CHECK-LABEL: @avx512_psra_q_512_masked( +; CHECK-NEXT: [[TMP1:%.*]] = and <2 x i64> [[A:%.*]], +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = ashr <8 x i64> [[V:%.*]], [[TMP2]] +; CHECK-NEXT: ret <8 x i64> [[TMP3]] +; + %1 = and <2 x i64> %a, + %2 = tail call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> %v, <2 x i64> %1) + ret <8 x i64> %2 +} + +define <4 x i32> @sse2_psrl_d_128_masked(<4 x i32> %v, <4 x i32> %a) { +; CHECK-LABEL: @sse2_psrl_d_128_masked( +; CHECK-NEXT: [[TMP1:%.*]] = and <4 x i32> [[A:%.*]], +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = lshr <4 x i32> [[V:%.*]], [[TMP2]] +; CHECK-NEXT: ret <4 x i32> [[TMP3]] +; + %1 = and <4 x i32> %a, + %2 = tail call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %v, <4 x i32> %1) + ret <4 x i32> %2 +} + +define <4 x i64> @avx2_psrl_q_256_masked(<4 x i64> %v, <2 x i64> %a) { +; CHECK-LABEL: @avx2_psrl_q_256_masked( +; CHECK-NEXT: [[TMP1:%.*]] = and <2 x i64> [[A:%.*]], +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = lshr <4 x i64> [[V:%.*]], [[TMP2]] +; CHECK-NEXT: ret <4 x i64> [[TMP3]] +; + %1 = and <2 x i64> %a, + %2 = tail call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %v, <2 x i64> %1) + ret <4 x i64> %2 +} + +define <32 x i16> @avx512_psrl_w_512_masked(<32 x i16> %v, <8 x i16> %a) { +; CHECK-LABEL: @avx512_psrl_w_512_masked( +; CHECK-NEXT: [[TMP1:%.*]] = and <8 x i16> [[A:%.*]], +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> undef, <32 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = lshr <32 x i16> [[V:%.*]], [[TMP2]] +; CHECK-NEXT: ret <32 x i16> [[TMP3]] +; + %1 = and <8 x i16> %a, + %2 = tail call <32 x i16> @llvm.x86.avx512.psrl.w.512(<32 x i16> %v, <8 x i16> %1) + ret <32 x i16> %2 +} + +define <2 x i64> @sse2_psll_q_128_masked(<2 x i64> %v, <2 x i64> %a) { +; CHECK-LABEL: @sse2_psll_q_128_masked( +; CHECK-NEXT: [[TMP1:%.*]] = and <2 x i64> [[A:%.*]], +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = shl <2 x i64> [[V:%.*]], [[TMP2]] +; CHECK-NEXT: ret <2 x i64> [[TMP3]] +; + %1 = and <2 x i64> %a, + %2 = tail call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %v, <2 x i64> %1) + ret <2 x i64> %2 +} + +define <16 x i16> @avx2_psll_w_256_masked(<16 x i16> %v, <8 x i16> %a) { +; CHECK-LABEL: @avx2_psll_w_256_masked( +; CHECK-NEXT: [[TMP1:%.*]] = and <8 x i16> [[A:%.*]], +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> undef, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = shl <16 x i16> [[V:%.*]], [[TMP2]] +; CHECK-NEXT: ret <16 x i16> [[TMP3]] +; + %1 = and <8 x i16> %a, + %2 = tail call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %v, <8 x i16> %1) + ret <16 x i16> %2 +} + +define <16 x i32> @avx512_psll_d_512_masked(<16 x i32> %v, <4 x i32> %a) { +; CHECK-LABEL: @avx512_psll_d_512_masked( +; CHECK-NEXT: [[TMP1:%.*]] = and <4 x i32> [[A:%.*]], +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = shl <16 x i32> [[V:%.*]], [[TMP2]] +; CHECK-NEXT: ret <16 x i32> [[TMP3]] +; + %1 = and <4 x i32> %a, + %2 = tail call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> %v, <4 x i32> %1) + ret <16 x i32> %2 +} + +define <8 x i16> @sse2_psrai_w_128_masked(<8 x i16> %v, i32 %a) { +; CHECK-LABEL: @sse2_psrai_w_128_masked( +; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[A:%.*]] to i16 +; CHECK-NEXT: [[TMP2:%.*]] = and i16 [[TMP1]], 15 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x i16> undef, i16 [[TMP2]], i32 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x i16> [[DOTSPLATINSERT]], <8 x i16> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = ashr <8 x i16> [[V:%.*]], [[DOTSPLAT]] +; CHECK-NEXT: ret <8 x i16> [[TMP3]] +; + %1 = and i32 %a, 15 + %2 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %v, i32 %1) + ret <8 x i16> %2 +} + +define <8 x i32> @avx2_psrai_d_256_masked(<8 x i32> %v, i32 %a) { +; CHECK-LABEL: @avx2_psrai_d_256_masked( +; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[A:%.*]], 31 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 [[TMP1]], i32 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = ashr <8 x i32> [[V:%.*]], [[DOTSPLAT]] +; CHECK-NEXT: ret <8 x i32> [[TMP2]] +; + %1 = and i32 %a, 31 + %2 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %v, i32 %1) + ret <8 x i32> %2 +} + +define <8 x i64> @avx512_psrai_q_512_masked(<8 x i64> %v, i32 %a) { +; CHECK-LABEL: @avx512_psrai_q_512_masked( +; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[A:%.*]], 63 +; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <8 x i64> undef, i64 [[TMP2]], i32 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <8 x i64> [[DOTSPLATINSERT]], <8 x i64> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = ashr <8 x i64> [[V:%.*]], [[DOTSPLAT]] +; CHECK-NEXT: ret <8 x i64> [[TMP3]] +; + %1 = and i32 %a, 63 + %2 = tail call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> %v, i32 %1) + ret <8 x i64> %2 +} + +define <4 x i32> @sse2_psrli_d_128_masked(<4 x i32> %v, i32 %a) { +; CHECK-LABEL: @sse2_psrli_d_128_masked( +; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[A:%.*]], 31 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[TMP1]], i32 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i32> [[DOTSPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = lshr <4 x i32> [[V:%.*]], [[DOTSPLAT]] +; CHECK-NEXT: ret <4 x i32> [[TMP2]] +; + %1 = and i32 %a, 31 + %2 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %v, i32 %1) + ret <4 x i32> %2 +} + +define <4 x i64> @avx2_psrli_q_256_masked(<4 x i64> %v, i32 %a) { +; CHECK-LABEL: @avx2_psrli_q_256_masked( +; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[A:%.*]], 63 +; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i64> undef, i64 [[TMP2]], i32 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i64> [[DOTSPLATINSERT]], <4 x i64> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = lshr <4 x i64> [[V:%.*]], [[DOTSPLAT]] +; CHECK-NEXT: ret <4 x i64> [[TMP3]] +; + %1 = and i32 %a, 63 + %2 = tail call <4 x i64> @llvm.x86.avx2.psrli.q(<4 x i64> %v, i32 %1) + ret <4 x i64> %2 +} + +define <32 x i16> @avx512_psrli_w_512_masked(<32 x i16> %v, i32 %a) { +; CHECK-LABEL: @avx512_psrli_w_512_masked( +; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[A:%.*]] to i16 +; CHECK-NEXT: [[TMP2:%.*]] = and i16 [[TMP1]], 15 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <32 x i16> undef, i16 [[TMP2]], i32 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <32 x i16> [[DOTSPLATINSERT]], <32 x i16> undef, <32 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = lshr <32 x i16> [[V:%.*]], [[DOTSPLAT]] +; CHECK-NEXT: ret <32 x i16> [[TMP3]] +; + %1 = and i32 %a, 15 + %2 = tail call <32 x i16> @llvm.x86.avx512.psrli.w.512(<32 x i16> %v, i32 %1) + ret <32 x i16> %2 +} + +define <2 x i64> @sse2_pslli_q_128_masked(<2 x i64> %v, i32 %a) { +; CHECK-LABEL: @sse2_pslli_q_128_masked( +; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[A:%.*]], 63 +; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i64> undef, i64 [[TMP2]], i32 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i64> [[DOTSPLATINSERT]], <2 x i64> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = shl <2 x i64> [[V:%.*]], [[DOTSPLAT]] +; CHECK-NEXT: ret <2 x i64> [[TMP3]] +; + %1 = and i32 %a, 63 + %2 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %v, i32 %1) + ret <2 x i64> %2 +} + +define <16 x i16> @avx2_pslli_w_256_masked(<16 x i16> %v, i32 %a) { +; CHECK-LABEL: @avx2_pslli_w_256_masked( +; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[A:%.*]] to i16 +; CHECK-NEXT: [[TMP2:%.*]] = and i16 [[TMP1]], 15 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <16 x i16> undef, i16 [[TMP2]], i32 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <16 x i16> [[DOTSPLATINSERT]], <16 x i16> undef, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = shl <16 x i16> [[V:%.*]], [[DOTSPLAT]] +; CHECK-NEXT: ret <16 x i16> [[TMP3]] +; + %1 = and i32 %a, 15 + %2 = tail call <16 x i16> @llvm.x86.avx2.pslli.w(<16 x i16> %v, i32 %1) + ret <16 x i16> %2 +} + +define <16 x i32> @avx512_pslli_d_512_masked(<16 x i32> %v, i32 %a) { +; CHECK-LABEL: @avx512_pslli_d_512_masked( +; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[A:%.*]], 31 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <16 x i32> undef, i32 [[TMP1]], i32 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <16 x i32> [[DOTSPLATINSERT]], <16 x i32> undef, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = shl <16 x i32> [[V:%.*]], [[DOTSPLAT]] +; CHECK-NEXT: ret <16 x i32> [[TMP2]] +; + %1 = and i32 %a, 31 + %2 = tail call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> %v, i32 %1) + ret <16 x i32> %2 +} + define <4 x i32> @avx2_psrav_d_128_masked(<4 x i32> %v, <4 x i32> %a) { ; CHECK-LABEL: @avx2_psrav_d_128_masked( ; CHECK-NEXT: [[TMP1:%.*]] = and <4 x i32> [[A:%.*]], diff --git a/llvm/test/Transforms/InstCombine/align-attr.ll b/llvm/test/Transforms/InstCombine/align-attr.ll index 16782dba2effc..2b004311cc8ea 100644 --- a/llvm/test/Transforms/InstCombine/align-attr.ll +++ b/llvm/test/Transforms/InstCombine/align-attr.ll @@ -20,7 +20,7 @@ define i32 @foo2(i32* align 32 %a) #0 { ; CHECK-LABEL: @foo2( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[V:%.*]] = call i32* @func1(i32* [[A:%.*]]) -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[V]], align 32 +; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[A]], align 32 ; CHECK-NEXT: ret i32 [[TMP0]] ; entry: diff --git a/llvm/test/Transforms/InstCombine/all-bits-shift.ll b/llvm/test/Transforms/InstCombine/all-bits-shift.ll index ba1a281a4bb19..ac3b0a5517bb8 100644 --- a/llvm/test/Transforms/InstCombine/all-bits-shift.ll +++ b/llvm/test/Transforms/InstCombine/all-bits-shift.ll @@ -1,6 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -S -instcombine -expensive-combines=0 < %s | FileCheck %s --check-prefixes=CHECK,EXPENSIVE-OFF -; RUN: opt -S -instcombine -expensive-combines=1 < %s | FileCheck %s --check-prefixes=CHECK,EXPENSIVE-ON +; RUN: opt -S -instcombine < %s | FileCheck %s target datalayout = "E-m:e-i64:64-n32:64" target triple = "powerpc64-unknown-linux-gnu" diff --git a/llvm/test/Transforms/InstCombine/assume.ll b/llvm/test/Transforms/InstCombine/assume.ll index ff3bf5665ed9f..8abb379e537e1 100644 --- a/llvm/test/Transforms/InstCombine/assume.ll +++ b/llvm/test/Transforms/InstCombine/assume.ll @@ -1,6 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -instcombine -S | FileCheck %s --check-prefixes=CHECK,EXPENSIVE-ON -; RUN: opt < %s -instcombine -expensive-combines=0 -S | FileCheck %s --check-prefixes=CHECK,EXPENSIVE-OFF +; RUN: opt < %s -instcombine -S | FileCheck %s target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" @@ -353,16 +352,12 @@ define i1 @nonnull5(i32** %a) { ; PR35846 - https://bugs.llvm.org/show_bug.cgi?id=35846 define i32 @assumption_conflicts_with_known_bits(i32 %a, i32 %b) { -; EXPENSIVE-ON-LABEL: @assumption_conflicts_with_known_bits( -; EXPENSIVE-ON-NEXT: tail call void @llvm.assume(i1 false) -; EXPENSIVE-ON-NEXT: ret i32 0 -; -; EXPENSIVE-OFF-LABEL: @assumption_conflicts_with_known_bits( -; EXPENSIVE-OFF-NEXT: [[AND1:%.*]] = and i32 [[B:%.*]], 3 -; EXPENSIVE-OFF-NEXT: tail call void @llvm.assume(i1 false) -; EXPENSIVE-OFF-NEXT: [[CMP2:%.*]] = icmp eq i32 [[AND1]], 0 -; EXPENSIVE-OFF-NEXT: tail call void @llvm.assume(i1 [[CMP2]]) -; EXPENSIVE-OFF-NEXT: ret i32 0 +; CHECK-LABEL: @assumption_conflicts_with_known_bits( +; CHECK-NEXT: [[AND1:%.*]] = and i32 [[B:%.*]], 3 +; CHECK-NEXT: tail call void @llvm.assume(i1 false) +; CHECK-NEXT: [[CMP2:%.*]] = icmp eq i32 [[AND1]], 0 +; CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP2]]) +; CHECK-NEXT: ret i32 0 ; %and1 = and i32 %b, 3 %B1 = lshr i32 %and1, %and1 diff --git a/llvm/test/Transforms/InstCombine/call-returned.ll b/llvm/test/Transforms/InstCombine/call-returned.ll new file mode 100644 index 0000000000000..24d95a316c420 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/call-returned.ll @@ -0,0 +1,50 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -instcombine < %s | FileCheck %s + +declare i32 @passthru_i32(i32 returned) +declare i8* @passthru_p8(i8* returned) + +define i32 @returned_const_int_arg() { +; CHECK-LABEL: @returned_const_int_arg( +; CHECK-NEXT: [[X:%.*]] = call i32 @passthru_i32(i32 42) +; CHECK-NEXT: ret i32 42 +; + %x = call i32 @passthru_i32(i32 42) + ret i32 %x +} + +define i8* @returned_const_ptr_arg() { +; CHECK-LABEL: @returned_const_ptr_arg( +; CHECK-NEXT: [[X:%.*]] = call i8* @passthru_p8(i8* null) +; CHECK-NEXT: ret i8* null +; + %x = call i8* @passthru_p8(i8* null) + ret i8* %x +} + +define i32 @returned_var_arg(i32 %arg) { +; CHECK-LABEL: @returned_var_arg( +; CHECK-NEXT: [[X:%.*]] = call i32 @passthru_i32(i32 [[ARG:%.*]]) +; CHECK-NEXT: ret i32 [[ARG]] +; + %x = call i32 @passthru_i32(i32 %arg) + ret i32 %x +} + +define i32 @returned_const_int_arg_musttail(i32 %arg) { +; CHECK-LABEL: @returned_const_int_arg_musttail( +; CHECK-NEXT: [[X:%.*]] = musttail call i32 @passthru_i32(i32 42) +; CHECK-NEXT: ret i32 [[X]] +; + %x = musttail call i32 @passthru_i32(i32 42) + ret i32 %x +} + +define i32 @returned_var_arg_musttail(i32 %arg) { +; CHECK-LABEL: @returned_var_arg_musttail( +; CHECK-NEXT: [[X:%.*]] = musttail call i32 @passthru_i32(i32 [[ARG:%.*]]) +; CHECK-NEXT: ret i32 [[X]] +; + %x = musttail call i32 @passthru_i32(i32 %arg) + ret i32 %x +} diff --git a/llvm/test/Transforms/InstCombine/ctpop-cttz.ll b/llvm/test/Transforms/InstCombine/ctpop-cttz.ll new file mode 100644 index 0000000000000..6ac3cfe36bbc0 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/ctpop-cttz.ll @@ -0,0 +1,91 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -S -instcombine | FileCheck %s + +declare i32 @llvm.ctpop.i32(i32) +declare <2 x i32> @llvm.ctpop.v2i32(<2 x i32>) + +; PR43513 +; __builtin_popcount(i | -i) -> 32 - __builtin_cttz(i, false) +define i32 @ctpop1(i32 %0) { +; CHECK-LABEL: @ctpop1( +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP0:%.*]], i1 false), !range !0 +; CHECK-NEXT: ret i32 [[TMP2]] +; + %2 = sub i32 0, %0 + %3 = or i32 %0, %2 + %4 = tail call i32 @llvm.ctpop.i32(i32 %3) + %5 = sub i32 32, %4 + ret i32 %5 +} + +define <2 x i32> @ctpop1v(<2 x i32> %0) { +; CHECK-LABEL: @ctpop1v( +; CHECK-NEXT: [[TMP2:%.*]] = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> [[TMP0:%.*]], i1 false) +; CHECK-NEXT: [[TMP3:%.*]] = sub nsw <2 x i32> , [[TMP2]] +; CHECK-NEXT: ret <2 x i32> [[TMP3]] +; + %2 = sub <2 x i32> zeroinitializer, %0 + %3 = or <2 x i32> %2, %0 + %4 = tail call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %3) + ret <2 x i32> %4 +} + +define i32 @ctpop1_multiuse(i32 %0) { +; CHECK-LABEL: @ctpop1_multiuse( +; CHECK-NEXT: [[TMP2:%.*]] = sub i32 0, [[TMP0:%.*]] +; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP2]], [[TMP0]] +; CHECK-NEXT: [[TMP4:%.*]] = tail call i32 @llvm.ctpop.i32(i32 [[TMP3]]), !range !0 +; CHECK-NEXT: [[TMP5:%.*]] = sub nuw nsw i32 32, [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP5]], [[TMP3]] +; CHECK-NEXT: ret i32 [[TMP6]] +; + %2 = sub i32 0, %0 + %3 = or i32 %0, %2 + %4 = tail call i32 @llvm.ctpop.i32(i32 %3) + %5 = sub i32 32, %4 + %6 = add i32 %5, %3 + ret i32 %6 +} + +; PR43513 +; __builtin_popcount(~i & (i-1)) -> __builtin_cttz(i, false) +define i32 @ctpop2(i32 %0) { +; CHECK-LABEL: @ctpop2( +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP0:%.*]], i1 false), !range !0 +; CHECK-NEXT: ret i32 [[TMP2]] +; + %2 = xor i32 %0, -1 + %3 = sub i32 %0, 1 + %4 = and i32 %3, %2 + %5 = tail call i32 @llvm.ctpop.i32(i32 %4) + ret i32 %5 +} + +define <2 x i32> @ctpop2v(<2 x i32> %0) { +; CHECK-LABEL: @ctpop2v( +; CHECK-NEXT: [[TMP2:%.*]] = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> [[TMP0:%.*]], i1 false) +; CHECK-NEXT: ret <2 x i32> [[TMP2]] +; + %2 = xor <2 x i32> %0, + %3 = add <2 x i32> %0, + %4 = and <2 x i32> %2, %3 + %5 = tail call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %4) + ret <2 x i32> %5 +} + +define i32 @ctpop2_multiuse(i32 %0) { +; CHECK-LABEL: @ctpop2_multiuse( +; CHECK-NEXT: [[TMP2:%.*]] = xor i32 [[TMP0:%.*]], -1 +; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP0]], -1 +; CHECK-NEXT: [[TMP4:%.*]] = and i32 [[TMP3]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.cttz.i32(i32 [[TMP0]], i1 false), !range !0 +; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP5]], [[TMP4]] +; CHECK-NEXT: ret i32 [[TMP6]] +; + %2 = xor i32 %0, -1 + %3 = sub i32 %0, 1 + %4 = and i32 %3, %2 + %5 = tail call i32 @llvm.ctpop.i32(i32 %4) + %6 = add i32 %5, %4 + ret i32 %6 +} diff --git a/llvm/test/Transforms/InstCombine/expensive-combines.ll b/llvm/test/Transforms/InstCombine/expensive-combines.ll deleted file mode 100644 index 28acb773bfd50..0000000000000 --- a/llvm/test/Transforms/InstCombine/expensive-combines.ll +++ /dev/null @@ -1,28 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -S -instcombine < %s | FileCheck %s --check-prefix=DEFAULT -; RUN: opt -S -instcombine -expensive-combines=1 < %s | FileCheck %s --check-prefix=EXPENSIVE-ON -; RUN: opt -S -instcombine -expensive-combines=0 < %s | FileCheck %s --check-prefix=EXPENSIVE-OFF - -define void @test() { -; DEFAULT-LABEL: @test( -; DEFAULT-NEXT: [[CALL:%.*]] = call i32 @passthru(i32 0) -; DEFAULT-NEXT: call void @sink(i32 0) -; DEFAULT-NEXT: ret void -; -; EXPENSIVE-ON-LABEL: @test( -; EXPENSIVE-ON-NEXT: [[CALL:%.*]] = call i32 @passthru(i32 0) -; EXPENSIVE-ON-NEXT: call void @sink(i32 0) -; EXPENSIVE-ON-NEXT: ret void -; -; EXPENSIVE-OFF-LABEL: @test( -; EXPENSIVE-OFF-NEXT: [[CALL:%.*]] = call i32 @passthru(i32 0) -; EXPENSIVE-OFF-NEXT: call void @sink(i32 [[CALL]]) -; EXPENSIVE-OFF-NEXT: ret void -; - %call = call i32 @passthru(i32 0) - call void @sink(i32 %call) - ret void -} - -declare i32 @passthru(i32 returned) -declare void @sink(i32) diff --git a/llvm/test/Transforms/InstCombine/fortify-folding.ll b/llvm/test/Transforms/InstCombine/fortify-folding.ll index ee81557615a54..b2171a44f57ef 100644 --- a/llvm/test/Transforms/InstCombine/fortify-folding.ll +++ b/llvm/test/Transforms/InstCombine/fortify-folding.ll @@ -82,7 +82,7 @@ define i32 @test_not_sprintf() { define i8* @test_strcat() { ; CHECK-LABEL: @test_strcat( ; CHECK-NEXT: [[STRCAT:%.*]] = call i8* @strcat(i8* nonnull dereferenceable(1) getelementptr inbounds ([60 x i8], [60 x i8]* @a, i64 0, i64 0), i8* nonnull dereferenceable(1) getelementptr inbounds ([60 x i8], [60 x i8]* @b, i64 0, i64 0)) -; CHECK-NEXT: ret i8* [[STRCAT]] +; CHECK-NEXT: ret i8* getelementptr inbounds ([60 x i8], [60 x i8]* @a, i64 0, i64 0) ; %dst = getelementptr inbounds [60 x i8], [60 x i8]* @a, i32 0, i32 0 %src = getelementptr inbounds [60 x i8], [60 x i8]* @b, i32 0, i32 0 @@ -126,7 +126,7 @@ define i64 @test_not_strlcat() { define i8* @test_strncat() { ; CHECK-LABEL: @test_strncat( ; CHECK-NEXT: [[STRNCAT:%.*]] = call i8* @strncat(i8* nonnull dereferenceable(1) getelementptr inbounds ([60 x i8], [60 x i8]* @a, i64 0, i64 0), i8* nonnull dereferenceable(1) getelementptr inbounds ([60 x i8], [60 x i8]* @b, i64 0, i64 0), i64 22) -; CHECK-NEXT: ret i8* [[STRNCAT]] +; CHECK-NEXT: ret i8* getelementptr inbounds ([60 x i8], [60 x i8]* @a, i64 0, i64 0) ; %dst = getelementptr inbounds [60 x i8], [60 x i8]* @a, i32 0, i32 0 %src = getelementptr inbounds [60 x i8], [60 x i8]* @b, i32 0, i32 0 diff --git a/llvm/test/Transforms/InstCombine/known-bits.ll b/llvm/test/Transforms/InstCombine/known-bits.ll index d7283dfc810f9..e4015d832bef6 100644 --- a/llvm/test/Transforms/InstCombine/known-bits.ll +++ b/llvm/test/Transforms/InstCombine/known-bits.ll @@ -1,6 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -S -instcombine -expensive-combines=0 < %s | FileCheck %s --check-prefixes=CHECK,EXPENSIVE-OFF -; RUN: opt -S -instcombine -expensive-combines=1 < %s | FileCheck %s --check-prefixes=CHECK,EXPENSIVE-ON +; RUN: opt -S -instcombine < %s | FileCheck %s define void @test_shl(i1 %x) { ; CHECK-LABEL: @test_shl( diff --git a/llvm/test/Analysis/ValueTracking/known-signbit-shift.ll b/llvm/test/Transforms/InstCombine/known-signbit-shift.ll similarity index 95% rename from llvm/test/Analysis/ValueTracking/known-signbit-shift.ll rename to llvm/test/Transforms/InstCombine/known-signbit-shift.ll index 7e9f1c2e70cd7..b00a4f83702fc 100644 --- a/llvm/test/Analysis/ValueTracking/known-signbit-shift.ll +++ b/llvm/test/Transforms/InstCombine/known-signbit-shift.ll @@ -30,7 +30,7 @@ define i1 @test_shift_negative(i32 %a, i32 %b) { ; This test should not crash opt. The shift produces poison. define i32 @test_no_sign_bit_conflict1(i1 %b) { ; CHECK-LABEL: @test_no_sign_bit_conflict1( -; CHECK-NEXT: ret i32 0 +; CHECK-NEXT: ret i32 undef ; %sel = select i1 %b, i32 8193, i32 8192 %mul = shl nsw i32 %sel, 18 @@ -41,7 +41,7 @@ define i32 @test_no_sign_bit_conflict1(i1 %b) { ; This test should not crash opt. The shift produces poison. define i32 @test_no_sign_bit_conflict2(i1 %b) { ; CHECK-LABEL: @test_no_sign_bit_conflict2( -; CHECK-NEXT: ret i32 0 +; CHECK-NEXT: ret i32 undef ; %sel = select i1 %b, i32 -8193, i32 -8194 %mul = shl nsw i32 %sel, 18 diff --git a/llvm/test/Transforms/InstCombine/out-of-bounds-indexes.ll b/llvm/test/Transforms/InstCombine/out-of-bounds-indexes.ll index 5c1867f810065..9559c61dda00e 100644 --- a/llvm/test/Transforms/InstCombine/out-of-bounds-indexes.ll +++ b/llvm/test/Transforms/InstCombine/out-of-bounds-indexes.ll @@ -1,20 +1,13 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -S -instcombine -expensive-combines=0 < %s | FileCheck %s --check-prefixes=CHECK,EXPENSIVE-OFF -; RUN: opt -S -instcombine -expensive-combines=1 < %s | FileCheck %s --check-prefixes=CHECK,EXPENSIVE-ON +; RUN: opt -S -instcombine < %s | FileCheck %s ; Check that we don't crash on unreasonable constant indexes define i32 @test_out_of_bounds(i32 %a, i1 %x, i1 %y) { -; EXPENSIVE-OFF-LABEL: @test_out_of_bounds( -; EXPENSIVE-OFF-NEXT: entry: -; EXPENSIVE-OFF-NEXT: [[AND1:%.*]] = and i32 [[A:%.*]], 3 -; EXPENSIVE-OFF-NEXT: tail call void @llvm.assume(i1 undef) -; EXPENSIVE-OFF-NEXT: ret i32 [[AND1]] -; -; EXPENSIVE-ON-LABEL: @test_out_of_bounds( -; EXPENSIVE-ON-NEXT: entry: -; EXPENSIVE-ON-NEXT: [[AND1:%.*]] = and i32 [[A:%.*]], 3 -; EXPENSIVE-ON-NEXT: tail call void @llvm.assume(i1 false) -; EXPENSIVE-ON-NEXT: ret i32 [[AND1]] +; CHECK-LABEL: @test_out_of_bounds( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[AND1:%.*]] = and i32 [[A:%.*]], 3 +; CHECK-NEXT: tail call void @llvm.assume(i1 undef) +; CHECK-NEXT: ret i32 [[AND1]] ; entry: %and1 = and i32 %a, 3 @@ -25,15 +18,10 @@ entry: } define i128 @test_non64bit(i128 %a) { -; EXPENSIVE-OFF-LABEL: @test_non64bit( -; EXPENSIVE-OFF-NEXT: [[AND1:%.*]] = and i128 [[A:%.*]], 3 -; EXPENSIVE-OFF-NEXT: tail call void @llvm.assume(i1 undef) -; EXPENSIVE-OFF-NEXT: ret i128 [[AND1]] -; -; EXPENSIVE-ON-LABEL: @test_non64bit( -; EXPENSIVE-ON-NEXT: [[AND1:%.*]] = and i128 [[A:%.*]], 3 -; EXPENSIVE-ON-NEXT: tail call void @llvm.assume(i1 false) -; EXPENSIVE-ON-NEXT: ret i128 [[AND1]] +; CHECK-LABEL: @test_non64bit( +; CHECK-NEXT: [[AND1:%.*]] = and i128 [[A:%.*]], 3 +; CHECK-NEXT: tail call void @llvm.assume(i1 undef) +; CHECK-NEXT: ret i128 [[AND1]] ; %and1 = and i128 %a, 3 %B = lshr i128 %and1, -1 diff --git a/llvm/test/Transforms/InstCombine/phi-shifts.ll b/llvm/test/Transforms/InstCombine/phi-shifts.ll index af94d8b4001bc..732d4aee350b8 100644 --- a/llvm/test/Transforms/InstCombine/phi-shifts.ll +++ b/llvm/test/Transforms/InstCombine/phi-shifts.ll @@ -1,24 +1,15 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -S -instcombine -expensive-combines=0 < %s | FileCheck %s --check-prefixes=CHECK,EXPENSIVE-OFF -; RUN: opt -S -instcombine -expensive-combines=1 < %s | FileCheck %s --check-prefixes=CHECK,EXPENSIVE-ON +; RUN: opt -S -instcombine < %s | FileCheck %s ; OSS Fuzz: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=15217 define i64 @fuzz15217(i1 %cond, i8* %Ptr, i64 %Val) { -; EXPENSIVE-OFF-LABEL: @fuzz15217( -; EXPENSIVE-OFF-NEXT: entry: -; EXPENSIVE-OFF-NEXT: br i1 [[COND:%.*]], label [[END:%.*]], label [[TWO:%.*]] -; EXPENSIVE-OFF: two: -; EXPENSIVE-OFF-NEXT: br label [[END]] -; EXPENSIVE-OFF: end: -; EXPENSIVE-OFF-NEXT: ret i64 undef -; -; EXPENSIVE-ON-LABEL: @fuzz15217( -; EXPENSIVE-ON-NEXT: entry: -; EXPENSIVE-ON-NEXT: br i1 [[COND:%.*]], label [[END:%.*]], label [[TWO:%.*]] -; EXPENSIVE-ON: two: -; EXPENSIVE-ON-NEXT: br label [[END]] -; EXPENSIVE-ON: end: -; EXPENSIVE-ON-NEXT: ret i64 0 +; CHECK-LABEL: @fuzz15217( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[COND:%.*]], label [[END:%.*]], label [[TWO:%.*]] +; CHECK: two: +; CHECK-NEXT: br label [[END]] +; CHECK: end: +; CHECK-NEXT: ret i64 undef ; entry: br i1 %cond, label %end, label %two diff --git a/llvm/test/Transforms/InstCombine/pr44541.ll b/llvm/test/Transforms/InstCombine/pr44541.ll index 3d4082a770911..a009c62394ad3 100644 --- a/llvm/test/Transforms/InstCombine/pr44541.ll +++ b/llvm/test/Transforms/InstCombine/pr44541.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -S -instcombine -expensive-combines=0 -instcombine-infinite-loop-threshold=2 < %s | FileCheck %s +; RUN: opt -S -instcombine -instcombine-infinite-loop-threshold=2 < %s | FileCheck %s ; This test used to cause an infinite combine loop. diff --git a/llvm/test/Transforms/InstCombine/select.ll b/llvm/test/Transforms/InstCombine/select.ll index 0f26a93a7bae3..20b6dd1a990ca 100644 --- a/llvm/test/Transforms/InstCombine/select.ll +++ b/llvm/test/Transforms/InstCombine/select.ll @@ -1348,6 +1348,29 @@ define i32 @PR27137(i32 %a) { ret i32 %s1 } +; ub-safe negation pattern +define i32 @PR27817(i32 %x) { +; CHECK-LABEL: @PR27817( +; CHECK-NEXT: [[SUB:%.*]] = sub i32 0, [[X:%.*]] +; CHECK-NEXT: ret i32 [[SUB]] +; + %cmp = icmp eq i32 %x, -2147483648 + %sub = sub i32 0, %x + %sel = select i1 %cmp, i32 -2147483648, i32 %sub + ret i32 %sel +} + +define i32 @PR27817_nsw(i32 %x) { +; CHECK-LABEL: @PR27817_nsw( +; CHECK-NEXT: [[SUB:%.*]] = sub i32 0, [[X:%.*]] +; CHECK-NEXT: ret i32 [[SUB]] +; + %cmp = icmp eq i32 %x, -2147483648 + %sub = sub nsw i32 0, %x + %sel = select i1 %cmp, i32 -2147483648, i32 %sub + ret i32 %sel +} + define i32 @select_icmp_slt0_xor(i32 %x) { ; CHECK-LABEL: @select_icmp_slt0_xor( ; CHECK-NEXT: [[TMP1:%.*]] = or i32 [[X:%.*]], -2147483648 diff --git a/llvm/test/Transforms/InstCombine/strcpy_chk-1.ll b/llvm/test/Transforms/InstCombine/strcpy_chk-1.ll index 02a4b5cbdeac2..67e393d1525be 100644 --- a/llvm/test/Transforms/InstCombine/strcpy_chk-1.ll +++ b/llvm/test/Transforms/InstCombine/strcpy_chk-1.ll @@ -53,7 +53,7 @@ define i8* @test_simplify3() { define i8* @test_simplify4() { ; CHECK-LABEL: @test_simplify4( ; CHECK-NEXT: [[STRCPY:%.*]] = call i8* @strcpy(i8* nonnull dereferenceable(1) getelementptr inbounds ([60 x i8], [60 x i8]* @a, i32 0, i32 0), i8* nonnull dereferenceable(1) getelementptr inbounds ([60 x i8], [60 x i8]* @b, i32 0, i32 0)) -; CHECK-NEXT: ret i8* [[STRCPY]] +; CHECK-NEXT: ret i8* getelementptr inbounds ([60 x i8], [60 x i8]* @a, i32 0, i32 0) ; %dst = getelementptr inbounds [60 x i8], [60 x i8]* @a, i32 0, i32 0 %src = getelementptr inbounds [60 x i8], [60 x i8]* @b, i32 0, i32 0 diff --git a/llvm/test/Transforms/InstCombine/strncpy_chk-1.ll b/llvm/test/Transforms/InstCombine/strncpy_chk-1.ll index ed90303b28080..7601b16693599 100644 --- a/llvm/test/Transforms/InstCombine/strncpy_chk-1.ll +++ b/llvm/test/Transforms/InstCombine/strncpy_chk-1.ll @@ -39,7 +39,7 @@ define i8* @test_simplify2() { define i8* @test_simplify3() { ; CHECK-LABEL: @test_simplify3( ; CHECK-NEXT: [[STRNCPY:%.*]] = call i8* @strncpy(i8* nonnull dereferenceable(1) getelementptr inbounds ([60 x i8], [60 x i8]* @a, i32 0, i32 0), i8* nonnull dereferenceable(1) getelementptr inbounds ([60 x i8], [60 x i8]* @b, i32 0, i32 0), i32 12) -; CHECK-NEXT: ret i8* [[STRNCPY]] +; CHECK-NEXT: ret i8* getelementptr inbounds ([60 x i8], [60 x i8]* @a, i32 0, i32 0) ; %dst = getelementptr inbounds [60 x i8], [60 x i8]* @a, i32 0, i32 0 %src = getelementptr inbounds [60 x i8], [60 x i8]* @b, i32 0, i32 0 diff --git a/llvm/test/Transforms/InstCombine/unused-nonnull.ll b/llvm/test/Transforms/InstCombine/unused-nonnull.ll index 0a1520ea73c20..382d2634b86c5 100644 --- a/llvm/test/Transforms/InstCombine/unused-nonnull.ll +++ b/llvm/test/Transforms/InstCombine/unused-nonnull.ll @@ -12,13 +12,8 @@ define i32 @main(i32 %argc, i8** %argv) #0 { ; CHECK-SAME: (i32 [[ARGC:%.*]], i8** nocapture readnone [[ARGV:%.*]]) local_unnamed_addr #0 ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = icmp slt i32 [[ARGC]], 2 -; CHECK-NEXT: br i1 [[TMP0]], label [[DONE:%.*]], label [[DO_WORK:%.*]] -; CHECK: do_work: -; CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @compute(i8* undef, i32 [[ARGC]]) -; CHECK-NEXT: br label [[DONE]] -; CHECK: done: -; CHECK-NEXT: [[RETVAL:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP1]], [[DO_WORK]] ] -; CHECK-NEXT: ret i32 [[RETVAL]] +; CHECK-NEXT: [[SPEC_SELECT:%.*]] = select i1 [[TMP0]], i32 0, i32 [[ARGC]] +; CHECK-NEXT: ret i32 [[SPEC_SELECT]] ; entry: %0 = getelementptr inbounds i8*, i8** %argv, i32 0 diff --git a/llvm/test/Transforms/InstCombine/vscale_alloca.ll b/llvm/test/Transforms/InstCombine/vscale_alloca.ll new file mode 100644 index 0000000000000..8cfc7b74a77fe --- /dev/null +++ b/llvm/test/Transforms/InstCombine/vscale_alloca.ll @@ -0,0 +1,37 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -instcombine -verify < %s | FileCheck %s + +define @alloca( %z) { +; CHECK-LABEL: @alloca( +; CHECK-NEXT: ret [[Z:%.*]] +; + %a = alloca + store %z, * %a + %load = load , * %a + ret %load +} + +define void @alloca_dead_store( %z) { +; CHECK-LABEL: @alloca_dead_store( +; CHECK-NEXT: ret void +; + %a = alloca + store %z, * %a + ret void +} + +declare void @use(...) +define void @alloca_zero_byte_move_first_inst() { +; CHECK-LABEL: @alloca_zero_byte_move_first_inst( +; CHECK-NEXT: [[B:%.*]] = alloca {}, align 8 +; CHECK-NEXT: [[A:%.*]] = alloca , align 16 +; CHECK-NEXT: call void (...) @use(* nonnull [[A]]) +; CHECK-NEXT: call void (...) @use({}* nonnull [[B]]) +; CHECK-NEXT: ret void +; + %a = alloca + call void (...) @use( * %a ) + %b = alloca { } + call void (...) @use( { }* %b ) + ret void +} diff --git a/llvm/test/Transforms/InstSimplify/add-mask.ll b/llvm/test/Transforms/InstSimplify/add-mask.ll index e30a35f53127d..cd0c871981205 100644 --- a/llvm/test/Transforms/InstSimplify/add-mask.ll +++ b/llvm/test/Transforms/InstSimplify/add-mask.ll @@ -1,9 +1,9 @@ -; NOTE: Assertions have been autogenerated by update_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -S -instsimplify < %s | FileCheck %s -define i1 @test(i32 %a) { -; CHECK-LABEL: @test( -; CHECK: ret i1 false +define i1 @test1(i32 %a) { +; CHECK-LABEL: @test1( +; CHECK-NEXT: ret i1 false ; %rhs = add i32 %a, -1 %and = and i32 %a, %rhs @@ -11,9 +11,20 @@ define i1 @test(i32 %a) { ret i1 %res } +define i1 @test1v(<2 x i32> %a) { +; CHECK-LABEL: @test1v( +; CHECK-NEXT: ret i1 false +; + %rhs = add <2 x i32> %a, + %and = and <2 x i32> %a, %rhs + %ext = extractelement <2 x i32> %and, i32 0 + %res = icmp eq i32 %ext, 1 + ret i1 %res +} + define i1 @test2(i32 %a) { ; CHECK-LABEL: @test2( -; CHECK: ret i1 false +; CHECK-NEXT: ret i1 false ; %rhs = add i32 %a, 1 %and = and i32 %a, %rhs @@ -21,9 +32,20 @@ define i1 @test2(i32 %a) { ret i1 %res } +define i1 @test2v(<2 x i32> %a) { +; CHECK-LABEL: @test2v( +; CHECK-NEXT: ret i1 false +; + %rhs = add <2 x i32> %a, + %and = and <2 x i32> %a, %rhs + %ext = extractelement <2 x i32> %and, i32 1 + %res = icmp eq i32 %ext, 1 + ret i1 %res +} + define i1 @test3(i32 %a) { ; CHECK-LABEL: @test3( -; CHECK: ret i1 false +; CHECK-NEXT: ret i1 false ; %rhs = add i32 %a, 7 %and = and i32 %a, %rhs @@ -31,13 +53,24 @@ define i1 @test3(i32 %a) { ret i1 %res } +define i1 @test3v(<2 x i32> %a) { +; CHECK-LABEL: @test3v( +; CHECK-NEXT: ret i1 false +; + %rhs = add <2 x i32> %a, + %and = and <2 x i32> %a, %rhs + %ext = extractelement <2 x i32> %and, i32 0 + %res = icmp eq i32 %ext, 1 + ret i1 %res +} + @B = external global i32 declare void @llvm.assume(i1) ; Known bits without a constant define i1 @test4(i32 %a) { ; CHECK-LABEL: @test4( -; CHECK: [[B:%.*]] = load i32, i32* @B +; CHECK-NEXT: [[B:%.*]] = load i32, i32* @B ; CHECK-NEXT: [[B_AND:%.*]] = and i32 [[B]], 1 ; CHECK-NEXT: [[B_CND:%.*]] = icmp eq i32 [[B_AND]], 1 ; CHECK-NEXT: call void @llvm.assume(i1 [[B_CND]]) @@ -57,8 +90,8 @@ define i1 @test4(i32 %a) { ; Negative test - even number define i1 @test5(i32 %a) { ; CHECK-LABEL: @test5( -; CHECK: [[RHS:%.*]] = add i32 %a, 2 -; CHECK-NEXT: [[AND:%.*]] = and i32 %a, [[RHS]] +; CHECK-NEXT: [[RHS:%.*]] = add i32 [[A:%.*]], 2 +; CHECK-NEXT: [[AND:%.*]] = and i32 [[A]], [[RHS]] ; CHECK-NEXT: [[RES:%.*]] = icmp eq i32 [[AND]], 1 ; CHECK-NEXT: ret i1 [[RES]] ; @@ -68,12 +101,38 @@ define i1 @test5(i32 %a) { ret i1 %res } +define i1 @test5v(<2 x i32> %a) { +; CHECK-LABEL: @test5v( +; CHECK-NEXT: [[RHS:%.*]] = add <2 x i32> [[A:%.*]], +; CHECK-NEXT: [[AND:%.*]] = and <2 x i32> [[A]], [[RHS]] +; CHECK-NEXT: [[EXT:%.*]] = extractelement <2 x i32> [[AND]], i32 1 +; CHECK-NEXT: [[RES:%.*]] = icmp eq i32 [[EXT]], 1 +; CHECK-NEXT: ret i1 [[RES]] +; + %rhs = add <2 x i32> %a, + %and = and <2 x i32> %a, %rhs + %ext = extractelement <2 x i32> %and, i32 1 + %res = icmp eq i32 %ext, 1 + ret i1 %res +} + define i1 @test6(i32 %a) { ; CHECK-LABEL: @test6( -; CHECK: ret i1 false +; CHECK-NEXT: ret i1 false ; %lhs = add i32 %a, -1 %and = and i32 %lhs, %a %res = icmp eq i32 %and, 1 ret i1 %res } + +define i1 @test6v(<2 x i32> %a) { +; CHECK-LABEL: @test6v( +; CHECK-NEXT: ret i1 false +; + %lhs = add <2 x i32> %a, + %and = and <2 x i32> %lhs, %a + %ext = extractelement <2 x i32> %and, i32 1 + %res = icmp eq i32 %ext, 1 + ret i1 %res +} diff --git a/llvm/test/Transforms/InstSimplify/assume.ll b/llvm/test/Transforms/InstSimplify/assume.ll index 157d8de8fcf8c..a43f90adee37c 100644 --- a/llvm/test/Transforms/InstSimplify/assume.ll +++ b/llvm/test/Transforms/InstSimplify/assume.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by update_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -instsimplify -S < %s 2>&1 -pass-remarks-analysis=.* | FileCheck %s ; Verify that warnings are emitted for the 2nd and 3rd tests. @@ -9,7 +9,7 @@ define void @test1() { ; CHECK-LABEL: @test1( -; CHECK: ret void +; CHECK-NEXT: ret void ; call void @llvm.assume(i1 1) ret void @@ -17,7 +17,7 @@ define void @test1() { } ; The alloca guarantees that the low bits of %a are zero because of alignment. -; The assume says the opposite. The assume is processed last, so that's the +; The assume says the opposite. The assume is processed last, so that's the ; return value. There's no way to win (we can't undo transforms that happened ; based on half-truths), so just don't crash. diff --git a/llvm/test/Transforms/InstSimplify/call.ll b/llvm/test/Transforms/InstSimplify/call.ll index 108de9082a700..07fdcdbbd5e69 100644 --- a/llvm/test/Transforms/InstSimplify/call.ll +++ b/llvm/test/Transforms/InstSimplify/call.ll @@ -978,6 +978,10 @@ define <2 x double> @negated_mag_arg_vec(<2 x double> %x) { ret <2 x double> %r } +; We handle the "returned" attribute only in InstCombine, because the fact +; that this simplification may replace one call with another may cause issues +; for call graph passes. + declare i32 @passthru_i32(i32 returned) declare i8* @passthru_p8(i8* returned) diff --git a/llvm/test/Transforms/InstSimplify/compare.ll b/llvm/test/Transforms/InstSimplify/compare.ll index 3dfdaf2ecf2f1..86e1fef22f602 100644 --- a/llvm/test/Transforms/InstSimplify/compare.ll +++ b/llvm/test/Transforms/InstSimplify/compare.ll @@ -377,6 +377,19 @@ define i1 @add(i32 %x, i32 %y) { ret i1 %c } +define i1 @addv(<2 x i32> %x, <2 x i32> %y) { +; CHECK-LABEL: @addv( +; CHECK-NEXT: ret i1 false +; + %l = lshr <2 x i32> %x, + %q = lshr <2 x i32> %y, + %r = or <2 x i32> %q, + %s = add <2 x i32> %l, %r + %e = extractelement <2 x i32> %s, i32 0 + %c = icmp eq i32 %e, 0 + ret i1 %c +} + define i1 @add2(i8 %x, i8 %y) { ; CHECK-LABEL: @add2( ; CHECK-NEXT: ret i1 false @@ -388,6 +401,18 @@ define i1 @add2(i8 %x, i8 %y) { ret i1 %c } +define i1 @add2v(<2 x i8> %x, <2 x i8> %y) { +; CHECK-LABEL: @add2v( +; CHECK-NEXT: ret i1 false +; + %l = or <2 x i8> %x, + %r = or <2 x i8> %y, + %s = add <2 x i8> %l, %r + %e = extractelement <2 x i8> %s, i32 1 + %c = icmp eq i8 %e, 0 + ret i1 %c +} + define i1 @add3(i8 %x, i8 %y) { ; CHECK-LABEL: @add3( ; CHECK-NEXT: [[L:%.*]] = zext i8 [[X:%.*]] to i32 @@ -446,6 +471,23 @@ define i1 @addpowtwo(i32 %x, i32 %y) { ret i1 %c } +define i1 @addpowtwov(<2 x i32> %x, <2 x i32> %y) { +; CHECK-LABEL: @addpowtwov( +; CHECK-NEXT: [[L:%.*]] = lshr <2 x i32> [[X:%.*]], +; CHECK-NEXT: [[R:%.*]] = shl <2 x i32> , [[Y:%.*]] +; CHECK-NEXT: [[S:%.*]] = add <2 x i32> [[L]], [[R]] +; CHECK-NEXT: [[E:%.*]] = extractelement <2 x i32> [[S]], i32 0 +; CHECK-NEXT: [[C:%.*]] = icmp eq i32 [[E]], 0 +; CHECK-NEXT: ret i1 [[C]] +; + %l = lshr <2 x i32> %x, + %r = shl <2 x i32> , %y + %s = add <2 x i32> %l, %r + %e = extractelement <2 x i32> %s, i32 0 + %c = icmp eq i32 %e, 0 + ret i1 %c +} + define i1 @or(i32 %x) { ; CHECK-LABEL: @or( ; CHECK-NEXT: ret i1 false @@ -698,6 +740,18 @@ define i1 @srem2(i16 %X, i32 %Y) { ret i1 %D } +define i1 @srem2v(<2 x i16> %X, <2 x i32> %Y) { +; CHECK-LABEL: @srem2v( +; CHECK-NEXT: ret i1 false +; + %A = zext <2 x i16> %X to <2 x i32> + %B = add nsw <2 x i32> %A, + %C = srem <2 x i32> %B, %Y + %D = extractelement <2 x i32> %C, i32 0 + %E = icmp slt i32 %D, 0 + ret i1 %E +} + define i1 @srem3(i16 %X, i32 %Y) { ; CHECK-LABEL: @srem3( ; CHECK-NEXT: ret i1 false @@ -710,6 +764,19 @@ define i1 @srem3(i16 %X, i32 %Y) { ret i1 %E } +define i1 @srem3v(<2 x i16> %X, <2 x i32> %Y) { +; CHECK-LABEL: @srem3v( +; CHECK-NEXT: ret i1 false +; + %A = zext <2 x i16> %X to <2 x i32> + %B = or <2 x i32> , %A + %C = sub nsw <2 x i32> , %B + %D = srem <2 x i32> %C, %Y + %E = extractelement <2 x i32> %C, i32 1 + %F = icmp slt i32 %E, 0 + ret i1 %F +} + define i1 @udiv2(i32 %Z) { ; CHECK-LABEL: @udiv2( ; CHECK-NEXT: ret i1 true @@ -795,33 +862,55 @@ define i1 @udiv8(i32 %X, i32 %Y) { ret i1 %C } +; Square of a non-zero number is non-zero if there is no overflow. define i1 @mul1(i32 %X) { ; CHECK-LABEL: @mul1( ; CHECK-NEXT: ret i1 false ; -; Square of a non-zero number is non-zero if there is no overflow. %Y = or i32 %X, 1 %M = mul nuw i32 %Y, %Y %C = icmp eq i32 %M, 0 ret i1 %C } +define i1 @mul1v(<2 x i32> %X) { +; CHECK-LABEL: @mul1v( +; CHECK-NEXT: ret i1 false +; + %Y = or <2 x i32> %X, + %M = mul nuw <2 x i32> %Y, %Y + %E = extractelement <2 x i32> %M, i32 0 + %C = icmp eq i32 %E, 0 + ret i1 %C +} + +; Square of a non-zero number is positive if there is no signed overflow. define i1 @mul2(i32 %X) { ; CHECK-LABEL: @mul2( ; CHECK-NEXT: ret i1 true ; -; Square of a non-zero number is positive if there is no signed overflow. %Y = or i32 %X, 1 %M = mul nsw i32 %Y, %Y %C = icmp sgt i32 %M, 0 ret i1 %C } +define i1 @mul2v(<2 x i32> %X) { +; CHECK-LABEL: @mul2v( +; CHECK-NEXT: ret i1 true +; + %Y = or <2 x i32> %X, + %M = mul nsw <2 x i32> %Y, %Y + %E = extractelement <2 x i32> %M, i32 1 + %C = icmp sgt i32 %E, 0 + ret i1 %C +} + +; Product of non-negative numbers is non-negative if there is no signed overflow. define i1 @mul3(i32 %X, i32 %Y) { ; CHECK-LABEL: @mul3( ; CHECK-NEXT: ret i1 true ; -; Product of non-negative numbers is non-negative if there is no signed overflow. %XX = mul nsw i32 %X, %X %YY = mul nsw i32 %Y, %Y %M = mul nsw i32 %XX, %YY @@ -829,6 +918,17 @@ define i1 @mul3(i32 %X, i32 %Y) { ret i1 %C } +define <2 x i1> @mul3v(<2 x i32> %X, <2 x i32> %Y) { +; CHECK-LABEL: @mul3v( +; CHECK-NEXT: ret <2 x i1> +; + %XX = mul nsw <2 x i32> %X, %X + %YY = mul nsw <2 x i32> %Y, %Y + %M = mul nsw <2 x i32> %XX, %YY + %C = icmp sge <2 x i32> %M, zeroinitializer + ret <2 x i1> %C +} + define <2 x i1> @vectorselect1(<2 x i1> %cond) { ; CHECK-LABEL: @vectorselect1( ; CHECK-NEXT: ret <2 x i1> [[COND:%.*]] @@ -1258,7 +1358,20 @@ define i1 @icmp_known_bits(i4 %x, i4 %y) { %add = add i4 %or1, %or2 %cmp = icmp eq i4 %add, 0 ret i1 %cmp +} +define i1 @icmp_known_bits_vec(<2 x i4> %x, <2 x i4> %y) { +; CHECK-LABEL: @icmp_known_bits_vec( +; CHECK-NEXT: ret i1 false +; + %and1 = and <2 x i4> %y, + %and2 = and <2 x i4> %x, + %or1 = or <2 x i4> %and1, + %or2 = or <2 x i4> %and2, + %add = add <2 x i4> %or1, %or2 + %ext = extractelement <2 x i4> %add,i32 0 + %cmp = icmp eq i4 %ext, 0 + ret i1 %cmp } define i1 @icmp_shl_nuw_1(i64 %a) { diff --git a/llvm/test/Transforms/InstSimplify/freeze.ll b/llvm/test/Transforms/InstSimplify/freeze.ll index e6085bf392bfc..e8950a8ad527b 100644 --- a/llvm/test/Transforms/InstSimplify/freeze.ll +++ b/llvm/test/Transforms/InstSimplify/freeze.ll @@ -19,6 +19,268 @@ define i32 @make_const() { ret i32 %x } +; TODO: This is not poison. + +define float @make_const2() { +; CHECK-LABEL: @make_const2( +; CHECK-NEXT: [[X:%.*]] = freeze float 1.000000e+01 +; CHECK-NEXT: ret float [[X]] +; + %x = freeze float 10.0 + ret float %x +} + +@glb = constant i32 0 + +define i32* @make_const_glb() { +; CHECK-LABEL: @make_const_glb( +; CHECK-NEXT: ret i32* @glb +; + %k = freeze i32* @glb + ret i32* %k +} + +; TODO: This is not poison. + +define i32()* @make_const_fn() { +; CHECK-LABEL: @make_const_fn( +; CHECK-NEXT: [[K:%.*]] = freeze i32 ()* @make_const +; CHECK-NEXT: ret i32 ()* [[K]] +; + %k = freeze i32()* @make_const + ret i32()* %k +} + +; TODO: This is not poison. + +define i32* @make_const_null() { +; CHECK-LABEL: @make_const_null( +; CHECK-NEXT: [[K:%.*]] = freeze i32* null +; CHECK-NEXT: ret i32* [[K]] +; + %k = freeze i32* null + ret i32* %k +} + +define <2 x i32> @constvector() { +; CHECK-LABEL: @constvector( +; CHECK-NEXT: ret <2 x i32> +; + %x = freeze <2 x i32> + ret <2 x i32> %x +} + +define <3 x i5> @constvector_weird() { +; CHECK-LABEL: @constvector_weird( +; CHECK-NEXT: ret <3 x i5> +; + %x = freeze <3 x i5> + ret <3 x i5> %x +} + +define <2 x float> @constvector_FP() { +; CHECK-LABEL: @constvector_FP( +; CHECK-NEXT: ret <2 x float> +; + %x = freeze <2 x float> + ret <2 x float> %x +} + +; Negative test + +define <2 x i32> @constvector_noopt() { +; CHECK-LABEL: @constvector_noopt( +; CHECK-NEXT: [[X:%.*]] = freeze <2 x i32> +; CHECK-NEXT: ret <2 x i32> [[X]] +; + %x = freeze <2 x i32> + ret <2 x i32> %x +} + +; Negative test + +define <3 x i5> @constvector_weird_noopt() { +; CHECK-LABEL: @constvector_weird_noopt( +; CHECK-NEXT: [[X:%.*]] = freeze <3 x i5> +; CHECK-NEXT: ret <3 x i5> [[X]] +; + %x = freeze <3 x i5> + ret <3 x i5> %x +} + +; Negative test + +define <2 x float> @constvector_FP_noopt() { +; CHECK-LABEL: @constvector_FP_noopt( +; CHECK-NEXT: [[X:%.*]] = freeze <2 x float> +; CHECK-NEXT: ret <2 x float> [[X]] +; + %x = freeze <2 x float> + ret <2 x float> %x +} + +@g = external global i16, align 1 + +; Negative test + +define float @constant_expr() { +; CHECK-LABEL: @constant_expr( +; CHECK-NEXT: [[R:%.*]] = freeze float bitcast (i32 ptrtoint (i16* @g to i32) to float) +; CHECK-NEXT: ret float [[R]] +; + %r = freeze float bitcast (i32 ptrtoint (i16* @g to i32) to float) + ret float %r +} + +; Negative test + +define <2 x i31> @vector_element_constant_expr() { +; CHECK-LABEL: @vector_element_constant_expr( +; CHECK-NEXT: [[R:%.*]] = freeze <2 x i31> +; CHECK-NEXT: ret <2 x i31> [[R]] +; + %r = freeze <2 x i31> + ret <2 x i31> %r +} + +define void @alloca() { +; CHECK-LABEL: @alloca( +; CHECK-NEXT: [[P:%.*]] = alloca i8 +; CHECK-NEXT: [[Y:%.*]] = freeze i8* [[P]] +; CHECK-NEXT: call void @f3(i8* [[Y]]) +; CHECK-NEXT: ret void +; + %p = alloca i8 + %y = freeze i8* %p + call void @f3(i8* %y) + ret void +} + +define i8* @gep() { +; CHECK-LABEL: @gep( +; CHECK-NEXT: [[P:%.*]] = alloca [4 x i8] +; CHECK-NEXT: [[Q:%.*]] = getelementptr [4 x i8], [4 x i8]* [[P]], i32 0, i32 6 +; CHECK-NEXT: [[Q2:%.*]] = freeze i8* [[Q]] +; CHECK-NEXT: ret i8* [[Q2]] +; + %p = alloca [4 x i8] + %q = getelementptr [4 x i8], [4 x i8]* %p, i32 0, i32 6 + %q2 = freeze i8* %q + ret i8* %q2 +} + +define i8* @gep_noopt(i32 %arg) { +; CHECK-LABEL: @gep_noopt( +; CHECK-NEXT: [[Q:%.*]] = getelementptr [4 x i8], [4 x i8]* null, i32 0, i32 [[ARG:%.*]] +; CHECK-NEXT: [[Q2:%.*]] = freeze i8* [[Q]] +; CHECK-NEXT: ret i8* [[Q2]] +; + %q = getelementptr [4 x i8], [4 x i8]* null, i32 0, i32 %arg + %q2 = freeze i8* %q + ret i8* %q2 +} + +define i8* @gep_inbounds() { +; CHECK-LABEL: @gep_inbounds( +; CHECK-NEXT: [[P:%.*]] = alloca [4 x i8] +; CHECK-NEXT: [[Q:%.*]] = getelementptr inbounds [4 x i8], [4 x i8]* [[P]], i32 0, i32 0 +; CHECK-NEXT: [[Q2:%.*]] = freeze i8* [[Q]] +; CHECK-NEXT: ret i8* [[Q2]] +; + %p = alloca [4 x i8] + %q = getelementptr inbounds [4 x i8], [4 x i8]* %p, i32 0, i32 0 + %q2 = freeze i8* %q + ret i8* %q2 +} + +define i8* @gep_inbounds_noopt(i32 %arg) { +; CHECK-LABEL: @gep_inbounds_noopt( +; CHECK-NEXT: [[P:%.*]] = alloca [4 x i8] +; CHECK-NEXT: [[Q:%.*]] = getelementptr inbounds [4 x i8], [4 x i8]* [[P]], i32 0, i32 [[ARG:%.*]] +; CHECK-NEXT: [[Q2:%.*]] = freeze i8* [[Q]] +; CHECK-NEXT: ret i8* [[Q2]] +; + %p = alloca [4 x i8] + %q = getelementptr inbounds [4 x i8], [4 x i8]* %p, i32 0, i32 %arg + %q2 = freeze i8* %q + ret i8* %q2 +} + +define i32* @gep_inbounds_null() { +; CHECK-LABEL: @gep_inbounds_null( +; CHECK-NEXT: [[K:%.*]] = freeze i32* null +; CHECK-NEXT: ret i32* [[K]] +; + %p = getelementptr inbounds i32, i32* null, i32 0 + %k = freeze i32* %p + ret i32* %k +} + +define i32* @gep_inbounds_null_noopt(i32* %p) { +; CHECK-LABEL: @gep_inbounds_null_noopt( +; CHECK-NEXT: [[K:%.*]] = freeze i32* [[P:%.*]] +; CHECK-NEXT: ret i32* [[K]] +; + %q = getelementptr inbounds i32, i32* %p, i32 0 + %k = freeze i32* %q + ret i32* %k +} + +define i1 @icmp(i32 %a, i32 %b) { +; CHECK-LABEL: @icmp( +; CHECK-NEXT: [[A_FR:%.*]] = freeze i32 [[A:%.*]] +; CHECK-NEXT: [[B_FR:%.*]] = freeze i32 [[B:%.*]] +; CHECK-NEXT: [[C:%.*]] = icmp eq i32 [[A_FR]], [[B_FR]] +; CHECK-NEXT: ret i1 [[C]] +; + %a.fr = freeze i32 %a + %b.fr = freeze i32 %b + %c = icmp eq i32 %a.fr, %b.fr + %c.fr = freeze i1 %c + ret i1 %c.fr +} + +define i1 @icmp_noopt(i32 %a, i32 %b) { +; CHECK-LABEL: @icmp_noopt( +; CHECK-NEXT: [[C:%.*]] = icmp eq i32 [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[C_FR:%.*]] = freeze i1 [[C]] +; CHECK-NEXT: ret i1 [[C_FR]] +; + %c = icmp eq i32 %a, %b + %c.fr = freeze i1 %c + ret i1 %c.fr +} + +define i1 @fcmp(float %x, float %y) { +; CHECK-LABEL: @fcmp( +; CHECK-NEXT: [[FX:%.*]] = freeze float [[X:%.*]] +; CHECK-NEXT: [[FY:%.*]] = freeze float [[Y:%.*]] +; CHECK-NEXT: [[C:%.*]] = fcmp oeq float [[FX]], [[FY]] +; CHECK-NEXT: [[FC:%.*]] = freeze i1 [[C]] +; CHECK-NEXT: ret i1 [[FC]] +; + %fx = freeze float %x + %fy = freeze float %y + %c = fcmp oeq float %fx, %fy + %fc = freeze i1 %c + ret i1 %fc +} + +define i1 @fcmp_noopt(float %x, float %y) { +; CHECK-LABEL: @fcmp_noopt( +; CHECK-NEXT: [[FX:%.*]] = freeze float [[X:%.*]] +; CHECK-NEXT: [[FY:%.*]] = freeze float [[Y:%.*]] +; CHECK-NEXT: [[C:%.*]] = fcmp nnan oeq float [[FX]], [[FY]] +; CHECK-NEXT: [[FC:%.*]] = freeze i1 [[C]] +; CHECK-NEXT: ret i1 [[FC]] +; + %fx = freeze float %x + %fy = freeze float %y + %c = fcmp nnan oeq float %fx, %fy + %fc = freeze i1 %c + ret i1 %fc +} + define i1 @brcond(i1 %c, i1 %c2) { ; CHECK-LABEL: @brcond( ; CHECK-NEXT: br i1 [[C:%.*]], label [[A:%.*]], label [[B:%.*]] @@ -40,6 +302,66 @@ B: ret i1 %f2 } +define i32 @phi(i1 %cond, i1 %cond2, i32 %a0, i32 %a1) { +; CHECK-LABEL: @phi( +; CHECK-NEXT: ENTRY: +; CHECK-NEXT: [[A0_FR:%.*]] = freeze i32 [[A0:%.*]] +; CHECK-NEXT: br i1 [[COND:%.*]], label [[BB1:%.*]], label [[BB2:%.*]] +; CHECK: BB1: +; CHECK-NEXT: [[A1_FR:%.*]] = freeze i32 [[A1:%.*]] +; CHECK-NEXT: br i1 [[COND2:%.*]], label [[BB2]], label [[EXIT:%.*]] +; CHECK: BB2: +; CHECK-NEXT: [[PHI1:%.*]] = phi i32 [ [[A0_FR]], [[ENTRY:%.*]] ], [ [[A1_FR]], [[BB1]] ] +; CHECK-NEXT: br label [[EXIT]] +; CHECK: EXIT: +; CHECK-NEXT: [[PHI2:%.*]] = phi i32 [ [[A0_FR]], [[BB1]] ], [ [[PHI1]], [[BB2]] ] +; CHECK-NEXT: [[PHI2_FR:%.*]] = freeze i32 [[PHI2]] +; CHECK-NEXT: ret i32 [[PHI2_FR]] +; +ENTRY: + %a0.fr = freeze i32 %a0 + br i1 %cond, label %BB1, label %BB2 +BB1: + %a1.fr = freeze i32 %a1 + br i1 %cond2, label %BB2, label %EXIT +BB2: + %phi1 = phi i32 [%a0.fr, %ENTRY], [%a1.fr, %BB1] + br label %EXIT +EXIT: + %phi2 = phi i32 [%a0.fr, %BB1], [%phi1, %BB2] + %phi2.fr = freeze i32 %phi2 + ret i32 %phi2.fr +} + +define i32 @phi_noopt(i1 %cond, i1 %cond2, i32 %a0, i32 %a1) { +; CHECK-LABEL: @phi_noopt( +; CHECK-NEXT: ENTRY: +; CHECK-NEXT: [[A0_FR:%.*]] = freeze i32 [[A0:%.*]] +; CHECK-NEXT: br i1 [[COND:%.*]], label [[BB1:%.*]], label [[BB2:%.*]] +; CHECK: BB1: +; CHECK-NEXT: br i1 [[COND2:%.*]], label [[BB2]], label [[EXIT:%.*]] +; CHECK: BB2: +; CHECK-NEXT: [[PHI1:%.*]] = phi i32 [ [[A0_FR]], [[ENTRY:%.*]] ], [ [[A1:%.*]], [[BB1]] ] +; CHECK-NEXT: br label [[EXIT]] +; CHECK: EXIT: +; CHECK-NEXT: [[PHI2:%.*]] = phi i32 [ [[A0_FR]], [[BB1]] ], [ [[PHI1]], [[BB2]] ] +; CHECK-NEXT: [[PHI2_FR:%.*]] = freeze i32 [[PHI2]] +; CHECK-NEXT: ret i32 [[PHI2_FR]] +; +ENTRY: + %a0.fr = freeze i32 %a0 + br i1 %cond, label %BB1, label %BB2 +BB1: + br i1 %cond2, label %BB2, label %EXIT +BB2: + %phi1 = phi i32 [%a0.fr, %ENTRY], [%a1, %BB1] + br label %EXIT +EXIT: + %phi2 = phi i32 [%a0.fr, %BB1], [%phi1, %BB2] + %phi2.fr = freeze i32 %phi2 + ret i32 %phi2.fr +} + define i32 @brcond_switch(i32 %x) { ; CHECK-LABEL: @brcond_switch( ; CHECK-NEXT: switch i32 [[X:%.*]], label [[EXIT:%.*]] [ @@ -81,3 +403,4 @@ B: } declare void @f1(i1) declare void @f2() +declare void @f3(i8*) diff --git a/llvm/test/Transforms/InstSimplify/shift-knownbits.ll b/llvm/test/Transforms/InstSimplify/shift-knownbits.ll index 63b9b76fd22f7..66e987182190b 100644 --- a/llvm/test/Transforms/InstSimplify/shift-knownbits.ll +++ b/llvm/test/Transforms/InstSimplify/shift-knownbits.ll @@ -40,7 +40,7 @@ define i33 @ashr_amount_is_known_bogus(i33 %a, i33 %b) { define i16 @ashr_amount_is_zero(i16 %a, i16 %b) { ; CHECK-LABEL: @ashr_amount_is_zero( -; CHECK-NEXT: ret i16 %a +; CHECK-NEXT: ret i16 [[A:%.*]] ; %and = and i16 %b, 65520 ; 0xfff0 %shr = ashr i16 %a, %and @@ -49,7 +49,7 @@ define i16 @ashr_amount_is_zero(i16 %a, i16 %b) { define i300 @lshr_amount_is_zero(i300 %a, i300 %b) { ; CHECK-LABEL: @lshr_amount_is_zero( -; CHECK-NEXT: ret i300 %a +; CHECK-NEXT: ret i300 [[A:%.*]] ; %and = and i300 %b, 2048 %shr = lshr i300 %a, %and @@ -58,7 +58,7 @@ define i300 @lshr_amount_is_zero(i300 %a, i300 %b) { define i9 @shl_amount_is_zero(i9 %a, i9 %b) { ; CHECK-LABEL: @shl_amount_is_zero( -; CHECK-NEXT: ret i9 %a +; CHECK-NEXT: ret i9 [[A:%.*]] ; %and = and i9 %b, 496 ; 0x1f0 %shl = shl i9 %a, %and @@ -70,8 +70,8 @@ define i9 @shl_amount_is_zero(i9 %a, i9 %b) { define i9 @shl_amount_is_not_known_zero(i9 %a, i9 %b) { ; CHECK-LABEL: @shl_amount_is_not_known_zero( -; CHECK-NEXT: [[AND:%.*]] = and i9 %b, -8 -; CHECK-NEXT: [[SHL:%.*]] = shl i9 %a, [[AND]] +; CHECK-NEXT: [[AND:%.*]] = and i9 [[B:%.*]], -8 +; CHECK-NEXT: [[SHL:%.*]] = shl i9 [[A:%.*]], [[AND]] ; CHECK-NEXT: ret i9 [[SHL]] ; %and = and i9 %b, 504 ; 0x1f8 @@ -94,8 +94,8 @@ define <2 x i32> @ashr_vector_bogus(<2 x i32> %a, <2 x i32> %b) { ; FIXME: This is undef, but computeKnownBits doesn't handle the union. define <2 x i32> @shl_vector_bogus(<2 x i32> %a, <2 x i32> %b) { ; CHECK-LABEL: @shl_vector_bogus( -; CHECK-NEXT: [[OR:%.*]] = or <2 x i32> %b, -; CHECK-NEXT: [[SHL:%.*]] = shl <2 x i32> %a, [[OR]] +; CHECK-NEXT: [[OR:%.*]] = or <2 x i32> [[B:%.*]], +; CHECK-NEXT: [[SHL:%.*]] = shl <2 x i32> [[A:%.*]], [[OR]] ; CHECK-NEXT: ret <2 x i32> [[SHL]] ; %or = or <2 x i32> %b, @@ -105,7 +105,7 @@ define <2 x i32> @shl_vector_bogus(<2 x i32> %a, <2 x i32> %b) { define <2 x i32> @lshr_vector_zero(<2 x i32> %a, <2 x i32> %b) { ; CHECK-LABEL: @lshr_vector_zero( -; CHECK-NEXT: ret <2 x i32> %a +; CHECK-NEXT: ret <2 x i32> [[A:%.*]] ; %and = and <2 x i32> %b, %shr = lshr <2 x i32> %a, %and @@ -115,7 +115,7 @@ define <2 x i32> @lshr_vector_zero(<2 x i32> %a, <2 x i32> %b) { ; Make sure that weird vector types work too. define <2 x i15> @shl_vector_zero(<2 x i15> %a, <2 x i15> %b) { ; CHECK-LABEL: @shl_vector_zero( -; CHECK-NEXT: ret <2 x i15> %a +; CHECK-NEXT: ret <2 x i15> [[A:%.*]] ; %and = and <2 x i15> %b, %shl = shl <2 x i15> %a, %and @@ -124,8 +124,8 @@ define <2 x i15> @shl_vector_zero(<2 x i15> %a, <2 x i15> %b) { define <2 x i32> @shl_vector_for_real(<2 x i32> %a, <2 x i32> %b) { ; CHECK-LABEL: @shl_vector_for_real( -; CHECK-NEXT: [[AND:%.*]] = and <2 x i32> %b, -; CHECK-NEXT: [[SHL:%.*]] = shl <2 x i32> %a, [[AND]] +; CHECK-NEXT: [[AND:%.*]] = and <2 x i32> [[B:%.*]], +; CHECK-NEXT: [[SHL:%.*]] = shl <2 x i32> [[A:%.*]], [[AND]] ; CHECK-NEXT: ret <2 x i32> [[SHL]] ; %and = and <2 x i32> %b, ; a necessary mask op @@ -139,7 +139,7 @@ define <2 x i32> @shl_vector_for_real(<2 x i32> %a, <2 x i32> %b) { define i1 @shl_i1(i1 %a, i1 %b) { ; CHECK-LABEL: @shl_i1( -; CHECK-NEXT: ret i1 %a +; CHECK-NEXT: ret i1 [[A:%.*]] ; %shl = shl i1 %a, %b ret i1 %shl @@ -179,6 +179,16 @@ define <2 x i8> @lshr_ctlz_zero_is_undef_splat_vec(<2 x i8> %x) { ret <2 x i8> %sh } +define i8 @lshr_ctlz_zero_is_undef_vec(<2 x i8> %x) { +; CHECK-LABEL: @lshr_ctlz_zero_is_undef_vec( +; CHECK-NEXT: ret i8 0 +; + %ct = call <2 x i8> @llvm.ctlz.v2i8(<2 x i8> %x, i1 true) + %sh = lshr <2 x i8> %ct, + %ex = extractelement <2 x i8> %sh, i32 0 + ret i8 %ex +} + define <2 x i8> @lshr_cttz_zero_is_undef_splat_vec(<2 x i8> %x) { ; CHECK-LABEL: @lshr_cttz_zero_is_undef_splat_vec( ; CHECK-NEXT: ret <2 x i8> zeroinitializer @@ -188,3 +198,13 @@ define <2 x i8> @lshr_cttz_zero_is_undef_splat_vec(<2 x i8> %x) { ret <2 x i8> %sh } +define i8 @lshr_cttz_zero_is_undef_vec(<2 x i8> %x) { +; CHECK-LABEL: @lshr_cttz_zero_is_undef_vec( +; CHECK-NEXT: ret i8 0 +; + %ct = call <2 x i8> @llvm.cttz.v2i8(<2 x i8> %x, i1 true) + %sh = lshr <2 x i8> %ct, + %ex = extractelement <2 x i8> %sh, i32 0 + ret i8 %ex +} + diff --git a/llvm/test/Transforms/JumpThreading/PR44611-across-header-hang.ll b/llvm/test/Transforms/JumpThreading/PR44611-across-header-hang.ll new file mode 100644 index 0000000000000..d36a3a19f2fef --- /dev/null +++ b/llvm/test/Transforms/JumpThreading/PR44611-across-header-hang.ll @@ -0,0 +1,22 @@ +; RUN: opt -S < %s -jump-threading -jump-threading-across-loop-headers | FileCheck %s + +; CHECK-LABEL: @foo +; Just check that we don't hang on this test. + +define void @foo(i32 %a) { +bb_entry: + br label %bb_header + +bb_header: + %b = phi i32 [ %c, %bb_header ], [ 0, %bb_body1 ], [ 2, %bb_body2 ], [ 0, %bb_entry ] + %c = add nuw nsw i32 %b, 1 + %d = icmp ult i32 %c, 6 + br i1 %d, label %bb_header, label %bb_body1 + +bb_body1: + %e = icmp eq i32 %a, 0 + br i1 %e, label %bb_body2, label %bb_header + +bb_body2: + br label %bb_header +} diff --git a/llvm/test/Transforms/MemCpyOpt/store-to-memset-is-nonzero-type.ll b/llvm/test/Transforms/MemCpyOpt/store-to-memset-is-nonzero-type.ll new file mode 100644 index 0000000000000..f75b63edef359 --- /dev/null +++ b/llvm/test/Transforms/MemCpyOpt/store-to-memset-is-nonzero-type.ll @@ -0,0 +1,73 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S < %s -memcpyopt | FileCheck %s + +; Array + +define void @array_zero([0 x i8]* %p) { +; CHECK-LABEL: @array_zero( +; CHECK-NEXT: [[TMP1:%.*]] = bitcast [0 x i8]* [[P:%.*]] to i8* +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 1 [[TMP1]], i8 undef, i64 0, i1 false) +; CHECK-NEXT: ret void +; + store [0 x i8] zeroinitializer, [0 x i8]* %p + ret void +} + +define void @array_nonzero([1 x i8]* %p) { +; CHECK-LABEL: @array_nonzero( +; CHECK-NEXT: [[TMP1:%.*]] = bitcast [1 x i8]* [[P:%.*]] to i8* +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 1 [[TMP1]], i8 0, i64 1, i1 false) +; CHECK-NEXT: ret void +; + store [1 x i8] zeroinitializer, [1 x i8]* %p + ret void +} + +; Structure + +define void @struct_zero({ }* %p) { +; CHECK-LABEL: @struct_zero( +; CHECK-NEXT: [[TMP1:%.*]] = bitcast {}* [[P:%.*]] to i8* +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 1 [[TMP1]], i8 undef, i64 0, i1 false) +; CHECK-NEXT: ret void +; + store { } zeroinitializer, { }* %p + ret void +} +define void @struct_nonzero({ i8 }* %p) { +; CHECK-LABEL: @struct_nonzero( +; CHECK-NEXT: [[TMP1:%.*]] = bitcast { i8 }* [[P:%.*]] to i8* +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 1 [[TMP1]], i8 0, i64 1, i1 false) +; CHECK-NEXT: ret void +; + store { i8 } zeroinitializer, { i8 }* %p + ret void +} + +; Vector + +; Test only non-zero vector. Zero element vector is illegal + +define void @vector_fixed_length_nonzero(<16 x i8>* %p) { +; CHECK-LABEL: @vector_fixed_length_nonzero( +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr <16 x i8>, <16 x i8>* [[P:%.*]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr <16 x i8>, <16 x i8>* [[P]], i64 1 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i8>* [[TMP0]] to i8* +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 16 [[TMP1]], i8 0, i64 32, i1 false) +; CHECK-NEXT: ret void +; + %tmp0 = getelementptr <16 x i8>, <16 x i8>* %p, i64 0 + store <16 x i8> zeroinitializer, <16 x i8>* %tmp0 + %tmp1 = getelementptr <16 x i8>, <16 x i8>* %p, i64 1 + store <16 x i8> zeroinitializer, <16 x i8>* %tmp1 + ret void +} + +define void @vector_scalable_nonzero(* %p) { +; CHECK-LABEL: @vector_scalable_nonzero( +; CHECK-NEXT: store zeroinitializer, * [[P:%.*]] +; CHECK-NEXT: ret void +; + store zeroinitializer, * %p + ret void +} diff --git a/llvm/test/Transforms/MemCpyOpt/vscale-memset.ll b/llvm/test/Transforms/MemCpyOpt/vscale-memset.ll new file mode 100644 index 0000000000000..256bd8518dc19 --- /dev/null +++ b/llvm/test/Transforms/MemCpyOpt/vscale-memset.ll @@ -0,0 +1,115 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -memcpyopt -dce -S | FileCheck %s + +; Negative test +; Check this test is not transformed into memset, or cause a compiler warning +; warning: Compiler has made implicit assumption that TypeSize is not scalable. This may or may not lead to broken code. + +define void @foo(i8* %p) { +; CHECK-LABEL: @foo( +; CHECK-NEXT: [[A:%.*]] = bitcast i8* [[P:%.*]] to * +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr , * [[A]], i64 0 +; CHECK-NEXT: store zeroinitializer, * [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr , * [[A]], i64 1 +; CHECK-NEXT: store zeroinitializer, * [[TMP1]] +; CHECK-NEXT: ret void +; + %a = bitcast i8* %p to * + %tmp0 = getelementptr , * %a, i64 0 + store zeroinitializer, * %tmp0 + %tmp1 = getelementptr , * %a, i64 1 + store zeroinitializer, * %tmp1 + ret void +} + +; Positive test + +define void @memset_vscale_index_zero(i8* %p, i8 %z) { +; CHECK-LABEL: @memset_vscale_index_zero( +; CHECK-NEXT: [[A:%.*]] = bitcast i8* [[P:%.*]] to * +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr , * [[A]], i32 0, i32 0 +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 1 [[TMP0]], i8 [[Z:%.*]], i64 17, i1 false) +; CHECK-NEXT: ret void +; + %a = bitcast i8* %p to * + %tmp0 = getelementptr , * %a, i32 0, i32 0 + store i8 %z, i8* %tmp0 + %tmp1 = getelementptr , * %a, i32 0, i32 1 + store i8 %z, i8* %tmp1 + %tmp2 = getelementptr , * %a, i32 0, i32 2 + store i8 %z, i8* %tmp2 + %tmp3 = getelementptr , * %a, i32 0, i32 3 + store i8 %z, i8* %tmp3 + %tmp4 = getelementptr , * %a, i32 0, i32 4 + store i8 %z, i8* %tmp4 + %tmp5 = getelementptr , * %a, i32 0, i32 5 + store i8 %z, i8* %tmp5 + %tmp6 = getelementptr , * %a, i32 0, i32 6 + store i8 %z, i8* %tmp6 + %tmp7 = getelementptr , * %a, i32 0, i32 7 + store i8 %z, i8* %tmp7 + %tmp8 = getelementptr , * %a, i32 0, i32 8 + store i8 %z, i8* %tmp8 + %tmp9 = getelementptr , * %a, i32 0, i32 9 + store i8 %z, i8* %tmp9 + %tmp10 = getelementptr , * %a, i32 0, i32 10 + store i8 %z, i8* %tmp10 + %tmp11 = getelementptr , * %a, i32 0, i32 11 + store i8 %z, i8* %tmp11 + %tmp12 = getelementptr , * %a, i32 0, i32 12 + store i8 %z, i8* %tmp12 + %tmp13 = getelementptr , * %a, i32 0, i32 13 + store i8 %z, i8* %tmp13 + %tmp14 = getelementptr , * %a, i32 0, i32 14 + store i8 %z, i8* %tmp14 + %tmp15 = getelementptr , * %a, i32 0, i32 15 + store i8 %z, i8* %tmp15 + %tmp16 = getelementptr , * %a, i32 0, i32 16 + store i8 %z, i8* %tmp16 + ret void +} + +define void @memset_vscale_index_nonzero(i8* %p, i8 %z) { +; CHECK-LABEL: @memset_vscale_index_nonzero( +; CHECK-NEXT: [[A:%.*]] = bitcast i8* [[P:%.*]] to * +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr , * [[A]], i32 1, i32 0 +; CHECK-NEXT: call void @llvm.memset.p0i8.i64(i8* align 1 [[TMP0]], i8 [[Z:%.*]], i64 17, i1 false) +; CHECK-NEXT: ret void +; + %a = bitcast i8* %p to * + %tmp0 = getelementptr , * %a, i32 1, i32 0 + store i8 %z, i8* %tmp0 + %tmp1 = getelementptr , * %a, i32 1, i32 1 + store i8 %z, i8* %tmp1 + %tmp2 = getelementptr , * %a, i32 1, i32 2 + store i8 %z, i8* %tmp2 + %tmp3 = getelementptr , * %a, i32 1, i32 3 + store i8 %z, i8* %tmp3 + %tmp4 = getelementptr , * %a, i32 1, i32 4 + store i8 %z, i8* %tmp4 + %tmp5 = getelementptr , * %a, i32 1, i32 5 + store i8 %z, i8* %tmp5 + %tmp6 = getelementptr , * %a, i32 1, i32 6 + store i8 %z, i8* %tmp6 + %tmp7 = getelementptr , * %a, i32 1, i32 7 + store i8 %z, i8* %tmp7 + %tmp8 = getelementptr , * %a, i32 1, i32 8 + store i8 %z, i8* %tmp8 + %tmp9 = getelementptr , * %a, i32 1, i32 9 + store i8 %z, i8* %tmp9 + %tmp10 = getelementptr , * %a, i32 1, i32 10 + store i8 %z, i8* %tmp10 + %tmp11 = getelementptr , * %a, i32 1, i32 11 + store i8 %z, i8* %tmp11 + %tmp12 = getelementptr , * %a, i32 1, i32 12 + store i8 %z, i8* %tmp12 + %tmp13 = getelementptr , * %a, i32 1, i32 13 + store i8 %z, i8* %tmp13 + %tmp14 = getelementptr , * %a, i32 1, i32 14 + store i8 %z, i8* %tmp14 + %tmp15 = getelementptr , * %a, i32 1, i32 15 + store i8 %z, i8* %tmp15 + %tmp16 = getelementptr , * %a, i32 1, i32 16 + store i8 %z, i8* %tmp16 + ret void +} diff --git a/llvm/test/Transforms/OpenMP/rtf_type_checking.ll b/llvm/test/Transforms/OpenMP/rtf_type_checking.ll new file mode 100644 index 0000000000000..57c09bcc7e060 --- /dev/null +++ b/llvm/test/Transforms/OpenMP/rtf_type_checking.ll @@ -0,0 +1,63 @@ +; RUN: opt -S -openmpopt -stats < %s 2>&1 | FileCheck %s +; REQUIRES: asserts + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" + +%struct.ident_t = type { i32, i32, i32, i32, i8* } + +@.str = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1 +@0 = private unnamed_addr global %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 +@1 = private unnamed_addr global %struct.ident_t { i32 0, i32 322, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0) }, align 8 + +define i32 @main() { +entry: + + call void (%struct.ident_t*, void (i32*, i32*, ...)*, ...) @__kmpc_fork_call(%struct.ident_t* nonnull @0, void (i32*, i32*, ...)* bitcast (void (i32*, i32*)* @.omp_outlined. to void (i32*, i32*, ...)*)) + ret i32 0 +} + +; Only the last runtime call will be matched due that the rest of the "runtime function" calls +; have some type mismatch compared to the real runtime function. See the check at bottom. +define internal void @.omp_outlined.(i32* noalias %.global_tid., i32* noalias %.bound_tid.) { +entry: + + call void @__kmpc_master(%struct.ident_t* nonnull @0) + call void @__kmpc_end_master(%struct.ident_t* nonnull @0, i32 0, i32 0) + call void @__kmpc_barrier(%struct.ident_t* nonnull @1, float 0.0) + call void @omp_get_thread_num() + call void @__kmpc_flush(%struct.ident_t* nonnull @0) + ret void +} +; Fewer arguments than expected in variadic function. +declare !callback !2 void @__kmpc_fork_call(%struct.ident_t*, void (i32*, i32*, ...)*, ...) + +; Fewer number of arguments in non variadic function. +declare void @__kmpc_master(%struct.ident_t*) + +; Bigger number of arguments in non variadic function. +declare void @__kmpc_end_master(%struct.ident_t*, i32, i32) + +; Different argument type than the expected. +declare void @__kmpc_barrier(%struct.ident_t*, float) + +; Proper use of runtime function. +declare void @__kmpc_flush(%struct.ident_t*) + +; Different return type. +declare void @omp_get_thread_num() + +!llvm.module.flags = !{!0} +!llvm.ident = !{!1} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{!"clang"} +!2 = !{!3} +!3 = !{i64 2, i64 -1, i64 -1, i1 true} +; ===-------------------------------------------------------------------------=== +; ... Statistics Collected ... +; ===-------------------------------------------------------------------------=== +; +; CHECK: 1 cgscc-passmgr - Maximum CGSCCPassMgr iterations on one SCC +; CHECK: 2 openmp-opt{{.*}}Number of OpenMP runtime functions identified +; +; There are two matches since the pass is run once per function. \ No newline at end of file diff --git a/llvm/test/Transforms/Reassociate/cse-pairs.ll b/llvm/test/Transforms/Reassociate/cse-pairs.ll new file mode 100644 index 0000000000000..33397ea050c41 --- /dev/null +++ b/llvm/test/Transforms/Reassociate/cse-pairs.ll @@ -0,0 +1,83 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -reassociate -early-cse -S < %s | FileCheck %s + +@num1 = local_unnamed_addr global i32 0, align 4 +@num2 = local_unnamed_addr global i32 0, align 4 +@num3 = local_unnamed_addr global i32 0, align 4 +@num4 = local_unnamed_addr global i32 0, align 4 + +define signext i32 @twoPairs(i32 signext %0, i32 signext %1, i32 signext %2, i32 signext %3, i32 signext %4) { +; CHECK-LABEL: @twoPairs( +; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP2:%.*]], [[TMP0:%.*]] +; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP6]], [[TMP1:%.*]] +; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP7]], [[TMP3:%.*]] +; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[TMP8]], [[TMP4:%.*]] +; CHECK-NEXT: store i32 [[TMP9]], i32* @num1, align 4 +; CHECK-NEXT: store i32 [[TMP6]], i32* @num2, align 4 +; CHECK-NEXT: [[TMP10:%.*]] = add nsw i32 [[TMP3]], [[TMP1]] +; CHECK-NEXT: store i32 [[TMP10]], i32* @num3, align 4 +; CHECK-NEXT: ret i32 undef +; + %6 = add i32 %2, %0 + %7 = add i32 %6, %1 + %8 = add i32 %7, %3 + %9 = add i32 %8, %4 + store i32 %9, i32* @num1, align 4 + %10 = add nsw i32 %2, %0 + store i32 %10, i32* @num2, align 4 + %11 = add nsw i32 %3, %1 + store i32 %11, i32* @num3, align 4 + ret i32 undef +} + +define signext i32 @twoPairsAllOpInPairs(i32 signext %0, i32 signext %1, i32 signext %2, i32 signext %3) { +; CHECK-LABEL: @twoPairsAllOpInPairs( +; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[TMP2:%.*]], [[TMP1:%.*]] +; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP5]], [[TMP0:%.*]] +; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP6]], [[TMP3:%.*]] +; CHECK-NEXT: store i32 [[TMP7]], i32* @num1, align 4 +; CHECK-NEXT: store i32 [[TMP5]], i32* @num2, align 4 +; CHECK-NEXT: [[TMP8:%.*]] = add nsw i32 [[TMP3]], [[TMP0]] +; CHECK-NEXT: store i32 [[TMP8]], i32* @num3, align 4 +; CHECK-NEXT: ret i32 undef +; + %5 = add nsw i32 %0, %1 + %6 = add nsw i32 %5, %2 + %7 = add nsw i32 %6, %3 + store i32 %7, i32* @num1, align 4 + %8 = add nsw i32 %1, %2 + store i32 %8, i32* @num2, align 4 + %9 = add nsw i32 %0, %3 + store i32 %9, i32* @num3, align 4 + ret i32 undef +} + +define signext i32 @threePairsAllOpInPairs(i32 signext %0, i32 signext %1, i32 signext %2, i32 signext %3, i32 signext %4, i32 signext %5) { +; CHECK-LABEL: @threePairsAllOpInPairs( +; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP3:%.*]], [[TMP2:%.*]] +; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP7]], [[TMP0:%.*]] +; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[TMP8]], [[TMP1:%.*]] +; CHECK-NEXT: [[TMP10:%.*]] = add i32 [[TMP9]], [[TMP4:%.*]] +; CHECK-NEXT: [[TMP11:%.*]] = add i32 [[TMP10]], [[TMP5:%.*]] +; CHECK-NEXT: store i32 [[TMP11]], i32* @num1, align 4 +; CHECK-NEXT: [[TMP12:%.*]] = add nsw i32 [[TMP5]], [[TMP0]] +; CHECK-NEXT: store i32 [[TMP12]], i32* @num2, align 4 +; CHECK-NEXT: [[TMP13:%.*]] = add nsw i32 [[TMP4]], [[TMP1]] +; CHECK-NEXT: store i32 [[TMP13]], i32* @num3, align 4 +; CHECK-NEXT: store i32 [[TMP7]], i32* @num4, align 4 +; CHECK-NEXT: ret i32 undef +; + %7 = add nsw i32 %0, %1 + %8 = add nsw i32 %7, %2 + %9 = add nsw i32 %8, %3 + %10 = add nsw i32 %9, %4 + %11 = add nsw i32 %10, %5 + store i32 %11, i32* @num1, align 4 + %12 = add nsw i32 %0, %5 + store i32 %12, i32* @num2, align 4 + %13 = add nsw i32 %1, %4 + store i32 %13, i32* @num3, align 4 + %14 = add nsw i32 %2, %3 + store i32 %14, i32* @num4, align 4 + ret i32 undef +} diff --git a/llvm/test/Transforms/SCCP/apint-xor.ll b/llvm/test/Transforms/SCCP/apint-xor.ll new file mode 100644 index 0000000000000..26d31d77ac716 --- /dev/null +++ b/llvm/test/Transforms/SCCP/apint-xor.ll @@ -0,0 +1,39 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -sccp -S | FileCheck %s + +; Test some XOR simplifications / range propagation. +define void@xor1(i1 %cmp) { +; CHECK-LABEL: @xor1( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[CMP:%.*]], label [[IF_TRUE:%.*]], label [[END:%.*]] +; CHECK: if.true: +; CHECK-NEXT: br label [[END]] +; CHECK: end: +; CHECK-NEXT: call void @use(i1 true) +; CHECK-NEXT: call void @use(i1 false) +; CHECK-NEXT: call void @use(i1 false) +; CHECK-NEXT: call void @use(i1 true) +; CHECK-NEXT: ret void +; +entry: + br i1 %cmp, label %if.true, label %end + +if.true: + br label %end + +end: + %p = phi i32 [ 11, %entry ], [ 11, %if.true] + %xor.1 = xor i32 %p, %p + %c.1 = icmp eq i32 %xor.1, 0 + call void @use(i1 %c.1) + %c.2 = icmp eq i32 %xor.1, 10 + call void @use(i1 %c.2) + %xor.2 = xor i32 %p, 1 + %c.3 = icmp eq i32 %xor.2, 11 + call void @use(i1 %c.3) + %c.4 = icmp eq i32 %xor.2, 10 + call void @use(i1 %c.4) + ret void +} + +declare void @use(i1) diff --git a/llvm/test/Transforms/SCCP/binaryops-range-special-cases.ll b/llvm/test/Transforms/SCCP/binaryops-range-special-cases.ll index a354ae0d4d5d1..f7fdc1ed5e64b 100644 --- a/llvm/test/Transforms/SCCP/binaryops-range-special-cases.ll +++ b/llvm/test/Transforms/SCCP/binaryops-range-special-cases.ll @@ -7,16 +7,13 @@ define void @sdiv1_cmp_constants(i32 %x) { ; CHECK-NEXT: [[D:%.*]] = sdiv i32 1, [[X:%.*]] ; CHECK-NEXT: [[C_0:%.*]] = icmp slt i32 0, [[D]] ; CHECK-NEXT: call void @use(i1 [[C_0]]) -; CHECK-NEXT: [[C_1:%.*]] = icmp slt i32 1, [[D]] -; CHECK-NEXT: call void @use(i1 [[C_1]]) -; CHECK-NEXT: [[C_2:%.*]] = icmp slt i32 2, [[D]] -; CHECK-NEXT: call void @use(i1 [[C_2]]) +; CHECK-NEXT: call void @use(i1 false) +; CHECK-NEXT: call void @use(i1 false) ; CHECK-NEXT: [[C_3:%.*]] = icmp eq i32 1, [[D]] ; CHECK-NEXT: call void @use(i1 [[C_3]]) ; CHECK-NEXT: [[C_4:%.*]] = icmp eq i32 0, [[D]] ; CHECK-NEXT: call void @use(i1 [[C_4]]) -; CHECK-NEXT: [[C_5:%.*]] = icmp eq i32 2, [[D]] -; CHECK-NEXT: call void @use(i1 [[C_5]]) +; CHECK-NEXT: call void @use(i1 false) ; CHECK-NEXT: ret void ; %d = sdiv i32 1, %x @@ -47,8 +44,7 @@ define void @sdiv1_cmp_range_1(i32 %x, i1 %c) { ; CHECK: bb3: ; CHECK-NEXT: [[P:%.*]] = phi i32 [ 1, [[BB1]] ], [ 2, [[BB2]] ] ; CHECK-NEXT: [[D:%.*]] = sdiv i32 1, [[X:%.*]] -; CHECK-NEXT: [[C_0:%.*]] = icmp slt i32 [[P]], [[D]] -; CHECK-NEXT: call void @use(i1 [[C_0]]) +; CHECK-NEXT: call void @use(i1 false) ; CHECK-NEXT: [[C_1:%.*]] = icmp eq i32 [[P]], [[D]] ; CHECK-NEXT: call void @use(i1 [[C_1]]) ; CHECK-NEXT: ret void @@ -80,10 +76,8 @@ define void @sdiv1_cmp_range_2(i32 %x, i1 %c) { ; CHECK: bb3: ; CHECK-NEXT: [[P:%.*]] = phi i32 [ 3, [[BB1]] ], [ 2, [[BB2]] ] ; CHECK-NEXT: [[D:%.*]] = sdiv i32 1, [[X:%.*]] -; CHECK-NEXT: [[C_0:%.*]] = icmp slt i32 [[P]], [[D]] -; CHECK-NEXT: call void @use(i1 [[C_0]]) -; CHECK-NEXT: [[C_1:%.*]] = icmp eq i32 [[P]], [[D]] -; CHECK-NEXT: call void @use(i1 [[C_1]]) +; CHECK-NEXT: call void @use(i1 false) +; CHECK-NEXT: call void @use(i1 false) ; CHECK-NEXT: ret void ; br i1 %c, label %bb1, label %bb2 diff --git a/llvm/test/Transforms/SCCP/conditions-iter-order.ll b/llvm/test/Transforms/SCCP/conditions-iter-order.ll new file mode 100644 index 0000000000000..5e7e4f31b4c2d --- /dev/null +++ b/llvm/test/Transforms/SCCP/conditions-iter-order.ll @@ -0,0 +1,79 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -ipsccp -S %s | FileCheck %s + +declare noalias i8* @malloc(i64) + +; Make sure we can eliminate `%tmp17 = icmp ult i32 %tmp10, 3`. + +declare void @use(i1) + +define internal i32* @spam(i32* %arg) { +; CHECK-LABEL: @spam( +; CHECK-NEXT: bb: +; CHECK-NEXT: [[TMP:%.*]] = call i8* @malloc(i64 10368) +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP]] to i32* +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[ARG:%.*]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* [[TMP4]], align 8 +; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP5]], 1 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[ARG]], i32 1 +; CHECK-NEXT: [[TMP10:%.*]] = icmp ne i32* [[TMP7]], null +; CHECK-NEXT: br i1 [[TMP10]], label [[BB17:%.*]], label [[BB13:%.*]] +; CHECK: bb13: +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[ARG]], i32 2 +; CHECK-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP14]], align 8 +; CHECK-NEXT: [[TMP16:%.*]] = add i32 [[TMP15]], 1 +; CHECK-NEXT: br label [[BB30:%.*]] +; CHECK: bb17: +; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i32 [[TMP6]], [[TMP5]] +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[ARG]], i32 3 +; CHECK-NEXT: [[TMP20:%.*]] = load i32, i32* [[TMP19]], align 8 +; CHECK-NEXT: br i1 [[TMP18]], label [[BB30]], label [[BB13]] +; CHECK: bb30: +; CHECK-NEXT: ret i32* [[TMP1]] +; +bb: + %tmp = call i8* @malloc(i64 10368) + %tmp1 = bitcast i8* %tmp to i32* + %tmp4 = getelementptr inbounds i32, i32* %arg, i32 0 + %tmp5 = load i32, i32* %tmp4, align 8 + %tmp6 = add i32 %tmp5, 1 + %tmp7 = getelementptr inbounds i32, i32* %arg, i32 1 + %tmp10 = icmp ne i32* %tmp7, null + br i1 %tmp10, label %bb17, label %bb13 + +bb13: + %tmp14 = getelementptr inbounds i32, i32* %arg, i32 2 + %tmp15 = load i32, i32* %tmp14, align 8 + %tmp16 = add i32 %tmp15, 1 + br label %bb30 + +bb17: + %tmp18 = icmp eq i32 %tmp6, %tmp5 + %tmp19 = getelementptr inbounds i32, i32* %arg, i32 3 + %tmp20 = load i32, i32* %tmp19, align 8 + br i1 %tmp18, label %bb30, label %bb13 + +bb30: + ret i32* %tmp1 +} + +define void @spam.1(i32* %arg) { +bb: + %tmp = alloca i8*, align 8 + %tmp4 = call i32* @spam(i32* %arg) + br label %bb6 + +bb6: ; preds = %bb5 + %tmp7 = getelementptr inbounds i32, i32* %tmp4, i32 1 + %tmp10 = load i32, i32* %tmp7, align 8 + %tmp11 = icmp ne i32 %tmp10, 0 + br i1 %tmp11, label %bb6, label %bb15 + +bb15: ; preds = %bb12 + %tmp17 = icmp ult i32 %tmp10, 3 + call void @use(i1 %tmp17) + br i1 %tmp17, label %bb6, label %bb24 + +bb24: + ret void +} diff --git a/llvm/test/Transforms/SCCP/conditions-ranges.ll b/llvm/test/Transforms/SCCP/conditions-ranges.ll new file mode 100644 index 0000000000000..345e521d35c76 --- /dev/null +++ b/llvm/test/Transforms/SCCP/conditions-ranges.ll @@ -0,0 +1,712 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -ipsccp -S | FileCheck %s + +declare void @use(i1) + +define void @f1(i32 %a, i32 %b) { +; CHECK-LABEL: @f1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[A_255:%.*]] = and i32 [[A:%.*]], 255 +; CHECK-NEXT: [[A_2:%.*]] = add i32 [[A_255]], 20 +; CHECK-NEXT: [[BC:%.*]] = icmp ugt i32 [[B:%.*]], [[A_2]] +; CHECK-NEXT: br i1 [[BC]], label [[TRUE:%.*]], label [[FALSE:%.*]] +; CHECK: true: +; CHECK-NEXT: [[F_1:%.*]] = icmp eq i32 [[B]], 0 +; CHECK-NEXT: call void @use(i1 [[F_1]]) +; CHECK-NEXT: [[F_2:%.*]] = icmp eq i32 [[B]], 20 +; CHECK-NEXT: call void @use(i1 [[F_2]]) +; CHECK-NEXT: [[F_3:%.*]] = icmp ult i32 [[B]], 20 +; CHECK-NEXT: call void @use(i1 [[F_3]]) +; CHECK-NEXT: [[T_1:%.*]] = icmp ugt i32 [[B]], 5 +; CHECK-NEXT: call void @use(i1 [[T_1]]) +; CHECK-NEXT: [[T_2:%.*]] = icmp ne i32 [[B]], 20 +; CHECK-NEXT: call void @use(i1 [[T_2]]) +; CHECK-NEXT: [[C_1:%.*]] = icmp eq i32 [[B]], 21 +; CHECK-NEXT: call void @use(i1 [[C_1]]) +; CHECK-NEXT: [[C_2:%.*]] = icmp ugt i32 [[B]], 21 +; CHECK-NEXT: call void @use(i1 [[C_2]]) +; CHECK-NEXT: [[C_3:%.*]] = icmp ugt i32 [[B]], 255 +; CHECK-NEXT: call void @use(i1 [[C_3]]) +; CHECK-NEXT: ret void +; CHECK: false: +; CHECK-NEXT: [[F_4:%.*]] = icmp eq i32 [[B]], 276 +; CHECK-NEXT: call void @use(i1 [[F_4]]) +; CHECK-NEXT: [[F_5:%.*]] = icmp ugt i32 [[B]], 275 +; CHECK-NEXT: call void @use(i1 [[F_5]]) +; CHECK-NEXT: [[T_3:%.*]] = icmp ne i32 [[B]], 276 +; CHECK-NEXT: call void @use(i1 [[T_3]]) +; CHECK-NEXT: [[T_4:%.*]] = icmp ule i32 [[B]], 275 +; CHECK-NEXT: call void @use(i1 [[T_4]]) +; CHECK-NEXT: [[C_4:%.*]] = icmp eq i32 [[B]], 21 +; CHECK-NEXT: call void @use(i1 [[C_4]]) +; CHECK-NEXT: [[C_5:%.*]] = icmp eq i32 [[B]], 275 +; CHECK-NEXT: call void @use(i1 [[C_5]]) +; CHECK-NEXT: ret void +; +entry: + %a.255 = and i32 %a, 255 + %a.2 = add i32 %a.255, 20 + %bc = icmp ugt i32 %b, %a.2 + br i1 %bc, label %true, label %false + +true: ; %b in [21, 0) + ; Conditions below are false. + %f.1 = icmp eq i32 %b, 0 + call void @use(i1 %f.1) + %f.2 = icmp eq i32 %b, 20 + call void @use(i1 %f.2) + %f.3 = icmp ult i32 %b, 20 + call void @use(i1 %f.3) + + ; Conditions below are true. + %t.1 = icmp ugt i32 %b, 5 + call void @use(i1 %t.1) + %t.2 = icmp ne i32 %b, 20 + call void @use(i1 %t.2) + + ; Conditions below cannot be simplified. + %c.1 = icmp eq i32 %b, 21 + call void @use(i1 %c.1) + %c.2 = icmp ugt i32 %b, 21 + call void @use(i1 %c.2) + %c.3 = icmp ugt i32 %b, 255 + call void @use(i1 %c.3) + ret void + +false: ;%b in [0, 276) + ; Conditions below are false; + %f.4 = icmp eq i32 %b, 276 + call void @use(i1 %f.4) + %f.5 = icmp ugt i32 %b, 275 + call void @use(i1 %f.5) + + ; Conditions below are true; + %t.3 = icmp ne i32 %b, 276 + call void @use(i1 %t.3) + %t.4 = icmp ule i32 %b, 275 + call void @use(i1 %t.4) + + ; Conditions below cannot be simplified. + %c.4 = icmp eq i32 %b, 21 + call void @use(i1 %c.4) + %c.5 = icmp eq i32 %b, 275 + call void @use(i1 %c.5) + ret void +} + +; TODO: Use information %a != 0 in false branch. +define void @f2_ptr(i8* %a, i8* %b) { +; CHECK-LABEL: @f2_ptr( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[BC:%.*]] = icmp eq i8* [[A:%.*]], null +; CHECK-NEXT: br i1 [[BC]], label [[TRUE:%.*]], label [[FALSE:%.*]] +; CHECK: true: +; CHECK-NEXT: call void @use(i1 false) +; CHECK-NEXT: call void @use(i1 true) +; CHECK-NEXT: [[C_1:%.*]] = icmp eq i8* null, [[B:%.*]] +; CHECK-NEXT: call void @use(i1 [[C_1]]) +; CHECK-NEXT: ret void +; CHECK: false: +; CHECK-NEXT: [[F_2:%.*]] = icmp eq i8* [[A]], null +; CHECK-NEXT: call void @use(i1 [[F_2]]) +; CHECK-NEXT: [[T_2:%.*]] = icmp ne i8* [[A]], null +; CHECK-NEXT: call void @use(i1 [[T_2]]) +; CHECK-NEXT: [[C_2:%.*]] = icmp eq i8* [[A]], [[B]] +; CHECK-NEXT: call void @use(i1 [[C_2]]) +; CHECK-NEXT: ret void +; +entry: + %bc = icmp eq i8* %a, null + br i1 %bc, label %true, label %false + +true: ; %a == 0 + %f.1 = icmp ne i8* %a, null + call void @use(i1 %f.1) + + %t.1 = icmp eq i8* %a, null + call void @use(i1 %t.1) + + %c.1 = icmp eq i8* %a, %b + call void @use(i1 %c.1) + ret void + +false: ; %a != 0 + %f.2 = icmp eq i8* %a, null + call void @use(i1 %f.2) + + %t.2 = icmp ne i8* %a, null + call void @use(i1 %t.2) + + %c.2 = icmp eq i8* %a, %b + call void @use(i1 %c.2) + ret void +} + +define i8* @f3(i8* %a, i8* %b, i1 %c) { +; CHECK-LABEL: @f3( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[C_1:%.*]] = icmp eq i8* [[A:%.*]], null +; CHECK-NEXT: br i1 [[C_1]], label [[TRUE:%.*]], label [[FALSE:%.*]] +; CHECK: true: +; CHECK-NEXT: br i1 [[C:%.*]], label [[TRUE_2:%.*]], label [[FALSE_2:%.*]] +; CHECK: true.2: +; CHECK-NEXT: br label [[EXIT_2:%.*]] +; CHECK: false.2: +; CHECK-NEXT: br label [[EXIT_2]] +; CHECK: exit.2: +; CHECK-NEXT: [[P:%.*]] = phi i8* [ null, [[TRUE_2]] ], [ [[B:%.*]], [[FALSE_2]] ] +; CHECK-NEXT: ret i8* [[P]] +; CHECK: false: +; CHECK-NEXT: ret i8* null +; +entry: + %c.1 = icmp eq i8* %a, null + br i1 %c.1, label %true, label %false + +true: + br i1 %c, label %true.2, label %false.2 + +true.2: + br label %exit.2 + +false.2: + br label %exit.2 + +exit.2: + %p = phi i8* [ %a, %true.2 ], [ %b, %false.2 ] + ret i8* %p + +false: + ret i8* null +} + +define i32 @f5(i64 %sz) { +; CHECK-LABEL: @f5( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i64 4088, [[SZ:%.*]] +; CHECK-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_END:%.*]] +; CHECK: cond.true: +; CHECK-NEXT: [[DIV:%.*]] = udiv i64 4088, [[SZ]] +; CHECK-NEXT: br label [[COND_END]] +; CHECK: cond.end: +; CHECK-NEXT: [[COND:%.*]] = phi i64 [ [[DIV]], [[COND_TRUE]] ], [ 1, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[CONV:%.*]] = trunc i64 [[COND]] to i32 +; CHECK-NEXT: ret i32 [[CONV]] +; +entry: + %cmp = icmp ugt i64 4088, %sz + br i1 %cmp, label %cond.true, label %cond.end + +cond.true: ; preds = %entry + %div = udiv i64 4088, %sz + br label %cond.end + +cond.end: ; preds = %entry, %cond.true + %cond = phi i64 [ %div, %cond.true ], [ 1, %entry ] + %conv = trunc i64 %cond to i32 + ret i32 %conv +} + +define void @f6(i32 %b) { +; CHECK-LABEL: @f6( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[C_1:%.*]] = icmp ugt i32 [[B:%.*]], 20 +; CHECK-NEXT: br i1 [[C_1]], label [[TRUE:%.*]], label [[FALSE:%.*]] +; CHECK: true: +; CHECK-NEXT: call void @use(i1 true) +; CHECK-NEXT: ret void +; CHECK: false: +; CHECK-NEXT: ret void +; +entry: + %a = add i32 10, 10 + %c.1 = icmp ugt i32 %b, %a + br i1 %c.1, label %true, label %false + +true: + %c.2 = icmp eq i32 %a, 20 + call void @use(i1 %c.2) + ret void + +false: + ret void +} + +define void @loop.1() { +entry: + br label %for.cond + +for.cond: ; preds = %for.cond.cleanup13, %if.then + %i.0 = phi i32 [ 0, %entry ], [ %inc27, %for.cond.cleanup13 ] + %cmp9 = icmp sle i32 %i.0, 3 + br i1 %cmp9, label %for.body, label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond + ret void + +for.body: ; preds = %for.cond + br label %for.cond11 + +for.cond11: ; preds = %arrayctor.cont21, %for.body + br label %for.cond.cleanup13 + +for.cond.cleanup13: ; preds = %for.cond11 + %inc27 = add nsw i32 %i.0, 1 + br label %for.cond +} + + +define void @loop() { +; CHECK-LABEL: @loop( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[FOR_COND:%.*]] +; CHECK: for.cond: +; CHECK-NEXT: [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC27:%.*]], [[FOR_COND_CLEANUP13:%.*]] ] +; CHECK-NEXT: [[CMP9:%.*]] = icmp sle i32 [[I_0]], 3 +; CHECK-NEXT: br i1 [[CMP9]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; CHECK: for.body: +; CHECK-NEXT: br label [[FOR_COND11:%.*]] +; CHECK: for.cond11: +; CHECK-NEXT: [[J_0:%.*]] = phi i32 [ 0, [[FOR_BODY]] ], [ [[INC:%.*]], [[FOR_BODY14:%.*]] ] +; CHECK-NEXT: [[CMP12:%.*]] = icmp slt i32 [[J_0]], 2 +; CHECK-NEXT: br i1 [[CMP12]], label [[FOR_BODY14]], label [[FOR_COND_CLEANUP13]] +; CHECK: for.cond.cleanup13: +; CHECK-NEXT: [[INC27]] = add nsw i32 [[I_0]], 1 +; CHECK-NEXT: br label [[FOR_COND]] +; CHECK: for.body14: +; CHECK-NEXT: [[INC]] = add nsw i32 [[J_0]], 1 +; CHECK-NEXT: br label [[FOR_COND11]] +; +entry: + br label %for.cond + +for.cond: ; preds = %for.cond.cleanup13, %if.then + %i.0 = phi i32 [ 0, %entry ], [ %inc27, %for.cond.cleanup13 ] + %cmp9 = icmp sle i32 %i.0, 3 + br i1 %cmp9, label %for.body, label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond + ret void + +for.body: ; preds = %for.cond + br label %for.cond11 + +for.cond11: ; preds = %arrayctor.cont21, %for.body + %j.0 = phi i32 [ 0, %for.body ], [ %inc, %for.body14 ] + %cmp12 = icmp slt i32 %j.0, 2 + br i1 %cmp12, label %for.body14, label %for.cond.cleanup13 + +for.cond.cleanup13: ; preds = %for.cond11 + %inc27 = add nsw i32 %i.0, 1 + br label %for.cond + +for.body14: + %inc = add nsw i32 %j.0, 1 + br label %for.cond11 +} + +define i32 @udiv_1(i64 %sz) { +; CHECK-LABEL: @udiv_1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i64 4088, [[SZ:%.*]] +; CHECK-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_END:%.*]] +; CHECK: cond.true: +; CHECK-NEXT: [[DIV:%.*]] = udiv i64 4088, [[SZ]] +; CHECK-NEXT: br label [[COND_END]] +; CHECK: cond.end: +; CHECK-NEXT: [[COND:%.*]] = phi i64 [ [[DIV]], [[COND_TRUE]] ], [ 1, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[CONV:%.*]] = trunc i64 [[COND]] to i32 +; CHECK-NEXT: ret i32 [[CONV]] +; +entry: + %cmp = icmp ugt i64 4088, %sz + br i1 %cmp, label %cond.true, label %cond.end + +cond.true: ; preds = %entry + %div = udiv i64 4088, %sz + br label %cond.end + +cond.end: ; preds = %entry, %cond.true + %cond = phi i64 [ %div, %cond.true ], [ 1, %entry ] + %conv = trunc i64 %cond to i32 + ret i32 %conv +} + +; Same as @udiv_1, but with the condition switched. +define i32 @udiv_2(i64 %sz) { +; CHECK-LABEL: @udiv_2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP:%.*]] = icmp ugt i64 [[SZ:%.*]], 4088 +; CHECK-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_END:%.*]] +; CHECK: cond.true: +; CHECK-NEXT: [[DIV:%.*]] = udiv i64 4088, [[SZ]] +; CHECK-NEXT: br label [[COND_END]] +; CHECK: cond.end: +; CHECK-NEXT: [[COND:%.*]] = phi i64 [ [[DIV]], [[COND_TRUE]] ], [ 1, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[CONV:%.*]] = trunc i64 [[COND]] to i32 +; CHECK-NEXT: ret i32 [[CONV]] +; +entry: + %cmp = icmp ugt i64 %sz, 4088 + br i1 %cmp, label %cond.true, label %cond.end + +cond.true: ; preds = %entry + %div = udiv i64 4088, %sz + br label %cond.end + +cond.end: ; preds = %entry, %cond.true + %cond = phi i64 [ %div, %cond.true ], [ 1, %entry ] + %conv = trunc i64 %cond to i32 + ret i32 %conv +} + +; Test with 2 unrelated nested conditions. +define void @f7_nested_conds(i32* %a, i32 %b) { +; CHECK-LABEL: @f7_nested_conds( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[A_V:%.*]] = load i32, i32* [[A:%.*]] +; CHECK-NEXT: [[C_1:%.*]] = icmp ne i32 [[A_V]], 0 +; CHECK-NEXT: br i1 [[C_1]], label [[TRUE:%.*]], label [[FALSE:%.*]] +; CHECK: false: +; CHECK-NEXT: br i1 true, label [[TRUE_2:%.*]], label [[TRUE]] +; CHECK: true.2: +; CHECK-NEXT: call void @use(i1 true) +; CHECK-NEXT: ret void +; CHECK: true: +; CHECK-NEXT: store i32 [[B:%.*]], i32* [[A]] +; CHECK-NEXT: ret void +; +entry: + %a.v = load i32, i32* %a + %c.1 = icmp ne i32 %a.v, 0 + br i1 %c.1, label %true, label %false + +false: + %c.2 = icmp ult i32 %a.v, 3 + br i1 %c.2, label %true.2, label %true + +true.2: + %c.3 = icmp eq i32 %a.v, 0 + call void @use(i1 %c.3) + ret void + +true: + store i32 %b, i32* %a + ret void +} + +; Test with 2 related nested conditions (%b > [20, 276) && %b < 255). +define void @f8_nested_conds(i32 %a, i32 %b) { +; CHECK-LABEL: @f8_nested_conds( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[A_255:%.*]] = and i32 [[A:%.*]], 255 +; CHECK-NEXT: [[A_2:%.*]] = add i32 [[A_255]], 20 +; CHECK-NEXT: [[BC_1:%.*]] = icmp ugt i32 [[B:%.*]], [[A_2]] +; CHECK-NEXT: br i1 [[BC_1]], label [[TRUE:%.*]], label [[FALSE:%.*]] +; CHECK: true: +; CHECK-NEXT: [[BC_2:%.*]] = icmp ult i32 [[B]], 255 +; CHECK-NEXT: br i1 [[BC_2]], label [[TRUE_2:%.*]], label [[FALSE_2:%.*]] +; CHECK: true.2: +; CHECK-NEXT: [[F_1:%.*]] = icmp eq i32 [[B]], 0 +; CHECK-NEXT: call void @use(i1 [[F_1]]) +; CHECK-NEXT: [[F_2:%.*]] = icmp eq i32 [[B]], 20 +; CHECK-NEXT: call void @use(i1 [[F_2]]) +; CHECK-NEXT: [[F_3:%.*]] = icmp ult i32 [[B]], 20 +; CHECK-NEXT: call void @use(i1 [[F_3]]) +; CHECK-NEXT: [[F_4:%.*]] = icmp eq i32 [[B]], 255 +; CHECK-NEXT: call void @use(i1 [[F_4]]) +; CHECK-NEXT: [[F_5:%.*]] = icmp ugt i32 [[B]], 255 +; CHECK-NEXT: call void @use(i1 [[F_5]]) +; CHECK-NEXT: [[T_1:%.*]] = icmp ugt i32 [[B]], 5 +; CHECK-NEXT: call void @use(i1 [[T_1]]) +; CHECK-NEXT: [[T_2:%.*]] = icmp ne i32 [[B]], 20 +; CHECK-NEXT: call void @use(i1 [[T_2]]) +; CHECK-NEXT: [[T_3:%.*]] = icmp ult i32 [[B]], 255 +; CHECK-NEXT: call void @use(i1 [[T_3]]) +; CHECK-NEXT: [[T_4:%.*]] = icmp ne i32 [[B]], 300 +; CHECK-NEXT: call void @use(i1 [[T_4]]) +; CHECK-NEXT: [[C_1:%.*]] = icmp eq i32 [[B]], 21 +; CHECK-NEXT: call void @use(i1 [[C_1]]) +; CHECK-NEXT: [[C_2:%.*]] = icmp ugt i32 [[B]], 21 +; CHECK-NEXT: call void @use(i1 [[C_2]]) +; CHECK-NEXT: [[C_3:%.*]] = icmp ugt i32 [[B]], 34 +; CHECK-NEXT: call void @use(i1 [[C_3]]) +; CHECK-NEXT: ret void +; CHECK: false.2: +; CHECK-NEXT: [[F_6:%.*]] = icmp eq i32 [[B]], 254 +; CHECK-NEXT: call void @use(i1 [[F_6]]) +; CHECK-NEXT: [[F_7:%.*]] = icmp ult i32 [[B]], 255 +; CHECK-NEXT: call void @use(i1 [[F_7]]) +; CHECK-NEXT: [[T_5:%.*]] = icmp ne i32 [[B]], 254 +; CHECK-NEXT: call void @use(i1 [[T_5]]) +; CHECK-NEXT: [[T_6:%.*]] = icmp uge i32 [[B]], 255 +; CHECK-NEXT: call void @use(i1 [[T_6]]) +; CHECK-NEXT: [[C_4:%.*]] = icmp eq i32 [[B]], 255 +; CHECK-NEXT: call void @use(i1 [[C_4]]) +; CHECK-NEXT: [[C_5:%.*]] = icmp ne i32 [[B]], 275 +; CHECK-NEXT: call void @use(i1 [[C_5]]) +; CHECK-NEXT: ret void +; CHECK: false: +; CHECK-NEXT: ret void +; +entry: + %a.255 = and i32 %a, 255 + %a.2 = add i32 %a.255, 20 + %bc.1 = icmp ugt i32 %b, %a.2 + br i1 %bc.1, label %true, label %false + +true: ; %b in [21, 0) + %bc.2 = icmp ult i32 %b, 255 + br i1 %bc.2, label %true.2, label %false.2 + +true.2: ; %b in [21, 255) + ; Conditions below are false. + %f.1 = icmp eq i32 %b, 0 + call void @use(i1 %f.1) + %f.2 = icmp eq i32 %b, 20 + call void @use(i1 %f.2) + %f.3 = icmp ult i32 %b, 20 + call void @use(i1 %f.3) + %f.4 = icmp eq i32 %b, 255 + call void @use(i1 %f.4) + %f.5 = icmp ugt i32 %b, 255 + call void @use(i1 %f.5) + + + ; Conditions below are true. + %t.1 = icmp ugt i32 %b, 5 + call void @use(i1 %t.1) + %t.2 = icmp ne i32 %b, 20 + call void @use(i1 %t.2) + %t.3 = icmp ult i32 %b, 255 + call void @use(i1 %t.3) + %t.4 = icmp ne i32 %b, 300 + call void @use(i1 %t.4) + + ; Conditions below cannot be simplified. + %c.1 = icmp eq i32 %b, 21 + call void @use(i1 %c.1) + %c.2 = icmp ugt i32 %b, 21 + call void @use(i1 %c.2) + %c.3 = icmp ugt i32 %b, 34 + call void @use(i1 %c.3) + ret void + +false.2: ;%b in [255, 0) + ; Conditions below are false; + %f.6 = icmp eq i32 %b, 254 + call void @use(i1 %f.6) + %f.7 = icmp ult i32 %b, 255 + call void @use(i1 %f.7) + + ; Conditions below are true; + %t.5 = icmp ne i32 %b, 254 + call void @use(i1 %t.5) + %t.6 = icmp uge i32 %b, 255 + call void @use(i1 %t.6) + + ; Conditions below cannot be simplified. + %c.4 = icmp eq i32 %b, 255 + call void @use(i1 %c.4) + %c.5 = icmp ne i32 %b, 275 + call void @use(i1 %c.5) + ret void + +false: + ret void +} + +; Test with with nested conditions where the second conditions is more limiting than the first one. +define void @f9_nested_conds(i32 %a, i32 %b) { +; CHECK-LABEL: @f9_nested_conds( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[BC_1:%.*]] = icmp ugt i32 [[B:%.*]], 10 +; CHECK-NEXT: br i1 [[BC_1]], label [[TRUE:%.*]], label [[FALSE:%.*]] +; CHECK: true: +; CHECK-NEXT: [[F_1:%.*]] = icmp eq i32 [[B]], 0 +; CHECK-NEXT: call void @use(i1 [[F_1]]) +; CHECK-NEXT: [[F_2:%.*]] = icmp eq i32 [[B]], 10 +; CHECK-NEXT: call void @use(i1 [[F_2]]) +; CHECK-NEXT: [[T_1:%.*]] = icmp ugt i32 [[B]], 5 +; CHECK-NEXT: call void @use(i1 [[T_1]]) +; CHECK-NEXT: [[T_2:%.*]] = icmp ne i32 [[B]], 10 +; CHECK-NEXT: call void @use(i1 [[T_2]]) +; CHECK-NEXT: [[C_1:%.*]] = icmp eq i32 [[B]], 11 +; CHECK-NEXT: call void @use(i1 [[C_1]]) +; CHECK-NEXT: [[C_2:%.*]] = icmp ugt i32 [[B]], 11 +; CHECK-NEXT: call void @use(i1 [[C_2]]) +; CHECK-NEXT: [[BC_2:%.*]] = icmp ugt i32 [[B]], 20 +; CHECK-NEXT: br i1 [[BC_2]], label [[TRUE_2:%.*]], label [[FALSE_2:%.*]] +; CHECK: true.2: +; CHECK-NEXT: [[F_3:%.*]] = icmp eq i32 [[B]], 11 +; CHECK-NEXT: call void @use(i1 [[F_3]]) +; CHECK-NEXT: [[F_4:%.*]] = icmp eq i32 [[B]], 20 +; CHECK-NEXT: call void @use(i1 [[F_4]]) +; CHECK-NEXT: [[T_3:%.*]] = icmp ugt i32 [[B]], 11 +; CHECK-NEXT: call void @use(i1 [[T_3]]) +; CHECK-NEXT: [[T_4:%.*]] = icmp ne i32 [[B]], 20 +; CHECK-NEXT: call void @use(i1 [[T_4]]) +; CHECK-NEXT: [[C_3:%.*]] = icmp eq i32 [[B]], 21 +; CHECK-NEXT: call void @use(i1 [[C_3]]) +; CHECK-NEXT: [[C_4:%.*]] = icmp ugt i32 [[B]], 21 +; CHECK-NEXT: call void @use(i1 [[C_4]]) +; CHECK-NEXT: [[C_5:%.*]] = icmp ugt i32 [[B]], 34 +; CHECK-NEXT: call void @use(i1 [[C_5]]) +; CHECK-NEXT: ret void +; CHECK: false.2: +; CHECK-NEXT: [[F_5:%.*]] = icmp eq i32 [[B]], 21 +; CHECK-NEXT: call void @use(i1 [[F_5]]) +; CHECK-NEXT: [[F_6:%.*]] = icmp ugt i32 [[B]], 21 +; CHECK-NEXT: call void @use(i1 [[F_6]]) +; CHECK-NEXT: [[F_7:%.*]] = icmp ne i32 [[B]], 5 +; CHECK-NEXT: call void @use(i1 [[F_7]]) +; CHECK-NEXT: [[T_5:%.*]] = icmp ne i32 [[B]], 21 +; CHECK-NEXT: call void @use(i1 [[T_5]]) +; CHECK-NEXT: [[T_6:%.*]] = icmp ult i32 [[B]], 21 +; CHECK-NEXT: call void @use(i1 [[T_6]]) +; CHECK-NEXT: [[T_7:%.*]] = icmp ne i32 [[B]], 5 +; CHECK-NEXT: call void @use(i1 [[T_7]]) +; CHECK-NEXT: [[C_6:%.*]] = icmp eq i32 [[B]], 11 +; CHECK-NEXT: call void @use(i1 [[C_6]]) +; CHECK-NEXT: [[C_7:%.*]] = icmp ne i32 [[B]], 15 +; CHECK-NEXT: call void @use(i1 [[C_7]]) +; CHECK-NEXT: ret void +; CHECK: false: +; CHECK-NEXT: ret void +; +entry: + %bc.1 = icmp ugt i32 %b, 10 + br i1 %bc.1, label %true, label %false + +true: ; %b in [11, 0) + ; Conditions below are false. + %f.1 = icmp eq i32 %b, 0 + call void @use(i1 %f.1) + %f.2 = icmp eq i32 %b, 10 + call void @use(i1 %f.2) + + ; Conditions below are true. + %t.1 = icmp ugt i32 %b, 5 + call void @use(i1 %t.1) + %t.2 = icmp ne i32 %b, 10 + call void @use(i1 %t.2) + + ; Conditions below cannot be simplified. + %c.1 = icmp eq i32 %b, 11 + call void @use(i1 %c.1) + %c.2 = icmp ugt i32 %b, 11 + call void @use(i1 %c.2) + + %bc.2 = icmp ugt i32 %b, 20 + br i1 %bc.2, label %true.2, label %false.2 + +true.2: ; %b in [21, 0) + ; Conditions below are false. + %f.3 = icmp eq i32 %b, 11 + call void @use(i1 %f.3) + %f.4 = icmp eq i32 %b, 20 + call void @use(i1 %f.4) + + ; Conditions below are true. + %t.3 = icmp ugt i32 %b, 11 + call void @use(i1 %t.3) + %t.4 = icmp ne i32 %b, 20 + call void @use(i1 %t.4) + + ; Conditions below cannot be simplified. + %c.3 = icmp eq i32 %b, 21 + call void @use(i1 %c.3) + %c.4 = icmp ugt i32 %b, 21 + call void @use(i1 %c.4) + %c.5 = icmp ugt i32 %b, 34 + call void @use(i1 %c.5) + ret void + +false.2: ;%b in [11, 21) + ; Conditions below are false; + %f.5 = icmp eq i32 %b, 21 + call void @use(i1 %f.5) + %f.6 = icmp ugt i32 %b, 21 + call void @use(i1 %f.6) + %f.7 = icmp ne i32 %b, 5 + call void @use(i1 %f.7) + + ; Conditions below are true; + %t.5 = icmp ne i32 %b, 21 + call void @use(i1 %t.5) + %t.6 = icmp ult i32 %b, 21 + call void @use(i1 %t.6) + %t.7 = icmp ne i32 %b, 5 + call void @use(i1 %t.7) + + ; Conditions below cannot be simplified. + %c.6 = icmp eq i32 %b, 11 + call void @use(i1 %c.6) + %c.7 = icmp ne i32 %b, 15 + call void @use(i1 %c.7) + ret void + +false: + ret void +} + + +; Test with with nested conditions where the second conditions is more limiting than the first one. +define void @f10_cond_does_not_restrict_range(i32 %a, i32 %b) { +; CHECK-LABEL: @f10_cond_does_not_restrict_range( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[B_255:%.*]] = and i32 [[B:%.*]], 255 +; CHECK-NEXT: br label [[TRUE:%.*]] +; CHECK: true: +; CHECK-NEXT: [[F_1:%.*]] = icmp eq i32 [[B_255]], 256 +; CHECK-NEXT: call void @use(i1 [[F_1]]) +; CHECK-NEXT: [[F_2:%.*]] = icmp eq i32 [[B_255]], 300 +; CHECK-NEXT: call void @use(i1 [[F_2]]) +; CHECK-NEXT: [[T_1:%.*]] = icmp ult i32 [[B_255]], 256 +; CHECK-NEXT: call void @use(i1 [[T_1]]) +; CHECK-NEXT: [[T_2:%.*]] = icmp ult i32 [[B_255]], 300 +; CHECK-NEXT: call void @use(i1 [[T_2]]) +; CHECK-NEXT: [[T_3:%.*]] = icmp ne i32 [[B_255]], 256 +; CHECK-NEXT: call void @use(i1 [[T_3]]) +; CHECK-NEXT: [[T_4:%.*]] = icmp ne i32 [[B_255]], 300 +; CHECK-NEXT: call void @use(i1 [[T_4]]) +; CHECK-NEXT: [[C_1:%.*]] = icmp eq i32 [[B_255]], 11 +; CHECK-NEXT: call void @use(i1 [[C_1]]) +; CHECK-NEXT: [[C_2:%.*]] = icmp ugt i32 [[B_255]], 30 +; CHECK-NEXT: call void @use(i1 [[C_2]]) +; CHECK-NEXT: ret void +; +entry: + %b.255 = and i32 %b, 255 + %bc.1 = icmp ult i32 %b.255, 300 + br i1 %bc.1, label %true, label %false + +true: ; %b in [0, 256) + ; Conditions below are false. + %f.1 = icmp eq i32 %b.255, 256 + call void @use(i1 %f.1) + %f.2 = icmp eq i32 %b.255, 300 + call void @use(i1 %f.2) + + ; Conditions below are true. + %t.1 = icmp ult i32 %b.255, 256 + call void @use(i1 %t.1) + %t.2 = icmp ult i32 %b.255, 300 + call void @use(i1 %t.2) + %t.3 = icmp ne i32 %b.255, 256 + call void @use(i1 %t.3) + %t.4 = icmp ne i32 %b.255, 300 + call void @use(i1 %t.4) + + ; Conditions below cannot be simplified. + %c.1 = icmp eq i32 %b.255, 11 + call void @use(i1 %c.1) + %c.2 = icmp ugt i32 %b.255, 30 + call void @use(i1 %c.2) + ret void + +false: + ret void +} diff --git a/llvm/test/Transforms/SCCP/ip-constant-ranges.ll b/llvm/test/Transforms/SCCP/ip-constant-ranges.ll index dbaedaa739bf4..891bebf105b82 100644 --- a/llvm/test/Transforms/SCCP/ip-constant-ranges.ll +++ b/llvm/test/Transforms/SCCP/ip-constant-ranges.ll @@ -59,12 +59,9 @@ entry: ret i32 %res.2 } -; x is overdefined, because constant ranges are only used for parameter -; values. ; CHECK-LABEL: f3 -; CHECK: %cmp = icmp sgt i32 %x, 300 -; CHECK: %res = select i1 %cmp, i32 1, i32 2 -; CHECK: ret i32 %res +; CHECK-LABEL: entry: +; CHECK: ret i32 undef define internal i32 @f3(i32 %x) { entry: %cmp = icmp sgt i32 %x, 300 @@ -83,7 +80,7 @@ if.true: end: %res = phi i32 [ 0, %entry], [ 1, %if.true ] %call1 = tail call i32 @f3(i32 %res) - ret i32 %call1 + ret i32 2 } ; CHECK-LABEL: f4 diff --git a/llvm/test/Transforms/SCCP/ip-ranges-binaryops.ll b/llvm/test/Transforms/SCCP/ip-ranges-binaryops.ll new file mode 100644 index 0000000000000..cef41bbdb584b --- /dev/null +++ b/llvm/test/Transforms/SCCP/ip-ranges-binaryops.ll @@ -0,0 +1,134 @@ +; RUN: opt < %s -ipsccp -S | FileCheck %s + +; x = [10, 21), y = [100, 201) +; x + y = [110, 221) +define internal i1 @f.add(i32 %x, i32 %y) { +; CHECK-LABEL: define internal i1 @f.add(i32 %x, i32 %y) { +; CHECK-NEXT: %a.1 = add i32 %x, %y +; CHECK-NEXT: %c.2 = icmp sgt i32 %a.1, 219 +; CHECK-NEXT: %c.4 = icmp slt i32 %a.1, 111 +; CHECK-NEXT: %c.5 = icmp eq i32 %a.1, 150 +; CHECK-NEXT: %c.6 = icmp slt i32 %a.1, 150 +; CHECK-NEXT: %res.1 = add i1 false, %c.2 +; CHECK-NEXT: %res.2 = add i1 %res.1, false +; CHECK-NEXT: %res.3 = add i1 %res.2, %c.4 +; CHECK-NEXT: %res.4 = add i1 %res.3, %c.5 +; CHECK-NEXT: %res.5 = add i1 %res.4, %c.6 +; CHECK-NEXT: ret i1 %res.5 +; + %a.1 = add i32 %x, %y + %c.1 = icmp sgt i32 %a.1, 220 + %c.2 = icmp sgt i32 %a.1, 219 + %c.3 = icmp slt i32 %a.1, 110 + %c.4 = icmp slt i32 %a.1, 111 + %c.5 = icmp eq i32 %a.1, 150 + %c.6 = icmp slt i32 %a.1, 150 + %res.1 = add i1 %c.1, %c.2 + %res.2 = add i1 %res.1, %c.3 + %res.3 = add i1 %res.2, %c.4 + %res.4 = add i1 %res.3, %c.5 + %res.5 = add i1 %res.4, %c.6 + ret i1 %res.5 +} + +define i1 @caller.add() { +; CHECK-LABEL: define i1 @caller.add() { +; CHECK-NEXT: %call.1 = tail call i1 @f.add(i32 10, i32 100) +; CHECK-NEXT: %call.2 = tail call i1 @f.add(i32 20, i32 200) +; CHECK-NEXT: %res = and i1 %call.1, %call.2 +; CHECK-NEXT: ret i1 %res +; + %call.1 = tail call i1 @f.add(i32 10, i32 100) + %call.2 = tail call i1 @f.add(i32 20, i32 200) + %res = and i1 %call.1, %call.2 + ret i1 %res +} + + +; x = [10, 21), y = [100, 201) +; x - y = [-190, -79) +define internal i1 @f.sub(i32 %x, i32 %y) { +; CHECK-LABEL: define internal i1 @f.sub(i32 %x, i32 %y) { +; CHECK-NEXT: %a.1 = sub i32 %x, %y +; CHECK-NEXT: %c.2 = icmp sgt i32 %a.1, -81 +; CHECK-NEXT: %c.4 = icmp slt i32 %a.1, -189 +; CHECK-NEXT: %c.5 = icmp eq i32 %a.1, -150 +; CHECK-NEXT: %c.6 = icmp slt i32 %a.1, -150 +; CHECK-NEXT: %res.1 = add i1 false, %c.2 +; CHECK-NEXT: %res.2 = add i1 %res.1, false +; CHECK-NEXT: %res.3 = add i1 %res.2, %c.4 +; CHECK-NEXT: %res.4 = add i1 %res.3, %c.5 +; CHECK-NEXT: %res.5 = add i1 %res.4, %c.6 +; CHECK-NEXT: ret i1 %res.5 +; + %a.1 = sub i32 %x, %y + %c.1 = icmp sgt i32 %a.1, -80 + %c.2 = icmp sgt i32 %a.1, -81 + %c.3 = icmp slt i32 %a.1, -190 + %c.4 = icmp slt i32 %a.1, -189 + %c.5 = icmp eq i32 %a.1, -150 + %c.6 = icmp slt i32 %a.1, -150 + %res.1 = add i1 %c.1, %c.2 + %res.2 = add i1 %res.1, %c.3 + %res.3 = add i1 %res.2, %c.4 + %res.4 = add i1 %res.3, %c.5 + %res.5 = add i1 %res.4, %c.6 + ret i1 %res.5 +} + +define i1 @caller.sub() { +; CHECK-LABEL: define i1 @caller.sub() { +; CHECK-NEXT: %call.1 = tail call i1 @f.sub(i32 10, i32 100) +; CHECK-NEXT: %call.2 = tail call i1 @f.sub(i32 20, i32 200) +; CHECK-NEXT: %res = and i1 %call.1, %call.2 +; CHECK-NEXT: ret i1 %res +; + %call.1 = tail call i1 @f.sub(i32 10, i32 100) + %call.2 = tail call i1 @f.sub(i32 20, i32 200) + %res = and i1 %call.1, %call.2 + ret i1 %res +} + +; x = [10, 21), y = [100, 201) +; x * y = [1000, 4001) +define internal i1 @f.mul(i32 %x, i32 %y) { +; CHECK-LABEL: define internal i1 @f.mul(i32 %x, i32 %y) { +; CHECK-NEXT: %a.1 = mul i32 %x, %y +; CHECK-NEXT: %c.2 = icmp sgt i32 %a.1, 3999 +; CHECK-NEXT: %c.4 = icmp slt i32 %a.1, 1001 +; CHECK-NEXT: %c.5 = icmp eq i32 %a.1, 1500 +; CHECK-NEXT: %c.6 = icmp slt i32 %a.1, 1500 +; CHECK-NEXT: %res.1 = add i1 false, %c.2 +; CHECK-NEXT: %res.2 = add i1 %res.1, false +; CHECK-NEXT: %res.3 = add i1 %res.2, %c.4 +; CHECK-NEXT: %res.4 = add i1 %res.3, %c.5 +; CHECK-NEXT: %res.5 = add i1 %res.4, %c.6 +; CHECK-NEXT: ret i1 %res.5 +; + %a.1 = mul i32 %x, %y + %c.1 = icmp sgt i32 %a.1, 4000 + %c.2 = icmp sgt i32 %a.1, 3999 + %c.3 = icmp slt i32 %a.1, 1000 + %c.4 = icmp slt i32 %a.1, 1001 + %c.5 = icmp eq i32 %a.1, 1500 + %c.6 = icmp slt i32 %a.1, 1500 + %res.1 = add i1 %c.1, %c.2 + %res.2 = add i1 %res.1, %c.3 + %res.3 = add i1 %res.2, %c.4 + %res.4 = add i1 %res.3, %c.5 + %res.5 = add i1 %res.4, %c.6 + ret i1 %res.5 +} + +define i1 @caller.mul() { +; CHECK-LABEL: define i1 @caller.mul() { +; CHECK-NEXT: %call.1 = tail call i1 @f.mul(i32 10, i32 100) +; CHECK-NEXT: %call.2 = tail call i1 @f.mul(i32 20, i32 200) +; CHECK-NEXT: %res = and i1 %call.1, %call.2 +; CHECK-NEXT: ret i1 %res +; + %call.1 = tail call i1 @f.mul(i32 10, i32 100) + %call.2 = tail call i1 @f.mul(i32 20, i32 200) + %res = and i1 %call.1, %call.2 + ret i1 %res +} diff --git a/llvm/test/Transforms/SCCP/ip-ranges-phis.ll b/llvm/test/Transforms/SCCP/ip-ranges-phis.ll new file mode 100644 index 0000000000000..a4a59d9c0f816 --- /dev/null +++ b/llvm/test/Transforms/SCCP/ip-ranges-phis.ll @@ -0,0 +1,215 @@ +; RUN: opt < %s -ipsccp -S | FileCheck %s + +define internal i32 @f1(i32 %x) { +; CHECK-LABEL: define internal i32 @f1( +; CHECK-NEXT: ret i32 undef +; + %cmp = icmp sgt i32 %x, 300 + %res = select i1 %cmp, i32 1, i32 2 + ret i32 %res +} + +; %res is a constant range [0, 2) from a PHI node. +define i32 @caller1(i1 %cmp) { +; CHECK-LABEL: define i32 @caller1( +; CHECK-LABEL: entry: +; CHECK-NEXT: br i1 %cmp, label %if.true, label %end + +; CHECK-LABEL: if.true: +; CHECK-NEXT: br label %end + +; CHECK-LABEL: end: +; CHECK-NEXT: %res = phi i32 [ 0, %entry ], [ 1, %if.true ] +; CHECK-NEXT: %call1 = tail call i32 @f1(i32 %res) +; CHECK-NEXT: ret i32 2 +; +entry: + br i1 %cmp, label %if.true, label %end + +if.true: + br label %end + +end: + %res = phi i32 [ 0, %entry], [ 1, %if.true ] + %call1 = tail call i32 @f1(i32 %res) + ret i32 %call1 +} + +define internal i32 @f2(i32 %x, i32 %y, i32 %z, i1 %cmp.1, i1 %cmp.2) { +; CHECK-LABEL: define internal i32 @f2( +; CHECK-LABEL: entry: +; CHECK-NEXT: br i1 %cmp.1, label %if.true.1, label %end + +; CHECK-LABEL: if.true.1: +; CHECK-NEXT: br i1 %cmp.2, label %if.true.2, label %end + +; CHECK-LABEL: if.true.2: +; CHECK-NEXT: br label %end + +; CHECK-LABEL: end: +; CHECK-NEXT: %p = phi i32 [ %x, %entry ], [ %y, %if.true.1 ], [ %z, %if.true.2 ] +; CHECK-NEXT: %c.1 = icmp sgt i32 %p, 5 +; CHECK-NEXT: %c.2 = icmp eq i32 %p, 0 +; CHECK-NEXT: %c.3 = icmp slt i32 %p, 0 +; CHECK-NEXT: %v.1 = select i1 %c.1, i32 10, i32 100 +; CHECK-NEXT: %v.2 = select i1 %c.2, i32 20, i32 200 +; CHECK-NEXT: %v.3 = select i1 %c.3, i32 30, i32 300 +; CHECK-NEXT: %r.1 = add i32 %v.1, %v.2 +; CHECK-NEXT: %r.2 = add i32 %r.1, %v.3 +; CHECK-NEXT: %r.3 = add i32 %r.2, 400 +; CHECK-NEXT: %r.4 = add i32 %r.3, 50 +; CHECK-NEXT: %r.5 = add i32 %r.4, 60 +; CHECK-NEXT: %r.6 = add i32 %r.4, 700 +; CHECK-NEXT: ret i32 %r.6 +; +entry: + br i1 %cmp.1, label %if.true.1, label %end + +if.true.1: + br i1 %cmp.2, label %if.true.2, label %end + +if.true.2: + br label %end + +end: + %p = phi i32 [ %x, %entry ], [ %y, %if.true.1 ], [ %z, %if.true.2 ] + %c.1 = icmp sgt i32 %p, 5 + %c.2 = icmp eq i32 %p, 0 + %c.3 = icmp slt i32 %p, 0 + %c.4 = icmp sgt i32 %p, 10 + %c.5 = icmp sle i32 %p, 10 + %c.6 = icmp sgt i32 %p, -11 + %c.7 = icmp slt i32 %p, -11 + %v.1 = select i1 %c.1, i32 10, i32 100 + %v.2 = select i1 %c.2, i32 20, i32 200 + %v.3 = select i1 %c.3, i32 30, i32 300 + %v.4 = select i1 %c.4, i32 40, i32 400 + %v.5 = select i1 %c.5, i32 50, i32 500 + %v.6 = select i1 %c.6, i32 60, i32 600 + %v.7 = select i1 %c.7, i32 70, i32 700 + %r.1 = add i32 %v.1, %v.2 + %r.2 = add i32 %r.1, %v.3 + %r.3 = add i32 %r.2, %v.4 + %r.4 = add i32 %r.3, %v.5 + %r.5 = add i32 %r.4, %v.6 + %r.6 = add i32 %r.4, %v.7 + ret i32 %r.6 +} + +define i32 @caller2(i1 %cmp.1, i1 %cmp.2) { +; CHECK-LABEL: define i32 @caller2(i1 %cmp.1, i1 %cmp.2) { +; CHECK-LABEL: entry: +; CHECK-NEXT: br i1 %cmp.1, label %if.true, label %end + +; CHECK-LABEL: if.true: ; preds = %entry +; CHECK-NEXT: br label %end + +; CHECK-LABEL: end: ; preds = %if.true, %entry +; CHECK-NEXT: %p1 = phi i32 [ 0, %entry ], [ 1, %if.true ] +; CHECK-NEXT: %p2 = phi i32 [ 1, %entry ], [ -10, %if.true ] +; CHECK-NEXT: %p3 = phi i32 [ 1, %entry ], [ 10, %if.true ] +; CHECK-NEXT: %call1 = tail call i32 @f2(i32 %p1, i32 %p2, i32 %p3, i1 %cmp.1, i1 %cmp.2) +; CHECK-NEXT: ret i32 %call1 +; + +entry: + br i1 %cmp.1, label %if.true, label %end + +if.true: + br label %end + +end: + %p1 = phi i32 [ 0, %entry], [ 1, %if.true ] + %p2 = phi i32 [ 1, %entry], [ -10, %if.true ] + %p3 = phi i32 [ 1, %entry], [ 10, %if.true ] + %call1 = tail call i32 @f2(i32 %p1, i32 %p2, i32 %p3, i1 %cmp.1, i1 %cmp.2) + ret i32 %call1 +} + +define internal i32 @f3(i32 %x, i32 %y, i1 %cmp.1) { +; CHECK-LABEL: define internal i32 @f3(i32 %x, i32 %y, i1 %cmp.1) { +; CHECK-LABEL: entry: +; CHECK-NEXT: br i1 %cmp.1, label %if.true.1, label %end + +; CHECK-LABEL: if.true.1: ; preds = %entry +; CHECK-NEXT: br label %end + +; CHECK-LABEL: end: ; preds = %if.true.1, %entry +; CHECK-NEXT: %p = phi i32 [ %x, %entry ], [ %y, %if.true.1 ] +; CHECK-NEXT: %c.1 = icmp sgt i32 %p, 5 +; CHECK-NEXT: %c.2 = icmp eq i32 %p, 0 +; CHECK-NEXT: %c.3 = icmp slt i32 %p, 0 +; CHECK-NEXT: %c.4 = icmp sgt i32 %p, 10 +; CHECK-NEXT: %c.5 = icmp sle i32 %p, 10 +; CHECK-NEXT: %c.6 = icmp sgt i32 %p, -11 +; CHECK-NEXT: %c.7 = icmp slt i32 %p, -11 +; CHECK-NEXT: %v.1 = select i1 %c.1, i32 10, i32 100 +; CHECK-NEXT: %v.2 = select i1 %c.2, i32 20, i32 200 +; CHECK-NEXT: %v.3 = select i1 %c.3, i32 30, i32 300 +; CHECK-NEXT: %v.4 = select i1 %c.4, i32 40, i32 400 +; CHECK-NEXT: %v.5 = select i1 %c.5, i32 50, i32 500 +; CHECK-NEXT: %v.6 = select i1 %c.6, i32 60, i32 600 +; CHECK-NEXT: %v.7 = select i1 %c.7, i32 70, i32 700 +; CHECK-NEXT: %r.1 = add i32 %v.1, %v.2 +; CHECK-NEXT: %r.2 = add i32 %r.1, %v.3 +; CHECK-NEXT: %r.3 = add i32 %r.2, %v.4 +; CHECK-NEXT: %r.4 = add i32 %r.3, %v.5 +; CHECK-NEXT: %r.5 = add i32 %r.4, %v.6 +; CHECK-NEXT: %r.6 = add i32 %r.4, %v.7 +; CHECK-NEXT: ret i32 %r.6 +; +entry: + br i1 %cmp.1, label %if.true.1, label %end + +if.true.1: + br label %end + +end: + %p = phi i32 [ %x, %entry ], [ %y, %if.true.1 ] + %c.1 = icmp sgt i32 %p, 5 + %c.2 = icmp eq i32 %p, 0 + %c.3 = icmp slt i32 %p, 0 + %c.4 = icmp sgt i32 %p, 10 + %c.5 = icmp sle i32 %p, 10 + %c.6 = icmp sgt i32 %p, -11 + %c.7 = icmp slt i32 %p, -11 + %v.1 = select i1 %c.1, i32 10, i32 100 + %v.2 = select i1 %c.2, i32 20, i32 200 + %v.3 = select i1 %c.3, i32 30, i32 300 + %v.4 = select i1 %c.4, i32 40, i32 400 + %v.5 = select i1 %c.5, i32 50, i32 500 + %v.6 = select i1 %c.6, i32 60, i32 600 + %v.7 = select i1 %c.7, i32 70, i32 700 + %r.1 = add i32 %v.1, %v.2 + %r.2 = add i32 %r.1, %v.3 + %r.3 = add i32 %r.2, %v.4 + %r.4 = add i32 %r.3, %v.5 + %r.5 = add i32 %r.4, %v.6 + %r.6 = add i32 %r.4, %v.7 + ret i32 %r.6 +} + +define i32 @caller3(i32 %y, i1 %cmp.1) { +; CHECK-LABEL: define i32 @caller3(i32 %y, i1 %cmp.1) { +; CHECK-LABEL: entry: +; CHECK-NEXT: br i1 %cmp.1, label %if.true, label %end + +; CHECK-LABEL: if.true: +; CHECK-NEXT: br label %end + +; CHECK-LABEL: end: +; CHECK-NEXT: %p1 = phi i32 [ 0, %entry ], [ 5, %if.true ] +; CHECK-NEXT: %call1 = tail call i32 @f3(i32 %p1, i32 %y, i1 %cmp.1) +; CHECK-NEXT: ret i32 %call1 +; +entry: + br i1 %cmp.1, label %if.true, label %end + +if.true: + br label %end + +end: + %p1 = phi i32 [ 0, %entry], [ 5, %if.true ] + %call1 = tail call i32 @f3(i32 %p1, i32 %y, i1 %cmp.1) + ret i32 %call1 +} diff --git a/llvm/test/Transforms/SCCP/phis.ll b/llvm/test/Transforms/SCCP/phis.ll new file mode 100644 index 0000000000000..dac8273ab2d1d --- /dev/null +++ b/llvm/test/Transforms/SCCP/phis.ll @@ -0,0 +1,81 @@ +; RUN: opt < %s -sccp -S | FileCheck %s + +define i1 @float.1(i1 %cmp) { +; CHECK-LABEL: define i1 @float.1(i1 %cmp) { + +; CHECK-LABEL: end: +; CHECK-NEXT: ret i1 true +; +entry: + br i1 %cmp, label %if.true, label %end + +if.true: + br label %end + +end: + %p = phi float [ 1.0, %entry ], [ 1.0, %if.true] + %c = fcmp ueq float %p, 1.0 + ret i1 %c +} + +define i1 @float.2(i1 %cmp) { +; CHECK-LABEL: define i1 @float.2(i1 %cmp) { + +; CHECK-LABEL: end: +; CHECK-NEXT: %p = phi float [ 1.000000e+00, %entry ], [ 2.000000e+00, %if.true ] +; CHECK-NEXT: %c = fcmp ueq float %p, 1.000000e+00 +; CHECK-NEXT: ret i1 %c +; +entry: + br i1 %cmp, label %if.true, label %end + +if.true: + br label %end + +end: + %p = phi float [ 1.0, %entry ], [ 2.0, %if.true] + %c = fcmp ueq float %p, 1.0 + ret i1 %c +} + +define i1 @float.3(float %f, i1 %cmp) { +; CHECK-LABEL: define i1 @float.3(float %f, i1 %cmp) + +; CHECK-LABEL: end: +; CHECK-NEXT: %p = phi float [ 1.000000e+00, %entry ], [ %f, %if.true ] +; CHECK-NEXT: %c = fcmp ueq float %p, 1.000000e+00 +; CHECK-NEXT: ret i1 %c +; +entry: + br i1 %cmp, label %if.true, label %end + +if.true: + br label %end + +end: + %p = phi float [ 1.0, %entry ], [ %f, %if.true] + %c = fcmp ueq float %p, 1.0 + ret i1 %c +} + + +define i1 @float.4_unreachable(float %f, i1 %cmp) { +; CHECK-LABEL: define i1 @float.4_unreachable(float %f, i1 %cmp) + +; CHECK-LABEL: end: +; CHECK-NEXT: ret i1 false +; +entry: + br i1 %cmp, label %if.true, label %end + +if.true: + br label %end + +dead: + br label %end + +end: + %p = phi float [ 1.0, %entry ], [ 1.0, %if.true], [ %f, %dead ] + %c = fcmp une float %p, 1.0 + ret i1 %c +} diff --git a/llvm/test/Transforms/SCCP/range-and.ll b/llvm/test/Transforms/SCCP/range-and.ll index e948274dd8f14..3b349692db704 100644 --- a/llvm/test/Transforms/SCCP/range-and.ll +++ b/llvm/test/Transforms/SCCP/range-and.ll @@ -8,16 +8,13 @@ define void @and_range_limit(i64 %a) { ; CHECK-NEXT: [[R:%.*]] = and i64 [[A:%.*]], 255 ; CHECK-NEXT: [[C_0:%.*]] = icmp slt i64 [[R]], 15 ; CHECK-NEXT: call void @use(i1 [[C_0]]) -; CHECK-NEXT: [[C_1:%.*]] = icmp slt i64 [[R]], 256 -; CHECK-NEXT: call void @use(i1 [[C_1]]) +; CHECK-NEXT: call void @use(i1 true) ; CHECK-NEXT: [[C_2:%.*]] = icmp eq i64 [[R]], 100 ; CHECK-NEXT: call void @use(i1 [[C_2]]) -; CHECK-NEXT: [[C_3:%.*]] = icmp eq i64 [[R]], 300 -; CHECK-NEXT: call void @use(i1 [[C_3]]) +; CHECK-NEXT: call void @use(i1 false) ; CHECK-NEXT: [[C_4:%.*]] = icmp ne i64 [[R]], 100 ; CHECK-NEXT: call void @use(i1 [[C_4]]) -; CHECK-NEXT: [[C_5:%.*]] = icmp ne i64 [[R]], 300 -; CHECK-NEXT: call void @use(i1 [[C_5]]) +; CHECK-NEXT: call void @use(i1 true) ; CHECK-NEXT: ret void ; %r = and i64 %a, 255 @@ -144,8 +141,7 @@ define i1 @constant_range_and_255_100(i1 %cond, i64 %a) { ; CHECK: bb3: ; CHECK-NEXT: [[P:%.*]] = phi i64 [ [[R_1]], [[BB1]] ], [ [[R_2]], [[BB2]] ] ; CHECK-NEXT: [[P_AND:%.*]] = and i64 [[P]], 512 -; CHECK-NEXT: [[C:%.*]] = icmp ult i64 [[P_AND]], 256 -; CHECK-NEXT: ret i1 [[C]] +; CHECK-NEXT: ret i1 true ; entry: br i1 %cond, label %bb1, label %bb2 diff --git a/llvm/test/Transforms/SCCP/vector-bitcast.ll b/llvm/test/Transforms/SCCP/vector-bitcast.ll index b032085083c60..35312034c65b8 100644 --- a/llvm/test/Transforms/SCCP/vector-bitcast.ll +++ b/llvm/test/Transforms/SCCP/vector-bitcast.ll @@ -2,7 +2,8 @@ target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32-S128" -; CHECK: store volatile <2 x i64> zeroinitializer, <2 x i64>* %p +; FIXME: Add back support for handling special values of vector/fp types. +; CHECK: store volatile <2 x i64> %and.i119.i, <2 x i64>* %p ; rdar://11324230 define void @foo(<2 x i64>* %p) nounwind { diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/round.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/round.ll new file mode 100644 index 0000000000000..84d25fe79a27c --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/round.ll @@ -0,0 +1,38 @@ +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -slp-vectorizer %s | FileCheck -check-prefixes=GCN,GFX7 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -slp-vectorizer %s | FileCheck -check-prefixes=GCN,GFX8 %s +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -slp-vectorizer %s | FileCheck -check-prefixes=GCN,GFX8 %s + +; GCN-LABEL: @round_v2f16( +; GFX7: call half @llvm.round.f16( +; GFX7: call half @llvm.round.f16( + +; GFX8: call <2 x half> @llvm.round.v2f16( +define <2 x half> @round_v2f16(<2 x half> %arg) { +bb: + %tmp = extractelement <2 x half> %arg, i64 0 + %tmp1 = tail call half @llvm.round.half(half %tmp) + %tmp2 = insertelement <2 x half> undef, half %tmp1, i64 0 + %tmp3 = extractelement <2 x half> %arg, i64 1 + %tmp4 = tail call half @llvm.round.half(half %tmp3) + %tmp5 = insertelement <2 x half> %tmp2, half %tmp4, i64 1 + ret <2 x half> %tmp5 +} + +; TODO: Should probably not really be vectorizing this +; GCN-LABEL: @round_v2f32( +; GCN: call <2 x float> @llvm.round.v2f32 +define <2 x float> @round_v2f32(<2 x float> %arg) { +bb: + %tmp = extractelement <2 x float> %arg, i64 0 + %tmp1 = tail call float @llvm.round.f32(float %tmp) + %tmp2 = insertelement <2 x float> undef, float %tmp1, i64 0 + %tmp3 = extractelement <2 x float> %arg, i64 1 + %tmp4 = tail call float @llvm.round.f32(float %tmp3) + %tmp5 = insertelement <2 x float> %tmp2, float %tmp4, i64 1 + ret <2 x float> %tmp5 +} + +declare half @llvm.round.half(half) #0 +declare float @llvm.round.f32(float) #0 + +attributes #0 = { nounwind readnone speculatable willreturn } diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll index 7fdc953922357..dd1a210824d5d 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction_unrolled.ll @@ -69,31 +69,51 @@ entry: ; } define i32 @test_mul(i32* nocapture readonly %p) { -; CHECK-LABEL: @test_mul( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[P:%.*]], align 4 -; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 1 -; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4 -; CHECK-NEXT: [[MUL_18:%.*]] = mul i32 [[TMP1]], [[TMP0]] -; CHECK-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2 -; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4 -; CHECK-NEXT: [[MUL_29:%.*]] = mul i32 [[TMP2]], [[MUL_18]] -; CHECK-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4 -; CHECK-NEXT: [[MUL_310:%.*]] = mul i32 [[TMP3]], [[MUL_29]] -; CHECK-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4 -; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX_4]], align 4 -; CHECK-NEXT: [[MUL_411:%.*]] = mul i32 [[TMP4]], [[MUL_310]] -; CHECK-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5 -; CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX_5]], align 4 -; CHECK-NEXT: [[MUL_512:%.*]] = mul i32 [[TMP5]], [[MUL_411]] -; CHECK-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6 -; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX_6]], align 4 -; CHECK-NEXT: [[MUL_613:%.*]] = mul i32 [[TMP6]], [[MUL_512]] -; CHECK-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7 -; CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX_7]], align 4 -; CHECK-NEXT: [[MUL_714:%.*]] = mul i32 [[TMP7]], [[MUL_613]] -; CHECK-NEXT: ret i32 [[MUL_714]] +; AVX-LABEL: @test_mul( +; AVX-NEXT: entry: +; AVX-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1 +; AVX-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2 +; AVX-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3 +; AVX-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4 +; AVX-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5 +; AVX-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6 +; AVX-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7 +; AVX-NEXT: [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>* +; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4 +; AVX-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> +; AVX-NEXT: [[BIN_RDX:%.*]] = mul <8 x i32> [[TMP1]], [[RDX_SHUF]] +; AVX-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> +; AVX-NEXT: [[BIN_RDX2:%.*]] = mul <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]] +; AVX-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> +; AVX-NEXT: [[BIN_RDX4:%.*]] = mul <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]] +; AVX-NEXT: [[TMP2:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0 +; AVX-NEXT: ret i32 [[TMP2]] +; +; SSE-LABEL: @test_mul( +; SSE-NEXT: entry: +; SSE-NEXT: [[TMP0:%.*]] = load i32, i32* [[P:%.*]], align 4 +; SSE-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 1 +; SSE-NEXT: [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX_1]], align 4 +; SSE-NEXT: [[MUL_18:%.*]] = mul i32 [[TMP1]], [[TMP0]] +; SSE-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2 +; SSE-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX_2]], align 4 +; SSE-NEXT: [[MUL_29:%.*]] = mul i32 [[TMP2]], [[MUL_18]] +; SSE-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3 +; SSE-NEXT: [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX_3]], align 4 +; SSE-NEXT: [[MUL_310:%.*]] = mul i32 [[TMP3]], [[MUL_29]] +; SSE-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4 +; SSE-NEXT: [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX_4]], align 4 +; SSE-NEXT: [[MUL_411:%.*]] = mul i32 [[TMP4]], [[MUL_310]] +; SSE-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5 +; SSE-NEXT: [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX_5]], align 4 +; SSE-NEXT: [[MUL_512:%.*]] = mul i32 [[TMP5]], [[MUL_411]] +; SSE-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6 +; SSE-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX_6]], align 4 +; SSE-NEXT: [[MUL_613:%.*]] = mul i32 [[TMP6]], [[MUL_512]] +; SSE-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7 +; SSE-NEXT: [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX_7]], align 4 +; SSE-NEXT: [[MUL_714:%.*]] = mul i32 [[TMP7]], [[MUL_613]] +; SSE-NEXT: ret i32 [[MUL_714]] ; entry: %0 = load i32, i32* %p, align 4 diff --git a/llvm/test/Transforms/SimplifyCFG/PowerPC/prefer-fma.ll b/llvm/test/Transforms/SimplifyCFG/PowerPC/prefer-fma.ll index e72413e8b308b..93fe8a2019079 100644 --- a/llvm/test/Transforms/SimplifyCFG/PowerPC/prefer-fma.ll +++ b/llvm/test/Transforms/SimplifyCFG/PowerPC/prefer-fma.ll @@ -11,14 +11,15 @@ define double @_Z3fooRdS_S_S_(double* dereferenceable(8) %x, double* dereference ; CHECK-NEXT: [[CMP:%.*]] = fcmp oeq double [[TMP0]], 0.000000e+00 ; CHECK-NEXT: [[TMP1:%.*]] = load double, double* [[X:%.*]], align 8 ; CHECK-NEXT: [[TMP2:%.*]] = load double, double* [[A:%.*]], align 8 -; CHECK-NEXT: [[TMP3:%.*]] = fmul fast double [[TMP1]], [[TMP2]] ; CHECK-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] ; CHECK: if.then: +; CHECK-NEXT: [[TMP3:%.*]] = fmul fast double [[TMP1]], [[TMP2]] ; CHECK-NEXT: [[MUL:%.*]] = fadd fast double 1.000000e+00, [[TMP3]] ; CHECK-NEXT: store double [[MUL]], double* [[Y]], align 8 ; CHECK-NEXT: br label [[IF_END:%.*]] ; CHECK: if.else: -; CHECK-NEXT: [[SUB1:%.*]] = fsub fast double [[TMP3]], [[TMP0]] +; CHECK-NEXT: [[MUL1:%.*]] = fmul fast double [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[SUB1:%.*]] = fsub fast double [[MUL1]], [[TMP0]] ; CHECK-NEXT: [[GEP1:%.*]] = getelementptr double, double* [[Y]], i32 1 ; CHECK-NEXT: store double [[SUB1]], double* [[GEP1]], align 8 ; CHECK-NEXT: br label [[IF_END]] diff --git a/llvm/test/Transforms/SimplifyCFG/sink-common-code.ll b/llvm/test/Transforms/SimplifyCFG/sink-common-code.ll index 8254a49e28fd0..00329ae5d640f 100644 --- a/llvm/test/Transforms/SimplifyCFG/sink-common-code.ll +++ b/llvm/test/Transforms/SimplifyCFG/sink-common-code.ll @@ -291,12 +291,12 @@ entry: if.then: %dummy = add i32 %w, 5 - %sv1 = call i32 @llvm.ctlz.i32(i32 %x) + %sv1 = call i32 @llvm.ctlz.i32(i32 %x, i1 false) br label %if.end if.else: %dummy1 = add i32 %w, 6 - %sv2 = call i32 @llvm.cttz.i32(i32 %x) + %sv2 = call i32 @llvm.cttz.i32(i32 %x, i1 false) br label %if.end if.end: @@ -304,8 +304,8 @@ if.end: ret i32 1 } -declare i32 @llvm.ctlz.i32(i32 %x) readnone -declare i32 @llvm.cttz.i32(i32 %x) readnone +declare i32 @llvm.ctlz.i32(i32 %x, i1 immarg) readnone +declare i32 @llvm.cttz.i32(i32 %x, i1 immarg) readnone ; CHECK-LABEL: test12 ; CHECK: call i32 @llvm.ctlz @@ -769,6 +769,120 @@ if.end: ; CHECK-NOT: exact ; CHECK: } + +; FIXME: Should turn into select +; CHECK-LABEL: @allow_intrinsic_remove_constant( +; CHECK: %sv1 = call float @llvm.fma.f32(float %dummy, float 2.000000e+00, float 1.000000e+00) +; CHECK: %sv2 = call float @llvm.fma.f32(float 2.000000e+00, float %dummy1, float 1.000000e+00) +define float @allow_intrinsic_remove_constant(i1 zeroext %flag, float %w, float %x, float %y) { +entry: + br i1 %flag, label %if.then, label %if.else + +if.then: + %dummy = fadd float %w, 4.0 + %sv1 = call float @llvm.fma.f32(float %dummy, float 2.0, float 1.0) + br label %if.end + +if.else: + %dummy1 = fadd float %w, 8.0 + %sv2 = call float @llvm.fma.f32(float 2.0, float %dummy1, float 1.0) + br label %if.end + +if.end: + %p = phi float [ %sv1, %if.then ], [ %sv2, %if.else ] + ret float %p +} + +declare float @llvm.fma.f32(float, float, float) + +; CHECK-LABEL: @no_remove_constant_immarg( +; CHECK: call i32 @llvm.ctlz.i32(i32 %x, i1 true) +; CHECK: call i32 @llvm.ctlz.i32(i32 %x, i1 false) +define i32 @no_remove_constant_immarg(i1 zeroext %flag, i32 %w, i32 %x, i32 %y) { +entry: + br i1 %flag, label %if.then, label %if.else + +if.then: + %dummy = add i32 %w, 5 + %sv1 = call i32 @llvm.ctlz.i32(i32 %x, i1 true) + br label %if.end + +if.else: + %dummy1 = add i32 %w, 6 + %sv2 = call i32 @llvm.ctlz.i32(i32 %x, i1 false) + br label %if.end + +if.end: + %p = phi i32 [ %sv1, %if.then ], [ %sv2, %if.else ] + ret i32 1 +} + +declare void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* nocapture, i8 addrspace(1)* nocapture readonly, i64, i1) + +; Make sure a memcpy size isn't replaced with a variable +; CHECK-LABEL: @no_replace_memcpy_size( +; CHECK: call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 1024, i1 false) +; CHECK: call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 4096, i1 false) +define void @no_replace_memcpy_size(i1 zeroext %flag, i8 addrspace(1)* %dst, i8 addrspace(1)* %src) { +entry: + br i1 %flag, label %if.then, label %if.else + +if.then: + call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 1024, i1 false) + br label %if.end + +if.else: + call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 4096, i1 false) + br label %if.end + +if.end: + ret void +} + +declare void @llvm.memmove.p1i8.p1i8.i64(i8 addrspace(1)* nocapture, i8 addrspace(1)* nocapture readonly, i64, i1) + +; Make sure a memmove size isn't replaced with a variable +; CHECK-LABEL: @no_replace_memmove_size( +; CHECK: call void @llvm.memmove.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 1024, i1 false) +; CHECK: call void @llvm.memmove.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 4096, i1 false) +define void @no_replace_memmove_size(i1 zeroext %flag, i8 addrspace(1)* %dst, i8 addrspace(1)* %src) { +entry: + br i1 %flag, label %if.then, label %if.else + +if.then: + call void @llvm.memmove.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 1024, i1 false) + br label %if.end + +if.else: + call void @llvm.memmove.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 4096, i1 false) + br label %if.end + +if.end: + ret void +} + +declare void @llvm.memset.p1i8.i64(i8 addrspace(1)* nocapture, i8, i64, i1) + +; Make sure a memset size isn't replaced with a variable +; CHECK-LABEL: @no_replace_memset_size( +; CHECK: call void @llvm.memset.p1i8.i64(i8 addrspace(1)* %dst, i8 0, i64 1024, i1 false) +; CHECK: call void @llvm.memset.p1i8.i64(i8 addrspace(1)* %dst, i8 0, i64 4096, i1 false) +define void @no_replace_memset_size(i1 zeroext %flag, i8 addrspace(1)* %dst) { +entry: + br i1 %flag, label %if.then, label %if.else + +if.then: + call void @llvm.memset.p1i8.i64(i8 addrspace(1)* %dst, i8 0, i64 1024, i1 false) + br label %if.end + +if.else: + call void @llvm.memset.p1i8.i64(i8 addrspace(1)* %dst, i8 0, i64 4096, i1 false) + br label %if.end + +if.end: + ret void +} + ; Check that simplifycfg doesn't sink and merge inline-asm instructions. define i32 @test_inline_asm1(i32 %c, i32 %r6) { @@ -913,7 +1027,6 @@ if.end: declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) - ; CHECK: ![[$TBAA]] = !{![[TYPE:[0-9]]], ![[TYPE]], i64 0} ; CHECK: ![[TYPE]] = !{!"float", ![[TEXT:[0-9]]]} ; CHECK: ![[TEXT]] = !{!"an example type tree"} diff --git a/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll b/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll index 56635bdff8c5f..cc3d35ff9d71a 100644 --- a/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll +++ b/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll @@ -415,3 +415,71 @@ define float @ext14_ext15_fmul_v16f32(<16 x float> %x) { %r = fadd float %e0, %e1 ret float %r } + +define <4 x float> @ins_bo_ext_ext(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: @ins_bo_ext_ext( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> undef, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = fadd <4 x float> [[A]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 2 +; CHECK-NEXT: [[V3:%.*]] = insertelement <4 x float> [[B:%.*]], float [[TMP3]], i32 3 +; CHECK-NEXT: ret <4 x float> [[V3]] +; + %a2 = extractelement <4 x float> %a, i32 2 + %a3 = extractelement <4 x float> %a, i32 3 + %a23 = fadd float %a2, %a3 + %v3 = insertelement <4 x float> %b, float %a23, i32 3 + ret <4 x float> %v3 +} + +define <4 x float> @ins_bo_ext_ext_uses(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: @ins_bo_ext_ext_uses( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> undef, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = fadd <4 x float> [[A]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP2]], i32 2 +; CHECK-NEXT: call void @use_f32(float [[TMP3]]) +; CHECK-NEXT: [[V3:%.*]] = insertelement <4 x float> [[B:%.*]], float [[TMP3]], i32 3 +; CHECK-NEXT: ret <4 x float> [[V3]] +; + %a2 = extractelement <4 x float> %a, i32 2 + %a3 = extractelement <4 x float> %a, i32 3 + %a23 = fadd float %a2, %a3 + call void @use_f32(float %a23) + %v3 = insertelement <4 x float> %b, float %a23, i32 3 + ret <4 x float> %v3 +} + +define <4 x float> @PR34724(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: @PR34724( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> undef, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> undef, <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[B]], <4 x float> undef, <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = fadd <4 x float> [[A]], [[TMP1]] +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP4]], i32 2 +; CHECK-NEXT: [[TMP6:%.*]] = fadd <4 x float> [[B]], [[TMP2]] +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = fadd <4 x float> [[B]], [[TMP3]] +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[TMP8]], i32 2 +; CHECK-NEXT: [[V1:%.*]] = insertelement <4 x float> undef, float [[TMP5]], i32 1 +; CHECK-NEXT: [[V2:%.*]] = insertelement <4 x float> [[V1]], float [[TMP7]], i32 2 +; CHECK-NEXT: [[V3:%.*]] = insertelement <4 x float> [[V2]], float [[TMP9]], i32 3 +; CHECK-NEXT: ret <4 x float> [[V3]] +; + %a0 = extractelement <4 x float> %a, i32 0 + %a1 = extractelement <4 x float> %a, i32 1 + %a2 = extractelement <4 x float> %a, i32 2 + %a3 = extractelement <4 x float> %a, i32 3 + + %b0 = extractelement <4 x float> %b, i32 0 + %b1 = extractelement <4 x float> %b, i32 1 + %b2 = extractelement <4 x float> %b, i32 2 + %b3 = extractelement <4 x float> %b, i32 3 + + %a23 = fadd float %a2, %a3 + %b01 = fadd float %b0, %b1 + %b23 = fadd float %b2, %b3 + + %v1 = insertelement <4 x float> undef, float %a23, i32 1 + %v2 = insertelement <4 x float> %v1, float %b01, i32 2 + %v3 = insertelement <4 x float> %v2, float %b23, i32 3 + ret <4 x float> %v3 +} diff --git a/llvm/test/Transforms/VectorCombine/X86/extract-cmp.ll b/llvm/test/Transforms/VectorCombine/X86/extract-cmp.ll index 65d886c7bca95..807bb800e9976 100644 --- a/llvm/test/Transforms/VectorCombine/X86/extract-cmp.ll +++ b/llvm/test/Transforms/VectorCombine/X86/extract-cmp.ll @@ -151,3 +151,40 @@ define i1 @cmp12_v4i32(<4 x i32> %x, <4 x i32> %y) { %cmp = icmp sgt i32 %x1, %y2 ret i1 %cmp } + +define <4 x i1> @ins_fcmp_ext_ext(<4 x float> %a, <4 x i1> %b) { +; SSE-LABEL: @ins_fcmp_ext_ext( +; SSE-NEXT: [[A1:%.*]] = extractelement <4 x float> [[A:%.*]], i32 1 +; SSE-NEXT: [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2 +; SSE-NEXT: [[A21:%.*]] = fcmp ugt float [[A2]], [[A1]] +; SSE-NEXT: [[R:%.*]] = insertelement <4 x i1> [[B:%.*]], i1 [[A21]], i32 2 +; SSE-NEXT: ret <4 x i1> [[R]] +; +; AVX-LABEL: @ins_fcmp_ext_ext( +; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> undef, <4 x i32> +; AVX-NEXT: [[TMP2:%.*]] = fcmp ugt <4 x float> [[TMP1]], [[A]] +; AVX-NEXT: [[TMP3:%.*]] = extractelement <4 x i1> [[TMP2]], i64 1 +; AVX-NEXT: [[R:%.*]] = insertelement <4 x i1> [[B:%.*]], i1 [[TMP3]], i32 2 +; AVX-NEXT: ret <4 x i1> [[R]] +; + %a1 = extractelement <4 x float> %a, i32 1 + %a2 = extractelement <4 x float> %a, i32 2 + %a21 = fcmp ugt float %a2, %a1 + %r = insertelement <4 x i1> %b, i1 %a21, i32 2 + ret <4 x i1> %r +} + +define <4 x i1> @ins_icmp_ext_ext(<4 x i32> %a, <4 x i1> %b) { +; CHECK-LABEL: @ins_icmp_ext_ext( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = icmp ule <4 x i32> [[A]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i1> [[TMP2]], i32 2 +; CHECK-NEXT: [[R:%.*]] = insertelement <4 x i1> [[B:%.*]], i1 [[TMP3]], i32 3 +; CHECK-NEXT: ret <4 x i1> [[R]] +; + %a3 = extractelement <4 x i32> %a, i32 3 + %a2 = extractelement <4 x i32> %a, i32 2 + %a23 = icmp ule i32 %a2, %a3 + %r = insertelement <4 x i1> %b, i1 %a23, i32 3 + ret <4 x i1> %r +} diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffle.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle.ll new file mode 100644 index 0000000000000..78d628615766f --- /dev/null +++ b/llvm/test/Transforms/VectorCombine/X86/shuffle.ll @@ -0,0 +1,93 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- -mattr=SSE2 | FileCheck %s --check-prefixes=CHECK,SSE +; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- -mattr=AVX2 | FileCheck %s --check-prefixes=CHECK,AVX + +define <16 x i8> @bitcast_shuf_narrow_element(<4 x i32> %v) { +; CHECK-LABEL: @bitcast_shuf_narrow_element( +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[R:%.*]] = bitcast <4 x i32> [[SHUF]] to <16 x i8> +; CHECK-NEXT: ret <16 x i8> [[R]] +; + %shuf = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> + %r = bitcast <4 x i32> %shuf to <16 x i8> + ret <16 x i8> %r +} + +define <4 x float> @bitcast_shuf_same_size(<4 x i32> %v) { +; CHECK-LABEL: @bitcast_shuf_same_size( +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[R:%.*]] = bitcast <4 x i32> [[SHUF]] to <4 x float> +; CHECK-NEXT: ret <4 x float> [[R]] +; + %shuf = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> + %r = bitcast <4 x i32> %shuf to <4 x float> + ret <4 x float> %r +} + +define <4 x i32> @bitcast_shuf_wide_element(<8 x i16> %v) { +; CHECK-LABEL: @bitcast_shuf_wide_element( +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <8 x i16> [[V:%.*]], <8 x i16> undef, <8 x i32> +; CHECK-NEXT: [[R:%.*]] = bitcast <8 x i16> [[SHUF]] to <4 x i32> +; CHECK-NEXT: ret <4 x i32> [[R]] +; + %shuf = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> + %r = bitcast <8 x i16> %shuf to <4 x i32> + ret <4 x i32> %r +} + +declare void @use(<4 x i32>) + +define <16 x i8> @bitcast_shuf_uses(<4 x i32> %v) { +; CHECK-LABEL: @bitcast_shuf_uses( +; CHECK-NEXT: [[SHUF:%.*]] = shufflevector <4 x i32> [[V:%.*]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: call void @use(<4 x i32> [[SHUF]]) +; CHECK-NEXT: [[R:%.*]] = bitcast <4 x i32> [[SHUF]] to <16 x i8> +; CHECK-NEXT: ret <16 x i8> [[R]] +; + %shuf = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> + call void @use(<4 x i32> %shuf) + %r = bitcast <4 x i32> %shuf to <16 x i8> + ret <16 x i8> %r +} + +define <2 x i64> @PR35454_1(<2 x i64> %v) { +; CHECK-LABEL: @PR35454_1( +; CHECK-NEXT: [[BC:%.*]] = bitcast <2 x i64> [[V:%.*]] to <4 x i32> +; CHECK-NEXT: [[PERMIL:%.*]] = shufflevector <4 x i32> [[BC]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[BC1:%.*]] = bitcast <4 x i32> [[PERMIL]] to <16 x i8> +; CHECK-NEXT: [[ADD:%.*]] = shl <16 x i8> [[BC1]], +; CHECK-NEXT: [[BC2:%.*]] = bitcast <16 x i8> [[ADD]] to <4 x i32> +; CHECK-NEXT: [[PERMIL1:%.*]] = shufflevector <4 x i32> [[BC2]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[BC3:%.*]] = bitcast <4 x i32> [[PERMIL1]] to <2 x i64> +; CHECK-NEXT: ret <2 x i64> [[BC3]] +; + %bc = bitcast <2 x i64> %v to <4 x i32> + %permil = shufflevector <4 x i32> %bc, <4 x i32> undef, <4 x i32> + %bc1 = bitcast <4 x i32> %permil to <16 x i8> + %add = shl <16 x i8> %bc1, + %bc2 = bitcast <16 x i8> %add to <4 x i32> + %permil1 = shufflevector <4 x i32> %bc2, <4 x i32> undef, <4 x i32> + %bc3 = bitcast <4 x i32> %permil1 to <2 x i64> + ret <2 x i64> %bc3 +} + +define <2 x i64> @PR35454_2(<2 x i64> %v) { +; CHECK-LABEL: @PR35454_2( +; CHECK-NEXT: [[BC:%.*]] = bitcast <2 x i64> [[V:%.*]] to <4 x i32> +; CHECK-NEXT: [[PERMIL:%.*]] = shufflevector <4 x i32> [[BC]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[BC1:%.*]] = bitcast <4 x i32> [[PERMIL]] to <8 x i16> +; CHECK-NEXT: [[ADD:%.*]] = shl <8 x i16> [[BC1]], +; CHECK-NEXT: [[BC2:%.*]] = bitcast <8 x i16> [[ADD]] to <4 x i32> +; CHECK-NEXT: [[PERMIL1:%.*]] = shufflevector <4 x i32> [[BC2]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[BC3:%.*]] = bitcast <4 x i32> [[PERMIL1]] to <2 x i64> +; CHECK-NEXT: ret <2 x i64> [[BC3]] +; + %bc = bitcast <2 x i64> %v to <4 x i32> + %permil = shufflevector <4 x i32> %bc, <4 x i32> undef, <4 x i32> + %bc1 = bitcast <4 x i32> %permil to <8 x i16> + %add = shl <8 x i16> %bc1, + %bc2 = bitcast <8 x i16> %add to <4 x i32> + %permil1 = shufflevector <4 x i32> %bc2, <4 x i32> undef, <4 x i32> + %bc3 = bitcast <4 x i32> %permil1 to <2 x i64> + ret <2 x i64> %bc3 +} diff --git a/llvm/test/Verifier/bswap.ll b/llvm/test/Verifier/bswap.ll new file mode 100644 index 0000000000000..ad37e79061d5e --- /dev/null +++ b/llvm/test/Verifier/bswap.ll @@ -0,0 +1,53 @@ +; RUN: not llvm-as %s -o /dev/null 2>&1 | FileCheck %s +; Check handling of bswap with unsupported sizes. + +declare i8 @llvm.bswap.i8(i8) +declare <2 x i8> @llvm.bswap.v2i8(<2 x i8>) + +declare i12 @llvm.bswap.i12(i12) +declare <2 x i12> @llvm.bswap.v2i12(<2 x i12>) + +declare i18 @llvm.bswap.i18(i18) +declare <2 x i18> @llvm.bswap.v2i18(<2 x i18>) + +define i8 @bswap_i8(i8 %arg) { +; CHECK: bswap must be an even number of bytes +; CHECK-NEXT: %res = call i8 @llvm.bswap.i8(i8 %arg) + %res = call i8 @llvm.bswap.i8(i8 %arg) + ret i8 %res +} + +define <2 x i8> @bswap_v2i8(<2 x i8> %arg) { +; CHECK: bswap must be an even number of bytes +; CHECK-NEXT: %res = call <2 x i8> @llvm.bswap.v2i8(<2 x i8> %arg) + %res = call <2 x i8> @llvm.bswap.v2i8(<2 x i8> %arg) + ret <2 x i8> %res +} + +define i12 @bswap_i12(i12 %arg) { +; CHECK: bswap must be an even number of bytes +; CHECK-NEXT: %res = call i12 @llvm.bswap.i12(i12 %arg) + %res = call i12 @llvm.bswap.i12(i12 %arg) + ret i12 %res +} + +define <2 x i12> @bswap_v2i12(<2 x i12> %arg) { +; CHECK: bswap must be an even number of bytes +; CHECK-NEXT: %res = call <2 x i12> @llvm.bswap.v2i12(<2 x i12> %arg) + %res = call <2 x i12> @llvm.bswap.v2i12(<2 x i12> %arg) + ret <2 x i12> %res +} + +define i18 @bswap_i18(i18 %arg) { +; CHECK: bswap must be an even number of bytes +; CHECK-NEXT: %res = call i18 @llvm.bswap.i18(i18 %arg) + %res = call i18 @llvm.bswap.i18(i18 %arg) + ret i18 %res +} + +define <2 x i18> @bswap_v2i18(<2 x i18> %arg) { +; CHECK: bswap must be an even number of bytes +; CHECK-NEXT: %res = call <2 x i18> @llvm.bswap.v2i18(<2 x i18> %arg) + %res = call <2 x i18> @llvm.bswap.v2i18(<2 x i18> %arg) + ret <2 x i18> %res +} diff --git a/llvm/test/Verifier/vp-intrinsics.ll b/llvm/test/Verifier/vp-intrinsics.ll new file mode 100644 index 0000000000000..0e9f4e01561d9 --- /dev/null +++ b/llvm/test/Verifier/vp-intrinsics.ll @@ -0,0 +1,34 @@ +; RUN: opt --verify %s + +define void @test_vp_int(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n) { + %r0 = call <8 x i32> @llvm.vp.add.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n) + %r1 = call <8 x i32> @llvm.vp.sub.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n) + %r2 = call <8 x i32> @llvm.vp.mul.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n) + %r3 = call <8 x i32> @llvm.vp.sdiv.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n) + %r4 = call <8 x i32> @llvm.vp.srem.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n) + %r5 = call <8 x i32> @llvm.vp.udiv.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n) + %r6 = call <8 x i32> @llvm.vp.urem.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n) + %r7 = call <8 x i32> @llvm.vp.and.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n) + %r8 = call <8 x i32> @llvm.vp.or.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n) + %r9 = call <8 x i32> @llvm.vp.xor.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n) + %rA = call <8 x i32> @llvm.vp.ashr.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n) + %rB = call <8 x i32> @llvm.vp.lshr.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n) + %rC = call <8 x i32> @llvm.vp.shl.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n) + ret void +} + +; integer arith +declare <8 x i32> @llvm.vp.add.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) +declare <8 x i32> @llvm.vp.sub.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) +declare <8 x i32> @llvm.vp.mul.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) +declare <8 x i32> @llvm.vp.sdiv.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) +declare <8 x i32> @llvm.vp.srem.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) +declare <8 x i32> @llvm.vp.udiv.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) +declare <8 x i32> @llvm.vp.urem.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) +; bit arith +declare <8 x i32> @llvm.vp.and.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) +declare <8 x i32> @llvm.vp.or.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) +declare <8 x i32> @llvm.vp.xor.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) +declare <8 x i32> @llvm.vp.ashr.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) +declare <8 x i32> @llvm.vp.lshr.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) +declare <8 x i32> @llvm.vp.shl.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) diff --git a/llvm/test/tools/dsymutil/X86/Inputs/tail-call.cpp b/llvm/test/tools/dsymutil/X86/Inputs/tail-call.cpp new file mode 100644 index 0000000000000..57e512e7009d9 --- /dev/null +++ b/llvm/test/tools/dsymutil/X86/Inputs/tail-call.cpp @@ -0,0 +1,28 @@ +/* + * This file is used to test dsymutil support for call site entries with tail + * calls (DW_AT_call_pc). + * + * Instructions for regenerating binaries (on Darwin/x86_64): + * + * 1. Copy the source to a top-level directory to work around having absolute + * paths in the symtab's OSO entries. + * + * mkdir -p /Inputs/ && cp tail-call.c /Inputs && cd /Inputs + * + * 2. Compile with call site info enabled. -O2 is used to get tail call + * promotion. + * + * clang -g -O2 tail-call.c -c -o tail-call.macho.x86_64.o + * clang tail-call.macho.x86_64.o -o tail-call.macho.x86_64 + * + * 3. Copy the binaries back into the repo's Inputs directory. You'll need + * -oso-prepend-path=%p to link. + */ + +volatile int x; + +__attribute__((disable_tail_calls, noinline)) void func2() { x++; } + +__attribute__((noinline)) void func1() { func2(); /* tail */ } + +__attribute__((disable_tail_calls)) int main() { func1(); /* regular */ } diff --git a/llvm/test/tools/dsymutil/X86/Inputs/tail-call.macho.x86_64 b/llvm/test/tools/dsymutil/X86/Inputs/tail-call.macho.x86_64 new file mode 100755 index 0000000000000..d6098d0de5e4b Binary files /dev/null and b/llvm/test/tools/dsymutil/X86/Inputs/tail-call.macho.x86_64 differ diff --git a/llvm/test/tools/dsymutil/X86/Inputs/tail-call.macho.x86_64.o b/llvm/test/tools/dsymutil/X86/Inputs/tail-call.macho.x86_64.o new file mode 100644 index 0000000000000..1d5726d12e34c Binary files /dev/null and b/llvm/test/tools/dsymutil/X86/Inputs/tail-call.macho.x86_64.o differ diff --git a/llvm/test/tools/dsymutil/X86/object-prefix-path.test b/llvm/test/tools/dsymutil/X86/object-prefix-path.test new file mode 100644 index 0000000000000..16956e0f94521 --- /dev/null +++ b/llvm/test/tools/dsymutil/X86/object-prefix-path.test @@ -0,0 +1,11 @@ +RUN: rm -rf %t.dir && mkdir %t.dir && mkdir %t.dir/ModuleCacheRenamed +RUN: cp %p/../Inputs/module-warnings/1.o %t.dir +RUN: cp %p/../Inputs/module-warnings/Foo.pcm %t.dir/ModuleCacheRenamed + +RUN: dsymutil -verify -f -oso-prepend-path=%t.dir -y \ +RUN: %p/dummy-debug-map.map -o %t \ +RUN: -object-prefix-map=/ModuleCache=/ModuleCacheRenamed \ +RUN: 2>&1 | FileCheck %s + +CHECK: warning: {{.*}}Bar.pcm: +CHECK-NOT: warning: {{.*}}Foo.pcm: diff --git a/llvm/test/tools/dsymutil/X86/tail-call-linking.test b/llvm/test/tools/dsymutil/X86/tail-call-linking.test new file mode 100644 index 0000000000000..29ae2cc544cf6 --- /dev/null +++ b/llvm/test/tools/dsymutil/X86/tail-call-linking.test @@ -0,0 +1,4 @@ +RUN: dsymutil -oso-prepend-path=%p %p/Inputs/tail-call.macho.x86_64 -o %t.dSYM +RUN: llvm-dwarfdump %t.dSYM | FileCheck %s -implicit-check-not=DW_AT_call_pc + +CHECK: DW_AT_call_pc (0x0000000100000f95) diff --git a/llvm/test/tools/dsymutil/cmdline.test b/llvm/test/tools/dsymutil/cmdline.test index fc3f00b369fde..701de29637dd8 100644 --- a/llvm/test/tools/dsymutil/cmdline.test +++ b/llvm/test/tools/dsymutil/cmdline.test @@ -12,6 +12,7 @@ HELP: -no-odr HELP: -no-output HELP: -no-swiftmodule-timestamp HELP: -num-threads +HELP: -object-prefix-map HELP: -oso-prepend-path HELP: -o HELP: -papertrail diff --git a/llvm/test/tools/llvm-ar/lto-kind-from-triple.test b/llvm/test/tools/llvm-ar/lto-kind-from-triple.test new file mode 100644 index 0000000000000..92b194c6abf10 --- /dev/null +++ b/llvm/test/tools/llvm-ar/lto-kind-from-triple.test @@ -0,0 +1,26 @@ +## Ensure that we generate a GNU style archive if the first input is a bitcode +## file with a GNU target triple (absence of __.SYMDEF in the archive). + +# RUN: echo -e 'target triple = "x86_64-unknown-linux-gnu" \n define void @_Z3foov() { ret void }' > %t.gnu.ll +# RUN: llvm-as -o %t.gnu.o %t.gnu.ll + +# RUN: rm -f %t.ar +# RUN: llvm-ar crs %t.ar %t.gnu.o +# RUN: not grep -q __.SYMDEF %t.ar + +## Ensure that we generate a MachO style archive if the first input is a +## bitcode file with a MachO target triple (presence of __.SYMDEF in the +## archive). + +# RUN: echo -e 'target triple = "x86_64-apple-macosx10.9" \n define void @_Z3foov() { ret void }' > %t.macho.ll +# RUN: llvm-as -o %t.macho.o %t.macho.ll + +# RUN: rm -f %t.ar +# RUN: llvm-ar crs %t.ar %t.macho.o +# RUN: grep -q __.SYMDEF %t.ar + +## Verify that archive format is based on the first input's target triple. + +# RUN: rm -f %t.ar +# RUN: llvm-ar crs %t.ar %t.gnu.o %t.macho.o +# RUN: not grep -q __.SYMDEF %t.ar diff --git a/llvm/test/tools/llvm-dwarfdump/X86/debug_line_many_files_v5.s b/llvm/test/tools/llvm-dwarfdump/X86/debug_line_many_files_v5.s new file mode 100644 index 0000000000000..280ec6df1f343 --- /dev/null +++ b/llvm/test/tools/llvm-dwarfdump/X86/debug_line_many_files_v5.s @@ -0,0 +1,67 @@ +## An object with many files and directories in a single debug_line contribution +## meant to test the handling of directory_count and file_name_count fields. + +# RUN: llvm-mc -triple x86_64-pc-linux -filetype=obj %s -o %t +# RUN: llvm-dwarfdump -debug-line %t | FileCheck %s + +# CHECK: include_directories[ 0] = "/d000" +# CHECK: include_directories[299] = "/d299" +# CHECK: file_names[ 0]: +# CHECK-NEXT: name: "000.c" +# CHECK-NEXT: dir_index: 0 +# CHECK: file_names[299]: +# CHECK-NEXT: name: "299.c" +# CHECK-NEXT: dir_index: 299 + +.section .debug_line,"",@progbits +.long .Lunit_end0-.Lunit_start0 # Length of Unit +.Lunit_start0: +.short 5 # DWARF version number +.byte 8 # Address Size +.byte 0 # Segment Selector Size +.long .Lunit_header_end0 - .Lunit_params0 # Length of Prologue (invalid) +.Lunit_params0: +.byte 1 # Minimum Instruction Length +.byte 1 # Maximum Operations per Instruction +.byte 1 # Default is_stmt +.byte -5 # Line Base +.byte 14 # Line Range +.byte 13 # Opcode Base +.byte 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1 # Standard Opcode Lengths + +# Directory table format +.byte 1 # One element per directory entry +.byte 1 # DW_LNCT_path +.byte 0x08 # DW_FORM_string + +# Directory table entries +.uleb128 300 # 300 directories +.irpc a,012 +.irpc b,0123456789 +.irpc c,0123456789 +.byte '/', 'd', '0'+\a, '0'+\b, '0'+\c, 0 +.endr +.endr +.endr + +# File table format +.byte 2 # 2 elements per file entry +.byte 1 # DW_LNCT_path +.byte 0x08 # DW_FORM_string +.byte 2 # DW_LNCT_directory_index +.byte 0x05 # DW_FORM_data2 + +# File table entries +.uleb128 300 # 300 files +.irpc a,012 +.irpc b,0123456789 +.irpc c,0123456789 +.byte '0'+\a, '0'+\b, '0'+\c, '.', 'c', 0 # File name +.word \a*100+\b*10+\c # Dir index +.endr +.endr +.endr + +.Lunit_header_end0: +.byte 0, 1, 1 # DW_LNE_end_sequence +.Lunit_end0: diff --git a/llvm/test/tools/llvm-dwp/X86/duplicate.test b/llvm/test/tools/llvm-dwp/X86/duplicate.test index 43266a24b6014..de5f1fdd4231f 100644 --- a/llvm/test/tools/llvm-dwp/X86/duplicate.test +++ b/llvm/test/tools/llvm-dwp/X86/duplicate.test @@ -18,10 +18,10 @@ RUN: | FileCheck --check-prefix=DWO1DWP %s Build from a, b, and c.c all containing a single void() func by the name of the file. -DWOS: error: Duplicate DWO ID ({{.*}}) in 'c.c' and 'c.c'{{$}} -1DWP: error: Duplicate DWO ID ({{.*}}) in 'c.c' (from '{{.*}}ac.dwp') and 'c.c'{{$}} -2DWP: error: Duplicate DWO ID ({{.*}}) in 'c.c' and 'c.c' (from '{{.*}}bc.dwp'){{$}} +DWOS: error: duplicate DWO ID ({{.*}}) in 'c.c' and 'c.c'{{$}} +1DWP: error: duplicate DWO ID ({{.*}}) in 'c.c' (from '{{.*}}ac.dwp') and 'c.c'{{$}} +2DWP: error: duplicate DWO ID ({{.*}}) in 'c.c' and 'c.c' (from '{{.*}}bc.dwp'){{$}} -DWODWOS: error: Duplicate DWO ID ({{.*}}) in 'c.c' and 'c.c'{{$}} -DWO1DWP: error: Duplicate DWO ID ({{.*}}) in 'c.c' (from 'c.dwo' in '{{.*}}ac.dwp') and 'c.c'{{$}} -DWO2DWP: error: Duplicate DWO ID ({{.*}}) in 'c.c' and 'c.c' (from 'c.dwo' in '{{.*}}bc.dwp'){{$}} +DWODWOS: error: duplicate DWO ID ({{.*}}) in 'c.c' and 'c.c'{{$}} +DWO1DWP: error: duplicate DWO ID ({{.*}}) in 'c.c' (from 'c.dwo' in '{{.*}}ac.dwp') and 'c.c'{{$}} +DWO2DWP: error: duplicate DWO ID ({{.*}}) in 'c.c' and 'c.c' (from 'c.dwo' in '{{.*}}bc.dwp'){{$}} diff --git a/llvm/test/tools/llvm-dwp/X86/gcc_type.test b/llvm/test/tools/llvm-dwp/X86/gcc_type.test index 7bb1a64cd2df0..eb8f2ba9fd374 100644 --- a/llvm/test/tools/llvm-dwp/X86/gcc_type.test +++ b/llvm/test/tools/llvm-dwp/X86/gcc_type.test @@ -5,4 +5,4 @@ CHECK: Type Unit CHECK: Type Unit // Check that llvm-dwp can parse DW_FORM_string for CU name -DUP: Duplicate DWO ID ({{.*}}) in 'a.cpp' and 'a.cpp' +DUP: duplicate DWO ID ({{.*}}) in 'a.cpp' and 'a.cpp' diff --git a/llvm/test/tools/llvm-dwp/X86/invalid_cu_index.test b/llvm/test/tools/llvm-dwp/X86/invalid_cu_index.test index 6fc96b2a1a301..92c9a12cb91e4 100644 --- a/llvm/test/tools/llvm-dwp/X86/invalid_cu_index.test +++ b/llvm/test/tools/llvm-dwp/X86/invalid_cu_index.test @@ -1,3 +1,3 @@ RUN: not llvm-dwp %p/../Inputs/invalid_cu_index/x.dwp -o %t 2>&1 | FileCheck %s -CHECK: error: Failed to parse cu_index +CHECK: error: failed to parse cu_index diff --git a/llvm/test/tools/llvm-dwp/X86/missing_tu_index.test b/llvm/test/tools/llvm-dwp/X86/missing_tu_index.test index 99f5253e8b323..b84ed7c0765c3 100644 --- a/llvm/test/tools/llvm-dwp/X86/missing_tu_index.test +++ b/llvm/test/tools/llvm-dwp/X86/missing_tu_index.test @@ -1,3 +1,3 @@ RUN: not llvm-dwp %p/../Inputs/missing_tu_index/x.dwp -o %t 2>&1 | FileCheck %s -CHECK: error: Failed to parse tu_index +CHECK: error: failed to parse tu_index diff --git a/llvm/test/tools/llvm-locstats/locstats.ll b/llvm/test/tools/llvm-locstats/locstats.ll index f16635d2e8e42..fd28679f3ec13 100644 --- a/llvm/test/tools/llvm-locstats/locstats.ll +++ b/llvm/test/tools/llvm-locstats/locstats.ll @@ -9,9 +9,9 @@ ; LOCSTATS: [10%,20%) 0 0% ; LOCSTATS: [20%,30%) 1 11% ; LOCSTATS: [30%,40%) 0 0% -; LOCSTATS: [40%,50%) 1 11% -; LOCSTATS: [50%,60%) 1 11% -; LOCSTATS: [60%,70%) 1 11% +; LOCSTATS: [40%,50%) 0 0% +; LOCSTATS: [50%,60%) 0 0% +; LOCSTATS: [60%,70%) 3 33% ; LOCSTATS: [70%,80%) 0 0% ; LOCSTATS: [80%,90%) 2 22% ; LOCSTATS: [90%,100%) 1 11% diff --git a/llvm/test/tools/llvm-objcopy/ELF/Inputs/partitions.elf.test b/llvm/test/tools/llvm-objcopy/ELF/Inputs/partitions.elf.test index 179673848161a..16be495db9d41 100644 --- a/llvm/test/tools/llvm-objcopy/ELF/Inputs/partitions.elf.test +++ b/llvm/test/tools/llvm-objcopy/ELF/Inputs/partitions.elf.test @@ -1,6 +1,5 @@ // partitions.elf was generated by running this test in lld: -// REQUIRES: x86 // RUN: llvm-mc %s -o %t.o -filetype=obj --triple=x86_64-unknown-linux // RUN: ld.lld %t.o -o %t --export-dynamic --gc-sections diff --git a/llvm/test/tools/llvm-objcopy/tool-name.test b/llvm/test/tools/llvm-objcopy/tool-name.test new file mode 100644 index 0000000000000..a273375f109e8 --- /dev/null +++ b/llvm/test/tools/llvm-objcopy/tool-name.test @@ -0,0 +1,33 @@ +## Don't make symlinks on Windows. +# UNSUPPORTED: system-windows + +# RUN: rm -rf %t +# RUN: mkdir %t + +# RUN: ln -s llvm-objcopy %t/llvm-objcopy-11.exe +# RUN: ln -s llvm-objcopy %t/powerpc64-unknown-freebsd13-objcopy + +# RUN: llvm-objcopy --help | FileCheck --check-prefix=OBJCOPY %s +# RUN: %t/llvm-objcopy-11.exe --help | FileCheck --check-prefix=OBJCOPY %s +# RUN: %t/powerpc64-unknown-freebsd13-objcopy --help | FileCheck --check-prefix=OBJCOPY %s + +# OBJCOPY: OVERVIEW: llvm-objcopy tool + +# RUN: ln -s llvm-strip %t/strip.exe +# RUN: ln -s llvm-strip %t/gnu-llvm-strip-10 + +# RUN: llvm-strip --help | FileCheck --check-prefix=STRIP %s +# RUN: %t/strip.exe --help | FileCheck --check-prefix=STRIP %s +# RUN: %t/gnu-llvm-strip-10 --help | FileCheck --check-prefix=STRIP %s + +# STRIP: OVERVIEW: llvm-strip tool + +## This driver emulates install_name_tool on macOS. +# RUN: ln -s llvm-install-name-tool %t/llvm-install-name-tool-10 +# RUN: ln -s llvm-install-name-tool %t/install_name_tool.exe + +# RUN: llvm-install-name-tool --help | FileCheck --check-prefix=INSTALL %s +# RUN: %t/llvm-install-name-tool-10 --help | FileCheck --check-prefix=INSTALL %s +# RUN: %t/install_name_tool.exe --help | FileCheck --check-prefix=INSTALL %s + +# INSTALL: OVERVIEW: llvm-install-name-tool tool diff --git a/llvm/test/tools/llvm-objdump/ELF/PowerPC/branch-offset.s b/llvm/test/tools/llvm-objdump/ELF/PowerPC/branch-offset.s index 38588a3774cb1..e78e4c16eab55 100644 --- a/llvm/test/tools/llvm-objdump/ELF/PowerPC/branch-offset.s +++ b/llvm/test/tools/llvm-objdump/ELF/PowerPC/branch-offset.s @@ -1,43 +1,35 @@ -# RUN: llvm-mc -triple=powerpc64le-unknown-linux -filetype=obj %s -o %t.o -# RUN: llvm-objdump -d %t.o | FileCheck %s - -# RUN: llvm-mc -triple=powerpc64-unknown-linux -filetype=obj %s -o %t.o -# RUN: llvm-objdump -d %t.o | FileCheck %s - -# RUN: llvm-mc -triple=powerpc-unknown-linux -filetype=obj %s -o %t.o -# RUN: llvm-objdump -d %t.o | FileCheck %s - -# CHECK: {{0*}}00000000 : -# CHECK: 18: {{.*}} bl .-24 -# CHECK: 20: {{.*}} bl .+16 -# CHECK: {{0*}}00000030 : - - .text - .global caller - .type caller,@function - .type callee_forward,@function - .type callee_back,@function - - .p2align 4 -callee_back: - li 3, 55 - blr - - .p2align 4 -caller: -.Lgep: - addis 2, 12, .TOC.-.Lgep@ha - addi 2, 2, .TOC.-.Lgep@l -.Llep: - .localentry caller, .Llep-.Lgep - bl callee_back - mr 31, 3 - bl callee_forward - add 3, 3, 31 - blr - - .p2align 4 -callee_forward: - li 3, 66 - blr +# RUN: llvm-mc -triple=powerpc -filetype=obj %s -o %t.32.o +# RUN: llvm-objdump -d --no-show-raw-insn %t.32.o | FileCheck --check-prefixes=ELF32,CHECK %s +# RUN: llvm-mc -triple=powerpc64le -filetype=obj %s -o %t.64.o +# RUN: llvm-objdump -d --no-show-raw-insn %t.64.o | FileCheck --check-prefixes=ELF64,CHECK %s + +# RUN: llvm-mc -triple=powerpc64 -filetype=obj %s -o %t.64.o +# RUN: llvm-objdump -d --no-show-raw-insn %t.64.o | FileCheck --check-prefixes=ELF64,CHECK %s + +# CHECK-LABEL: : +# ELF32-NEXT: bl .-4 +# ELF64-NEXT: bl .-4 +# CHECK-NEXT: bl .+0 +# CHECK-NEXT: bl .+4 + +bl: + bl .-4 + bl . + bl .+4 + +# CHECK-LABEL: : +# CHECK-NEXT: b .+67108860 +# CHECK-NEXT: b .+0 +# CHECK-NEXT: b .+4 + +b: + b .-4 + b . + b .+4 + +# CHECK-LABEL: : +# CHECK-NEXT: bt 2, .+65532 + +bt: + bt 2, .-4 diff --git a/llvm/test/tools/llvm-readobj/ELF/all.test b/llvm/test/tools/llvm-readobj/ELF/all.test index 39783613e789c..501c480f23df8 100644 --- a/llvm/test/tools/llvm-readobj/ELF/all.test +++ b/llvm/test/tools/llvm-readobj/ELF/all.test @@ -112,4 +112,5 @@ ProgramHeaders: Sections: - Section: .note.gnu.build-id Symbols: [] -DynamicSymbols: [] +DynamicSymbols: + - Name: foo diff --git a/llvm/test/tools/llvm-readobj/ELF/dyn-symbols-size-from-hash-table.test b/llvm/test/tools/llvm-readobj/ELF/dyn-symbols-size-from-hash-table.test new file mode 100644 index 0000000000000..80cb8e3e9fa4c --- /dev/null +++ b/llvm/test/tools/llvm-readobj/ELF/dyn-symbols-size-from-hash-table.test @@ -0,0 +1,314 @@ +## This test shows how llvm-readobj uses the hash table section to derive the +## size of a dynamic symbol table. This allows dumping of the dynamic symbol +## table in the event of an object without section headers. + +## Case 1a) Table size is derived from hash table, with DT_SYMTAB before DT_HASH. +# RUN: yaml2obj --docnum=1 %s -o %t1a-64 -DBITS=64 \ +# RUN: -DTAG1=DT_SYMTAB -DTAG2=DT_HASH -DVAL1=0x400 -DVAL2=0x600 +# RUN: llvm-strip --strip-sections %t1a-64 +# RUN: llvm-readobj --dyn-symbols %t1a-64 2>&1 | \ +# RUN: FileCheck %s --check-prefixes=LLVM1,STRIP --implicit-check-not=warning: +# RUN: llvm-readelf --dyn-symbols %t1a-64 2>&1 | \ +# RUN: FileCheck %s --check-prefixes=GNU1,GNU1-STRIP --implicit-check-not=warning: +# RUN: yaml2obj --docnum=1 %s -o %t1a-32 -DBITS=32 \ +# RUN: -DTAG1=DT_SYMTAB -DTAG2=DT_HASH -DVAL1=0x400 -DVAL2=0x600 +# RUN: llvm-strip --strip-sections %t1a-32 +# RUN: llvm-readobj --dyn-symbols %t1a-32 2>&1 | \ +# RUN: FileCheck %s --check-prefixes=LLVM1,STRIP --implicit-check-not=warning: +# RUN: llvm-readelf --dyn-symbols %t1a-32 2>&1 | \ +# RUN: FileCheck %s --check-prefixes=GNU1,GNU1-STRIP --implicit-check-not=warning: + +## 1b) Table size is derived from hash table, with DT_HASH before DT_SYMTAB. +## We don't bother testing 32 and 64-bit here. The above cases show that reading +## the nchain value is correct for all formats, and other tests show the basic +## printing behaviour. +# RUN: yaml2obj --docnum=1 %s -o %t1b-64 -DBITS=64 \ +# RUN: -DTAG1=DT_HASH -DTAG2=DT_SYMTAB -DVAL1=0x600 -DVAL2=0x400 +# RUN: llvm-strip --strip-sections %t1b-64 +# RUN: llvm-readobj --dyn-symbols %t1b-64 2>&1 | \ +# RUN: FileCheck %s --check-prefixes=LLVM1,STRIP --implicit-check-not=warning: +# RUN: llvm-readelf --dyn-symbols %t1b-64 2>&1 | \ +# RUN: FileCheck %s --check-prefixes=GNU1,GNU1-STRIP --implicit-check-not=warning: + +# LLVM1: DynamicSymbols [ +# LLVM1-NEXT: Symbol { +# LLVM1-NEXT: Name: (0) +# LLVM1-NEXT: Value: 0x0 +# LLVM1-NEXT: Size: 0 +# LLVM1-NEXT: Binding: Local (0x0) +# LLVM1-NEXT: Type: None (0x0) +# LLVM1-NEXT: Other: 0 +# LLVM1-NEXT: Section: Undefined (0x0) +# LLVM1-NEXT: } +# LLVM1-NEXT: Symbol { +# LLVM1-NEXT: Name: foo (5) +# LLVM1-NEXT: Value: 0x100 +# LLVM1-NEXT: Size: 0 +# LLVM1-NEXT: Binding: Local (0x0) +# LLVM1-NEXT: Type: Function (0x2) +# LLVM1-NEXT: Other: 0 +# STRIP-NEXT: Section: (0x1) +# NOSTRIP-NEXT: Section: .text (0x1) +# LLVM1-NEXT: } +# LLVM1-NEXT: Symbol { +# LLVM1-NEXT: Name: bar (1) +# LLVM1-NEXT: Value: 0x200 +# LLVM1-NEXT: Size: 0 +# LLVM1-NEXT: Binding: Local (0x0) +# LLVM1-NEXT: Type: Object (0x1) +# LLVM1-NEXT: Other: 0 +# STRIP-NEXT: Section: (0x2) +# NOSTRIP-NEXT: Section: .data (0x2) +# LLVM1-NEXT: } +# LLVM1-NEXT: ] + +# GNU1-STRIP: Symbol table for image contains 3 entries: +# GNU1-NOSTRIP: Symbol table '.dynsym' contains 3 entries: +# GNU1-NEXT: Num: Value Size Type Bind Vis Ndx Name +# GNU1-NEXT: 0: {{0*}}00000000 0 NOTYPE LOCAL DEFAULT UND +# GNU1-NEXT: 1: {{0*}}00000100 0 FUNC LOCAL DEFAULT 1 foo +# GNU1-NEXT: 2: {{0*}}00000200 0 OBJECT LOCAL DEFAULT 2 bar +# GNU1-EMPTY: + +--- !ELF +FileHeader: + Class: ELFCLASS[[BITS]] + Data: ELFDATA2LSB + Type: ET_DYN + Machine: EM_X86_64 +Sections: + - Name: .text + Type: SHT_PROGBITS + - Name: .data + Type: SHT_PROGBITS + - Name: .dynsym + Type: SHT_DYNSYM + Flags: [ SHF_ALLOC ] + Address: 0x400 + AddressAlign: 0x400 + - Name: .hash + Type: SHT_HASH + Flags: [ SHF_ALLOC ] + Address: 0x600 + AddressAlign: 0x200 + Bucket: [ 1 ] + Chain: [ 1, 2, 3 ] + - Name: .dynstr + Type: SHT_STRTAB + Flags: [ SHF_ALLOC ] + Address: 0x800 + AddressAlign: 0x200 + - Name: .dynamic + Type: SHT_DYNAMIC + Flags: [ SHF_ALLOC ] + Address: 0xA00 + AddressAlign: 0x200 + Entries: + - Tag: DT_STRTAB + Value: 0x800 + - Tag: DT_STRSZ + Value: 9 + - Tag: [[TAG1]] + Value: [[VAL1]] + - Tag: [[TAG2]] + Value: [[VAL2]] + - Tag: DT_NULL + Value: 0 +DynamicSymbols: + - Name: foo + Type: STT_FUNC + Section: .text + Value: 0x100 + - Name: bar + Type: STT_OBJECT + Section: .data + Value: 0x200 +ProgramHeaders: + - Type: PT_LOAD + VAddr: 0 + Sections: + - Section: .text + - Section: .data + - Type: PT_LOAD + VAddr: 0x400 + Sections: + - Section: .dynsym + - Section: .hash + - Section: .dynstr + - Section: .dynamic + - Type: PT_DYNAMIC + VAddr: 0xA00 + Sections: + - Section: .dynamic + +## Case 2: Table size from DT_HASH does not match size from section header. +# RUN: yaml2obj --docnum=2 %s -o %t2-smaller -DCHAIN="[1, 2]" +# RUN: llvm-readobj --dyn-symbols %t2-smaller 2>&1 | \ +# RUN: FileCheck %s --check-prefixes=LLVM2,WARN \ +# RUN: --implicit-check-not=warning: -DNCHAIN=2 +# RUN: llvm-readelf --dyn-symbols %t2-smaller 2>&1 | \ +# RUN: FileCheck %s --check-prefixes=GNU2,WARN \ +# RUN: --implicit-check-not=warning: -DNCHAIN=2 + +# RUN: yaml2obj --docnum=2 %s -o %t2-larger -DCHAIN="[1, 2, 3, 4]" +# RUN: llvm-readobj --dyn-symbols %t2-larger 2>&1 | \ +# RUN: FileCheck %s --check-prefixes=LLVM2,LLVM2-MORE,LLVM2-ALL,WARN \ +# RUN: --implicit-check-not=warning: -DNCHAIN=4 +# RUN: llvm-readelf --dyn-symbols %t2-larger 2>&1 | \ +# RUN: FileCheck %s --check-prefixes=GNU2,GNU2-MORE,GNU2-ALL,WARN \ +# RUN: --implicit-check-not=warning: -DNCHAIN=4 + +## Show there's no warning if the sizes match +# RUN: yaml2obj --docnum=2 %s -o %t2-same -DCHAIN="[1, 2, 3]" +# RUN: llvm-readobj --dyn-symbols %t2-same 2>&1 | \ +# RUN: FileCheck %s --check-prefixes=LLVM2,LLVM2-MORE --implicit-check-not=warning: +# RUN: llvm-readelf --dyn-symbols %t2-same 2>&1 | \ +# RUN: FileCheck %s --check-prefixes=GNU2,GNU2-MORE \ +# RUN: --implicit-check-not=warning: -DNCHAIN=3 + +# WARN: warning: '{{.*}}2-{{.*}}': hash table nchain ([[NCHAIN]]) differs from symbol count derived from SHT_DYNSYM section header (3) + +# LLVM2: DynamicSymbols [ +# LLVM2-NEXT: Symbol { +# LLVM2-NEXT: Name: (0) +# LLVM2-NEXT: Value: 0x0 +# LLVM2-NEXT: Size: 0 +# LLVM2-NEXT: Binding: Local (0x0) +# LLVM2-NEXT: Type: None (0x0) +# LLVM2-NEXT: Other: 0 +# LLVM2-NEXT: Section: Undefined (0x0) +# LLVM2-NEXT: } +# LLVM2-NEXT: Symbol { +# LLVM2-NEXT: Name: foo (9) +# LLVM2-NEXT: Value: 0x100 +# LLVM2-NEXT: Size: 0 +# LLVM2-NEXT: Binding: Local (0x0) +# LLVM2-NEXT: Type: Function (0x2) +# LLVM2-NEXT: Other: 0 +# LLVM2-NEXT: Section: .text (0x1) +# LLVM2-NEXT: } +# LLVM2-MORE-NEXT: Symbol { +# LLVM2-MORE-NEXT: Name: bar (5) +# LLVM2-MORE-NEXT: Value: 0x200 +# LLVM2-MORE-NEXT: Size: 0 +# LLVM2-MORE-NEXT: Binding: Local (0x0) +# LLVM2-MORE-NEXT: Type: Object (0x1) +# LLVM2-MORE-NEXT: Other: 0 +# LLVM2-MORE-NEXT: Section: .data (0x2) +# LLVM2-MORE-NEXT: } +# LLVM2-ALL-NEXT: Symbol { +# LLVM2-ALL-NEXT: Name: baz (1) +# LLVM2-ALL-NEXT: Value: 0x300 +# LLVM2-ALL-NEXT: Size: 0 +# LLVM2-ALL-NEXT: Binding: Local (0x0) +# LLVM2-ALL-NEXT: Type: Object (0x1) +# LLVM2-ALL-NEXT: Other: 0 +# LLVM2-ALL-NEXT: Section: .data (0x2) +# LLVM2-ALL-NEXT: } +# LLVM2-NEXT: ] + +# GNU2: Symbol table '.dynsym' contains [[NCHAIN]] entries: +# GNU2-NEXT: Num: Value Size Type Bind Vis Ndx Name +# GNU2-NEXT: 0: {{0*}}00000000 0 NOTYPE LOCAL DEFAULT UND +# GNU2-NEXT: 1: {{0*}}00000100 0 FUNC LOCAL DEFAULT 1 foo +# GNU2-MORE-NEXT: 2: {{0*}}00000200 0 OBJECT LOCAL DEFAULT 2 bar +# GNU2-ALL-NEXT: 3: {{0*}}00000300 0 OBJECT LOCAL DEFAULT 2 baz +# GNU2-EMPTY: + +## In this YAML, we define 4 dynamic symbols (including the null symbol), but +## constrain the .dynsym section header to say there are only 3. This means that +## when a size of 4 is derived from the hash table, we still have a valid symbol +## to dump. +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_DYN + Machine: EM_X86_64 +Sections: + - Name: .text + Type: SHT_PROGBITS + - Name: .data + Type: SHT_PROGBITS + - Name: .dynsym + Type: SHT_DYNSYM + Flags: [ SHF_ALLOC ] + ShSize: 0x48 + Address: 0x400 + AddressAlign: 0x400 + - Name: .hash + Type: SHT_HASH + Flags: [ SHF_ALLOC ] + Address: 0x600 + AddressAlign: 0x200 + Bucket: [ 1 ] + Chain: [[CHAIN]] + - Name: .dynstr + Type: SHT_STRTAB + Flags: [ SHF_ALLOC ] + Address: 0x800 + AddressAlign: 0x200 + - Name: .dynamic + Type: SHT_DYNAMIC + Flags: [ SHF_ALLOC ] + Address: 0xA00 + AddressAlign: 0x200 + Entries: + - Tag: DT_SYMTAB + Value: 0x400 + - Tag: DT_HASH + Value: 0x600 + - Tag: DT_STRTAB + Value: 0x800 + - Tag: DT_STRSZ + Value: 13 + - Tag: DT_NULL + Value: 0 +DynamicSymbols: + - Name: foo + Type: STT_FUNC + Section: .text + Value: 0x100 + - Name: bar + Type: STT_OBJECT + Section: .data + Value: 0x200 + - Name: baz + Type: STT_OBJECT + Section: .data + Value: 0x300 +ProgramHeaders: + - Type: PT_LOAD + VAddr: 0 + Sections: + - Section: .text + - Section: .data + - Type: PT_LOAD + VAddr: 0x400 + Sections: + - Section: .dynsym + - Section: .hash + - Section: .dynstr + - Section: .dynamic + - Type: PT_DYNAMIC + VAddr: 0xA00 + Sections: + - Section: .dynamic + +## Case 3: DT_HASH is missing. +## Show that no warning occurs if there are section headers. +# RUN: yaml2obj --docnum=1 %s -o %t3 -DTAG1=DT_SYMTAB -DVAL1=0x400 -DTAG2=DT_NULL -DVAL2=0 -DBITS=64 +# RUN: llvm-readobj --dyn-symbols %t3 2>&1 | \ +# RUN: FileCheck %s --check-prefixes=LLVM1,NOSTRIP --implicit-check-not=warning: +# RUN: llvm-readelf --dyn-symbols %t3 2>&1 | \ +# RUN: FileCheck %s --check-prefixes=GNU1,GNU1-NOSTRIP --implicit-check-not=warning: + +## Show that size is treated as zero, if no section headers are present. +# RUN: llvm-strip --strip-sections %t3 +# RUN: llvm-readobj --dyn-symbols %t3 2>&1 | \ +# RUN: FileCheck %s --check-prefix=LLVM3 --implicit-check-not=warning: +# RUN: llvm-readelf --dyn-symbols %t3 2>&1 | \ +# RUN: FileCheck %s --implicit-check-not={{.}} --allow-empty + +# LLVM3: DynamicSymbols [ +# LLVM3: ] diff --git a/llvm/test/tools/llvm-readobj/ELF/reloc-addends.test b/llvm/test/tools/llvm-readobj/ELF/reloc-addends.test new file mode 100644 index 0000000000000..d64d2935c1621 --- /dev/null +++ b/llvm/test/tools/llvm-readobj/ELF/reloc-addends.test @@ -0,0 +1,155 @@ +## Check how llvm-readobj and llvm-readelf tools dump addends of relocations. + +# RUN: yaml2obj --docnum=1 -DENCODE=LSB -DTYPE=SHT_RELA %s -o %t.le64.rela +# RUN: llvm-readobj -r %t.le64.rela | FileCheck %s --check-prefix=LLVM-RELA64 +# RUN: llvm-readelf -r %t.le64.rela | FileCheck %s --check-prefix=GNU-RELA64 + +# LLVM-RELA64: R_X86_64_NONE - 0x0{{$}} +# LLVM-RELA64-NEXT: R_X86_64_NONE - 0x1{{$}} +# LLVM-RELA64-NEXT: R_X86_64_NONE - 0xFFFFFFFFFFFFFFFF{{$}} +# LLVM-RELA64-NEXT: R_X86_64_NONE - 0x7FFFFFFFFFFFFFFF{{$}} +# LLVM-RELA64-NEXT: R_X86_64_NONE - 0x8000000000000000{{$}} +# LLVM-RELA64-NEXT: R_X86_64_NONE - 0xFFFFFFFFFFFFCFC7{{$}} +# LLVM-RELA64-NEXT: R_X86_64_NONE - 0x12345{{$}} + +## FIXME: GNU readelf prints addends differently. +## See https://bugs.llvm.org/show_bug.cgi?id=45235. +# GNU-RELA64: Type Symbol's Value Symbol's Name + Addend +# GNU-RELA64-NEXT: R_X86_64_NONE 0{{$}} +# GNU-RELA64-NEXT: R_X86_64_NONE 1{{$}} +# GNU-RELA64-NEXT: R_X86_64_NONE ffffffffffffffff{{$}} +# GNU-RELA64-NEXT: R_X86_64_NONE 7fffffffffffffff{{$}} +# GNU-RELA64-NEXT: R_X86_64_NONE 8000000000000000{{$}} +# GNU-RELA64-NEXT: R_X86_64_NONE ffffffffffffcfc7{{$}} +# GNU-RELA64-NEXT: R_X86_64_NONE 12345{{$}} + +# RUN: yaml2obj --docnum=1 -DENCODE=MSB -DTYPE=SHT_RELA %s -o %t.be64.rela +# RUN: llvm-readobj -r %t.be64.rela | FileCheck %s --check-prefix=LLVM-RELA64 +# RUN: llvm-readelf -r %t.be64.rela | FileCheck %s --check-prefix=GNU-RELA64 + +# RUN: yaml2obj --docnum=1 -DENCODE=LSB -DTYPE=SHT_REL %s -o %t.le64.rel +# RUN: llvm-readobj -r %t.le64.rel | FileCheck %s --check-prefix=LLVM-REL64 +# RUN: llvm-readelf -r %t.le64.rel | FileCheck %s --check-prefix=GNU-REL64 + +## FIXME: We either should not dump an addend or should read it from a +## destination location for a SHT_REL case. +## See https://bugs.llvm.org/show_bug.cgi?id=44257. +# LLVM-REL64-COUNT-7: R_X86_64_NONE - 0x0{{$}} +# LLVM-REL64-NOT: R_ + +# GNU-REL64: Type Symbol's Value Symbol's Name{{$}} +# GNU-REL64-COUNT-7: R_X86_64_NONE {{$}} +# GNU-REL64-NOT: R_ + +# RUN: yaml2obj --docnum=1 -DENCODE=MSB -DTYPE=SHT_REL %s -o %t.be64.rel +# RUN: llvm-readobj -r %t.be64.rel | FileCheck %s --check-prefix=LLVM-REL64 +# RUN: llvm-readelf -r %t.be64.rel | FileCheck %s --check-prefix=GNU-REL64 + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2[[ENCODE]] + Type: ET_EXEC + Machine: EM_X86_64 +Sections: + - Name: .foo + Type: [[TYPE]] + Relocations: +## Addend == 0. + - Addend: 0x0 + Type: R_X86_64_NONE +## Addend == first positive int64/uint64 == 1. + - Addend: 0x1 + Type: R_X86_64_NONE +## Addend == first negative int64 == -1. + - Addend: 0xffffffffffffffff + Type: R_X86_64_NONE +## Addend == max possible int64 == 0x7FFFFFFFFFFFFFFF. + - Addend: 0x7FFFFFFFFFFFFFFF + Type: R_X86_64_NONE +## Addend == min possible int64 == 0x8000000000000000. + - Addend: 0x8000000000000000 + Type: R_X86_64_NONE +## Addend == an arbitrary negative number. + - Addend: 0xFFFFFFFFFFFFCFC7 ## -12345 + Type: R_X86_64_NONE +## Addend == an arbitrary positive number. + - Addend: 0x12345 + Type: R_X86_64_NONE + +## Starting from here we check ELFCLASS32 objects. + +# RUN: yaml2obj --docnum=2 -DENCODE=LSB -DTYPE=SHT_RELA %s -o %t.le32.rela +# RUN: llvm-readobj -r %t.le32.rela | FileCheck %s --check-prefix=LLVM-RELA32 +# RUN: llvm-readelf -r %t.le32.rela | FileCheck %s --check-prefix=GNU-RELA32 + +# LLVM-RELA32: R_386_NONE - 0x0{{$}} +# LLVM-RELA32-NEXT: R_386_NONE - 0x1{{$}} +# LLVM-RELA32-NEXT: R_386_NONE - 0xFFFFFFFF{{$}} +# LLVM-RELA32-NEXT: R_386_NONE - 0x7FFFFFFF{{$}} +# LLVM-RELA32-NEXT: R_386_NONE - 0x80000000{{$}} +# LLVM-RELA32-NEXT: R_386_NONE - 0xFFFFCFC7{{$}} + +## FIXME: GNU readelf prints addends differently. +## See https://bugs.llvm.org/show_bug.cgi?id=45235. +# GNU-RELA32: Type Sym. Value Symbol's Name + Addend +# GNU-RELA32-NEXT: R_386_NONE 0{{$}} +# GNU-RELA32-NEXT: R_386_NONE 1{{$}} +# GNU-RELA32-NEXT: R_386_NONE ffffffffffffffff{{$}} +# GNU-RELA32-NEXT: R_386_NONE 7fffffff{{$}} +# GNU-RELA32-NEXT: R_386_NONE ffffffff80000000{{$}} +# GNU-RELA32-NEXT: R_386_NONE ffffffffffffcfc7{{$}} + +# RUN: yaml2obj --docnum=2 -DENCODE=MSB -DTYPE=SHT_RELA %s -o %t.be32.rela +# RUN: llvm-readobj -r %t.be32.rela | FileCheck %s --check-prefix=LLVM-RELA32 +# RUN: llvm-readelf -r %t.be32.rela | FileCheck %s --check-prefix=GNU-RELA32 + +# RUN: yaml2obj --docnum=2 -DENCODE=LSB -DTYPE=SHT_REL %s -o %t.le32.rel +# RUN: llvm-readobj -r %t.le32.rel | FileCheck %s --check-prefix=LLVM-REL32 +# RUN: llvm-readelf -r %t.le32.rel | FileCheck %s --check-prefix=GNU-REL32 + +## FIXME: We either should not dump an addend or should read it from a +## destination location for a SHT_REL case. +## See https://bugs.llvm.org/show_bug.cgi?id=44257. +# LLVM-REL32-COUNT-7: R_386_NONE - 0x0{{$}} +# LLVM-REL32-NOT: R_ + +# GNU-REL32: Type +# GNU-REL32-COUNT-7: R_386_NONE {{$}} +# GNU-REL32-NOT: R_ + +# RUN: yaml2obj --docnum=2 -DENCODE=MSB -DTYPE=SHT_REL %s -o %t.be32.rel +# RUN: llvm-readobj -r %t.be32.rel | FileCheck %s --check-prefix=LLVM-REL32 +# RUN: llvm-readelf -r %t.be32.rel | FileCheck %s --check-prefix=GNU-REL32 + +--- !ELF +FileHeader: + Class: ELFCLASS32 + Data: ELFDATA2[[ENCODE]] + Type: ET_EXEC + Machine: EM_386 +Sections: + - Name: .foo + Type: [[TYPE]] + Relocations: +## Addend == 0. + - Addend: 0x0 + Type: R_386_NONE +## Addend == first positive int32/uint32 == 1. + - Addend: 0x1 + Type: R_386_NONE +## Addend == first negative int32 == -1. + - Addend: 0xffffffff + Type: R_386_NONE +## Addend == max possible int32 == 0x7FFFFFFF. + - Addend: 0x7FFFFFFF + Type: R_386_NONE +## Addend == min possible int32 == 0x80000000. + - Addend: 0x80000000 + Type: R_386_NONE +## Addend == an arbitrary negative number. + - Addend: 0xFFFFCFC7 ## -12345 + Type: R_386_NONE +## Addend == an arbitrary positive number. + - Addend: 0x12345 + Type: R_386_NONE diff --git a/llvm/test/tools/llvm-readobj/ELF/stack-sizes.test b/llvm/test/tools/llvm-readobj/ELF/stack-sizes.test index d1f6dabdbf08c..c315749ba4251 100644 --- a/llvm/test/tools/llvm-readobj/ELF/stack-sizes.test +++ b/llvm/test/tools/llvm-readobj/ELF/stack-sizes.test @@ -92,27 +92,35 @@ Symbols: Binding: STB_GLOBAL ## Check that we correctly report the stack sizes in an executable (non-relocatable) -## object file. +## object file. This also shows that the sh_link field is ignored in this situation +## without warning. # RUN: yaml2obj --docnum=2 %s -o %t02 -# RUN: llvm-readelf --stack-sizes %t02 \ -# RUN: | FileCheck %s --check-prefix=EXEC-GNU --strict-whitespace --match-full-lines -# RUN: llvm-readobj --stack-sizes %t02 | FileCheck %s --check-prefix=EXEC-LLVM +# RUN: llvm-readelf --stack-sizes %t02 2>&1 \ +# RUN: | FileCheck %s --check-prefix=EXEC-GNU --strict-whitespace \ +# RUN: --match-full-lines --implicit-check-not=warning: +# RUN: llvm-readobj --stack-sizes %t02 2>&1 \ +# RUN: | FileCheck %s --check-prefix=EXEC-LLVM --implicit-check-not=warning: # EXEC-GNU: Size Function -# EXEC-GNU-NEXT: 16 foo -# EXEC-GNU-NEXT: 32 bar +# EXEC-GNU-NEXT: 16 other +# EXEC-GNU-NEXT: 32 other_end +# EXEC-GNU-NEXT: 48 bar # EXEC-GNU-NOT:{{.}} # EXEC-LLVM: StackSizes [ # EXEC-LLVM-NEXT: Entry { -# EXEC-LLVM-NEXT: Function: foo +# EXEC-LLVM-NEXT: Function: other # EXEC-LLVM-NEXT: Size: 0x10 # EXEC-LLVM-NEXT: } # EXEC-LLVM-NEXT: Entry { -# EXEC-LLVM-NEXT: Function: bar +# EXEC-LLVM-NEXT: Function: other_end # EXEC-LLVM-NEXT: Size: 0x20 # EXEC-LLVM-NEXT: } +# EXEC-LLVM-NEXT: Entry { +# EXEC-LLVM-NEXT: Function: bar +# EXEC-LLVM-NEXT: Size: 0x30 +# EXEC-LLVM-NEXT: } # EXEC-LLVM-NEXT: ] --- !ELF @@ -126,22 +134,45 @@ Sections: Type: SHT_PROGBITS Flags: [SHF_ALLOC] Size: 16 + - Name: .text2 + Type: SHT_PROGBITS + Flags: [SHF_ALLOC] + Size: 16 - Name: .stack_sizes Type: SHT_PROGBITS Entries: - - Address: 0x10 + - Address: 0x0 Size: 0x10 - - Address: 0x20 + - Address: 0x10 Size: 0x20 - Link: .text + - Address: 0x20 + Size: 0x30 + Link: .text2 Symbols: - - Name: foo + ## Undefined symbols are ignored. + - Name: undefined + Type: STT_FUNC + Binding: STB_GLOBAL + ## sh_link of .stack_sizes is ignored for non-reloctable objects. + - Name: other + Section: .text + Value: 0 + Type: STT_FUNC + Binding: STB_GLOBAL + ## If two symbols have the same value, the first is picked, regardless of + ## the sh_link value of the .stack_sizes section. + - Name: other_end Section: .text Value: 0x10 Type: STT_FUNC Binding: STB_GLOBAL + - Name: foo + Section: .text2 + Value: 0x10 + Type: STT_FUNC + Binding: STB_GLOBAL - Name: bar - Section: .text + Section: .text2 Value: 0x20 Type: STT_FUNC Binding: STB_GLOBAL @@ -184,7 +215,8 @@ Symbols: Binding: STB_GLOBAL ## Check that we warn about a function symbol that is not in the section -## that is referenced by the stack sizes section's sh_link. +## that is referenced by the stack sizes section's sh_link, for relocatable +## output. # RUN: yaml2obj --docnum=4 %s -o %t04 # RUN: llvm-readelf --stack-sizes %t04 2> %t04-gnu.err | FileCheck %s --check-prefix=WRONGSECTION-GNU @@ -437,18 +469,23 @@ Sections: # MULTIPLE-GNU-EMPTY: # MULTIPLE-GNU-NEXT:Stack Sizes: # MULTIPLE-GNU-NEXT: Size Function -# MULTIPLE-GNU-NEXT: 16 foo -# MULTIPLE-GNU-NEXT: 32 bar +# MULTIPLE-GNU-NEXT: 16 other +# MULTIPLE-GNU-NEXT: 32 other_end +# MULTIPLE-GNU-NEXT: 48 bar # MULTIPLE-LLVM: StackSizes [ # MULTIPLE-LLVM-NEXT: Entry { -# MULTIPLE-LLVM-NEXT: Function: foo +# MULTIPLE-LLVM-NEXT: Function: other # MULTIPLE-LLVM-NEXT: Size: 0x10 # MULTIPLE-LLVM-NEXT: } # MULTIPLE-LLVM-NEXT: Entry { -# MULTIPLE-LLVM-NEXT: Function: bar +# MULTIPLE-LLVM-NEXT: Function: other_end # MULTIPLE-LLVM-NEXT: Size: 0x20 # MULTIPLE-LLVM-NEXT: } +# MULTIPLE-LLVM-NEXT: Entry { +# MULTIPLE-LLVM-NEXT: Function: bar +# MULTIPLE-LLVM-NEXT: Size: 0x30 +# MULTIPLE-LLVM-NEXT: } # MULTIPLE-LLVM-NEXT: ] ## Check that we do not consider symbols that are not function symbols, even though diff --git a/llvm/test/tools/obj2yaml/dynamic-section.test b/llvm/test/tools/obj2yaml/dynamic-section.test index 28066dd16adfb..65eb335746a81 100644 --- a/llvm/test/tools/obj2yaml/dynamic-section.test +++ b/llvm/test/tools/obj2yaml/dynamic-section.test @@ -1,16 +1,15 @@ -# RUN: yaml2obj %s -o %t -# RUN: obj2yaml %t | FileCheck %s +## Check we can use obj2yaml to yamalize the object containing .dynamic +## section. Check that resulting section has the proper attributes and +## dynamic tags and that we do not dump the default sh_entsize. -## Check we can use obj2yaml to yamalize the object containing -## .dynamic section. Check that resulting section has the -## proper attributes and dynamic tags. +# RUN: yaml2obj -DENTSIZE=0x10 %s -o %t1 +# RUN: obj2yaml %t1 | FileCheck %s # CHECK: Sections: # CHECK-NEXT: - Name: .dynamic # CHECK-NEXT: Type: SHT_DYNAMIC # CHECK-NEXT: Address: 0x0000000000001000 # CHECK-NEXT: AddressAlign: 0x0000000000002000 -# CHECK-NEXT: EntSize: 0x0000000000000010 # CHECK-NEXT: Entries: # CHECK-NEXT: - Tag: DT_NULL # CHECK-NEXT: Value: 0x0000000000000000 @@ -136,7 +135,7 @@ Sections: Type: SHT_DYNAMIC Address: 0x0000000000001000 AddressAlign: 0x0000000000002000 - EntSize: 0x0000000000000010 + EntSize: [[ENTSIZE]] Entries: - Tag: DT_NULL Value: 0x0000000000000000 @@ -250,3 +249,11 @@ Sections: Value: 0x0000000000000036 - Tag: DT_USED Value: 0x0000000000000001 + +## Test the behavior when sh_entsize is invalid. +## Here we use 0xFE as an arbitrary broken value instead of expected 0x16. + +# RUN: yaml2obj -DENTSIZE=0xff %s -o %t2 +# RUN: not obj2yaml %t2 2>&1 | FileCheck %s --check-prefix=ENTSIZE + +# ENTSIZE: section [index 1] has an invalid sh_entsize: 255 diff --git a/llvm/test/tools/obj2yaml/elf-ppc64-relocations.yaml b/llvm/test/tools/obj2yaml/elf-ppc64-relocations.yaml index 512b71ea4d833..e476242eb1c67 100644 --- a/llvm/test/tools/obj2yaml/elf-ppc64-relocations.yaml +++ b/llvm/test/tools/obj2yaml/elf-ppc64-relocations.yaml @@ -10,9 +10,8 @@ # CHECK-NEXT: Type: ET_REL # CHECK-NEXT: Machine: EM_PPC64 # CHECK-NEXT: Sections: -# CHECK-NEXT: - Name: .rela.text -# CHECK-NEXT: Type: SHT_RELA -# CHECK-NEXT: EntSize: 0x0000000000000018 +# CHECK-NEXT: - Name: .rela.text +# CHECK-NEXT: Type: SHT_RELA # CHECK-NEXT: Relocations: # CHECK-NEXT: - Type: R_PPC64_NONE # CHECK-NEXT: - Type: R_PPC64_ADDR32 diff --git a/llvm/test/tools/obj2yaml/elf-reladyn-section-shinfo.yaml b/llvm/test/tools/obj2yaml/elf-reladyn-section-shinfo.yaml index 0f2906470f3de..8863ac8c9e9bd 100644 --- a/llvm/test/tools/obj2yaml/elf-reladyn-section-shinfo.yaml +++ b/llvm/test/tools/obj2yaml/elf-reladyn-section-shinfo.yaml @@ -19,11 +19,11 @@ # CHECK-NEXT: AddressAlignment: # CHECK-NEXT: EntrySize: -# YAML: - Name: .rela.dyn -# YAML-NEXT: Type: SHT_RELA -# YAML-NEXT: Flags: [ SHF_ALLOC ] -# YAML-NEXT: Link: .dynsym -# YAML-NEXT: EntSize: 0x0000000000000018 +# YAML: - Name: .rela.dyn +# YAML-NEXT: Type: SHT_RELA +# YAML-NEXT: Flags: [ SHF_ALLOC ] +# YAML-NEXT: Link: .dynsym +# YAML-NEXT: - Name: --- !ELF FileHeader: @@ -37,7 +37,6 @@ Sections: Type: SHT_RELA Flags: [ SHF_ALLOC ] Link: .dynsym - EntSize: 0x0000000000000018 # Add at least one symbol to trigger the .dynsym emission. DynamicSymbols: - Name: bar diff --git a/llvm/test/tools/obj2yaml/no-symbol-reloc.test b/llvm/test/tools/obj2yaml/no-symbol-reloc.test index 97800491a7afa..8940f6a9e8b79 100644 --- a/llvm/test/tools/obj2yaml/no-symbol-reloc.test +++ b/llvm/test/tools/obj2yaml/no-symbol-reloc.test @@ -16,7 +16,6 @@ # CHECK-NEXT: Flags: [ SHF_ALLOC, SHF_EXECINSTR ] # CHECK-NEXT: - Name: .rela.text # CHECK-NEXT: Type: SHT_RELA -# CHECK-NEXT: EntSize: 0x0000000000000018 # CHECK-NEXT: Info: .text # CHECK-NEXT: Relocations: # CHECK-NEXT: - Type: R_X86_64_NONE diff --git a/llvm/test/tools/obj2yaml/rel-rela-section.yaml b/llvm/test/tools/obj2yaml/rel-rela-section.yaml new file mode 100644 index 0000000000000..6bef1d30cad8d --- /dev/null +++ b/llvm/test/tools/obj2yaml/rel-rela-section.yaml @@ -0,0 +1,49 @@ +## This is a generic test for SHT_REL/SHT_RELA sections. + +## Check that we do not print excessive default +## fields for SHT_REL[A] sections. +# RUN: yaml2obj %s -o %t1 +# RUN: obj2yaml %t1 | FileCheck %s --check-prefix=YAML + +## Note: it is important to have at least two sections with sh_info == 0. +## Previously we printed a broken Info field in this case. +# YAML: - Name: .rela.dyn +# YAML-NEXT: Type: SHT_RELA +# YAML-NEXT: - Name: .rel.dyn +# YAML-NEXT: Type: SHT_REL +# YAML-NEXT: - Name + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_DYN + Machine: EM_X86_64 +Sections: + - Name: .rela.dyn + Type: SHT_RELA + - Name: .rel.dyn + Type: SHT_REL +## Trigger the .dynsym emission. +DynamicSymbols: [] + +## Test the behavior when the sh_entsize field is broken. +## Here we use the 0xFE value instead of expected 0x18/0x10. + +# RUN: yaml2obj -DTYPE=SHT_RELA --docnum=2 %s -o %t2.rela +# RUN: not obj2yaml %t2.rela 2>&1 | FileCheck %s --check-prefix=ERR +# RUN: yaml2obj -DTYPE=SHT_REL --docnum=2 %s -o %t2.rel +# RUN: not obj2yaml %t2.rel 2>&1 | FileCheck %s --check-prefix=ERR + +# ERR: section [index 1] has an invalid sh_entsize: 254 + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_DYN + Machine: EM_X86_64 +Sections: + - Name: .foo + Type: [[TYPE]] + EntSize: 0xFE diff --git a/llvm/test/tools/obj2yaml/relocation-type.yaml b/llvm/test/tools/obj2yaml/relocation-type.yaml index 6ab8e9c462ea1..d069abe68edae 100644 --- a/llvm/test/tools/obj2yaml/relocation-type.yaml +++ b/llvm/test/tools/obj2yaml/relocation-type.yaml @@ -13,7 +13,6 @@ # CHECK-NEXT: Sections: # CHECK-NEXT: - Name: .rela.text # CHECK-NEXT: Type: SHT_RELA -# CHECK-NEXT: EntSize: 0x0000000000000018 # CHECK-NEXT: Relocations: # CHECK-NEXT: - Offset: 0x0000000000000009 # CHECK-NEXT: Type: [[FIRST]] diff --git a/llvm/test/tools/obj2yaml/relr-section.yaml b/llvm/test/tools/obj2yaml/relr-section.yaml index 37ddf2c2e6716..3134fcc22abe7 100644 --- a/llvm/test/tools/obj2yaml/relr-section.yaml +++ b/llvm/test/tools/obj2yaml/relr-section.yaml @@ -1,7 +1,8 @@ ## Test how we dump SHT_RELR sections for 32 and 64-bit targets. -## Test we use the "Entries" property when it is possible do -## dump values correctly. +## Test we use the "Entries" property when it is possible to +## dump values correctly. Also, check we do not dump sh_entsize when +## it has the default value. # RUN: yaml2obj --docnum=1 -D BITS=32 -D ENCODE=LSB %s -o %t.32le # RUN: obj2yaml %t.32le | FileCheck %s --check-prefix=ELF32LE @@ -15,25 +16,21 @@ # ELF64LE: Sections: # ELF64LE-NEXT: - Name: .relr.dyn # ELF64LE-NEXT: Type: SHT_RELR -# ELF64LE-NEXT: EntSize: 0x0000000000000008 # ELF64LE-NEXT: Entries: [ 0x8877665544332211 ] # ELF32LE: Sections: # ELF32LE-NEXT: - Name: .relr.dyn # ELF32LE-NEXT: Type: SHT_RELR -# ELF32LE-NEXT: EntSize: 0x0000000000000004 # ELF32LE-NEXT: Entries: [ 0x0000000044332211, 0x0000000088776655 ] # ELF64BE: Sections: # ELF64BE-NEXT: - Name: .relr.dyn # ELF64BE-NEXT: Type: SHT_RELR -# ELF64BE-NEXT: EntSize: 0x0000000000000008 # ELF64BE-NEXT: Entries: [ 0x1122334455667788 ] # ELF32BE: Sections: # ELF32BE-NEXT: - Name: .relr.dyn # ELF32BE-NEXT: Type: SHT_RELR -# ELF32BE-NEXT: EntSize: 0x0000000000000004 # ELF32BE-NEXT: Entries: [ 0x0000000011223344, 0x0000000055667788 ] --- !ELF @@ -54,7 +51,6 @@ Sections: # CONTENT: - Name: .relr.dyn # CONTENT-NEXT: Type: SHT_RELR -# CONTENT-NEXT: EntSize: 0x0000000000000008 # CONTENT-NEXT: Content: '11223344556677' --- !ELF @@ -67,3 +63,27 @@ Sections: - Name: .relr.dyn Type: SHT_RELR Content: "11223344556677" + +## Test we are able to dump a SHT_RELR section when sh_entsize is invalid. +## Here we use 0xFE as a value instead of expected 0x8. + +# RUN: yaml2obj --docnum=3 %s -o %t.entsize +# RUN: obj2yaml %t.entsize | FileCheck %s --check-prefix=ENTSIZE + +# ENTSIZE: - Name: .relr.dyn +# ENTSIZE-NEXT: Type: SHT_RELR +# ENTSIZE-NEXT: EntSize: 0x00000000000000FE +# ENTSIZE-NEXT: Content: '1122334455667788' +# ENTSIZE-NEXT: ... + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2MSB + Type: ET_DYN + Machine: EM_X86_64 +Sections: + - Name: .relr.dyn + Type: SHT_RELR + EntSize: 0xFE + Content: "1122334455667788" diff --git a/llvm/tools/bugpoint/CMakeLists.txt b/llvm/tools/bugpoint/CMakeLists.txt index 421889cfedb7f..df10bfe4fcac1 100644 --- a/llvm/tools/bugpoint/CMakeLists.txt +++ b/llvm/tools/bugpoint/CMakeLists.txt @@ -39,4 +39,4 @@ add_llvm_tool(bugpoint intrinsics_gen SUPPORT_PLUGINS ) -export_executable_symbols(bugpoint) +export_executable_symbols_for_plugins(bugpoint) diff --git a/llvm/tools/dsymutil/DwarfLinkerForBinary.cpp b/llvm/tools/dsymutil/DwarfLinkerForBinary.cpp index 51912c78769bf..c1c247d1baa02 100644 --- a/llvm/tools/dsymutil/DwarfLinkerForBinary.cpp +++ b/llvm/tools/dsymutil/DwarfLinkerForBinary.cpp @@ -297,6 +297,7 @@ bool DwarfLinkerForBinary::link(const DebugMap &Map) { remarks::RemarkLinker RL; if (!Options.RemarksPrependPath.empty()) RL.setExternalFilePrependPath(Options.RemarksPrependPath); + GeneralLinker.setObjectPrefixMap(&Options.ObjectPrefixMap); std::function TranslationLambda = [&](StringRef Input) { assert(Options.Translator); diff --git a/llvm/tools/dsymutil/LinkUtils.h b/llvm/tools/dsymutil/LinkUtils.h index 92de81da8fa08..0339f4485e73b 100644 --- a/llvm/tools/dsymutil/LinkUtils.h +++ b/llvm/tools/dsymutil/LinkUtils.h @@ -57,6 +57,9 @@ struct LinkOptions { /// -oso-prepend-path std::string PrependPath; + /// The -object-prefix-map. + std::map ObjectPrefixMap; + /// The Resources directory in the .dSYM bundle. Optional ResourceDir; diff --git a/llvm/tools/dsymutil/Options.td b/llvm/tools/dsymutil/Options.td index eb86c2f3ae1b3..5360bf09ac751 100644 --- a/llvm/tools/dsymutil/Options.td +++ b/llvm/tools/dsymutil/Options.td @@ -110,6 +110,15 @@ def oso_prepend_path: Separate<["--", "-"], "oso-prepend-path">, Group; def: Joined<["--", "-"], "oso-prepend-path=">, Alias; +def object_prefix_map: Separate<["--", "-"], "object-prefix-map">, + MetaVarName<"">, + HelpText<"Remap object file paths (but no source paths) before processing." + "Use this for Clang objects where the module cache location was" + "remapped using -fdebug-prefix-map; to help dsymutil" + "find the Clang module cache.">, + Group; +def: Joined<["--", "-"], "object-prefix-map=">, Alias; + def symbolmap: Separate<["--", "-"], "symbol-map">, MetaVarName<"">, HelpText<"Updates the existing dSYMs inplace using symbol map specified.">, diff --git a/llvm/tools/dsymutil/dsymutil.cpp b/llvm/tools/dsymutil/dsymutil.cpp index 54adeaa11c1a2..8b70673785daf 100644 --- a/llvm/tools/dsymutil/dsymutil.cpp +++ b/llvm/tools/dsymutil/dsymutil.cpp @@ -246,6 +246,12 @@ static Expected getOptions(opt::InputArgList &Args) { if (opt::Arg *OsoPrependPath = Args.getLastArg(OPT_oso_prepend_path)) Options.LinkOpts.PrependPath = OsoPrependPath->getValue(); + for (const auto &Arg : Args.getAllArgValues(OPT_object_prefix_map)) { + auto Split = StringRef(Arg).split('='); + Options.LinkOpts.ObjectPrefixMap.insert( + {std::string(Split.first), std::string(Split.second)}); + } + if (opt::Arg *OutputFile = Args.getLastArg(OPT_output)) Options.OutputFile = OutputFile->getValue(); diff --git a/llvm/tools/llc/CMakeLists.txt b/llvm/tools/llc/CMakeLists.txt index 479bc6b55b27f..2eecfca2e075f 100644 --- a/llvm/tools/llc/CMakeLists.txt +++ b/llvm/tools/llc/CMakeLists.txt @@ -27,4 +27,4 @@ add_llvm_tool(llc SUPPORT_PLUGINS ) -export_executable_symbols(llc) +export_executable_symbols_for_plugins(llc) diff --git a/llvm/tools/llc/llc.cpp b/llvm/tools/llc/llc.cpp index a97d87d739a53..76dd071bd2c71 100644 --- a/llvm/tools/llc/llc.cpp +++ b/llvm/tools/llc/llc.cpp @@ -50,6 +50,7 @@ #include "llvm/Support/TargetSelect.h" #include "llvm/Support/ToolOutputFile.h" #include "llvm/Support/WithColor.h" +#include "llvm/Target/TargetLoweringObjectFile.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Transforms/Utils/Cloning.h" #include @@ -596,6 +597,8 @@ static int compileModule(char **argv, LLVMContext &Context) { return 1; } + const_cast(LLVMTM.getObjFileLowering()) + ->Initialize(MMIWP->getMMI().getContext(), *Target); if (MIR) { assert(MMIWP && "Forgot to create MMIWP?"); if (MIR->parseMachineFunctions(*M, MMIWP->getMMI())) diff --git a/llvm/tools/lli/lli.cpp b/llvm/tools/lli/lli.cpp index f7a0022b68c4f..45ce927433591 100644 --- a/llvm/tools/lli/lli.cpp +++ b/llvm/tools/lli/lli.cpp @@ -30,6 +30,7 @@ #include "llvm/ExecutionEngine/Orc/LLJIT.h" #include "llvm/ExecutionEngine/Orc/MachOPlatform.h" #include "llvm/ExecutionEngine/Orc/OrcRemoteTargetClient.h" +#include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h" #include "llvm/ExecutionEngine/OrcMCJITReplacement.h" #include "llvm/ExecutionEngine/SectionMemoryManager.h" #include "llvm/IR/IRBuilder.h" @@ -891,6 +892,11 @@ int runOrcLazyJIT(const char *ProgName) { auto J = ExitOnErr(Builder.create()); + if (TT->isOSBinFormatELF()) + static_cast(J->getObjLinkingLayer()) + .registerJITEventListener( + *JITEventListener::createGDBRegistrationListener()); + if (PerModuleLazy) J->setPartitionFunction(orc::CompileOnDemandLayer::compileWholeModule); diff --git a/llvm/tools/llvm-ar/llvm-ar.cpp b/llvm/tools/llvm-ar/llvm-ar.cpp index 4e5c0b408f776..652219568db14 100644 --- a/llvm/tools/llvm-ar/llvm-ar.cpp +++ b/llvm/tools/llvm-ar/llvm-ar.cpp @@ -14,11 +14,14 @@ #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/ADT/Triple.h" +#include "llvm/BinaryFormat/Magic.h" #include "llvm/IR/LLVMContext.h" #include "llvm/Object/Archive.h" #include "llvm/Object/ArchiveWriter.h" +#include "llvm/Object/IRObjectFile.h" #include "llvm/Object/MachO.h" #include "llvm/Object/ObjectFile.h" +#include "llvm/Object/SymbolicFile.h" #include "llvm/Support/Chrono.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/ConvertUTF.h" @@ -875,8 +878,9 @@ static object::Archive::Kind getDefaultForHost() { } static object::Archive::Kind getKindFromMember(const NewArchiveMember &Member) { + auto MemBufferRef = Member.Buf->getMemBufferRef(); Expected> OptionalObject = - object::ObjectFile::createObjectFile(Member.Buf->getMemBufferRef()); + object::ObjectFile::createObjectFile(MemBufferRef); if (OptionalObject) return isa(**OptionalObject) @@ -885,6 +889,23 @@ static object::Archive::Kind getKindFromMember(const NewArchiveMember &Member) { // squelch the error in case we had a non-object file consumeError(OptionalObject.takeError()); + + // If we're adding a bitcode file to the archive, detect the Archive kind + // based on the target triple. + LLVMContext Context; + if (identify_magic(MemBufferRef.getBuffer()) == file_magic::bitcode) { + if (auto ObjOrErr = object::SymbolicFile::createSymbolicFile( + MemBufferRef, file_magic::bitcode, &Context)) { + auto &IRObject = cast(**ObjOrErr); + return Triple(IRObject.getTargetTriple()).isOSDarwin() + ? object::Archive::K_DARWIN + : object::Archive::K_GNU; + } else { + // Squelch the error in case this was not a SymbolicFile. + consumeError(ObjOrErr.takeError()); + } + } + return getDefaultForHost(); } diff --git a/llvm/tools/llvm-dwp/llvm-dwp.cpp b/llvm/tools/llvm-dwp/llvm-dwp.cpp index df524ad0a2e8f..51b3470afee45 100644 --- a/llvm/tools/llvm-dwp/llvm-dwp.cpp +++ b/llvm/tools/llvm-dwp/llvm-dwp.cpp @@ -479,7 +479,7 @@ static Error buildDuplicateError(const std::pair &PrevE, const CompileUnitIdentifiers &ID, StringRef DWPName) { return make_error( - std::string("Duplicate DWO ID (") + utohexstr(PrevE.first) + ") in " + + std::string("duplicate DWO ID (") + utohexstr(PrevE.first) + ") in " + buildDWODescription(PrevE.second.Name, PrevE.second.DWPName, PrevE.second.DWOName) + " and " + buildDWODescription(ID.Name, DWPName, ID.DWOName)); @@ -596,7 +596,7 @@ static Error write(MCStreamer &Out, ArrayRef Inputs) { DWARFUnitIndex CUIndex(DW_SECT_INFO); DataExtractor CUIndexData(CurCUIndexSection, Obj.isLittleEndian(), 0); if (!CUIndex.parse(CUIndexData)) - return make_error("Failed to parse cu_index"); + return make_error("failed to parse cu_index"); for (const DWARFUnitIndex::Entry &E : CUIndex.getRows()) { auto *I = E.getOffsets(); @@ -631,7 +631,7 @@ static Error write(MCStreamer &Out, ArrayRef Inputs) { DWARFUnitIndex TUIndex(DW_SECT_TYPES); DataExtractor TUIndexData(CurTUIndexSection, Obj.isLittleEndian(), 0); if (!TUIndex.parse(TUIndexData)) - return make_error("Failed to parse tu_index"); + return make_error("failed to parse tu_index"); addAllTypesFromDWP(Out, TypeIndexEntries, TUIndex, TypesSection, CurTypesSection.front(), CurEntry, ContributionOffsets[DW_SECT_TYPES - DW_SECT_INFO]); diff --git a/llvm/tools/llvm-objcopy/llvm-objcopy.cpp b/llvm/tools/llvm-objcopy/llvm-objcopy.cpp index 8e95ebb73331a..69b23b6cf9756 100644 --- a/llvm/tools/llvm-objcopy/llvm-objcopy.cpp +++ b/llvm/tools/llvm-objcopy/llvm-objcopy.cpp @@ -327,11 +327,25 @@ enum class ToolType { Objcopy, Strip, InstallNameTool }; int main(int argc, char **argv) { InitLLVM X(argc, argv); ToolName = argv[0]; - ToolType Tool = StringSwitch(sys::path::stem(ToolName)) - .EndsWith("strip", ToolType::Strip) - .EndsWith("install-name-tool", ToolType::InstallNameTool) - .EndsWith("install_name_tool", ToolType::InstallNameTool) - .Default(ToolType::Objcopy); + + StringRef Stem = sys::path::stem(ToolName); + auto Is = [=](StringRef Tool) { + // We need to recognize the following filenames: + // + // llvm-objcopy -> objcopy + // strip-10.exe -> strip + // powerpc64-unknown-freebsd13-objcopy -> objcopy + // llvm-install-name-tool -> install-name-tool + auto I = Stem.rfind_lower(Tool); + return I != StringRef::npos && + (I + Tool.size() == Stem.size() || !isAlnum(Stem[I + Tool.size()])); + }; + ToolType Tool = ToolType::Objcopy; + if (Is("strip")) + Tool = ToolType::Strip; + else if (Is("install-name-tool") || Is("install_name_tool")) + Tool = ToolType::InstallNameTool; + // Expand response files. // TODO: Move these lines, which are copied from lib/Support/CommandLine.cpp, // into a separate function in the CommandLine library and call that function diff --git a/llvm/tools/llvm-readobj/ELFDumper.cpp b/llvm/tools/llvm-readobj/ELFDumper.cpp index 283f1b24020df..defa4a3cc1095 100644 --- a/llvm/tools/llvm-readobj/ELFDumper.cpp +++ b/llvm/tools/llvm-readobj/ELFDumper.cpp @@ -283,7 +283,7 @@ template class ELFDumper : public ObjDumper { DynRegionInfo DynRelaRegion; DynRegionInfo DynRelrRegion; DynRegionInfo DynPLTRelRegion; - DynRegionInfo DynSymRegion; + Optional DynSymRegion; DynRegionInfo DynamicTable; StringRef DynamicStringTable; std::string SOName = ""; @@ -322,7 +322,9 @@ template class ELFDumper : public ObjDumper { } Elf_Sym_Range dynamic_symbols() const { - return DynSymRegion.getAsArrayRef(); + if (!DynSymRegion) + return Elf_Sym_Range(); + return DynSymRegion->getAsArrayRef(); } Elf_Rel_Range dyn_rels() const; @@ -667,8 +669,8 @@ void ELFDumper::printSymbolsHelper(bool IsDynamic) const { StrTable = DynamicStringTable; Syms = dynamic_symbols(); SymtabName = DynSymtabName; - if (DynSymRegion.Addr) - Entries = DynSymRegion.Size / DynSymRegion.EntSize; + if (DynSymRegion) + Entries = DynSymRegion->Size / DynSymRegion->EntSize; } else { if (!DotSymtabSec) return; @@ -752,7 +754,7 @@ template class DumpStyle { void printRelocatableStackSizes(const ELFObjectFile *Obj, std::function PrintHeader); void printFunctionStackSize(const ELFObjectFile *Obj, uint64_t SymValue, - SectionRef FunctionSec, + Optional FunctionSec, const StringRef SectionName, DataExtractor Data, uint64_t *Offset); void printStackSize(const ELFObjectFile *Obj, RelocationRef Rel, @@ -993,7 +995,7 @@ std::error_code createELFDumper(const object::ObjectFile *Obj, template Error ELFDumper::LoadVersionMap() const { // If there is no dynamic symtab or version table, there is nothing to do. - if (!DynSymRegion.Addr || !SymbolVersionSection) + if (!DynSymRegion || !SymbolVersionSection) return Error::success(); // Has the VersionMap already been loaded? @@ -1043,10 +1045,11 @@ Expected ELFDumper::getSymbolVersion(const Elf_Sym *Sym, return ""; } + assert(DynSymRegion && "DynSymRegion has not been initialised"); // Determine the position in the symbol table of this entry. size_t EntryIndex = (reinterpret_cast(Sym) - - reinterpret_cast(DynSymRegion.Addr)) / - sizeof(Elf_Sym); + reinterpret_cast(DynSymRegion->Addr)) / + sizeof(Elf_Sym); // Get the corresponding version index entry. const Elf_Versym *Versym = unwrapOrError( @@ -1980,8 +1983,12 @@ ELFDumper::ELFDumper(const object::ELFObjectFile *ObjF, ScopedPrinter &Writer) : ObjDumper(Writer), ObjF(ObjF), DynRelRegion(ObjF->getFileName()), DynRelaRegion(ObjF->getFileName()), DynRelrRegion(ObjF->getFileName()), - DynPLTRelRegion(ObjF->getFileName()), DynSymRegion(ObjF->getFileName()), - DynamicTable(ObjF->getFileName()) { + DynPLTRelRegion(ObjF->getFileName()), DynamicTable(ObjF->getFileName()) { + if (opts::Output == opts::GNU) + ELFDumperStyle.reset(new GNUStyle(Writer, this)); + else + ELFDumperStyle.reset(new LLVMStyle(Writer, this)); + const ELFFile *Obj = ObjF->getELFFile(); typename ELFT::ShdrRange Sections = unwrapOrError(ObjF->getFileName(), Obj->sections()); @@ -1992,9 +1999,9 @@ ELFDumper::ELFDumper(const object::ELFObjectFile *ObjF, DotSymtabSec = &Sec; break; case ELF::SHT_DYNSYM: - if (!DynSymRegion.Size) { + if (!DynSymRegion) { DynSymRegion = createDRIFrom(&Sec); - DynSymRegion.Context = + DynSymRegion->Context = ("section with index " + Twine(&Sec - &Sections.front())).str(); // This is only used (if Elf_Shdr present)for naming section in GNU // style @@ -2034,11 +2041,6 @@ ELFDumper::ELFDumper(const object::ELFObjectFile *ObjF, } loadDynamicTable(Obj); - - if (opts::Output == opts::GNU) - ELFDumperStyle.reset(new GNUStyle(Writer, this)); - else - ELFDumperStyle.reset(new LLVMStyle(Writer, this)); } template @@ -2059,6 +2061,7 @@ void ELFDumper::parseDynamicTable(const ELFFile *Obj) { uint64_t SONameOffset = 0; const char *StringTableBegin = nullptr; uint64_t StringTableSize = 0; + Optional DynSymFromTable; for (const Elf_Dyn &Dyn : dynamic_table()) { switch (Dyn.d_tag) { case ELF::DT_HASH: @@ -2077,26 +2080,13 @@ void ELFDumper::parseDynamicTable(const ELFFile *Obj) { StringTableSize = Dyn.getVal(); break; case ELF::DT_SYMTAB: { - // Often we find the information about the dynamic symbol table - // location in the SHT_DYNSYM section header. However, the value in - // DT_SYMTAB has priority, because it is used by dynamic loaders to - // locate .dynsym at runtime. The location we find in the section header - // and the location we find here should match. If we can't map the - // DT_SYMTAB value to an address (e.g. when there are no program headers), we - // ignore its value. + // If we can't map the DT_SYMTAB value to an address (e.g. when there are + // no program headers), we ignore its value. if (const uint8_t *VA = toMappedAddr(Dyn.getTag(), Dyn.getPtr())) { - // EntSize is non-zero if the dynamic symbol table has been found via a - // section header. - if (DynSymRegion.EntSize && VA != DynSymRegion.Addr) - reportWarning( - createError( - "SHT_DYNSYM section header and DT_SYMTAB disagree about " - "the location of the dynamic symbol table"), - ObjF->getFileName()); - - DynSymRegion.Addr = VA; - DynSymRegion.EntSize = sizeof(Elf_Sym); - DynSymRegion.EntSizePrintName = ""; + DynSymFromTable.emplace(ObjF->getFileName()); + DynSymFromTable->Addr = VA; + DynSymFromTable->EntSize = sizeof(Elf_Sym); + DynSymFromTable->EntSizePrintName = ""; } break; } @@ -2176,6 +2166,48 @@ void ELFDumper::parseDynamicTable(const ELFFile *Obj) { if (StringTableBegin) DynamicStringTable = StringRef(StringTableBegin, StringTableSize); SOName = getDynamicString(SONameOffset); + + if (DynSymRegion) { + // Often we find the information about the dynamic symbol table + // location in the SHT_DYNSYM section header. However, the value in + // DT_SYMTAB has priority, because it is used by dynamic loaders to + // locate .dynsym at runtime. The location we find in the section header + // and the location we find here should match. + if (DynSymFromTable && DynSymFromTable->Addr != DynSymRegion->Addr) + ELFDumperStyle->reportUniqueWarning( + createError("SHT_DYNSYM section header and DT_SYMTAB disagree about " + "the location of the dynamic symbol table")); + + // According to the ELF gABI: "The number of symbol table entries should + // equal nchain". Check to see if the DT_HASH hash table nchain value + // conflicts with the number of symbols in the dynamic symbol table + // according to the section header. + if (HashTable && + HashTable->nchain != DynSymRegion->Size / DynSymRegion->EntSize) + ELFDumperStyle->reportUniqueWarning(createError( + "hash table nchain (" + Twine(HashTable->nchain) + + ") differs from symbol count derived from SHT_DYNSYM section " + "header (" + + Twine(DynSymRegion->Size / DynSymRegion->EntSize) + ")")); + } + + // Delay the creation of the actual dynamic symbol table until now, so that + // checks can always be made against the section header-based properties, + // without worrying about tag order. + if (DynSymFromTable) { + if (!DynSymRegion) { + DynSymRegion = DynSymFromTable; + } else { + DynSymRegion->Addr = DynSymFromTable->Addr; + DynSymRegion->EntSize = DynSymFromTable->EntSize; + DynSymRegion->EntSizePrintName = DynSymFromTable->EntSizePrintName; + } + } + + // Derive the dynamic symbol table size from the DT_HASH hash table, if + // present. + if (HashTable && DynSymRegion) + DynSymRegion->Size = HashTable->nchain * DynSymRegion->EntSize; } template @@ -2591,7 +2623,7 @@ template void ELFDumper::printGnuHashTable() { ArrayRef Buckets = GnuHashTable->buckets(); W.printList("Buckets", Buckets); - if (!DynSymRegion.Addr) { + if (!DynSymRegion) { reportWarning(createError("unable to dump 'Values' for the SHT_GNU_HASH " "section: no dynamic symbol table found"), ObjF->getFileName()); @@ -3689,10 +3721,10 @@ void GNUStyle::printSymtabMessage(const ELFO *Obj, StringRef Name, size_t Entries, bool NonVisibilityBitsUsed) { if (!Name.empty()) - OS << "\nSymbol table '" << Name << "' contains " << Entries - << " entries:\n"; + OS << "\nSymbol table '" << Name << "'"; else - OS << "\n Symbol table for image:\n"; + OS << "\nSymbol table for image"; + OS << " contains " << Entries << " entries:\n"; if (ELFT::Is64Bits) OS << " Num: Value Size Type Bind Vis"; @@ -5174,9 +5206,12 @@ static std::string getSymbolName(const ELFSymbolRef &Sym) { } template -void DumpStyle::printFunctionStackSize( - const ELFObjectFile *Obj, uint64_t SymValue, SectionRef FunctionSec, - const StringRef SectionName, DataExtractor Data, uint64_t *Offset) { +void DumpStyle::printFunctionStackSize(const ELFObjectFile *Obj, + uint64_t SymValue, + Optional FunctionSec, + const StringRef SectionName, + DataExtractor Data, + uint64_t *Offset) { // This function ignores potentially erroneous input, unless it is directly // related to stack size reporting. SymbolRef FuncSym; @@ -5186,9 +5221,15 @@ void DumpStyle::printFunctionStackSize( consumeError(SymAddrOrErr.takeError()); continue; } + if (Expected SymFlags = Symbol.getFlags()) { + if (*SymFlags & SymbolRef::SF_Undefined) + continue; + } else + consumeError(SymFlags.takeError()); if (Symbol.getELFType() == ELF::STT_FUNC && *SymAddrOrErr == SymValue) { - // Check if the symbol is in the right section. - if (FunctionSec.containsSymbol(Symbol)) { + // Check if the symbol is in the right section. FunctionSec == None means + // "any section". + if (!FunctionSec || FunctionSec->containsSymbol(Symbol)) { FuncSym = Symbol; break; } @@ -5299,11 +5340,6 @@ void DumpStyle::printNonRelocatableStackSizes( ArrayRef Contents = unwrapOrError(this->FileName, EF->getSectionContents(ElfSec)); DataExtractor Data(Contents, Obj->isLittleEndian(), sizeof(Elf_Addr)); - // A .stack_sizes section header's sh_link field is supposed to point - // to the section that contains the functions whose stack sizes are - // described in it. - const Elf_Shdr *FunctionELFSec = - unwrapOrError(this->FileName, EF->getSection(ElfSec->sh_link)); uint64_t Offset = 0; while (Offset < Contents.size()) { // The function address is followed by a ULEB representing the stack @@ -5317,8 +5353,8 @@ void DumpStyle::printNonRelocatableStackSizes( FileStr); } uint64_t SymValue = Data.getAddress(&Offset); - printFunctionStackSize(Obj, SymValue, Obj->toSectionRef(FunctionELFSec), - SectionName, Data, &Offset); + printFunctionStackSize(Obj, SymValue, /*FunctionSec=*/None, SectionName, + Data, &Offset); } } } @@ -5881,7 +5917,12 @@ void LLVMStyle::printSymbolSection(const Elf_Sym *Symbol, Expected SectionName = this->dumper()->getSymbolSectionName(Symbol, *SectionIndex); if (!SectionName) { - this->reportUniqueWarning(SectionName.takeError()); + // Don't report an invalid section name if the section headers are missing. + // In such situations, all sections will be "invalid". + if (!this->dumper()->getElfObject()->sections().empty()) + this->reportUniqueWarning(SectionName.takeError()); + else + consumeError(SectionName.takeError()); W.printHex("Section", "", *SectionIndex); } else { W.printHex("Section", *SectionName, *SectionIndex); diff --git a/llvm/tools/llvm-stress/CMakeLists.txt b/llvm/tools/llvm-stress/CMakeLists.txt index 139ab9e0d8f96..e4d1ae65ee76d 100644 --- a/llvm/tools/llvm-stress/CMakeLists.txt +++ b/llvm/tools/llvm-stress/CMakeLists.txt @@ -10,4 +10,3 @@ add_llvm_tool(llvm-stress DEPENDS intrinsics_gen ) -export_executable_symbols(llvm-stress) diff --git a/llvm/tools/llvm-symbolizer/llvm-symbolizer.cpp b/llvm/tools/llvm-symbolizer/llvm-symbolizer.cpp index 96b2b72d8ba11..8f4a93fcc25d2 100644 --- a/llvm/tools/llvm-symbolizer/llvm-symbolizer.cpp +++ b/llvm/tools/llvm-symbolizer/llvm-symbolizer.cpp @@ -309,6 +309,9 @@ int main(int argc, char **argv) { Opts.FallbackDebugPath = ClFallbackDebugPath; Opts.DWPName = ClDwpName; Opts.DebugFileDirectory = ClDebugFileDirectory; + Opts.PathStyle = DILineInfoSpecifier::FileLineInfoKind::AbsoluteFilePath; + if (ClBasenames) + Opts.PathStyle = DILineInfoSpecifier::FileLineInfoKind::BaseNameOnly; for (const auto &hint : ClDsymHint) { if (sys::path::extension(hint) == ".dSYM") { @@ -322,7 +325,7 @@ int main(int argc, char **argv) { DIPrinter Printer(outs(), ClPrintFunctions != FunctionNameKind::None, ClPrettyPrint, ClPrintSourceContextLines, ClVerbose, - ClBasenames, ClOutputStyle); + ClOutputStyle); if (ClInputAddresses.empty()) { const int kMaxInputStringLength = 1024; diff --git a/llvm/tools/obj2yaml/elf2yaml.cpp b/llvm/tools/obj2yaml/elf2yaml.cpp index 180457bb6d91e..ac3eefdeaace6 100644 --- a/llvm/tools/obj2yaml/elf2yaml.cpp +++ b/llvm/tools/obj2yaml/elf2yaml.cpp @@ -89,8 +89,6 @@ class ELFDumper { Expected dumpStackSizesSection(const Elf_Shdr *Shdr); - Expected dumpSpecialSection(const Elf_Shdr *Shdr); - public: ELFDumper(const object::ELFFile &O); Expected dump(); @@ -242,19 +240,71 @@ template Expected>> ELFDumper::dumpSections() { std::vector> Ret; + auto Add = [&](Expected SecOrErr) -> Error { + if (!SecOrErr) + return SecOrErr.takeError(); + Ret.emplace_back(*SecOrErr); + return Error::success(); + }; + + auto GetDumper = [this](unsigned Type) + -> std::function(const Elf_Shdr *)> { + switch (Type) { + case ELF::SHT_DYNAMIC: + return [this](const Elf_Shdr *S) { return dumpDynamicSection(S); }; + case ELF::SHT_SYMTAB_SHNDX: + return [this](const Elf_Shdr *S) { return dumpSymtabShndxSection(S); }; + case ELF::SHT_REL: + case ELF::SHT_RELA: + return [this](const Elf_Shdr *S) { return dumpRelocSection(S); }; + case ELF::SHT_RELR: + return [this](const Elf_Shdr *S) { return dumpRelrSection(S); }; + case ELF::SHT_GROUP: + return [this](const Elf_Shdr *S) { return dumpGroup(S); }; + case ELF::SHT_MIPS_ABIFLAGS: + return [this](const Elf_Shdr *S) { return dumpMipsABIFlags(S); }; + case ELF::SHT_NOBITS: + return [this](const Elf_Shdr *S) { return dumpNoBitsSection(S); }; + case ELF::SHT_NOTE: + return [this](const Elf_Shdr *S) { return dumpNoteSection(S); }; + case ELF::SHT_HASH: + return [this](const Elf_Shdr *S) { return dumpHashSection(S); }; + case ELF::SHT_GNU_HASH: + return [this](const Elf_Shdr *S) { return dumpGnuHashSection(S); }; + case ELF::SHT_GNU_verdef: + return [this](const Elf_Shdr *S) { return dumpVerdefSection(S); }; + case ELF::SHT_GNU_versym: + return [this](const Elf_Shdr *S) { return dumpSymverSection(S); }; + case ELF::SHT_GNU_verneed: + return [this](const Elf_Shdr *S) { return dumpVerneedSection(S); }; + case ELF::SHT_LLVM_ADDRSIG: + return [this](const Elf_Shdr *S) { return dumpAddrsigSection(S); }; + case ELF::SHT_LLVM_LINKER_OPTIONS: + return [this](const Elf_Shdr *S) { return dumpLinkerOptionsSection(S); }; + case ELF::SHT_LLVM_DEPENDENT_LIBRARIES: + return [this](const Elf_Shdr *S) { + return dumpDependentLibrariesSection(S); + }; + case ELF::SHT_LLVM_CALL_GRAPH_PROFILE: + return + [this](const Elf_Shdr *S) { return dumpCallGraphProfileSection(S); }; + default: + return nullptr; + } + }; for (const Elf_Shdr &Sec : Sections) { - switch (Sec.sh_type) { - case ELF::SHT_DYNAMIC: { - Expected SecOrErr = dumpDynamicSection(&Sec); - if (!SecOrErr) - return SecOrErr.takeError(); - Ret.emplace_back(*SecOrErr); - break; + // We have dedicated dumping functions for most of the section types. + // Try to use one of them first. + if (std::function(const Elf_Shdr *)> DumpFn = + GetDumper(Sec.sh_type)) { + if (Error E = Add(DumpFn(&Sec))) + return std::move(E); + continue; } - case ELF::SHT_STRTAB: - case ELF::SHT_SYMTAB: - case ELF::SHT_DYNSYM: { + + if (Sec.sh_type == ELF::SHT_STRTAB || Sec.sh_type == ELF::SHT_SYMTAB || + Sec.sh_type == ELF::SHT_DYNSYM) { // The contents of these sections are described by other parts of the YAML // file. We still dump them so that their positions in the section header // table are correctly recorded. We only dump allocatable section because @@ -266,128 +316,13 @@ ELFDumper::dumpSections() { auto S = std::make_unique(); if (Error E = dumpCommonSection(&Sec, *S.get())) return std::move(E); - Ret.emplace_back(std::move(S)); + if (Error E = Add(S.release())) + return std::move(E); } - break; - } - case ELF::SHT_SYMTAB_SHNDX: { - Expected SecOrErr = - dumpSymtabShndxSection(&Sec); - if (!SecOrErr) - return SecOrErr.takeError(); - Ret.emplace_back(*SecOrErr); - break; - } - case ELF::SHT_REL: - case ELF::SHT_RELA: { - Expected SecOrErr = dumpRelocSection(&Sec); - if (!SecOrErr) - return SecOrErr.takeError(); - Ret.emplace_back(*SecOrErr); - break; - } - case ELF::SHT_RELR: { - Expected SecOrErr = dumpRelrSection(&Sec); - if (!SecOrErr) - return SecOrErr.takeError(); - Ret.emplace_back(*SecOrErr); - break; - } - case ELF::SHT_GROUP: { - Expected GroupOrErr = dumpGroup(&Sec); - if (!GroupOrErr) - return GroupOrErr.takeError(); - Ret.emplace_back(*GroupOrErr); - break; - } - case ELF::SHT_MIPS_ABIFLAGS: { - Expected SecOrErr = dumpMipsABIFlags(&Sec); - if (!SecOrErr) - return SecOrErr.takeError(); - Ret.emplace_back(*SecOrErr); - break; - } - case ELF::SHT_NOBITS: { - Expected SecOrErr = dumpNoBitsSection(&Sec); - if (!SecOrErr) - return SecOrErr.takeError(); - Ret.emplace_back(*SecOrErr); - break; - } - case ELF::SHT_NOTE: { - Expected SecOrErr = dumpNoteSection(&Sec); - if (!SecOrErr) - return SecOrErr.takeError(); - Ret.emplace_back(*SecOrErr); - break; - } - case ELF::SHT_HASH: { - Expected SecOrErr = dumpHashSection(&Sec); - if (!SecOrErr) - return SecOrErr.takeError(); - Ret.emplace_back(*SecOrErr); - break; - } - case ELF::SHT_GNU_HASH: { - Expected SecOrErr = dumpGnuHashSection(&Sec); - if (!SecOrErr) - return SecOrErr.takeError(); - Ret.emplace_back(*SecOrErr); - break; - } - case ELF::SHT_GNU_verdef: { - Expected SecOrErr = dumpVerdefSection(&Sec); - if (!SecOrErr) - return SecOrErr.takeError(); - Ret.emplace_back(*SecOrErr); - break; - } - case ELF::SHT_GNU_versym: { - Expected SecOrErr = dumpSymverSection(&Sec); - if (!SecOrErr) - return SecOrErr.takeError(); - Ret.emplace_back(*SecOrErr); - break; - } - case ELF::SHT_GNU_verneed: { - Expected SecOrErr = dumpVerneedSection(&Sec); - if (!SecOrErr) - return SecOrErr.takeError(); - Ret.emplace_back(*SecOrErr); - break; - } - case ELF::SHT_LLVM_ADDRSIG: { - Expected SecOrErr = dumpAddrsigSection(&Sec); - if (!SecOrErr) - return SecOrErr.takeError(); - Ret.emplace_back(*SecOrErr); - break; - } - case ELF::SHT_LLVM_LINKER_OPTIONS: { - Expected SecOrErr = - dumpLinkerOptionsSection(&Sec); - if (!SecOrErr) - return SecOrErr.takeError(); - Ret.emplace_back(*SecOrErr); - break; - } - case ELF::SHT_LLVM_DEPENDENT_LIBRARIES: { - Expected SecOrErr = - dumpDependentLibrariesSection(&Sec); - if (!SecOrErr) - return SecOrErr.takeError(); - Ret.emplace_back(*SecOrErr); - break; - } - case ELF::SHT_LLVM_CALL_GRAPH_PROFILE: { - Expected SecOrErr = - dumpCallGraphProfileSection(&Sec); - if (!SecOrErr) - return SecOrErr.takeError(); - Ret.emplace_back(*SecOrErr); - break; + continue; } - case ELF::SHT_NULL: { + + if (Sec.sh_type == ELF::SHT_NULL) { // We only dump the SHT_NULL section at index 0 when it // has at least one non-null field, because yaml2obj // normally creates the zero section at index 0 implicitly. @@ -395,30 +330,27 @@ ELFDumper::dumpSections() { const uint8_t *Begin = reinterpret_cast(&Sec); const uint8_t *End = Begin + sizeof(Elf_Shdr); if (std::find_if(Begin, End, [](uint8_t V) { return V != 0; }) == End) - break; + continue; } - LLVM_FALLTHROUGH; } - default: { - // Recognize some special SHT_PROGBITS sections by name. - if (Sec.sh_type == ELF::SHT_PROGBITS) { - Expected SpecialSecOrErr = dumpSpecialSection(&Sec); - if (!SpecialSecOrErr) - return SpecialSecOrErr.takeError(); - if (*SpecialSecOrErr) { - Ret.emplace_back(*SpecialSecOrErr); - break; - } - } - Expected SecOrErr = - dumpContentSection(&Sec); - if (!SecOrErr) - return SecOrErr.takeError(); - Ret.emplace_back(*SecOrErr); - } + // Recognize some special SHT_PROGBITS sections by name. + if (Sec.sh_type == ELF::SHT_PROGBITS) { + auto NameOrErr = getUniquedSectionName(&Sec); + if (!NameOrErr) + return NameOrErr.takeError(); + + if (ELFYAML::StackSizesSection::nameMatches(*NameOrErr)) { + if (Error E = Add(dumpStackSizesSection(&Sec))) + return std::move(E); + continue; + } } + + if (Error E = Add(dumpContentSection(&Sec))) + return std::move(E); } + return std::move(Ret); } @@ -521,6 +453,22 @@ Error ELFDumper::dumpRelocation(const RelT *Rel, const Elf_Shdr *SymTab, return Error::success(); } +template +static unsigned getDefaultShEntSize(ELFYAML::ELF_SHT SecType) { + switch (SecType) { + case ELF::SHT_REL: + return sizeof(typename ELFT::Rel); + case ELF::SHT_RELA: + return sizeof(typename ELFT::Rela); + case ELF::SHT_RELR: + return sizeof(typename ELFT::Relr); + case ELF::SHT_DYNAMIC: + return sizeof(typename ELFT::Dyn); + default: + return 0; + } +} + template Error ELFDumper::dumpCommonSection(const Elf_Shdr *Shdr, ELFYAML::Section &S) { @@ -532,7 +480,8 @@ Error ELFDumper::dumpCommonSection(const Elf_Shdr *Shdr, if (Shdr->sh_addr) S.Address = static_cast(Shdr->sh_addr); S.AddressAlign = Shdr->sh_addralign; - if (Shdr->sh_entsize) + + if (Shdr->sh_entsize != getDefaultShEntSize(S.Type)) S.EntSize = static_cast(Shdr->sh_entsize); auto NameOrErr = getUniquedSectionName(Shdr); @@ -557,24 +506,17 @@ Error ELFDumper::dumpCommonSection(const Elf_Shdr *Shdr, return Error::success(); } -template -Expected -ELFDumper::dumpSpecialSection(const Elf_Shdr *Shdr) { - auto NameOrErr = getUniquedSectionName(Shdr); - if (!NameOrErr) - return NameOrErr.takeError(); - - if (ELFYAML::StackSizesSection::nameMatches(*NameOrErr)) - return dumpStackSizesSection(Shdr); - return nullptr; -} - template Error ELFDumper::dumpCommonRelocationSection( const Elf_Shdr *Shdr, ELFYAML::RelocationSection &S) { if (Error E = dumpCommonSection(Shdr, S)) return E; + // Having a zero sh_info field is normal: .rela.dyn is a dynamic + // relocation section that normally has no value in this field. + if (!Shdr->sh_info) + return Error::success(); + auto InfoSection = Obj.getSection(Shdr->sh_info); if (!InfoSection) return InfoSection.takeError(); diff --git a/llvm/tools/opt/CMakeLists.txt b/llvm/tools/opt/CMakeLists.txt index ad9e20bd0b439..c3368d93174ce 100644 --- a/llvm/tools/opt/CMakeLists.txt +++ b/llvm/tools/opt/CMakeLists.txt @@ -40,7 +40,7 @@ add_llvm_tool(opt intrinsics_gen SUPPORT_PLUGINS ) -export_executable_symbols(opt) +export_executable_symbols_for_plugins(opt) if(LLVM_BUILD_EXAMPLES) target_link_libraries(opt PRIVATE ExampleIRTransforms) diff --git a/llvm/unittests/ADT/CoalescingBitVectorTest.cpp b/llvm/unittests/ADT/CoalescingBitVectorTest.cpp index 02a5bde2bfacd..4f87bf415bebb 100644 --- a/llvm/unittests/ADT/CoalescingBitVectorTest.cpp +++ b/llvm/unittests/ADT/CoalescingBitVectorTest.cpp @@ -77,17 +77,6 @@ TEST(CoalescingBitVector, Copy) { EXPECT_TRUE(elementsMatch(BV2, {0})); } -TEST(CoalescingBitVector, Move) { - UBitVec::Allocator Alloc; - UBitVec BV1(Alloc); - BV1.set(0); - UBitVec BV2 = std::move(BV1); - EXPECT_TRUE(elementsMatch(BV2, {0})); - BV2.set(5); - BV1 = std::move(BV2); - EXPECT_TRUE(elementsMatch(BV1, {0, 5})); -} - TEST(CoalescingBitVectorTest, Iterators) { UBitVec::Allocator Alloc; UBitVec BV(Alloc); @@ -194,14 +183,12 @@ TEST(CoalescingBitVectorTest, Comparison) { // A simple implementation of set union, used to double-check the human // "expected" answer. -UBitVec simpleUnion(UBitVec::Allocator &Alloc, const UBitVec &LHS, +void simpleUnion(UBitVec &Union, const UBitVec &LHS, const UBitVec &RHS) { - UBitVec Union(Alloc); for (unsigned Bit : LHS) Union.test_and_set(Bit); for (unsigned Bit : RHS) Union.test_and_set(Bit); - return Union; } TEST(CoalescingBitVectorTest, Union) { @@ -215,7 +202,8 @@ TEST(CoalescingBitVectorTest, Union) { BV1.set(LHS); UBitVec BV2(Alloc); BV2.set(RHS); - const UBitVec &DoubleCheckedExpected = simpleUnion(Alloc, BV1, BV2); + UBitVec DoubleCheckedExpected(Alloc); + simpleUnion(DoubleCheckedExpected, BV1, BV2); ASSERT_TRUE(elementsMatch(DoubleCheckedExpected, Expected)); BV1 |= BV2; ASSERT_TRUE(elementsMatch(BV1, Expected)); @@ -288,13 +276,11 @@ TEST(CoalescingBitVectorTest, Union) { // A simple implementation of set intersection, used to double-check the // human "expected" answer. -UBitVec simpleIntersection(UBitVec::Allocator &Alloc, const UBitVec &LHS, - const UBitVec &RHS) { - UBitVec Intersection(Alloc); +void simpleIntersection(UBitVec &Intersection, const UBitVec &LHS, + const UBitVec &RHS) { for (unsigned Bit : LHS) if (RHS.test(Bit)) Intersection.set(Bit); - return Intersection; } TEST(CoalescingBitVectorTest, Intersection) { @@ -308,7 +294,8 @@ TEST(CoalescingBitVectorTest, Intersection) { BV1.set(LHS); UBitVec BV2(Alloc); BV2.set(RHS); - const UBitVec &DoubleCheckedExpected = simpleIntersection(Alloc, BV1, BV2); + UBitVec DoubleCheckedExpected(Alloc); + simpleIntersection(DoubleCheckedExpected, BV1, BV2); ASSERT_TRUE(elementsMatch(DoubleCheckedExpected, Expected)); BV1 &= BV2; ASSERT_TRUE(elementsMatch(BV1, Expected)); @@ -367,14 +354,11 @@ TEST(CoalescingBitVectorTest, Intersection) { // A simple implementation of set intersection-with-complement, used to // double-check the human "expected" answer. -UBitVec simpleIntersectionWithComplement(UBitVec::Allocator &Alloc, - const UBitVec &LHS, - const UBitVec &RHS) { - UBitVec Intersection(Alloc); +void simpleIntersectionWithComplement(UBitVec &Intersection, const UBitVec &LHS, + const UBitVec &RHS) { for (unsigned Bit : LHS) if (!RHS.test(Bit)) Intersection.set(Bit); - return Intersection; } TEST(CoalescingBitVectorTest, IntersectWithComplement) { @@ -389,8 +373,8 @@ TEST(CoalescingBitVectorTest, IntersectWithComplement) { BV1.set(LHS); UBitVec BV2(Alloc); BV2.set(RHS); - const UBitVec &DoubleCheckedExpected = - simpleIntersectionWithComplement(Alloc, BV1, BV2); + UBitVec DoubleCheckedExpected(Alloc); + simpleIntersectionWithComplement(DoubleCheckedExpected, BV1, BV2); ASSERT_TRUE(elementsMatch(DoubleCheckedExpected, Expected)); BV1.intersectWithComplement(BV2); ASSERT_TRUE(elementsMatch(BV1, Expected)); @@ -464,6 +448,44 @@ TEST(CoalescingBitVectorTest, FindLowerBound) { EXPECT_EQ(*BV.find(3), 3u); } +TEST(CoalescingBitVectorTest, AdvanceToLowerBound) { + U64BitVec::Allocator Alloc; + U64BitVec BV(Alloc); + uint64_t BigNum1 = uint64_t(1) << 32; + uint64_t BigNum2 = (uint64_t(1) << 33) + 1; + + auto advFromBegin = [&](uint64_t To) -> U64BitVec::const_iterator { + auto It = BV.begin(); + It.advanceToLowerBound(To); + return It; + }; + + EXPECT_TRUE(advFromBegin(BigNum1) == BV.end()); + BV.set(BigNum1); + auto Find1 = advFromBegin(BigNum1); + EXPECT_EQ(*Find1, BigNum1); + BV.set(BigNum2); + auto Find2 = advFromBegin(BigNum1); + EXPECT_EQ(*Find2, BigNum1); + auto Find3 = advFromBegin(BigNum2); + EXPECT_EQ(*Find3, BigNum2); + BV.reset(BigNum1); + auto Find4 = advFromBegin(BigNum1); + EXPECT_EQ(*Find4, BigNum2); + + BV.clear(); + BV.set({1, 2, 3}); + EXPECT_EQ(*advFromBegin(2), 2u); + EXPECT_EQ(*advFromBegin(3), 3u); + auto It = BV.begin(); + It.advanceToLowerBound(0); + EXPECT_EQ(*It, 1u); + It.advanceToLowerBound(100); + EXPECT_TRUE(It == BV.end()); + It.advanceToLowerBound(100); + EXPECT_TRUE(It == BV.end()); +} + TEST(CoalescingBitVectorTest, Print) { std::string S; { diff --git a/llvm/unittests/Analysis/VectorUtilsTest.cpp b/llvm/unittests/Analysis/VectorUtilsTest.cpp index df744ac716571..d471e79842ca7 100644 --- a/llvm/unittests/Analysis/VectorUtilsTest.cpp +++ b/llvm/unittests/Analysis/VectorUtilsTest.cpp @@ -98,6 +98,14 @@ TEST_F(BasicTest, isSplat) { EXPECT_FALSE(isSplatValue(SplatWithUndefC)); } +TEST_F(BasicTest, scaleShuffleMask) { + SmallVector ScaledMask; + scaleShuffleMask(1, {3,2,0,-2}, ScaledMask); + EXPECT_EQ(makeArrayRef(ScaledMask), makeArrayRef({3,2,0,-2})); + scaleShuffleMask(4, {3,2,0,-1}, ScaledMask); + EXPECT_EQ(makeArrayRef(ScaledMask), makeArrayRef({12,13,14,15,8,9,10,11,0,1,2,3,-1,-1,-1,-1})); +} + TEST_F(BasicTest, getSplatIndex) { EXPECT_EQ(getSplatIndex({0,0,0}), 0); EXPECT_EQ(getSplatIndex({1,0,0}), -1); // no splat diff --git a/llvm/unittests/CodeGen/GlobalISel/CSETest.cpp b/llvm/unittests/CodeGen/GlobalISel/CSETest.cpp index 999220fd250d8..1c86d5ff9943f 100644 --- a/llvm/unittests/CodeGen/GlobalISel/CSETest.cpp +++ b/llvm/unittests/CodeGen/GlobalISel/CSETest.cpp @@ -11,7 +11,7 @@ namespace { -TEST_F(GISelMITest, TestCSE) { +TEST_F(AArch64GISelMITest, TestCSE) { setUp(); if (!TM) return; @@ -79,7 +79,7 @@ TEST_F(GISelMITest, TestCSE) { EXPECT_EQ(&*Undef0, &*Undef1); } -TEST_F(GISelMITest, TestCSEConstantConfig) { +TEST_F(AArch64GISelMITest, TestCSEConstantConfig) { setUp(); if (!TM) return; diff --git a/llvm/unittests/CodeGen/GlobalISel/ConstantFoldingTest.cpp b/llvm/unittests/CodeGen/GlobalISel/ConstantFoldingTest.cpp index 2f99005105fd1..127aaffc28785 100644 --- a/llvm/unittests/CodeGen/GlobalISel/ConstantFoldingTest.cpp +++ b/llvm/unittests/CodeGen/GlobalISel/ConstantFoldingTest.cpp @@ -17,7 +17,7 @@ using namespace llvm; namespace { -TEST_F(GISelMITest, FoldWithBuilder) { +TEST_F(AArch64GISelMITest, FoldWithBuilder) { setUp(); if (!TM) return; @@ -68,7 +68,7 @@ TEST_F(GISelMITest, FoldWithBuilder) { EXPECT_EQ(-0x80, Cst); } -TEST_F(GISelMITest, FoldBinOp) { +TEST_F(AArch64GISelMITest, FoldBinOp) { setUp(); if (!TM) return; diff --git a/llvm/unittests/CodeGen/GlobalISel/GISelMITest.cpp b/llvm/unittests/CodeGen/GlobalISel/GISelMITest.cpp index 0558c4121fdd5..ef50a0f281f00 100644 --- a/llvm/unittests/CodeGen/GlobalISel/GISelMITest.cpp +++ b/llvm/unittests/CodeGen/GlobalISel/GISelMITest.cpp @@ -28,3 +28,79 @@ operator<<(std::ostream &OS, const MachineFunction &MF) { } } + +std::unique_ptr +AArch64GISelMITest::createTargetMachine() const { + Triple TargetTriple("aarch64--"); + std::string Error; + const Target *T = TargetRegistry::lookupTarget("", TargetTriple, Error); + if (!T) + return nullptr; + + TargetOptions Options; + return std::unique_ptr( + static_cast(T->createTargetMachine( + "AArch64", "", "", Options, None, None, CodeGenOpt::Aggressive))); +} + +void AArch64GISelMITest::getTargetTestModuleString(SmallString<512> &S, + StringRef MIRFunc) const { + (Twine(R"MIR( +--- +... +name: func +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } + - { id: 3, class: _ } +body: | + bb.1: + liveins: $x0, $x1, $x2, $x4 + + %0(s64) = COPY $x0 + %1(s64) = COPY $x1 + %2(s64) = COPY $x2 +)MIR") + + Twine(MIRFunc) + Twine("...\n")) + .toNullTerminatedStringRef(S); +} + +std::unique_ptr +AMDGPUGISelMITest::createTargetMachine() const { + Triple TargetTriple("amdgcn-amd-amdhsa"); + std::string Error; + const Target *T = TargetRegistry::lookupTarget("", TargetTriple, Error); + if (!T) + return nullptr; + + TargetOptions Options; + return std::unique_ptr( + static_cast(T->createTargetMachine( + "amdgcn-amd-amdhsa", "gfx900", "", Options, None, None, + CodeGenOpt::Aggressive))); +} + +void AMDGPUGISelMITest::getTargetTestModuleString( + SmallString<512> &S, StringRef MIRFunc) const { + (Twine(R"MIR( +--- +... +name: func +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } + - { id: 3, class: _ } +body: | + bb.1: + liveins: $vgpr0, $vgpr1, $vgpr2 + + %0(s32) = COPY $vgpr0 + %1(s32) = COPY $vgpr1 + %2(s32) = COPY $vgpr2 +)MIR") + Twine(MIRFunc) + Twine("...\n")) + .toNullTerminatedStringRef(S); +} diff --git a/llvm/unittests/CodeGen/GlobalISel/GISelMITest.h b/llvm/unittests/CodeGen/GlobalISel/GISelMITest.h index 4254f4f759e4a..db3f2b9531c9f 100644 --- a/llvm/unittests/CodeGen/GlobalISel/GISelMITest.h +++ b/llvm/unittests/CodeGen/GlobalISel/GISelMITest.h @@ -53,21 +53,6 @@ std::ostream & operator<<(std::ostream &OS, const MachineFunction &MF); } -/// Create a TargetMachine. As we lack a dedicated always available target for -/// unittests, we go for "AArch64". -static std::unique_ptr createTargetMachine() { - Triple TargetTriple("aarch64--"); - std::string Error; - const Target *T = TargetRegistry::lookupTarget("", TargetTriple, Error); - if (!T) - return nullptr; - - TargetOptions Options; - return std::unique_ptr( - static_cast(T->createTargetMachine( - "AArch64", "", "", Options, None, None, CodeGenOpt::Aggressive))); -} - static std::unique_ptr parseMIR(LLVMContext &Context, std::unique_ptr &MIR, const TargetMachine &TM, @@ -90,34 +75,13 @@ static std::unique_ptr parseMIR(LLVMContext &Context, return M; } - static std::pair, std::unique_ptr> createDummyModule(LLVMContext &Context, const LLVMTargetMachine &TM, - StringRef MIRFunc) { - SmallString<512> S; - StringRef MIRString = (Twine(R"MIR( ---- -... -name: func -tracksRegLiveness: true -registers: - - { id: 0, class: _ } - - { id: 1, class: _ } - - { id: 2, class: _ } - - { id: 3, class: _ } -body: | - bb.1: - liveins: $x0, $x1, $x2, $x4 - - %0(s64) = COPY $x0 - %1(s64) = COPY $x1 - %2(s64) = COPY $x2 -)MIR") + Twine(MIRFunc) + Twine("...\n")) - .toNullTerminatedStringRef(S); + StringRef MIRString, const char *FuncName) { std::unique_ptr MIR; auto MMI = std::make_unique(&TM); std::unique_ptr M = - parseMIR(Context, MIR, TM, MIRString, "func", *MMI); + parseMIR(Context, MIR, TM, MIRString, FuncName, *MMI); return make_pair(std::move(M), std::move(MMI)); } @@ -140,11 +104,23 @@ static void collectCopies(SmallVectorImpl &Copies, class GISelMITest : public ::testing::Test { protected: GISelMITest() : ::testing::Test() {} + + /// Prepare a target specific LLVMTargetMachine. + virtual std::unique_ptr createTargetMachine() const = 0; + + /// Get the stub sample MIR test function. + virtual void getTargetTestModuleString(SmallString<512> &S, + StringRef MIRFunc) const = 0; + void setUp(StringRef ExtraAssembly = "") { TM = createTargetMachine(); if (!TM) return; - ModuleMMIPair = createDummyModule(Context, *TM, ExtraAssembly); + + SmallString<512> MIRString; + getTargetTestModuleString(MIRString, ExtraAssembly); + + ModuleMMIPair = createDummyModule(Context, *TM, MIRString, "func"); MF = getMFFromMMI(ModuleMMIPair.first.get(), ModuleMMIPair.second.get()); collectCopies(Copies, MF); EntryMBB = &*MF->begin(); @@ -152,6 +128,7 @@ class GISelMITest : public ::testing::Test { MRI = &MF->getRegInfo(); B.setInsertPt(*EntryMBB, EntryMBB->end()); } + LLVMContext Context; std::unique_ptr TM; MachineFunction *MF; @@ -163,6 +140,18 @@ class GISelMITest : public ::testing::Test { MachineRegisterInfo *MRI; }; +class AArch64GISelMITest : public GISelMITest { + std::unique_ptr createTargetMachine() const override; + void getTargetTestModuleString(SmallString<512> &S, + StringRef MIRFunc) const override; +}; + +class AMDGPUGISelMITest : public GISelMITest { + std::unique_ptr createTargetMachine() const override; + void getTargetTestModuleString(SmallString<512> &S, + StringRef MIRFunc) const override; +}; + #define DefineLegalizerInfo(Name, SettingUpActionsBlock) \ class Name##Info : public LegalizerInfo { \ public: \ diff --git a/llvm/unittests/CodeGen/GlobalISel/KnownBitsTest.cpp b/llvm/unittests/CodeGen/GlobalISel/KnownBitsTest.cpp index a5372511b051e..fe0c14270ecf2 100644 --- a/llvm/unittests/CodeGen/GlobalISel/KnownBitsTest.cpp +++ b/llvm/unittests/CodeGen/GlobalISel/KnownBitsTest.cpp @@ -10,7 +10,7 @@ #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" -TEST_F(GISelMITest, TestKnownBitsCst) { +TEST_F(AArch64GISelMITest, TestKnownBitsCst) { StringRef MIRString = " %3:_(s8) = G_CONSTANT i8 1\n" " %4:_(s8) = COPY %3\n"; setUp(MIRString); @@ -30,7 +30,7 @@ TEST_F(GISelMITest, TestKnownBitsCst) { EXPECT_EQ(Res.Zero.getZExtValue(), Res2.Zero.getZExtValue()); } -TEST_F(GISelMITest, TestKnownBitsCstWithClass) { +TEST_F(AArch64GISelMITest, TestKnownBitsCstWithClass) { StringRef MIRString = " %10:gpr32 = MOVi32imm 1\n" " %4:_(s32) = COPY %10\n"; setUp(MIRString); @@ -58,7 +58,7 @@ TEST_F(GISelMITest, TestKnownBitsCstWithClass) { // Check that we are able to track bits through PHIs // and get the intersections of everything we know on each operand. -TEST_F(GISelMITest, TestKnownBitsCstPHI) { +TEST_F(AArch64GISelMITest, TestKnownBitsCstPHI) { StringRef MIRString = " bb.10:\n" " %10:_(s8) = G_CONSTANT i8 3\n" " %11:_(s1) = G_IMPLICIT_DEF\n" @@ -92,7 +92,7 @@ TEST_F(GISelMITest, TestKnownBitsCstPHI) { // Check that we report we know nothing when we hit a // non-generic register. // Note: this could be improved though! -TEST_F(GISelMITest, TestKnownBitsCstPHIToNonGenericReg) { +TEST_F(AArch64GISelMITest, TestKnownBitsCstPHIToNonGenericReg) { StringRef MIRString = " bb.10:\n" " %10:gpr32 = MOVi32imm 3\n" " %11:_(s1) = G_IMPLICIT_DEF\n" @@ -129,7 +129,7 @@ TEST_F(GISelMITest, TestKnownBitsCstPHIToNonGenericReg) { // here to cover the code that stops the analysis of PHIs // earlier. In that case, we would not even look at the // second incoming value. -TEST_F(GISelMITest, TestKnownBitsUnknownPHI) { +TEST_F(AArch64GISelMITest, TestKnownBitsUnknownPHI) { StringRef MIRString = " bb.10:\n" " %10:_(s64) = COPY %0\n" @@ -165,7 +165,7 @@ TEST_F(GISelMITest, TestKnownBitsUnknownPHI) { // For now, the analysis just stops and assumes it knows nothing, // eventually we could teach it how to properly track phis that // loop back. -TEST_F(GISelMITest, TestKnownBitsCstPHIWithLoop) { +TEST_F(AArch64GISelMITest, TestKnownBitsCstPHIWithLoop) { StringRef MIRString = " bb.10:\n" " %10:_(s8) = G_CONSTANT i8 3\n" @@ -210,7 +210,7 @@ TEST_F(GISelMITest, TestKnownBitsCstPHIWithLoop) { // on PHIs, but eventually we could teach it how to properly track // phis that loop back without relying on the luck effect of max // depth. -TEST_F(GISelMITest, TestKnownBitsDecreasingCstPHIWithLoop) { +TEST_F(AArch64GISelMITest, TestKnownBitsDecreasingCstPHIWithLoop) { StringRef MIRString = " bb.10:\n" " %10:_(s8) = G_CONSTANT i8 5\n" " %11:_(s8) = G_CONSTANT i8 1\n" @@ -243,7 +243,7 @@ TEST_F(GISelMITest, TestKnownBitsDecreasingCstPHIWithLoop) { EXPECT_EQ(Res.Zero.getZExtValue(), Res2.Zero.getZExtValue()); } -TEST_F(GISelMITest, TestKnownBitsPtrToIntViceVersa) { +TEST_F(AArch64GISelMITest, TestKnownBitsPtrToIntViceVersa) { StringRef MIRString = " %3:_(s16) = G_CONSTANT i16 256\n" " %4:_(p0) = G_INTTOPTR %3\n" " %5:_(s32) = G_PTRTOINT %4\n" @@ -259,7 +259,7 @@ TEST_F(GISelMITest, TestKnownBitsPtrToIntViceVersa) { EXPECT_EQ(256u, Res.One.getZExtValue()); EXPECT_EQ(0xfffffeffu, Res.Zero.getZExtValue()); } -TEST_F(GISelMITest, TestKnownBitsXOR) { +TEST_F(AArch64GISelMITest, TestKnownBitsXOR) { StringRef MIRString = " %3:_(s8) = G_CONSTANT i8 4\n" " %4:_(s8) = G_CONSTANT i8 7\n" " %5:_(s8) = G_XOR %3, %4\n" @@ -276,7 +276,7 @@ TEST_F(GISelMITest, TestKnownBitsXOR) { EXPECT_EQ(252u, Res.Zero.getZExtValue()); } -TEST_F(GISelMITest, TestKnownBits) { +TEST_F(AArch64GISelMITest, TestKnownBits) { StringRef MIR = " %3:_(s32) = G_TRUNC %0\n" " %4:_(s32) = G_TRUNC %1\n" @@ -306,7 +306,7 @@ TEST_F(GISelMITest, TestKnownBits) { EXPECT_EQ(Known.Zero, Zeroes); } -TEST_F(GISelMITest, TestSignBitIsZero) { +TEST_F(AArch64GISelMITest, TestSignBitIsZero) { setUp(); if (!TM) return; @@ -321,7 +321,7 @@ TEST_F(GISelMITest, TestSignBitIsZero) { EXPECT_FALSE(KnownBits.signBitIsZero(SignBit.getReg(0))); } -TEST_F(GISelMITest, TestNumSignBitsConstant) { +TEST_F(AArch64GISelMITest, TestNumSignBitsConstant) { StringRef MIRString = " %3:_(s8) = G_CONSTANT i8 1\n" " %4:_(s8) = COPY %3\n" @@ -353,7 +353,7 @@ TEST_F(GISelMITest, TestNumSignBitsConstant) { EXPECT_EQ(3u, Info.computeNumSignBits(CopyRegNeg32)); } -TEST_F(GISelMITest, TestNumSignBitsSext) { +TEST_F(AArch64GISelMITest, TestNumSignBitsSext) { StringRef MIRString = " %3:_(p0) = G_IMPLICIT_DEF\n" " %4:_(s8) = G_LOAD %3 :: (load 1)\n" " %5:_(s32) = G_SEXT %4\n" @@ -373,7 +373,7 @@ TEST_F(GISelMITest, TestNumSignBitsSext) { EXPECT_EQ(32u, Info.computeNumSignBits(CopySextNeg1)); } -TEST_F(GISelMITest, TestNumSignBitsTrunc) { +TEST_F(AArch64GISelMITest, TestNumSignBitsTrunc) { StringRef MIRString = " %3:_(p0) = G_IMPLICIT_DEF\n" " %4:_(s32) = G_LOAD %3 :: (load 4)\n" " %5:_(s8) = G_TRUNC %4\n" @@ -398,3 +398,36 @@ TEST_F(GISelMITest, TestNumSignBitsTrunc) { EXPECT_EQ(8u, Info.computeNumSignBits(CopyTruncNeg1)); EXPECT_EQ(5u, Info.computeNumSignBits(CopyTrunc7)); } + +TEST_F(AMDGPUGISelMITest, TestNumSignBitsTrunc) { + StringRef MIRString = + " %3:_(<4 x s32>) = G_IMPLICIT_DEF\n" + " %4:_(s32) = G_IMPLICIT_DEF\n" + " %5:_(s32) = G_AMDGPU_BUFFER_LOAD_UBYTE %3, %4, %4, %4, 0, 0, 0 :: (load 1)\n" + " %6:_(s32) = COPY %5\n" + + " %7:_(s32) = G_AMDGPU_BUFFER_LOAD_SBYTE %3, %4, %4, %4, 0, 0, 0 :: (load 1)\n" + " %8:_(s32) = COPY %7\n" + + " %9:_(s32) = G_AMDGPU_BUFFER_LOAD_USHORT %3, %4, %4, %4, 0, 0, 0 :: (load 2)\n" + " %10:_(s32) = COPY %9\n" + + " %11:_(s32) = G_AMDGPU_BUFFER_LOAD_SSHORT %3, %4, %4, %4, 0, 0, 0 :: (load 2)\n" + " %12:_(s32) = COPY %11\n"; + + setUp(MIRString); + if (!TM) + return; + + Register CopyLoadUByte = Copies[Copies.size() - 4]; + Register CopyLoadSByte = Copies[Copies.size() - 3]; + Register CopyLoadUShort = Copies[Copies.size() - 2]; + Register CopyLoadSShort = Copies[Copies.size() - 1]; + + GISelKnownBits Info(*MF); + + EXPECT_EQ(24u, Info.computeNumSignBits(CopyLoadUByte)); + EXPECT_EQ(25u, Info.computeNumSignBits(CopyLoadSByte)); + EXPECT_EQ(16u, Info.computeNumSignBits(CopyLoadUShort)); + EXPECT_EQ(17u, Info.computeNumSignBits(CopyLoadSShort)); +} diff --git a/llvm/unittests/CodeGen/GlobalISel/LegalizerHelperTest.cpp b/llvm/unittests/CodeGen/GlobalISel/LegalizerHelperTest.cpp index 9407a9437bcc9..244cb75c3e297 100644 --- a/llvm/unittests/CodeGen/GlobalISel/LegalizerHelperTest.cpp +++ b/llvm/unittests/CodeGen/GlobalISel/LegalizerHelperTest.cpp @@ -25,7 +25,7 @@ class DummyGISelObserver : public GISelChangeObserver { // Test CTTZ expansion when CTTZ_ZERO_UNDEF is legal or custom, // in which case it becomes CTTZ_ZERO_UNDEF with select. -TEST_F(GISelMITest, LowerBitCountingCTTZ0) { +TEST_F(AArch64GISelMITest, LowerBitCountingCTTZ0) { setUp(); if (!TM) return; @@ -57,7 +57,7 @@ TEST_F(GISelMITest, LowerBitCountingCTTZ0) { } // CTTZ expansion in terms of CTLZ -TEST_F(GISelMITest, LowerBitCountingCTTZ1) { +TEST_F(AArch64GISelMITest, LowerBitCountingCTTZ1) { setUp(); if (!TM) return; @@ -91,7 +91,7 @@ TEST_F(GISelMITest, LowerBitCountingCTTZ1) { } // CTLZ scalar narrowing -TEST_F(GISelMITest, NarrowScalarCTLZ) { +TEST_F(AArch64GISelMITest, NarrowScalarCTLZ) { setUp(); if (!TM) return; @@ -126,7 +126,7 @@ TEST_F(GISelMITest, NarrowScalarCTLZ) { } // CTTZ scalar narrowing -TEST_F(GISelMITest, NarrowScalarCTTZ) { +TEST_F(AArch64GISelMITest, NarrowScalarCTTZ) { setUp(); if (!TM) return; @@ -161,7 +161,7 @@ TEST_F(GISelMITest, NarrowScalarCTTZ) { } // CTTZ expansion in terms of CTPOP -TEST_F(GISelMITest, LowerBitCountingCTTZ2) { +TEST_F(AArch64GISelMITest, LowerBitCountingCTTZ2) { setUp(); if (!TM) return; @@ -192,7 +192,7 @@ TEST_F(GISelMITest, LowerBitCountingCTTZ2) { } // CTPOP widening. -TEST_F(GISelMITest, WidenBitCountingCTPOP1) { +TEST_F(AArch64GISelMITest, WidenBitCountingCTPOP1) { if (!TM) return; @@ -224,7 +224,7 @@ TEST_F(GISelMITest, WidenBitCountingCTPOP1) { } // Test a strange case where the result is wider than the source -TEST_F(GISelMITest, WidenBitCountingCTPOP2) { +TEST_F(AArch64GISelMITest, WidenBitCountingCTPOP2) { if (!TM) return; @@ -257,7 +257,7 @@ TEST_F(GISelMITest, WidenBitCountingCTPOP2) { } // CTTZ_ZERO_UNDEF expansion in terms of CTTZ -TEST_F(GISelMITest, LowerBitCountingCTTZ3) { +TEST_F(AArch64GISelMITest, LowerBitCountingCTTZ3) { setUp(); if (!TM) return; @@ -284,7 +284,7 @@ TEST_F(GISelMITest, LowerBitCountingCTTZ3) { } // CTLZ expansion in terms of CTLZ_ZERO_UNDEF -TEST_F(GISelMITest, LowerBitCountingCTLZ0) { +TEST_F(AArch64GISelMITest, LowerBitCountingCTLZ0) { setUp(); if (!TM) return; @@ -315,7 +315,7 @@ TEST_F(GISelMITest, LowerBitCountingCTLZ0) { } // CTLZ expansion in terms of CTLZ_ZERO_UNDEF if the latter is a libcall -TEST_F(GISelMITest, LowerBitCountingCTLZLibcall) { +TEST_F(AArch64GISelMITest, LowerBitCountingCTLZLibcall) { setUp(); if (!TM) return; @@ -346,7 +346,7 @@ TEST_F(GISelMITest, LowerBitCountingCTLZLibcall) { } // CTLZ expansion -TEST_F(GISelMITest, LowerBitCountingCTLZ1) { +TEST_F(AArch64GISelMITest, LowerBitCountingCTLZ1) { setUp(); if (!TM) return; @@ -387,7 +387,7 @@ TEST_F(GISelMITest, LowerBitCountingCTLZ1) { } // CTLZ widening. -TEST_F(GISelMITest, WidenBitCountingCTLZ) { +TEST_F(AArch64GISelMITest, WidenBitCountingCTLZ) { setUp(); if (!TM) return; @@ -422,7 +422,7 @@ TEST_F(GISelMITest, WidenBitCountingCTLZ) { } // CTLZ_ZERO_UNDEF widening. -TEST_F(GISelMITest, WidenBitCountingCTLZZeroUndef) { +TEST_F(AArch64GISelMITest, WidenBitCountingCTLZZeroUndef) { setUp(); if (!TM) return; @@ -458,7 +458,7 @@ TEST_F(GISelMITest, WidenBitCountingCTLZZeroUndef) { } // CTPOP widening. -TEST_F(GISelMITest, WidenBitCountingCTPOP) { +TEST_F(AArch64GISelMITest, WidenBitCountingCTPOP) { setUp(); if (!TM) return; @@ -491,7 +491,7 @@ TEST_F(GISelMITest, WidenBitCountingCTPOP) { } // CTTZ_ZERO_UNDEF widening. -TEST_F(GISelMITest, WidenBitCountingCTTZ_ZERO_UNDEF) { +TEST_F(AArch64GISelMITest, WidenBitCountingCTTZ_ZERO_UNDEF) { setUp(); if (!TM) return; @@ -525,7 +525,7 @@ TEST_F(GISelMITest, WidenBitCountingCTTZ_ZERO_UNDEF) { } // CTTZ widening. -TEST_F(GISelMITest, WidenBitCountingCTTZ) { +TEST_F(AArch64GISelMITest, WidenBitCountingCTTZ) { setUp(); if (!TM) return; @@ -559,7 +559,7 @@ TEST_F(GISelMITest, WidenBitCountingCTTZ) { EXPECT_TRUE(CheckMachineFunction(*MF, CheckStr)) << *MF; } // UADDO widening. -TEST_F(GISelMITest, WidenUADDO) { +TEST_F(AArch64GISelMITest, WidenUADDO) { setUp(); if (!TM) return; @@ -598,7 +598,7 @@ TEST_F(GISelMITest, WidenUADDO) { } // USUBO widening. -TEST_F(GISelMITest, WidenUSUBO) { +TEST_F(AArch64GISelMITest, WidenUSUBO) { setUp(); if (!TM) return; @@ -636,7 +636,7 @@ TEST_F(GISelMITest, WidenUSUBO) { EXPECT_TRUE(CheckMachineFunction(*MF, CheckStr)) << *MF; } -TEST_F(GISelMITest, FewerElementsAnd) { +TEST_F(AArch64GISelMITest, FewerElementsAnd) { if (!TM) return; @@ -683,7 +683,7 @@ TEST_F(GISelMITest, FewerElementsAnd) { EXPECT_TRUE(CheckMachineFunction(*MF, CheckStr)) << *MF; } -TEST_F(GISelMITest, MoreElementsAnd) { +TEST_F(AArch64GISelMITest, MoreElementsAnd) { if (!TM) return; @@ -724,7 +724,7 @@ TEST_F(GISelMITest, MoreElementsAnd) { EXPECT_TRUE(CheckMachineFunction(*MF, CheckStr)) << *MF; } -TEST_F(GISelMITest, FewerElementsPhi) { +TEST_F(AArch64GISelMITest, FewerElementsPhi) { if (!TM) return; @@ -819,7 +819,7 @@ TEST_F(GISelMITest, FewerElementsPhi) { } // FNEG expansion in terms of FSUB -TEST_F(GISelMITest, LowerFNEG) { +TEST_F(AArch64GISelMITest, LowerFNEG) { if (!TM) return; @@ -864,7 +864,7 @@ TEST_F(GISelMITest, LowerFNEG) { EXPECT_TRUE(CheckMachineFunction(*MF, CheckStr)) << *MF; } -TEST_F(GISelMITest, LowerMinMax) { +TEST_F(AArch64GISelMITest, LowerMinMax) { if (!TM) return; @@ -942,7 +942,7 @@ TEST_F(GISelMITest, LowerMinMax) { EXPECT_TRUE(CheckMachineFunction(*MF, CheckStr)) << *MF; } -TEST_F(GISelMITest, WidenScalarBuildVector) { +TEST_F(AArch64GISelMITest, WidenScalarBuildVector) { if (!TM) return; @@ -988,7 +988,7 @@ TEST_F(GISelMITest, WidenScalarBuildVector) { EXPECT_TRUE(CheckMachineFunction(*MF, CheckStr)) << *MF; } -TEST_F(GISelMITest, LowerMergeValues) { +TEST_F(AArch64GISelMITest, LowerMergeValues) { if (!TM) return; @@ -1089,7 +1089,7 @@ TEST_F(GISelMITest, LowerMergeValues) { EXPECT_TRUE(CheckMachineFunction(*MF, CheckStr)) << *MF; } -TEST_F(GISelMITest, WidenScalarMergeValuesPointer) { +TEST_F(AArch64GISelMITest, WidenScalarMergeValuesPointer) { if (!TM) return; @@ -1126,7 +1126,7 @@ TEST_F(GISelMITest, WidenScalarMergeValuesPointer) { EXPECT_TRUE(CheckMachineFunction(*MF, CheckStr)) << *MF; } -TEST_F(GISelMITest, WidenSEXTINREG) { +TEST_F(AArch64GISelMITest, WidenSEXTINREG) { if (!TM) return; @@ -1157,7 +1157,7 @@ TEST_F(GISelMITest, WidenSEXTINREG) { ASSERT_TRUE(CheckMachineFunction(*MF, CheckStr)); } -TEST_F(GISelMITest, NarrowSEXTINREG) { +TEST_F(AArch64GISelMITest, NarrowSEXTINREG) { if (!TM) return; @@ -1188,7 +1188,7 @@ TEST_F(GISelMITest, NarrowSEXTINREG) { ASSERT_TRUE(CheckMachineFunction(*MF, CheckStr)); } -TEST_F(GISelMITest, NarrowSEXTINREG2) { +TEST_F(AArch64GISelMITest, NarrowSEXTINREG2) { if (!TM) return; @@ -1220,7 +1220,7 @@ TEST_F(GISelMITest, NarrowSEXTINREG2) { ASSERT_TRUE(CheckMachineFunction(*MF, CheckStr)); } -TEST_F(GISelMITest, LowerSEXTINREG) { +TEST_F(AArch64GISelMITest, LowerSEXTINREG) { if (!TM) return; @@ -1250,7 +1250,7 @@ TEST_F(GISelMITest, LowerSEXTINREG) { ASSERT_TRUE(CheckMachineFunction(*MF, CheckStr)); } -TEST_F(GISelMITest, LibcallFPExt) { +TEST_F(AArch64GISelMITest, LibcallFPExt) { setUp(); if (!TM) return; @@ -1289,7 +1289,7 @@ TEST_F(GISelMITest, LibcallFPExt) { EXPECT_TRUE(CheckMachineFunction(*MF, CheckStr)) << *MF; } -TEST_F(GISelMITest, LibcallFPTrunc) { +TEST_F(AArch64GISelMITest, LibcallFPTrunc) { setUp(); if (!TM) return; @@ -1331,7 +1331,7 @@ TEST_F(GISelMITest, LibcallFPTrunc) { EXPECT_TRUE(CheckMachineFunction(*MF, CheckStr)) << *MF; } -TEST_F(GISelMITest, LibcallSimple) { +TEST_F(AArch64GISelMITest, LibcallSimple) { setUp(); if (!TM) return; @@ -1354,7 +1354,7 @@ TEST_F(GISelMITest, LibcallSimple) { Helper.libcall(*MIBFADD)); } -TEST_F(GISelMITest, LibcallSRem) { +TEST_F(AArch64GISelMITest, LibcallSRem) { setUp(); if (!TM) return; @@ -1411,7 +1411,7 @@ TEST_F(GISelMITest, LibcallSRem) { EXPECT_TRUE(CheckMachineFunction(*MF, CheckStr)) << *MF; } -TEST_F(GISelMITest, LibcallURem) { +TEST_F(AArch64GISelMITest, LibcallURem) { setUp(); if (!TM) return; @@ -1468,7 +1468,7 @@ TEST_F(GISelMITest, LibcallURem) { EXPECT_TRUE(CheckMachineFunction(*MF, CheckStr)) << *MF; } -TEST_F(GISelMITest, LibcallCtlzZeroUndef) { +TEST_F(AArch64GISelMITest, LibcallCtlzZeroUndef) { setUp(); if (!TM) return; @@ -1521,7 +1521,7 @@ TEST_F(GISelMITest, LibcallCtlzZeroUndef) { EXPECT_TRUE(CheckMachineFunction(*MF, CheckStr)) << *MF; } -TEST_F(GISelMITest, LibcallFAdd) { +TEST_F(AArch64GISelMITest, LibcallFAdd) { setUp(); if (!TM) return; @@ -1573,7 +1573,7 @@ TEST_F(GISelMITest, LibcallFAdd) { EXPECT_TRUE(CheckMachineFunction(*MF, CheckStr)) << *MF; } -TEST_F(GISelMITest, LibcallFSub) { +TEST_F(AArch64GISelMITest, LibcallFSub) { setUp(); if (!TM) return; @@ -1625,7 +1625,7 @@ TEST_F(GISelMITest, LibcallFSub) { EXPECT_TRUE(CheckMachineFunction(*MF, CheckStr)) << *MF; } -TEST_F(GISelMITest, LibcallFMul) { +TEST_F(AArch64GISelMITest, LibcallFMul) { setUp(); if (!TM) return; @@ -1677,7 +1677,7 @@ TEST_F(GISelMITest, LibcallFMul) { EXPECT_TRUE(CheckMachineFunction(*MF, CheckStr)) << *MF; } -TEST_F(GISelMITest, LibcallFDiv) { +TEST_F(AArch64GISelMITest, LibcallFDiv) { setUp(); if (!TM) return; @@ -1729,7 +1729,7 @@ TEST_F(GISelMITest, LibcallFDiv) { EXPECT_TRUE(CheckMachineFunction(*MF, CheckStr)) << *MF; } -TEST_F(GISelMITest, LibcallFExp) { +TEST_F(AArch64GISelMITest, LibcallFExp) { setUp(); if (!TM) return; @@ -1776,7 +1776,7 @@ TEST_F(GISelMITest, LibcallFExp) { EXPECT_TRUE(CheckMachineFunction(*MF, CheckStr)) << *MF; } -TEST_F(GISelMITest, LibcallFExp2) { +TEST_F(AArch64GISelMITest, LibcallFExp2) { setUp(); if (!TM) return; @@ -1823,7 +1823,7 @@ TEST_F(GISelMITest, LibcallFExp2) { EXPECT_TRUE(CheckMachineFunction(*MF, CheckStr)) << *MF; } -TEST_F(GISelMITest, LibcallFRem) { +TEST_F(AArch64GISelMITest, LibcallFRem) { setUp(); if (!TM) return; @@ -1870,7 +1870,7 @@ TEST_F(GISelMITest, LibcallFRem) { EXPECT_TRUE(CheckMachineFunction(*MF, CheckStr)) << *MF; } -TEST_F(GISelMITest, LibcallFPow) { +TEST_F(AArch64GISelMITest, LibcallFPow) { setUp(); if (!TM) return; @@ -1917,7 +1917,7 @@ TEST_F(GISelMITest, LibcallFPow) { EXPECT_TRUE(CheckMachineFunction(*MF, CheckStr)) << *MF; } -TEST_F(GISelMITest, LibcallFMa) { +TEST_F(AArch64GISelMITest, LibcallFMa) { setUp(); if (!TM) return; @@ -1965,7 +1965,7 @@ TEST_F(GISelMITest, LibcallFMa) { EXPECT_TRUE(CheckMachineFunction(*MF, CheckStr)) << *MF; } -TEST_F(GISelMITest, LibcallFCeil) { +TEST_F(AArch64GISelMITest, LibcallFCeil) { setUp(); if (!TM) return; @@ -2012,7 +2012,7 @@ TEST_F(GISelMITest, LibcallFCeil) { EXPECT_TRUE(CheckMachineFunction(*MF, CheckStr)) << *MF; } -TEST_F(GISelMITest, LibcallFFloor) { +TEST_F(AArch64GISelMITest, LibcallFFloor) { setUp(); if (!TM) return; @@ -2059,7 +2059,7 @@ TEST_F(GISelMITest, LibcallFFloor) { EXPECT_TRUE(CheckMachineFunction(*MF, CheckStr)) << *MF; } -TEST_F(GISelMITest, LibcallFMinNum) { +TEST_F(AArch64GISelMITest, LibcallFMinNum) { setUp(); if (!TM) return; @@ -2109,7 +2109,7 @@ TEST_F(GISelMITest, LibcallFMinNum) { EXPECT_TRUE(CheckMachineFunction(*MF, CheckStr)) << *MF; } -TEST_F(GISelMITest, LibcallFMaxNum) { +TEST_F(AArch64GISelMITest, LibcallFMaxNum) { setUp(); if (!TM) return; @@ -2159,7 +2159,7 @@ TEST_F(GISelMITest, LibcallFMaxNum) { EXPECT_TRUE(CheckMachineFunction(*MF, CheckStr)) << *MF; } -TEST_F(GISelMITest, LibcallFSqrt) { +TEST_F(AArch64GISelMITest, LibcallFSqrt) { setUp(); if (!TM) return; @@ -2206,7 +2206,7 @@ TEST_F(GISelMITest, LibcallFSqrt) { EXPECT_TRUE(CheckMachineFunction(*MF, CheckStr)) << *MF; } -TEST_F(GISelMITest, LibcallFRint) { +TEST_F(AArch64GISelMITest, LibcallFRint) { setUp(); if (!TM) return; @@ -2253,7 +2253,7 @@ TEST_F(GISelMITest, LibcallFRint) { EXPECT_TRUE(CheckMachineFunction(*MF, CheckStr)) << *MF; } -TEST_F(GISelMITest, LibcallFNearbyInt) { +TEST_F(AArch64GISelMITest, LibcallFNearbyInt) { setUp(); if (!TM) return; @@ -2303,7 +2303,7 @@ TEST_F(GISelMITest, LibcallFNearbyInt) { EXPECT_TRUE(CheckMachineFunction(*MF, CheckStr)) << *MF; } -TEST_F(GISelMITest, NarrowScalarExtract) { +TEST_F(AArch64GISelMITest, NarrowScalarExtract) { setUp(); if (!TM) return; @@ -2342,7 +2342,7 @@ TEST_F(GISelMITest, NarrowScalarExtract) { EXPECT_TRUE(CheckMachineFunction(*MF, CheckStr)) << *MF; } -TEST_F(GISelMITest, LowerInsert) { +TEST_F(AArch64GISelMITest, LowerInsert) { setUp(); if (!TM) return; @@ -2443,7 +2443,7 @@ TEST_F(GISelMITest, LowerInsert) { } // Test lowering of G_FFLOOR -TEST_F(GISelMITest, LowerFFloor) { +TEST_F(AArch64GISelMITest, LowerFFloor) { setUp(); if (!TM) return; @@ -2475,7 +2475,7 @@ TEST_F(GISelMITest, LowerFFloor) { } // Test lowering of G_BSWAP -TEST_F(GISelMITest, LowerBSWAP) { +TEST_F(AArch64GISelMITest, LowerBSWAP) { setUp(); if (!TM) return; @@ -2516,4 +2516,210 @@ TEST_F(GISelMITest, LowerBSWAP) { EXPECT_TRUE(CheckMachineFunction(*MF, CheckStr)) << *MF; } +// Test widening of G_UNMERGE_VALUES +TEST_F(AArch64GISelMITest, WidenUnmerge) { + setUp(); + if (!TM) + return; + + DefineLegalizerInfo(A, {}); + + // Check that widening G_UNMERGE_VALUES to a larger type than the source type + // works as expected + LLT P0{LLT::pointer(0, 64)}; + LLT S32{LLT::scalar(32)}; + LLT S96{LLT::scalar(96)}; + + auto IntToPtr = B.buildIntToPtr(P0, Copies[0]); + auto UnmergePtr = B.buildUnmerge(S32, IntToPtr); + auto UnmergeScalar = B.buildUnmerge(S32, Copies[0]); + + AInfo Info(MF->getSubtarget()); + DummyGISelObserver Observer; + LegalizerHelper Helper(*MF, Info, Observer, B); + + // Perform Legalization + EXPECT_EQ(LegalizerHelper::LegalizeResult::Legalized, + Helper.widenScalar(*UnmergePtr, 0, S96)); + + EXPECT_EQ(LegalizerHelper::LegalizeResult::Legalized, + Helper.widenScalar(*UnmergeScalar, 0, S96)); + + const auto *CheckStr = R"( + CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY + CHECK: [[PTR:%[0-9]+]]:_(p0) = G_INTTOPTR [[COPY]] + CHECK: [[INT:%[0-9]+]]:_(s64) = G_PTRTOINT [[PTR]] + CHECK: [[ANYEXT:%[0-9]+]]:_(s96) = G_ANYEXT [[INT]] + CHECK: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[ANYEXT]] + CHECK: [[C:%[0-9]+]]:_(s96) = G_CONSTANT i96 32 + CHECK: [[LSHR:%[0-9]+]]:_(s96) = G_LSHR [[ANYEXT]]:_, [[C]] + CHECK: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[LSHR]] + CHECK: [[ANYEXT:%[0-9]+]]:_(s96) = G_ANYEXT [[COPY]] + CHECK: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[ANYEXT]] + CHECK: [[C:%[0-9]+]]:_(s96) = G_CONSTANT i96 32 + CHECK: [[LSHR:%[0-9]+]]:_(s96) = G_LSHR [[ANYEXT]]:_, [[C]] + CHECK: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[LSHR]] + )"; + + // Check + EXPECT_TRUE(CheckMachineFunction(*MF, CheckStr)) << *MF; +} + +TEST_F(AArch64GISelMITest, BitcastLoad) { + setUp(); + if (!TM) + return; + + LLT P0 = LLT::pointer(0, 64); + LLT S32 = LLT::scalar(32); + LLT V4S8 = LLT::vector(4, 8); + auto Ptr = B.buildUndef(P0); + + DefineLegalizerInfo(A, {}); + + MachineMemOperand *MMO = B.getMF().getMachineMemOperand( + MachinePointerInfo(), MachineMemOperand::MOLoad, 4, 4); + auto Load = B.buildLoad(V4S8, Ptr, *MMO); + + AInfo Info(MF->getSubtarget()); + DummyGISelObserver Observer; + LegalizerHelper Helper(*MF, Info, Observer, B); + EXPECT_EQ(LegalizerHelper::LegalizeResult::Legalized, + Helper.bitcast(*Load, 0, S32)); + + auto CheckStr = R"( + CHECK: [[PTR:%[0-9]+]]:_(p0) = G_IMPLICIT_DEF + CHECK: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD + CHECK: [[CAST:%[0-9]+]]:_(<4 x s8>) = G_BITCAST [[LOAD]] + + )"; + + // Check + EXPECT_TRUE(CheckMachineFunction(*MF, CheckStr)) << *MF; +} + +TEST_F(AArch64GISelMITest, BitcastStore) { + setUp(); + if (!TM) + return; + + LLT P0 = LLT::pointer(0, 64); + LLT S32 = LLT::scalar(32); + LLT V4S8 = LLT::vector(4, 8); + auto Ptr = B.buildUndef(P0); + + DefineLegalizerInfo(A, {}); + + MachineMemOperand *MMO = B.getMF().getMachineMemOperand( + MachinePointerInfo(), MachineMemOperand::MOStore, 4, 4); + auto Val = B.buildUndef(V4S8); + auto Store = B.buildStore(Val, Ptr, *MMO); + + AInfo Info(MF->getSubtarget()); + DummyGISelObserver Observer; + LegalizerHelper Helper(*MF, Info, Observer, B); + EXPECT_EQ(LegalizerHelper::LegalizeResult::Legalized, + Helper.bitcast(*Store, 0, S32)); + + auto CheckStr = R"( + CHECK: [[VAL:%[0-9]+]]:_(<4 x s8>) = G_IMPLICIT_DEF + CHECK: [[CAST:%[0-9]+]]:_(s32) = G_BITCAST [[VAL]] + CHECK: G_STORE [[CAST]] + )"; + + // Check + EXPECT_TRUE(CheckMachineFunction(*MF, CheckStr)) << *MF; +} + +TEST_F(AArch64GISelMITest, BitcastSelect) { + setUp(); + if (!TM) + return; + + LLT S1 = LLT::scalar(1); + LLT S32 = LLT::scalar(32); + LLT V4S8 = LLT::vector(4, 8); + + DefineLegalizerInfo(A, {}); + + auto Cond = B.buildUndef(S1); + auto Val0 = B.buildConstant(V4S8, 123); + auto Val1 = B.buildConstant(V4S8, 99); + + auto Select = B.buildSelect(V4S8, Cond, Val0, Val1); + + AInfo Info(MF->getSubtarget()); + DummyGISelObserver Observer; + LegalizerHelper Helper(*MF, Info, Observer, B); + EXPECT_EQ(LegalizerHelper::LegalizeResult::Legalized, + Helper.bitcast(*Select, 0, S32)); + + auto CheckStr = R"( + CHECK: [[VAL0:%[0-9]+]]:_(<4 x s8>) = G_BUILD_VECTOR + CHECK: [[VAL1:%[0-9]+]]:_(<4 x s8>) = G_BUILD_VECTOR + CHECK: [[CAST0:%[0-9]+]]:_(s32) = G_BITCAST [[VAL0]] + CHECK: [[CAST1:%[0-9]+]]:_(s32) = G_BITCAST [[VAL1]] + CHECK: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT %{{[0-9]+}}:_(s1), [[CAST0]]:_, [[CAST1]]:_ + CHECK: [[CAST2:%[0-9]+]]:_(<4 x s8>) = G_BITCAST [[SELECT]] + )"; + + // Check + EXPECT_TRUE(CheckMachineFunction(*MF, CheckStr)) << *MF; + + // Doesn't make sense + auto VCond = B.buildUndef(LLT::vector(4, 1)); + auto VSelect = B.buildSelect(V4S8, VCond, Val0, Val1); + EXPECT_EQ(LegalizerHelper::LegalizeResult::UnableToLegalize, + Helper.bitcast(*VSelect, 0, S32)); + EXPECT_EQ(LegalizerHelper::LegalizeResult::UnableToLegalize, + Helper.bitcast(*VSelect, 1, LLT::scalar(4))); +} + +TEST_F(AArch64GISelMITest, BitcastBitOps) { + setUp(); + if (!TM) + return; + + LLT S32 = LLT::scalar(32); + LLT V4S8 = LLT::vector(4, 8); + + DefineLegalizerInfo(A, {}); + + auto Val0 = B.buildConstant(V4S8, 123); + auto Val1 = B.buildConstant(V4S8, 99); + auto And = B.buildAnd(V4S8, Val0, Val1); + auto Or = B.buildOr(V4S8, Val0, Val1); + auto Xor = B.buildXor(V4S8, Val0, Val1); + + AInfo Info(MF->getSubtarget()); + DummyGISelObserver Observer; + LegalizerHelper Helper(*MF, Info, Observer, B); + EXPECT_EQ(LegalizerHelper::LegalizeResult::Legalized, + Helper.bitcast(*And, 0, S32)); + EXPECT_EQ(LegalizerHelper::LegalizeResult::Legalized, + Helper.bitcast(*Or, 0, S32)); + EXPECT_EQ(LegalizerHelper::LegalizeResult::Legalized, + Helper.bitcast(*Xor, 0, S32)); + + auto CheckStr = R"( + CHECK: [[VAL0:%[0-9]+]]:_(<4 x s8>) = G_BUILD_VECTOR + CHECK: [[VAL1:%[0-9]+]]:_(<4 x s8>) = G_BUILD_VECTOR + CHECK: [[CAST0:%[0-9]+]]:_(s32) = G_BITCAST [[VAL0]] + CHECK: [[CAST1:%[0-9]+]]:_(s32) = G_BITCAST [[VAL1]] + CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[CAST0]]:_, [[CAST1]]:_ + CHECK: [[CAST_AND:%[0-9]+]]:_(<4 x s8>) = G_BITCAST [[AND]] + CHECK: [[CAST2:%[0-9]+]]:_(s32) = G_BITCAST [[VAL0]] + CHECK: [[CAST3:%[0-9]+]]:_(s32) = G_BITCAST [[VAL1]] + CHECK: [[OR:%[0-9]+]]:_(s32) = G_OR [[CAST2]]:_, [[CAST3]]:_ + CHECK: [[CAST_OR:%[0-9]+]]:_(<4 x s8>) = G_BITCAST [[OR]] + CHECK: [[CAST4:%[0-9]+]]:_(s32) = G_BITCAST [[VAL0]] + CHECK: [[CAST5:%[0-9]+]]:_(s32) = G_BITCAST [[VAL1]] + CHECK: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[CAST4]]:_, [[CAST5]]:_ + CHECK: [[CAST_XOR:%[0-9]+]]:_(<4 x s8>) = G_BITCAST [[XOR]] + )"; + + // Check + EXPECT_TRUE(CheckMachineFunction(*MF, CheckStr)) << *MF; +} + } // namespace diff --git a/llvm/unittests/CodeGen/GlobalISel/LegalizerInfoTest.cpp b/llvm/unittests/CodeGen/GlobalISel/LegalizerInfoTest.cpp index b342143e13942..7fd2ea453a2ac 100644 --- a/llvm/unittests/CodeGen/GlobalISel/LegalizerInfoTest.cpp +++ b/llvm/unittests/CodeGen/GlobalISel/LegalizerInfoTest.cpp @@ -27,6 +27,7 @@ operator<<(std::ostream &OS, const LegalizeAction Act) { case MoreElements: OS << "MoreElements"; break; case Libcall: OS << "Libcall"; break; case Custom: OS << "Custom"; break; + case Bitcast: OS << "Bitcast"; break; case Unsupported: OS << "Unsupported"; break; case NotFound: OS << "NotFound"; break; case UseLegacyRules: OS << "UseLegacyRules"; break; diff --git a/llvm/unittests/CodeGen/GlobalISel/LegalizerTest.cpp b/llvm/unittests/CodeGen/GlobalISel/LegalizerTest.cpp index afb4614f07e48..edae7f9457121 100644 --- a/llvm/unittests/CodeGen/GlobalISel/LegalizerTest.cpp +++ b/llvm/unittests/CodeGen/GlobalISel/LegalizerTest.cpp @@ -49,7 +49,7 @@ DefineLegalizerInfo(ALegalizer, { getActionDefinitionsBuilder(G_SHL).legalFor({{s32, s32}}); }) -TEST_F(GISelMITest, BasicLegalizerTest) { +TEST_F(AArch64GISelMITest, BasicLegalizerTest) { StringRef MIRString = R"( %vptr:_(p0) = COPY $x4 %v:_(<2 x s8>) = G_LOAD %vptr:_(p0) :: (load 2, align 1) @@ -85,7 +85,7 @@ TEST_F(GISelMITest, BasicLegalizerTest) { // Making sure the legalization finishes successfully w/o failure to combine // away all the legalization artifacts regardless of the order of their // creation. -TEST_F(GISelMITest, UnorderedArtifactCombiningTest) { +TEST_F(AArch64GISelMITest, UnorderedArtifactCombiningTest) { StringRef MIRString = R"( %vptr:_(p0) = COPY $x4 %v:_(<2 x s8>) = G_LOAD %vptr:_(p0) :: (load 2, align 1) @@ -169,7 +169,7 @@ TEST_F(GISelMITest, UnorderedArtifactCombiningTest) { EXPECT_TRUE(CheckMachineFunction(*MF, CheckString)) << *MF; } -TEST_F(GISelMITest, UnorderedArtifactCombiningManyCopiesTest) { +TEST_F(AArch64GISelMITest, UnorderedArtifactCombiningManyCopiesTest) { StringRef MIRString = R"( %vptr:_(p0) = COPY $x4 %v:_(<2 x s8>) = G_LOAD %vptr:_(p0) :: (load 2, align 1) diff --git a/llvm/unittests/CodeGen/GlobalISel/MachineIRBuilderTest.cpp b/llvm/unittests/CodeGen/GlobalISel/MachineIRBuilderTest.cpp index f6fed8e75d2c7..4d766cd42bee7 100644 --- a/llvm/unittests/CodeGen/GlobalISel/MachineIRBuilderTest.cpp +++ b/llvm/unittests/CodeGen/GlobalISel/MachineIRBuilderTest.cpp @@ -9,7 +9,7 @@ #include "GISelMITest.h" #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" -TEST_F(GISelMITest, TestBuildConstantFConstant) { +TEST_F(AArch64GISelMITest, TestBuildConstantFConstant) { setUp(); if (!TM) return; @@ -37,11 +37,10 @@ TEST_F(GISelMITest, TestBuildConstantFConstant) { EXPECT_TRUE(CheckMachineFunction(*MF, CheckStr)) << *MF; } - #ifdef GTEST_HAS_DEATH_TEST #ifndef NDEBUG -TEST_F(GISelMITest, TestBuildConstantFConstantDeath) { +TEST_F(AArch64GISelMITest, TestBuildConstantFConstantDeath) { setUp(); if (!TM) return; @@ -73,7 +72,7 @@ TEST_F(GISelMITest, TestBuildConstantFConstantDeath) { #endif #endif -TEST_F(GISelMITest, DstOpSrcOp) { +TEST_F(AArch64GISelMITest, DstOpSrcOp) { setUp(); if (!TM) return; @@ -99,7 +98,7 @@ TEST_F(GISelMITest, DstOpSrcOp) { EXPECT_TRUE(CheckMachineFunction(*MF, CheckStr)) << *MF; } -TEST_F(GISelMITest, BuildUnmerge) { +TEST_F(AArch64GISelMITest, BuildUnmerge) { setUp(); if (!TM) return; @@ -120,7 +119,7 @@ TEST_F(GISelMITest, BuildUnmerge) { EXPECT_TRUE(CheckMachineFunction(*MF, CheckStr)) << *MF; } -TEST_F(GISelMITest, TestBuildFPInsts) { +TEST_F(AArch64GISelMITest, TestBuildFPInsts) { setUp(); if (!TM) return; @@ -156,7 +155,7 @@ TEST_F(GISelMITest, TestBuildFPInsts) { EXPECT_TRUE(CheckMachineFunction(*MF, CheckStr)) << *MF; } -TEST_F(GISelMITest, BuildIntrinsic) { +TEST_F(AArch64GISelMITest, BuildIntrinsic) { setUp(); if (!TM) return; @@ -185,7 +184,7 @@ TEST_F(GISelMITest, BuildIntrinsic) { EXPECT_TRUE(CheckMachineFunction(*MF, CheckStr)) << *MF; } -TEST_F(GISelMITest, BuildXor) { +TEST_F(AArch64GISelMITest, BuildXor) { setUp(); if (!TM) return; @@ -214,7 +213,7 @@ TEST_F(GISelMITest, BuildXor) { EXPECT_TRUE(CheckMachineFunction(*MF, CheckStr)) << *MF; } -TEST_F(GISelMITest, BuildBitCounts) { +TEST_F(AArch64GISelMITest, BuildBitCounts) { setUp(); if (!TM) return; @@ -242,7 +241,7 @@ TEST_F(GISelMITest, BuildBitCounts) { EXPECT_TRUE(CheckMachineFunction(*MF, CheckStr)) << *MF; } -TEST_F(GISelMITest, BuildCasts) { +TEST_F(AArch64GISelMITest, BuildCasts) { setUp(); if (!TM) return; @@ -267,7 +266,7 @@ TEST_F(GISelMITest, BuildCasts) { EXPECT_TRUE(CheckMachineFunction(*MF, CheckStr)) << *MF; } -TEST_F(GISelMITest, BuildMinMax) { +TEST_F(AArch64GISelMITest, BuildMinMax) { setUp(); if (!TM) return; @@ -293,7 +292,7 @@ TEST_F(GISelMITest, BuildMinMax) { EXPECT_TRUE(CheckMachineFunction(*MF, CheckStr)) << *MF; } -TEST_F(GISelMITest, BuildAtomicRMW) { +TEST_F(AArch64GISelMITest, BuildAtomicRMW) { setUp(); if (!TM) return; @@ -324,7 +323,7 @@ TEST_F(GISelMITest, BuildAtomicRMW) { EXPECT_TRUE(CheckMachineFunction(*MF, CheckStr)) << *MF; } -TEST_F(GISelMITest, BuildMerge) { +TEST_F(AArch64GISelMITest, BuildMerge) { setUp(); if (!TM) return; @@ -363,7 +362,7 @@ TEST_F(GISelMITest, BuildMerge) { EXPECT_TRUE(CheckMachineFunction(*MF, CheckStr)) << *MF; } -TEST_F(GISelMITest, BuildAddoSubo) { +TEST_F(AArch64GISelMITest, BuildAddoSubo) { setUp(); if (!TM) return; diff --git a/llvm/unittests/CodeGen/GlobalISel/PatternMatchTest.cpp b/llvm/unittests/CodeGen/GlobalISel/PatternMatchTest.cpp index 66c1d6793a7a0..172eca46b4a9a 100644 --- a/llvm/unittests/CodeGen/GlobalISel/PatternMatchTest.cpp +++ b/llvm/unittests/CodeGen/GlobalISel/PatternMatchTest.cpp @@ -30,7 +30,7 @@ using namespace MIPatternMatch; namespace { -TEST_F(GISelMITest, MatchIntConstant) { +TEST_F(AArch64GISelMITest, MatchIntConstant) { setUp(); if (!TM) return; @@ -41,7 +41,7 @@ TEST_F(GISelMITest, MatchIntConstant) { EXPECT_EQ(Cst, 42); } -TEST_F(GISelMITest, MatchBinaryOp) { +TEST_F(AArch64GISelMITest, MatchBinaryOp) { setUp(); if (!TM) return; @@ -139,7 +139,7 @@ TEST_F(GISelMITest, MatchBinaryOp) { EXPECT_EQ(Src1, TruncCopy1.getReg(0)); } -TEST_F(GISelMITest, MatchICmp) { +TEST_F(AArch64GISelMITest, MatchICmp) { setUp(); if (!TM) return; @@ -164,7 +164,7 @@ TEST_F(GISelMITest, MatchICmp) { EXPECT_EQ(Copies[1], Reg1); } -TEST_F(GISelMITest, MatchFCmp) { +TEST_F(AArch64GISelMITest, MatchFCmp) { setUp(); if (!TM) return; @@ -189,7 +189,7 @@ TEST_F(GISelMITest, MatchFCmp) { EXPECT_EQ(Copies[1], Reg1); } -TEST_F(GISelMITest, MatchFPUnaryOp) { +TEST_F(AArch64GISelMITest, MatchFPUnaryOp) { setUp(); if (!TM) return; @@ -251,7 +251,7 @@ TEST_F(GISelMITest, MatchFPUnaryOp) { EXPECT_NE(TmpFP16, TmpFP); } -TEST_F(GISelMITest, MatchExtendsTrunc) { +TEST_F(AArch64GISelMITest, MatchExtendsTrunc) { setUp(); if (!TM) return; @@ -298,7 +298,7 @@ TEST_F(GISelMITest, MatchExtendsTrunc) { EXPECT_EQ(Src0, Copies[0]); } -TEST_F(GISelMITest, MatchSpecificType) { +TEST_F(AArch64GISelMITest, MatchSpecificType) { setUp(); if (!TM) return; @@ -335,7 +335,7 @@ TEST_F(GISelMITest, MatchSpecificType) { EXPECT_EQ(Src0, Copies[0]); } -TEST_F(GISelMITest, MatchCombinators) { +TEST_F(AArch64GISelMITest, MatchCombinators) { setUp(); if (!TM) return; @@ -369,7 +369,7 @@ TEST_F(GISelMITest, MatchCombinators) { EXPECT_FALSE(match); } -TEST_F(GISelMITest, MatchMiscellaneous) { +TEST_F(AArch64GISelMITest, MatchMiscellaneous) { setUp(); if (!TM) return; diff --git a/llvm/unittests/DebugInfo/DWARF/DWARFDebugLineTest.cpp b/llvm/unittests/DebugInfo/DWARF/DWARFDebugLineTest.cpp index 2ffad829055c1..cf2fb5375c5cb 100644 --- a/llvm/unittests/DebugInfo/DWARF/DWARFDebugLineTest.cpp +++ b/llvm/unittests/DebugInfo/DWARF/DWARFDebugLineTest.cpp @@ -1324,7 +1324,12 @@ TEST_F(DebugLineBasicFixture, PrintPathsProperly) { EXPECT_TRUE((*ExpectedLineTable) ->Prologue.getFileNameByIndex( 1, CompDir, - DILineInfoSpecifier::FileLineInfoKind::Default, Result)); + DILineInfoSpecifier::FileLineInfoKind::RawValue, Result)); + EXPECT_TRUE((*ExpectedLineTable) + ->Prologue.getFileNameByIndex( + 1, CompDir, + DILineInfoSpecifier::FileLineInfoKind::BaseNameOnly, + Result)); EXPECT_STREQ(Result.c_str(), "b file"); EXPECT_TRUE((*ExpectedLineTable) ->Prologue.getFileNameByIndex( diff --git a/llvm/unittests/IR/CMakeLists.txt b/llvm/unittests/IR/CMakeLists.txt index 9dba01db1d11d..05df08d46b29b 100644 --- a/llvm/unittests/IR/CMakeLists.txt +++ b/llvm/unittests/IR/CMakeLists.txt @@ -41,6 +41,7 @@ add_llvm_unittest(IRTests ValueTest.cpp VectorTypesTest.cpp VerifierTest.cpp + VPIntrinsicTest.cpp WaymarkTest.cpp ) diff --git a/llvm/unittests/IR/ConstantRangeTest.cpp b/llvm/unittests/IR/ConstantRangeTest.cpp index 2e5ab82f256e7..4d19dd59cae64 100644 --- a/llvm/unittests/IR/ConstantRangeTest.cpp +++ b/llvm/unittests/IR/ConstantRangeTest.cpp @@ -2317,4 +2317,20 @@ TEST_F(ConstantRangeTest, castOps) { EXPECT_EQ(64u, IntToPtr.getBitWidth()); EXPECT_TRUE(IntToPtr.isFullSet()); } + +TEST_F(ConstantRangeTest, binaryXor) { + // Single element ranges. + ConstantRange R16(APInt(8, 16)); + ConstantRange R20(APInt(8, 20)); + EXPECT_EQ(*R16.binaryXor(R16).getSingleElement(), APInt(8, 0)); + EXPECT_EQ(*R16.binaryXor(R20).getSingleElement(), APInt(8, 16 ^ 20)); + + // Ranges with more than a single element. Handled conservatively for now. + ConstantRange R16_35(APInt(8, 16), APInt(8, 35)); + ConstantRange R0_99(APInt(8, 0), APInt(8, 99)); + EXPECT_TRUE(R16_35.binaryXor(R16_35).isFullSet()); + EXPECT_TRUE(R16_35.binaryXor(R0_99).isFullSet()); + EXPECT_TRUE(R0_99.binaryXor(R16_35).isFullSet()); +} + } // anonymous namespace diff --git a/llvm/unittests/IR/KnowledgeRetentionTest.cpp b/llvm/unittests/IR/KnowledgeRetentionTest.cpp index 62caeebba895d..aca801d2b296e 100644 --- a/llvm/unittests/IR/KnowledgeRetentionTest.cpp +++ b/llvm/unittests/IR/KnowledgeRetentionTest.cpp @@ -22,6 +22,10 @@ using namespace llvm; extern cl::opt ShouldPreserveAllAttributes; extern cl::opt EnableKnowledgeRetention; +static IntrinsicInst *buildAssumeFromInst(Instruction *I) { + return cast_or_null(BuildAssumeFromInst(I)); +} + static void RunTest( StringRef Head, StringRef Tail, std::vector>> @@ -40,7 +44,7 @@ static void RunTest( } } -bool hasMatchesExactlyAttributes(CallInst *Assume, Value *WasOn, +bool hasMatchesExactlyAttributes(IntrinsicInst *Assume, Value *WasOn, StringRef AttrToMatch) { Regex Reg(AttrToMatch); SmallVector Matches; @@ -56,7 +60,7 @@ bool hasMatchesExactlyAttributes(CallInst *Assume, Value *WasOn, return true; } -bool hasTheRightValue(CallInst *Assume, Value *WasOn, +bool hasTheRightValue(IntrinsicInst *Assume, Value *WasOn, Attribute::AttrKind Kind, unsigned Value, bool Both, AssumeQuery AQ = AssumeQuery::Highest) { if (!Both) { @@ -97,7 +101,7 @@ TEST(AssumeQueryAPI, hasAttributeInAssume) { "call void @func(i32* nonnull align 4 dereferenceable(16) %P, i32* align " "8 noalias %P1)\n", [](Instruction *I) { - CallInst *Assume = BuildAssumeFromInst(I); + IntrinsicInst *Assume = buildAssumeFromInst(I); Assume->insertBefore(I); ASSERT_TRUE(hasMatchesExactlyAttributes(Assume, I->getOperand(0), "(nonnull|align|dereferenceable)")); @@ -117,7 +121,7 @@ TEST(AssumeQueryAPI, hasAttributeInAssume) { "dereferenceable(4) " "%P, i32* nonnull align 16 dereferenceable(12) %P)\n", [](Instruction *I) { - CallInst *Assume = BuildAssumeFromInst(I); + IntrinsicInst *Assume = buildAssumeFromInst(I); Assume->insertBefore(I); ASSERT_TRUE(hasMatchesExactlyAttributes(Assume, I->getOperand(0), "(nonnull|align|dereferenceable)")); @@ -149,7 +153,7 @@ TEST(AssumeQueryAPI, hasAttributeInAssume) { Tests.push_back(std::make_pair( "call void @func_many(i32* align 8 %P1) cold\n", [](Instruction *I) { ShouldPreserveAllAttributes.setValue(true); - CallInst *Assume = BuildAssumeFromInst(I); + IntrinsicInst *Assume = buildAssumeFromInst(I); Assume->insertBefore(I); ASSERT_TRUE(hasMatchesExactlyAttributes( Assume, nullptr, @@ -159,7 +163,7 @@ TEST(AssumeQueryAPI, hasAttributeInAssume) { })); Tests.push_back( std::make_pair("call void @llvm.assume(i1 true)\n", [](Instruction *I) { - CallInst *Assume = cast(I); + IntrinsicInst *Assume = cast(I); ASSERT_TRUE(hasMatchesExactlyAttributes(Assume, nullptr, "")); })); Tests.push_back(std::make_pair( @@ -169,7 +173,7 @@ TEST(AssumeQueryAPI, hasAttributeInAssume) { "dereferenceable(4) " "%P2, i32* nonnull align 16 dereferenceable(12) %P3)\n", [](Instruction *I) { - CallInst *Assume = BuildAssumeFromInst(I); + IntrinsicInst *Assume = buildAssumeFromInst(I); Assume->insertBefore(I); ASSERT_TRUE(hasMatchesExactlyAttributes( Assume, I->getOperand(0), @@ -205,7 +209,7 @@ TEST(AssumeQueryAPI, hasAttributeInAssume) { "dereferenceable(4) " "%P2, i32* nonnull align 16 dereferenceable(12) %P3)\n", [](Instruction *I) { - CallInst *Assume = BuildAssumeFromInst(I); + IntrinsicInst *Assume = buildAssumeFromInst(I); Assume->insertBefore(I); I->getOperand(1)->dropDroppableUses(); I->getOperand(2)->dropDroppableUses(); @@ -228,7 +232,7 @@ TEST(AssumeQueryAPI, hasAttributeInAssume) { "call void @func(i32* nonnull align 4 dereferenceable(16) %P, i32* align " "8 noalias %P1)\n", [](Instruction *I) { - CallInst *Assume = BuildAssumeFromInst(I); + IntrinsicInst *Assume = buildAssumeFromInst(I); Assume->insertBefore(I); Value *New = I->getFunction()->getArg(3); Value *Old = I->getOperand(0); @@ -260,11 +264,11 @@ static bool FindExactlyAttributes(RetainedKnowledgeMap &Map, Value *WasOn, return true; } -static bool MapHasRightValue(RetainedKnowledgeMap &Map, - RetainedKnowledgeKey Key, MinMax MM) { +static bool MapHasRightValue(RetainedKnowledgeMap &Map, IntrinsicInst *II, + RetainedKnowledgeKey Key, MinMax MM) { auto LookupIt = Map.find(Key); - return (LookupIt != Map.end()) && (LookupIt->second.Min == MM.Min) && - (LookupIt->second.Max == MM.Max); + return (LookupIt != Map.end()) && (LookupIt->second[II].Min == MM.Min) && + (LookupIt->second[II].Max == MM.Max); } TEST(AssumeQueryAPI, fillMapFromAssume) { @@ -284,7 +288,7 @@ TEST(AssumeQueryAPI, fillMapFromAssume) { "call void @func(i32* nonnull align 4 dereferenceable(16) %P, i32* align " "8 noalias %P1)\n", [](Instruction *I) { - CallInst *Assume = BuildAssumeFromInst(I); + IntrinsicInst *Assume = buildAssumeFromInst(I); Assume->insertBefore(I); RetainedKnowledgeMap Map; @@ -294,10 +298,10 @@ TEST(AssumeQueryAPI, fillMapFromAssume) { ASSERT_TRUE(FindExactlyAttributes(Map, I->getOperand(1), "(align)")); ASSERT_TRUE(MapHasRightValue( - Map, {I->getOperand(0), Attribute::Dereferenceable}, {16, 16})); - ASSERT_TRUE(MapHasRightValue(Map, {I->getOperand(0), Attribute::Alignment}, + Map, Assume, {I->getOperand(0), Attribute::Dereferenceable}, {16, 16})); + ASSERT_TRUE(MapHasRightValue(Map, Assume, {I->getOperand(0), Attribute::Alignment}, {4, 4})); - ASSERT_TRUE(MapHasRightValue(Map, {I->getOperand(0), Attribute::Alignment}, + ASSERT_TRUE(MapHasRightValue(Map, Assume, {I->getOperand(0), Attribute::Alignment}, {4, 4})); })); Tests.push_back(std::make_pair( @@ -307,7 +311,7 @@ TEST(AssumeQueryAPI, fillMapFromAssume) { "dereferenceable(4) " "%P, i32* nonnull align 16 dereferenceable(12) %P)\n", [](Instruction *I) { - CallInst *Assume = BuildAssumeFromInst(I); + IntrinsicInst *Assume = buildAssumeFromInst(I); Assume->insertBefore(I); RetainedKnowledgeMap Map; @@ -322,14 +326,14 @@ TEST(AssumeQueryAPI, fillMapFromAssume) { ASSERT_TRUE(FindExactlyAttributes(Map, I->getOperand(3), "(nonnull|align|dereferenceable)")); ASSERT_TRUE(MapHasRightValue( - Map, {I->getOperand(0), Attribute::Dereferenceable}, {4, 48})); - ASSERT_TRUE(MapHasRightValue(Map, {I->getOperand(0), Attribute::Alignment}, + Map, Assume, {I->getOperand(0), Attribute::Dereferenceable}, {4, 48})); + ASSERT_TRUE(MapHasRightValue(Map, Assume, {I->getOperand(0), Attribute::Alignment}, {8, 64})); })); Tests.push_back(std::make_pair( "call void @func_many(i32* align 8 %P1) cold\n", [](Instruction *I) { ShouldPreserveAllAttributes.setValue(true); - CallInst *Assume = BuildAssumeFromInst(I); + IntrinsicInst *Assume = buildAssumeFromInst(I); Assume->insertBefore(I); RetainedKnowledgeMap Map; @@ -342,7 +346,7 @@ TEST(AssumeQueryAPI, fillMapFromAssume) { Tests.push_back( std::make_pair("call void @llvm.assume(i1 true)\n", [](Instruction *I) { RetainedKnowledgeMap Map; - fillMapFromAssume(*cast(I), Map); + fillMapFromAssume(*cast(I), Map); ASSERT_TRUE(FindExactlyAttributes(Map, nullptr, "")); ASSERT_TRUE(Map.empty()); @@ -354,7 +358,7 @@ TEST(AssumeQueryAPI, fillMapFromAssume) { "dereferenceable(4) " "%P2, i32* nonnull align 16 dereferenceable(12) %P3)\n", [](Instruction *I) { - CallInst *Assume = BuildAssumeFromInst(I); + IntrinsicInst *Assume = buildAssumeFromInst(I); Assume->insertBefore(I); RetainedKnowledgeMap Map; @@ -368,22 +372,22 @@ TEST(AssumeQueryAPI, fillMapFromAssume) { "(align|dereferenceable)")); ASSERT_TRUE(FindExactlyAttributes(Map, I->getOperand(3), "(nonnull|align|dereferenceable)")); - ASSERT_TRUE(MapHasRightValue(Map, {I->getOperand(0), Attribute::Alignment}, + ASSERT_TRUE(MapHasRightValue(Map, Assume, {I->getOperand(0), Attribute::Alignment}, {32, 32})); ASSERT_TRUE(MapHasRightValue( - Map, {I->getOperand(0), Attribute::Dereferenceable}, {48, 48})); + Map, Assume, {I->getOperand(0), Attribute::Dereferenceable}, {48, 48})); ASSERT_TRUE(MapHasRightValue( - Map, {I->getOperand(1), Attribute::Dereferenceable}, {28, 28})); - ASSERT_TRUE(MapHasRightValue(Map, {I->getOperand(1), Attribute::Alignment}, + Map, Assume, {I->getOperand(1), Attribute::Dereferenceable}, {28, 28})); + ASSERT_TRUE(MapHasRightValue(Map, Assume, {I->getOperand(1), Attribute::Alignment}, {8, 8})); - ASSERT_TRUE(MapHasRightValue(Map, {I->getOperand(2), Attribute::Alignment}, + ASSERT_TRUE(MapHasRightValue(Map, Assume, {I->getOperand(2), Attribute::Alignment}, {64, 64})); ASSERT_TRUE(MapHasRightValue( - Map, {I->getOperand(2), Attribute::Dereferenceable}, {4, 4})); - ASSERT_TRUE(MapHasRightValue(Map, {I->getOperand(3), Attribute::Alignment}, + Map, Assume, {I->getOperand(2), Attribute::Dereferenceable}, {4, 4})); + ASSERT_TRUE(MapHasRightValue(Map, Assume, {I->getOperand(3), Attribute::Alignment}, {16, 16})); ASSERT_TRUE(MapHasRightValue( - Map, {I->getOperand(3), Attribute::Dereferenceable}, {12, 12})); + Map, Assume, {I->getOperand(3), Attribute::Dereferenceable}, {12, 12})); })); /// Keep this test last as it modifies the function. @@ -391,7 +395,7 @@ TEST(AssumeQueryAPI, fillMapFromAssume) { "call void @func(i32* nonnull align 4 dereferenceable(16) %P, i32* align " "8 noalias %P1)\n", [](Instruction *I) { - CallInst *Assume = BuildAssumeFromInst(I); + IntrinsicInst *Assume = buildAssumeFromInst(I); Assume->insertBefore(I); RetainedKnowledgeMap Map; @@ -475,12 +479,12 @@ static void RunRandTest(uint64_t Seed, int Size, int MinCount, int MaxCount, OpBundle.push_back(OperandBundleDef{ss.str().c_str(), std::move(Args)}); } - Instruction *Assume = - CallInst::Create(FnAssume, ArrayRef({ConstantInt::getTrue(C)}), - std::move(OpBundle)); + auto *Assume = cast(IntrinsicInst::Create( + FnAssume, ArrayRef({ConstantInt::getTrue(C)}), + std::move(OpBundle))); Assume->insertBefore(&F->begin()->front()); RetainedKnowledgeMap Map; - fillMapFromAssume(*cast(Assume), Map); + fillMapFromAssume(*Assume, Map); for (int i = 0; i < (Size * 2); i++) { if (!HasArg[i]) continue; @@ -488,7 +492,7 @@ static void RunRandTest(uint64_t Seed, int Size, int MinCount, int MaxCount, getKnowledgeFromUseInAssume(&*ShuffledArgs[i]->use_begin()); auto LookupIt = Map.find(RetainedKnowledgeKey{K.WasOn, K.AttrKind}); ASSERT_TRUE(LookupIt != Map.end()); - MinMax MM = LookupIt->second; + MinMax MM = LookupIt->second[Assume]; ASSERT_TRUE(MM.Min == MM.Max); ASSERT_TRUE(MM.Min == K.ArgValue); } diff --git a/llvm/unittests/IR/VPIntrinsicTest.cpp b/llvm/unittests/IR/VPIntrinsicTest.cpp new file mode 100644 index 0000000000000..919bac4ef266d --- /dev/null +++ b/llvm/unittests/IR/VPIntrinsicTest.cpp @@ -0,0 +1,151 @@ +//===- VPIntrinsicTest.cpp - VPIntrinsic unit tests ---------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/SmallVector.h" +#include "llvm/AsmParser/Parser.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Verifier.h" +#include "llvm/Support/SourceMgr.h" +#include "gtest/gtest.h" + +using namespace llvm; + +namespace { + +class VPIntrinsicTest : public testing::Test { +protected: + LLVMContext Context; + + VPIntrinsicTest() : Context() {} + + LLVMContext C; + SMDiagnostic Err; + + std::unique_ptr CreateVPDeclarationModule() { + return parseAssemblyString( +" declare <8 x i32> @llvm.vp.add.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) " +" declare <8 x i32> @llvm.vp.sub.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) " +" declare <8 x i32> @llvm.vp.mul.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) " +" declare <8 x i32> @llvm.vp.sdiv.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) " +" declare <8 x i32> @llvm.vp.srem.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) " +" declare <8 x i32> @llvm.vp.udiv.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) " +" declare <8 x i32> @llvm.vp.urem.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) " +" declare <8 x i32> @llvm.vp.and.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) " +" declare <8 x i32> @llvm.vp.xor.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) " +" declare <8 x i32> @llvm.vp.or.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) " +" declare <8 x i32> @llvm.vp.ashr.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) " +" declare <8 x i32> @llvm.vp.lshr.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) " +" declare <8 x i32> @llvm.vp.shl.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) ", + Err, C); + } +}; + +/// Check that VPIntrinsic:canIgnoreVectorLengthParam() returns true +/// if the vector length parameter does not mask off any lanes. +TEST_F(VPIntrinsicTest, CanIgnoreVectorLength) { + LLVMContext C; + SMDiagnostic Err; + + std::unique_ptr M = + parseAssemblyString( +"declare <256 x i64> @llvm.vp.mul.v256i64(<256 x i64>, <256 x i64>, <256 x i1>, i32)" +"declare @llvm.vp.mul.nxv2i64(, , , i32)" +"declare i32 @llvm.vscale.i32()" +"define void @test_static_vlen( " +" <256 x i64> %i0, %si0," +" <256 x i64> %i1, %si1," +" <256 x i1> %m, %sm, i32 %vl) { " +" %r0 = call <256 x i64> @llvm.vp.mul.v256i64(<256 x i64> %i0, <256 x i64> %i1, <256 x i1> %m, i32 %vl)" +" %r1 = call <256 x i64> @llvm.vp.mul.v256i64(<256 x i64> %i0, <256 x i64> %i1, <256 x i1> %m, i32 256)" +" %r2 = call <256 x i64> @llvm.vp.mul.v256i64(<256 x i64> %i0, <256 x i64> %i1, <256 x i1> %m, i32 0)" +" %r3 = call <256 x i64> @llvm.vp.mul.v256i64(<256 x i64> %i0, <256 x i64> %i1, <256 x i1> %m, i32 7)" +" %r4 = call <256 x i64> @llvm.vp.mul.v256i64(<256 x i64> %i0, <256 x i64> %i1, <256 x i1> %m, i32 123)" +" %vs = call i32 @llvm.vscale.i32()" +" %vs.i64 = mul i32 %vs, 2" +" %r5 = call @llvm.vp.mul.nxv2i64( %si0, %si1, %sm, i32 %vs.i64)" +" %r6 = call @llvm.vp.mul.nxv2i64( %si0, %si1, %sm, i32 99999)" +" ret void " +"}", + Err, C); + + auto *F = M->getFunction("test_static_vlen"); + assert(F); + + const int NumExpected = 7; + const bool Expected[] = {false, true, false, false, false, true, false}; + int i = 0; + for (auto &I : F->getEntryBlock()) { + VPIntrinsic *VPI = dyn_cast(&I); + if (!VPI) + continue; + + ASSERT_LT(i, NumExpected); + ASSERT_EQ(Expected[i], VPI->canIgnoreVectorLengthParam()); + ++i; + } +} + +/// Check that the argument returned by +/// VPIntrinsic::GetParamPos(Intrinsic::ID) has the expected type. +TEST_F(VPIntrinsicTest, GetParamPos) { + std::unique_ptr M = CreateVPDeclarationModule(); + assert(M); + + for (Function &F : *M) { + ASSERT_TRUE(F.isIntrinsic()); + Optional MaskParamPos = + VPIntrinsic::GetMaskParamPos(F.getIntrinsicID()); + if (MaskParamPos.hasValue()) { + Type *MaskParamType = F.getArg(MaskParamPos.getValue())->getType(); + ASSERT_TRUE(MaskParamType->isVectorTy()); + ASSERT_TRUE(MaskParamType->getVectorElementType()->isIntegerTy(1)); + } + + Optional VecLenParamPos = + VPIntrinsic::GetVectorLengthParamPos(F.getIntrinsicID()); + if (VecLenParamPos.hasValue()) { + Type *VecLenParamType = F.getArg(VecLenParamPos.getValue())->getType(); + ASSERT_TRUE(VecLenParamType->isIntegerTy(32)); + } + } +} + +/// Check that going from Opcode to VP intrinsic and back results in the same +/// Opcode. +TEST_F(VPIntrinsicTest, OpcodeRoundTrip) { + std::vector Opcodes; + Opcodes.reserve(100); + + { +#define HANDLE_INST(OCNum, OCName, Class) Opcodes.push_back(OCNum); +#include "llvm/IR/Instruction.def" + } + + unsigned FullTripCounts = 0; + for (unsigned OC : Opcodes) { + Intrinsic::ID VPID = VPIntrinsic::GetForOpcode(OC); + // no equivalent VP intrinsic available + if (VPID == Intrinsic::not_intrinsic) + continue; + + unsigned RoundTripOC = VPIntrinsic::GetFunctionalOpcodeForVP(VPID); + // no equivalent Opcode available + if (RoundTripOC == Instruction::Call) + continue; + + ASSERT_EQ(RoundTripOC, OC); + ++FullTripCounts; + } + ASSERT_NE(FullTripCounts, 0u); +} + +} // end anonymous namespace diff --git a/llvm/unittests/MC/AMDGPU/CMakeLists.txt b/llvm/unittests/MC/AMDGPU/CMakeLists.txt new file mode 100644 index 0000000000000..9a77415590d7d --- /dev/null +++ b/llvm/unittests/MC/AMDGPU/CMakeLists.txt @@ -0,0 +1,11 @@ +set(LLVM_LINK_COMPONENTS + AMDGPUCodeGen + AMDGPUDesc + AMDGPUInfo + MC + Support + ) + +add_llvm_unittest(AMDGPUDwarfTests + DwarfRegMappings.cpp + ) diff --git a/llvm/unittests/MC/AMDGPU/DwarfRegMappings.cpp b/llvm/unittests/MC/AMDGPU/DwarfRegMappings.cpp new file mode 100644 index 0000000000000..e416ece9aaa85 --- /dev/null +++ b/llvm/unittests/MC/AMDGPU/DwarfRegMappings.cpp @@ -0,0 +1,77 @@ +//===- llvm/unittests/MC/AMDGPU/DwarfRegMappings.cpp ----------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCTargetOptions.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Support/TargetSelect.h" +#include "llvm/Target/TargetMachine.h" +#include "gtest/gtest.h" +#include + +using namespace llvm; + +std::once_flag flag; + +void InitializeAMDGPUTarget() { + std::call_once(flag, []() { + LLVMInitializeAMDGPUTargetInfo(); + LLVMInitializeAMDGPUTarget(); + LLVMInitializeAMDGPUTargetMC(); + }); +} + +std::unique_ptr +createTargetMachine(std::string TStr, StringRef CPU, StringRef FS) { + InitializeAMDGPUTarget(); + + std::string Error; + const Target *T = TargetRegistry::lookupTarget(TStr, Error); + if (!T) + return nullptr; + + TargetOptions Options; + return std::unique_ptr(static_cast( + T->createTargetMachine(TStr, CPU, FS, Options, None, None))); +} + +TEST(AMDGPUDwarfRegMappingTests, TestWave64DwarfRegMapping) { + for (auto Triple : + {"amdgcn-amd-", "amdgcn-amd-amdhsa", "amdgcn-amd-amdpal"}) { + auto TM = createTargetMachine(Triple, "gfx1010", "+wavefrontsize64"); + if (TM && TM->getMCRegisterInfo()) { + auto MRI = TM->getMCRegisterInfo(); + // Wave64 Dwarf register mapping test numbers + // PC_64 => 16, EXEC_MASK_64 => 17, S0 => 32, S63 => 95, + // S64 => 1088, S105 => 1129, V0 => 2560, V255 => 2815, + // A0 => 3072, A255 => 3327 + for (int llvmReg : {16, 17, 32, 95, 1088, 1129, 2560, 2815, 3072, 3327}) { + MCRegister PCReg(*MRI->getLLVMRegNum(llvmReg, false)); + EXPECT_EQ(llvmReg, MRI->getDwarfRegNum(PCReg, false)); + } + } + } +} + +TEST(AMDGPUDwarfRegMappingTests, TestWave32DwarfRegMapping) { + for (auto Triple : + {"amdgcn-amd-", "amdgcn-amd-amdhsa", "amdgcn-amd-amdpal"}) { + auto TM = createTargetMachine(Triple, "gfx1010", "+wavefrontsize32"); + if (TM && TM->getMCRegisterInfo()) { + auto MRI = TM->getMCRegisterInfo(); + // Wave32 Dwarf register mapping test numbers + // PC_32 => 0, EXEC_MASK_32 => 1, S0 => 32, S63 => 95, + // S64 => 1088, S105 => 1129, V0 => 1536, V255 => 1791, + // A0 => 2048, A255 => 2303 + for (int llvmReg : {0, 1, 32, 95, 1088, 1129, 1536, 1791, 2048, 2303}) { + MCRegister PCReg(*MRI->getLLVMRegNum(llvmReg, false)); + EXPECT_EQ(llvmReg, MRI->getDwarfRegNum(PCReg, false)); + } + } + } +} diff --git a/llvm/unittests/MC/CMakeLists.txt b/llvm/unittests/MC/CMakeLists.txt index 5dcbef2296d0d..48c06183ab99d 100644 --- a/llvm/unittests/MC/CMakeLists.txt +++ b/llvm/unittests/MC/CMakeLists.txt @@ -1,3 +1,9 @@ +foreach(t ${LLVM_TARGETS_TO_BUILD}) + if(IS_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/${t}) + add_subdirectory(${t}) + endif() +endforeach() + set(LLVM_LINK_COMPONENTS ${LLVM_TARGETS_TO_BUILD} MC @@ -12,3 +18,4 @@ add_llvm_unittest(MCTests StringTableBuilderTest.cpp TargetRegistry.cpp ) + diff --git a/llvm/unittests/Passes/CMakeLists.txt b/llvm/unittests/Passes/CMakeLists.txt index c04aa9f84458e..823bc56851fa3 100644 --- a/llvm/unittests/Passes/CMakeLists.txt +++ b/llvm/unittests/Passes/CMakeLists.txt @@ -16,7 +16,7 @@ if (NOT WIN32) add_llvm_unittest(PluginsTests PluginsTest.cpp ) - export_executable_symbols(PluginsTests) + export_executable_symbols_for_plugins(PluginsTests) target_link_libraries(PluginsTests PRIVATE LLVMTestingSupport) set(LLVM_LINK_COMPONENTS) diff --git a/llvm/unittests/Support/CMakeLists.txt b/llvm/unittests/Support/CMakeLists.txt index 7362bc63ab2cf..6ffa8c1d6f64b 100644 --- a/llvm/unittests/Support/CMakeLists.txt +++ b/llvm/unittests/Support/CMakeLists.txt @@ -51,6 +51,7 @@ add_llvm_unittest(SupportTests MemoryBufferTest.cpp MemoryTest.cpp NativeFormatTests.cpp + OptimalLayoutTest.cpp ParallelTest.cpp Path.cpp ProcessTest.cpp diff --git a/llvm/unittests/Support/OptimalLayoutTest.cpp b/llvm/unittests/Support/OptimalLayoutTest.cpp new file mode 100644 index 0000000000000..a31fbaf3f2e68 --- /dev/null +++ b/llvm/unittests/Support/OptimalLayoutTest.cpp @@ -0,0 +1,132 @@ +//=== - llvm/unittest/Support/OptimalLayoutTest.cpp - Layout tests --------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/Support/OptimalLayout.h" +#include "gtest/gtest.h" + +using namespace llvm; + +namespace { + +class LayoutTest { + struct Field { + uint64_t Size; + Align Alignment; + uint64_t ForcedOffset; + uint64_t ExpectedOffset; + }; + + SmallVector Fields; + bool Verified = false; + +public: + LayoutTest() {} + LayoutTest(const LayoutTest &) = delete; + LayoutTest &operator=(const LayoutTest &) = delete; + ~LayoutTest() { assert(Verified); } + + LayoutTest &flexible(uint64_t Size, uint64_t Alignment, + uint64_t ExpectedOffset) { + Fields.push_back({Size, Align(Alignment), + OptimalLayoutField::FlexibleOffset, ExpectedOffset}); + return *this; + } + + LayoutTest &fixed(uint64_t Size, uint64_t Alignment, uint64_t Offset) { + Fields.push_back({Size, Align(Alignment), Offset, Offset}); + return *this; + } + + void verify(uint64_t ExpectedSize, uint64_t ExpectedAlignment) { + SmallVector LayoutFields; + LayoutFields.reserve(Fields.size()); + for (auto &F : Fields) + LayoutFields.emplace_back(&F, F.Size, F.Alignment, F.ForcedOffset); + + auto SizeAndAlign = performOptimalLayout(LayoutFields); + + EXPECT_EQ(SizeAndAlign.first, ExpectedSize); + EXPECT_EQ(SizeAndAlign.second, Align(ExpectedAlignment)); + + for (auto &LF : LayoutFields) { + auto &F = *static_cast(LF.Id); + EXPECT_EQ(LF.Offset, F.ExpectedOffset); + } + + Verified = true; + } +}; + +} + +TEST(OptimalLayoutTest, Basic) { + LayoutTest() + .flexible(12, 4, 8) + .flexible(8, 8, 0) + .flexible(4, 4, 20) + .verify(24, 8); +} + +TEST(OptimalLayoutTest, OddSize) { + LayoutTest() + .flexible(8, 8, 16) + .flexible(4, 4, 12) + .flexible(1, 1, 10) + .flexible(10, 8, 0) + .verify(24, 8); +} + +TEST(OptimalLayoutTest, Gaps) { + LayoutTest() + .fixed(4, 4, 8) + .fixed(4, 4, 16) + .flexible(4, 4, 0) + .flexible(4, 4, 4) + .flexible(4, 4, 12) + .flexible(4, 4, 20) + .verify(24, 4); +} + +TEST(OptimalLayoutTest, Greed) { + // The greedy algorithm doesn't find the optimal layout here, which + // would be to put the 5-byte field at the end. + LayoutTest() + .fixed(4, 4, 8) + .flexible(5, 4, 0) + .flexible(4, 4, 12) + .flexible(4, 4, 16) + .flexible(4, 4, 20) + .verify(24, 4); +} + +TEST(OptimalLayoutTest, Jagged) { + LayoutTest() + .flexible(1, 2, 18) + .flexible(13, 8, 0) + .flexible(3, 2, 14) + .verify(19, 8); +} + +TEST(OptimalLayoutTest, GardenPath) { + // The 4-byte-aligned field is our highest priority, but the less-aligned + // fields keep leaving the end offset mis-aligned. + LayoutTest() + .fixed(7, 4, 0) + .flexible(4, 4, 44) + .flexible(6, 1, 7) + .flexible(5, 1, 13) + .flexible(7, 2, 18) + .flexible(4, 1, 25) + .flexible(4, 1, 29) + .flexible(1, 1, 33) + .flexible(4, 2, 34) + .flexible(4, 2, 38) + .flexible(2, 2, 42) + .flexible(2, 2, 48) + .verify(50, 4); +} \ No newline at end of file diff --git a/llvm/unittests/Support/Path.cpp b/llvm/unittests/Support/Path.cpp index 601223b11ab4b..671966b52dd0c 100644 --- a/llvm/unittests/Support/Path.cpp +++ b/llvm/unittests/Support/Path.cpp @@ -28,6 +28,7 @@ #ifdef _WIN32 #include "llvm/ADT/ArrayRef.h" #include "llvm/Support/Chrono.h" +#include "llvm/Support/Windows/WindowsSupport.h" #include #include #endif @@ -1875,4 +1876,74 @@ TEST_F(FileSystemTest, permissions) { #endif } +#ifdef _WIN32 +TEST_F(FileSystemTest, widenPath) { + const std::wstring LongPathPrefix(L"\\\\?\\"); + + // Test that the length limit is checked against the UTF-16 length and not the + // UTF-8 length. + std::string Input("C:\\foldername\\"); + const std::string Pi("\xcf\x80"); // UTF-8 lower case pi. + // Add Pi up to the MAX_PATH limit. + const size_t NumChars = MAX_PATH - Input.size() - 1; + for (size_t i = 0; i < NumChars; ++i) + Input += Pi; + // Check that UTF-8 length already exceeds MAX_PATH. + EXPECT_TRUE(Input.size() > MAX_PATH); + SmallVector Result; + ASSERT_NO_ERROR(windows::widenPath(Input, Result)); + // Result should not start with the long path prefix. + EXPECT_TRUE(std::wmemcmp(Result.data(), LongPathPrefix.c_str(), + LongPathPrefix.size()) != 0); + EXPECT_EQ(Result.size(), (size_t)MAX_PATH - 1); + + // Add another Pi to exceed the MAX_PATH limit. + Input += Pi; + // Construct the expected result. + SmallVector Expected; + ASSERT_NO_ERROR(windows::UTF8ToUTF16(Input, Expected)); + Expected.insert(Expected.begin(), LongPathPrefix.begin(), + LongPathPrefix.end()); + + ASSERT_NO_ERROR(windows::widenPath(Input, Result)); + EXPECT_EQ(Result, Expected); + + // Test that UNC paths are handled correctly. + const std::string ShareName("\\\\sharename\\"); + const std::string FileName("\\filename"); + // Initialize directory name so that the input is within the MAX_PATH limit. + const char DirChar = 'x'; + std::string DirName(MAX_PATH - ShareName.size() - FileName.size() - 1, + DirChar); + + Input = ShareName + DirName + FileName; + ASSERT_NO_ERROR(windows::widenPath(Input, Result)); + // Result should not start with the long path prefix. + EXPECT_TRUE(std::wmemcmp(Result.data(), LongPathPrefix.c_str(), + LongPathPrefix.size()) != 0); + EXPECT_EQ(Result.size(), (size_t)MAX_PATH - 1); + + // Extend the directory name so the input exceeds the MAX_PATH limit. + DirName += DirChar; + Input = ShareName + DirName + FileName; + // Construct the expected result. + ASSERT_NO_ERROR(windows::UTF8ToUTF16(StringRef(Input).substr(2), Expected)); + const std::wstring UNCPrefix(LongPathPrefix + L"UNC\\"); + Expected.insert(Expected.begin(), UNCPrefix.begin(), UNCPrefix.end()); + + ASSERT_NO_ERROR(windows::widenPath(Input, Result)); + EXPECT_EQ(Result, Expected); + + // Check that Unix separators are handled correctly. + std::replace(Input.begin(), Input.end(), '\\', '/'); + ASSERT_NO_ERROR(windows::widenPath(Input, Result)); + EXPECT_EQ(Result, Expected); + + // Check the removal of "dots". + Input = ShareName + DirName + "\\.\\foo\\.\\.." + FileName; + ASSERT_NO_ERROR(windows::widenPath(Input, Result)); + EXPECT_EQ(Result, Expected); +} +#endif + } // anonymous namespace diff --git a/llvm/unittests/Support/VirtualFileSystemTest.cpp b/llvm/unittests/Support/VirtualFileSystemTest.cpp index acd526409f2f4..8ac454b511e8f 100644 --- a/llvm/unittests/Support/VirtualFileSystemTest.cpp +++ b/llvm/unittests/Support/VirtualFileSystemTest.cpp @@ -2188,3 +2188,59 @@ TEST_F(VFSFromYAMLTest, WorkingDirectoryFallthroughInvalid) { Status = FS->status("foo/a"); ASSERT_TRUE(Status.getError()); } + +TEST_F(VFSFromYAMLTest, YAMLVFSWriterTest) { + ScopedDir TestDirectory("virtual-file-system-test", /*Unique*/ true); + ScopedDir _a(TestDirectory + "/a"); + ScopedFile _ab(TestDirectory + "/a/b", ""); + ScopedDir _c(TestDirectory + "/c"); + ScopedFile _cd(TestDirectory + "/c/d", ""); + ScopedDir _e(TestDirectory + "/e"); + ScopedDir _ef(TestDirectory + "/e/f"); + ScopedDir _g(TestDirectory + "/g"); + ScopedFile _h(TestDirectory + "/h", ""); + + // This test exposes a bug/shortcoming in the YAMLVFSWriter. Below we call + // addFileMapping for _a and _e, which causes _ab and _ef not to exists in + // the deserialized file system, because _a and _e got emitted as regular + // files. The counter example is _c, if we only call addFileMapping for _cd, + // things work as expected. + + vfs::YAMLVFSWriter VFSWriter; + VFSWriter.addFileMapping(_a.Path, "//root/a"); + VFSWriter.addFileMapping(_ab.Path, "//root/a/b"); + VFSWriter.addFileMapping(_cd.Path, "//root/c/d"); + VFSWriter.addFileMapping(_e.Path, "//root/e"); + VFSWriter.addFileMapping(_ef.Path, "//root/e/f"); + VFSWriter.addFileMapping(_g.Path, "//root/g"); + VFSWriter.addFileMapping(_h.Path, "//root/h"); + + std::string Buffer; + raw_string_ostream OS(Buffer); + VFSWriter.write(OS); + OS.flush(); + + IntrusiveRefCntPtr Lower(new ErrorDummyFileSystem()); + Lower->addDirectory("//root/"); + Lower->addDirectory("//root/a"); + Lower->addRegularFile("//root/a/b"); + Lower->addDirectory("//root/b"); + Lower->addDirectory("//root/c"); + Lower->addRegularFile("//root/c/d"); + Lower->addDirectory("//root/e"); + Lower->addDirectory("//root/e/f"); + Lower->addDirectory("//root/g"); + Lower->addRegularFile("//root/h"); + + IntrusiveRefCntPtr FS = getFromYAMLRawString(Buffer, Lower); + ASSERT_TRUE(FS.get() != nullptr); + + EXPECT_TRUE(FS->exists(_a.Path)); + EXPECT_FALSE(FS->exists(_ab.Path)); // FIXME: See explanation above. + EXPECT_TRUE(FS->exists(_c.Path)); + EXPECT_TRUE(FS->exists(_cd.Path)); + EXPECT_TRUE(FS->exists(_e.Path)); + EXPECT_FALSE(FS->exists(_ef.Path)); // FIXME: See explanation above. + EXPECT_TRUE(FS->exists(_g.Path)); + EXPECT_TRUE(FS->exists(_h.Path)); +} diff --git a/llvm/unittests/Target/AMDGPU/CMakeLists.txt b/llvm/unittests/Target/AMDGPU/CMakeLists.txt new file mode 100644 index 0000000000000..ba6b92f378a6c --- /dev/null +++ b/llvm/unittests/Target/AMDGPU/CMakeLists.txt @@ -0,0 +1,16 @@ +include_directories( + ${CMAKE_SOURCE_DIR}/lib/Target/AMDGPU + ${CMAKE_BINARY_DIR}/lib/Target/AMDGPU + ) + +set(LLVM_LINK_COMPONENTS + AMDGPUCodeGen + AMDGPUDesc + AMDGPUInfo + MC + Support + ) + +add_llvm_target_unittest(AMDGPUTests + DwarfRegMappings.cpp + ) diff --git a/llvm/unittests/Target/AMDGPU/DwarfRegMappings.cpp b/llvm/unittests/Target/AMDGPU/DwarfRegMappings.cpp new file mode 100644 index 0000000000000..87facdee061dc --- /dev/null +++ b/llvm/unittests/Target/AMDGPU/DwarfRegMappings.cpp @@ -0,0 +1,88 @@ +//===- llvm/unittests/Target/AMDGPU/DwarfRegMappings.cpp ------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUSubtarget.h" +#include "AMDGPUTargetMachine.h" +#include "llvm/CodeGen/TargetSubtargetInfo.h" +#include "llvm/MC/MCTargetOptions.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Support/TargetSelect.h" +#include "llvm/Target/TargetMachine.h" +#include "gtest/gtest.h" +#include + +using namespace llvm; + +std::once_flag flag; + +void InitializeAMDGPUTarget() { + std::call_once(flag, []() { + LLVMInitializeAMDGPUTargetInfo(); + LLVMInitializeAMDGPUTarget(); + LLVMInitializeAMDGPUTargetMC(); + }); +} + +std::unique_ptr +createTargetMachine(std::string TStr, StringRef CPU, StringRef FS) { + InitializeAMDGPUTarget(); + + std::string Error; + const Target *T = TargetRegistry::lookupTarget(TStr, Error); + if (!T) + return nullptr; + + TargetOptions Options; + return std::unique_ptr(static_cast( + T->createTargetMachine(TStr, CPU, FS, Options, None, None))); +} + +TEST(AMDGPUDwarfRegMappingTests, TestWave64DwarfRegMapping) { + for (auto Triple : + {"amdgcn-amd-", "amdgcn-amd-amdhsa", "amdgcn-amd-amdpal"}) { + auto TM = createTargetMachine(Triple, "gfx1010", "+wavefrontsize64"); + if (TM) { + GCNSubtarget ST(TM->getTargetTriple(), std::string(TM->getTargetCPU()), + std::string(TM->getTargetFeatureString()), *TM); + auto MRI = ST.getRegisterInfo(); + if (MRI) { + // Wave64 Dwarf register mapping test numbers + // PC_64 => 16, EXEC_MASK_64 => 17, S0 => 32, S63 => 95, + // S64 => 1088, S105 => 1129, V0 => 2560, V255 => 2815, + // A0 => 3072, A255 => 3327 + for (int llvmReg : + {16, 17, 32, 95, 1088, 1129, 2560, 2815, 3072, 3327}) { + MCRegister PCReg(*MRI->getLLVMRegNum(llvmReg, false)); + EXPECT_EQ(llvmReg, MRI->getDwarfRegNum(PCReg, false)); + } + } + } + } +} + +TEST(AMDGPUDwarfRegMappingTests, TestWave32DwarfRegMapping) { + for (auto Triple : + {"amdgcn-amd-", "amdgcn-amd-amdhsa", "amdgcn-amd-amdpal"}) { + auto TM = createTargetMachine(Triple, "gfx1010", "+wavefrontsize32"); + if (TM) { + GCNSubtarget ST(TM->getTargetTriple(), std::string(TM->getTargetCPU()), + std::string(TM->getTargetFeatureString()), *TM); + auto MRI = ST.getRegisterInfo(); + if (MRI) { + // Wave32 Dwarf register mapping test numbers + // PC_32 => 0, EXEC_MASK_32 => 1, S0 => 32, S63 => 95, + // S64 => 1088, S105 => 1129, V0 => 1536, V255 => 1791, + // A0 => 2048, A255 => 2303 + for (int llvmReg : {0, 1, 32, 95, 1088, 1129, 1536, 1791, 2048, 2303}) { + MCRegister PCReg(*MRI->getLLVMRegNum(llvmReg, false)); + EXPECT_EQ(llvmReg, MRI->getDwarfRegNum(PCReg, false)); + } + } + } + } +} diff --git a/llvm/unittests/Target/ARM/MachineInstrTest.cpp b/llvm/unittests/Target/ARM/MachineInstrTest.cpp index 94939603845db..90f84d4e5aa1f 100644 --- a/llvm/unittests/Target/ARM/MachineInstrTest.cpp +++ b/llvm/unittests/Target/ARM/MachineInstrTest.cpp @@ -10,11 +10,267 @@ using namespace llvm; +TEST(MachineInstructionHorizontalReduction, IsCorrect) { + using namespace ARM; + + auto HorizontalReduction = [](unsigned Opcode) { + switch (Opcode) { + default: + break; + case MVE_VABAVs16: + case MVE_VABAVs32: + case MVE_VABAVs8: + case MVE_VABAVu16: + case MVE_VABAVu32: + case MVE_VABAVu8: + case MVE_VADDLVs32acc: + case MVE_VADDLVs32no_acc: + case MVE_VADDLVu32acc: + case MVE_VADDLVu32no_acc: + case MVE_VADDVs16acc: + case MVE_VADDVs16no_acc: + case MVE_VADDVs32acc: + case MVE_VADDVs32no_acc: + case MVE_VADDVs8acc: + case MVE_VADDVs8no_acc: + case MVE_VADDVu16acc: + case MVE_VADDVu16no_acc: + case MVE_VADDVu32acc: + case MVE_VADDVu32no_acc: + case MVE_VADDVu8acc: + case MVE_VADDVu8no_acc: + case MVE_VMAXAVs16: + case MVE_VMAXAVs32: + case MVE_VMAXAVs8: + case MVE_VMAXNMAVf16: + case MVE_VMAXNMAVf32: + case MVE_VMAXNMVf16: + case MVE_VMAXNMVf32: + case MVE_VMAXVs16: + case MVE_VMAXVs32: + case MVE_VMAXVs8: + case MVE_VMAXVu16: + case MVE_VMAXVu32: + case MVE_VMAXVu8: + case MVE_VMINAVs16: + case MVE_VMINAVs32: + case MVE_VMINAVs8: + case MVE_VMINNMAVf16: + case MVE_VMINNMAVf32: + case MVE_VMINNMVf16: + case MVE_VMINNMVf32: + case MVE_VMINVs16: + case MVE_VMINVs32: + case MVE_VMINVs8: + case MVE_VMINVu16: + case MVE_VMINVu32: + case MVE_VMINVu8: + case MVE_VMLADAVas16: + case MVE_VMLADAVas32: + case MVE_VMLADAVas8: + case MVE_VMLADAVau16: + case MVE_VMLADAVau32: + case MVE_VMLADAVau8: + case MVE_VMLADAVaxs16: + case MVE_VMLADAVaxs32: + case MVE_VMLADAVaxs8: + case MVE_VMLADAVs16: + case MVE_VMLADAVs32: + case MVE_VMLADAVs8: + case MVE_VMLADAVu16: + case MVE_VMLADAVu32: + case MVE_VMLADAVu8: + case MVE_VMLADAVxs16: + case MVE_VMLADAVxs32: + case MVE_VMLADAVxs8: + case MVE_VMLALDAVas16: + case MVE_VMLALDAVas32: + case MVE_VMLALDAVau16: + case MVE_VMLALDAVau32: + case MVE_VMLALDAVaxs16: + case MVE_VMLALDAVaxs32: + case MVE_VMLALDAVs16: + case MVE_VMLALDAVs32: + case MVE_VMLALDAVu16: + case MVE_VMLALDAVu32: + case MVE_VMLALDAVxs16: + case MVE_VMLALDAVxs32: + case MVE_VMLSDAVas16: + case MVE_VMLSDAVas32: + case MVE_VMLSDAVas8: + case MVE_VMLSDAVaxs16: + case MVE_VMLSDAVaxs32: + case MVE_VMLSDAVaxs8: + case MVE_VMLSDAVs16: + case MVE_VMLSDAVs32: + case MVE_VMLSDAVs8: + case MVE_VMLSDAVxs16: + case MVE_VMLSDAVxs32: + case MVE_VMLSDAVxs8: + case MVE_VMLSLDAVas16: + case MVE_VMLSLDAVas32: + case MVE_VMLSLDAVaxs16: + case MVE_VMLSLDAVaxs32: + case MVE_VMLSLDAVs16: + case MVE_VMLSLDAVs32: + case MVE_VMLSLDAVxs16: + case MVE_VMLSLDAVxs32: + case MVE_VRMLALDAVHas32: + case MVE_VRMLALDAVHau32: + case MVE_VRMLALDAVHaxs32: + case MVE_VRMLALDAVHs32: + case MVE_VRMLALDAVHu32: + case MVE_VRMLALDAVHxs32: + case MVE_VRMLSLDAVHas32: + case MVE_VRMLSLDAVHaxs32: + case MVE_VRMLSLDAVHs32: + case MVE_VRMLSLDAVHxs32: + return true; + } + return false; + }; + + LLVMInitializeARMTargetInfo(); + LLVMInitializeARMTarget(); + LLVMInitializeARMTargetMC(); + + auto TT(Triple::normalize("thumbv8.1m.main-arm-none-eabi")); + std::string Error; + const Target *T = TargetRegistry::lookupTarget(TT, Error); + if (!T) { + dbgs() << Error; + return; + } + + TargetOptions Options; + auto TM = std::unique_ptr( + static_cast( + T->createTargetMachine(TT, "generic", "", Options, None, None, + CodeGenOpt::Default))); + ARMSubtarget ST(TM->getTargetTriple(), std::string(TM->getTargetCPU()), + std::string(TM->getTargetFeatureString()), + *static_cast(TM.get()), false); + const ARMBaseInstrInfo *TII = ST.getInstrInfo(); + auto MII = TM->getMCInstrInfo(); + + for (unsigned i = 0; i < ARM::INSTRUCTION_LIST_END; ++i) { + const MCInstrDesc &Desc = TII->get(i); + + uint64_t Flags = Desc.TSFlags; + if ((Flags & ARMII::DomainMask) != ARMII::DomainMVE) + continue; + + bool Valid = (Flags & ARMII::HorizontalReduction) != 0; + ASSERT_EQ(HorizontalReduction(i), Valid) + << MII->getName(i) + << ": mismatched expectation for tail-predicated safety\n"; + } +} + +TEST(MachineInstructionRetainsPreviousHalfElement, IsCorrect) { + using namespace ARM; + + auto RetainsPreviousHalfElement = [](unsigned Opcode) { + switch (Opcode) { + default: + break; + case MVE_VMOVNi16bh: + case MVE_VMOVNi16th: + case MVE_VMOVNi32bh: + case MVE_VMOVNi32th: + case MVE_VQMOVNs16bh: + case MVE_VQMOVNs16th: + case MVE_VQMOVNs32bh: + case MVE_VQMOVNs32th: + case MVE_VQMOVNu16bh: + case MVE_VQMOVNu16th: + case MVE_VQMOVNu32bh: + case MVE_VQMOVNu32th: + case MVE_VQMOVUNs16bh: + case MVE_VQMOVUNs16th: + case MVE_VQMOVUNs32bh: + case MVE_VQMOVUNs32th: + case MVE_VQRSHRNbhs16: + case MVE_VQRSHRNbhs32: + case MVE_VQRSHRNbhu16: + case MVE_VQRSHRNbhu32: + case MVE_VQRSHRNths16: + case MVE_VQRSHRNths32: + case MVE_VQRSHRNthu16: + case MVE_VQRSHRNthu32: + case MVE_VQRSHRUNs16bh: + case MVE_VQRSHRUNs16th: + case MVE_VQRSHRUNs32bh: + case MVE_VQRSHRUNs32th: + case MVE_VQSHRNbhs16: + case MVE_VQSHRNbhs32: + case MVE_VQSHRNbhu16: + case MVE_VQSHRNbhu32: + case MVE_VQSHRNths16: + case MVE_VQSHRNths32: + case MVE_VQSHRNthu16: + case MVE_VQSHRNthu32: + case MVE_VQSHRUNs16bh: + case MVE_VQSHRUNs16th: + case MVE_VQSHRUNs32bh: + case MVE_VQSHRUNs32th: + case MVE_VRSHRNi16bh: + case MVE_VRSHRNi16th: + case MVE_VRSHRNi32bh: + case MVE_VRSHRNi32th: + case MVE_VSHRNi16bh: + case MVE_VSHRNi16th: + case MVE_VSHRNi32bh: + case MVE_VSHRNi32th: + case MVE_VCVTf16f32bh: + case MVE_VCVTf16f32th: + case MVE_VCVTf32f16bh: + case MVE_VCVTf32f16th: + return true; + } + return false; + }; + + LLVMInitializeARMTargetInfo(); + LLVMInitializeARMTarget(); + LLVMInitializeARMTargetMC(); + + auto TT(Triple::normalize("thumbv8.1m.main-arm-none-eabi")); + std::string Error; + const Target *T = TargetRegistry::lookupTarget(TT, Error); + if (!T) { + dbgs() << Error; + return; + } + + TargetOptions Options; + auto TM = std::unique_ptr( + static_cast( + T->createTargetMachine(TT, "generic", "", Options, None, None, + CodeGenOpt::Default))); + ARMSubtarget ST(TM->getTargetTriple(), std::string(TM->getTargetCPU()), + std::string(TM->getTargetFeatureString()), + *static_cast(TM.get()), false); + const ARMBaseInstrInfo *TII = ST.getInstrInfo(); + auto MII = TM->getMCInstrInfo(); + + for (unsigned i = 0; i < ARM::INSTRUCTION_LIST_END; ++i) { + const MCInstrDesc &Desc = TII->get(i); + + uint64_t Flags = Desc.TSFlags; + if ((Flags & ARMII::DomainMask) != ARMII::DomainMVE) + continue; + + bool Valid = (Flags & ARMII::RetainsPreviousHalfElement) != 0; + ASSERT_EQ(RetainsPreviousHalfElement(i), Valid) + << MII->getName(i) + << ": mismatched expectation for tail-predicated safety\n"; + } +} // Test for instructions that aren't immediately obviously valid within a // tail-predicated loop. This should be marked up in their tablegen // descriptions. Currently we, conservatively, disallow: // - cross beat carries. -// - narrowing of results. // - complex operations. // - horizontal operations. // - byte swapping. diff --git a/llvm/unittests/Transforms/Utils/LocalTest.cpp b/llvm/unittests/Transforms/Utils/LocalTest.cpp index cd32669ca2f61..4fa6a09f2022f 100644 --- a/llvm/unittests/Transforms/Utils/LocalTest.cpp +++ b/llvm/unittests/Transforms/Utils/LocalTest.cpp @@ -1001,3 +1001,63 @@ TEST(Local, SimplifyCFGWithNullAC) { // %test.bb is expected to be simplified by FoldCondBranchOnPHI. EXPECT_TRUE(simplifyCFG(TestBB, TTI, Options)); } + +TEST(Local, CanReplaceOperandWithVariable) { + LLVMContext Ctx; + Module M("test_module", Ctx); + IRBuilder<> B(Ctx); + + FunctionType *FnType = + FunctionType::get(Type::getVoidTy(Ctx), {}, false); + + FunctionType *VarArgFnType = + FunctionType::get(Type::getVoidTy(Ctx), {B.getInt32Ty()}, true); + + Function *TestBody = Function::Create(FnType, GlobalValue::ExternalLinkage, + 0, "", &M); + + BasicBlock *BB0 = BasicBlock::Create(Ctx, "", TestBody); + B.SetInsertPoint(BB0); + + Value *Intrin = M.getOrInsertFunction("llvm.foo", FnType).getCallee(); + Value *Func = M.getOrInsertFunction("foo", FnType).getCallee(); + Value *VarArgFunc + = M.getOrInsertFunction("foo.vararg", VarArgFnType).getCallee(); + Value *VarArgIntrin + = M.getOrInsertFunction("llvm.foo.vararg", VarArgFnType).getCallee(); + + auto *CallToIntrin = B.CreateCall(Intrin); + auto *CallToFunc = B.CreateCall(Func); + + // Test if it's valid to replace the callee operand. + EXPECT_FALSE(canReplaceOperandWithVariable(CallToIntrin, 0)); + EXPECT_TRUE(canReplaceOperandWithVariable(CallToFunc, 0)); + + // That it's invalid to replace an argument in the variadic argument list for + // an intrinsic, but OK for a normal function. + auto *CallToVarArgFunc = B.CreateCall( + VarArgFunc, {B.getInt32(0), B.getInt32(1), B.getInt32(2)}); + EXPECT_TRUE(canReplaceOperandWithVariable(CallToVarArgFunc, 0)); + EXPECT_TRUE(canReplaceOperandWithVariable(CallToVarArgFunc, 1)); + EXPECT_TRUE(canReplaceOperandWithVariable(CallToVarArgFunc, 2)); + EXPECT_TRUE(canReplaceOperandWithVariable(CallToVarArgFunc, 3)); + + auto *CallToVarArgIntrin = B.CreateCall( + VarArgIntrin, {B.getInt32(0), B.getInt32(1), B.getInt32(2)}); + EXPECT_TRUE(canReplaceOperandWithVariable(CallToVarArgIntrin, 0)); + EXPECT_FALSE(canReplaceOperandWithVariable(CallToVarArgIntrin, 1)); + EXPECT_FALSE(canReplaceOperandWithVariable(CallToVarArgIntrin, 2)); + EXPECT_FALSE(canReplaceOperandWithVariable(CallToVarArgIntrin, 3)); + + // Test that it's invalid to replace gcroot operands, even though it can't use + // immarg. + Type *PtrPtr = B.getInt8Ty()->getPointerTo(0); + Value *Alloca = B.CreateAlloca(PtrPtr, (unsigned)0); + CallInst *GCRoot = B.CreateIntrinsic(Intrinsic::gcroot, {}, + {Alloca, Constant::getNullValue(PtrPtr)}); + EXPECT_TRUE(canReplaceOperandWithVariable(GCRoot, 0)); // Alloca + EXPECT_FALSE(canReplaceOperandWithVariable(GCRoot, 1)); + EXPECT_FALSE(canReplaceOperandWithVariable(GCRoot, 2)); + + BB0->dropAllReferences(); +} diff --git a/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp index debf7b9561013..5269a9a17543f 100644 --- a/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp @@ -89,6 +89,9 @@ TEST_F(VPlanHCFGTest, testBuildHCFGInnerLoop) { EXPECT_EQ(IndvarAdd, ICmp->getOperand(0)); EXPECT_EQ(VecBB->getCondBit(), ICmp); + // Add an external value to check we do not print the list of external values, + // as this is not required with the new printing. + Plan->addVPValue(&*F->arg_begin()); std::string FullDump; raw_string_ostream(FullDump) << *Plan; EXPECT_EQ(R"(digraph VPlan { diff --git a/llvm/utils/TableGen/CodeGenIntrinsics.h b/llvm/utils/TableGen/CodeGenIntrinsics.h index 723bbe0cc23d8..824bb944753bf 100644 --- a/llvm/utils/TableGen/CodeGenIntrinsics.h +++ b/llvm/utils/TableGen/CodeGenIntrinsics.h @@ -123,6 +123,9 @@ struct CodeGenIntrinsic { /// True if the intrinsic is no-return. bool isNoReturn; + /// True if the intrinsic is no-sync. + bool isNoSync; + /// True if the intrinsic is will-return. bool isWillReturn; diff --git a/llvm/utils/TableGen/CodeGenTarget.cpp b/llvm/utils/TableGen/CodeGenTarget.cpp index de41692c6f45b..921d20e7af765 100644 --- a/llvm/utils/TableGen/CodeGenTarget.cpp +++ b/llvm/utils/TableGen/CodeGenTarget.cpp @@ -607,6 +607,7 @@ CodeGenIntrinsic::CodeGenIntrinsic(Record *R) { isCommutative = false; canThrow = false; isNoReturn = false; + isNoSync = false; isWillReturn = false; isCold = false; isNoDuplicate = false; @@ -726,8 +727,7 @@ CodeGenIntrinsic::CodeGenIntrinsic(Record *R) { // variants with iAny types; otherwise, if the intrinsic is not // overloaded, all the types can be specified directly. assert(((!TyEl->isSubClassOf("LLVMExtendedType") && - !TyEl->isSubClassOf("LLVMTruncatedType") && - !TyEl->isSubClassOf("LLVMScalarOrSameVectorWidth")) || + !TyEl->isSubClassOf("LLVMTruncatedType")) || VT == MVT::iAny || VT == MVT::vAny) && "Expected iAny or vAny type"); } else @@ -772,6 +772,8 @@ CodeGenIntrinsic::CodeGenIntrinsic(Record *R) { isConvergent = true; else if (Property->getName() == "IntrNoReturn") isNoReturn = true; + else if (Property->getName() == "IntrNoSync") + isNoSync = true; else if (Property->getName() == "IntrWillReturn") isWillReturn = true; else if (Property->getName() == "IntrCold") diff --git a/llvm/utils/TableGen/IntrinsicEmitter.cpp b/llvm/utils/TableGen/IntrinsicEmitter.cpp index 3ac9cc857f02a..b0ac385c19390 100644 --- a/llvm/utils/TableGen/IntrinsicEmitter.cpp +++ b/llvm/utils/TableGen/IntrinsicEmitter.cpp @@ -579,6 +579,9 @@ struct AttributeComparator { if (L->isNoReturn != R->isNoReturn) return R->isNoReturn; + if (L->isNoSync != R->isNoSync) + return R->isNoSync; + if (L->isWillReturn != R->isWillReturn) return R->isWillReturn; @@ -720,8 +723,8 @@ void IntrinsicEmitter::EmitAttributes(const CodeGenIntrinsicTable &Ints, if (!intrinsic.canThrow || (intrinsic.ModRef != CodeGenIntrinsic::ReadWriteMem && !intrinsic.hasSideEffects) || - intrinsic.isNoReturn || intrinsic.isWillReturn || intrinsic.isCold || - intrinsic.isNoDuplicate || intrinsic.isConvergent || + intrinsic.isNoReturn || intrinsic.isNoSync || intrinsic.isWillReturn || + intrinsic.isCold || intrinsic.isNoDuplicate || intrinsic.isConvergent || intrinsic.isSpeculatable) { OS << " const Attribute::AttrKind Atts[] = {"; bool addComma = false; @@ -735,6 +738,12 @@ void IntrinsicEmitter::EmitAttributes(const CodeGenIntrinsicTable &Ints, OS << "Attribute::NoReturn"; addComma = true; } + if (intrinsic.isNoSync) { + if (addComma) + OS << ","; + OS << "Attribute::NoSync"; + addComma = true; + } if (intrinsic.isWillReturn) { if (addComma) OS << ","; diff --git a/llvm/utils/TableGen/RegisterBankEmitter.cpp b/llvm/utils/TableGen/RegisterBankEmitter.cpp index 5d0751d144516..586f857b1fb0d 100644 --- a/llvm/utils/TableGen/RegisterBankEmitter.cpp +++ b/llvm/utils/TableGen/RegisterBankEmitter.cpp @@ -37,12 +37,12 @@ class RegisterBank { /// The register classes that are covered by the register bank. RegisterClassesTy RCs; - /// The register classes with the largest register size for each HwMode. - std::vector RCsWithLargestRegSize; + /// The register class with the largest register size. + const CodeGenRegisterClass *RCWithLargestRegsSize; public: - RegisterBank(const Record &TheDef, unsigned NumModeIds) - : TheDef(TheDef), RCs(), RCsWithLargestRegSize(NumModeIds) {} + RegisterBank(const Record &TheDef) + : TheDef(TheDef), RCs(), RCWithLargestRegsSize(nullptr) {} /// Get the human-readable name for the bank. StringRef getName() const { return TheDef.getValueAsString("Name"); } @@ -54,10 +54,6 @@ class RegisterBank { return (TheDef.getName() + "CoverageData").str(); } - std::string getSizesArrayName() const { - return (TheDef.getName() + "Sizes").str(); - } - /// Get the name of the global instance variable. StringRef getInstanceVarName() const { return TheDef.getName(); } @@ -87,20 +83,18 @@ class RegisterBank { // register size anywhere (we could sum the sizes of the subregisters // but there may be additional bits too) and we can't derive it from // the VT's reliably due to Untyped. - unsigned NumModeIds = RCsWithLargestRegSize.size(); - for (unsigned M = 0; M < NumModeIds; ++M) { - if (RCsWithLargestRegSize[M] == nullptr) - RCsWithLargestRegSize[M] = RC; - else if (RCsWithLargestRegSize[M]->RSI.get(M).SpillSize < - RC->RSI.get(M).SpillSize) - RCsWithLargestRegSize[M] = RC; - assert(RCsWithLargestRegSize[M] && "RC was nullptr?"); - } + if (RCWithLargestRegsSize == nullptr) + RCWithLargestRegsSize = RC; + else if (RCWithLargestRegsSize->RSI.get(DefaultMode).SpillSize < + RC->RSI.get(DefaultMode).SpillSize) + RCWithLargestRegsSize = RC; + assert(RCWithLargestRegsSize && "RC was nullptr?"); + RCs.emplace_back(RC); } - const CodeGenRegisterClass *getRCWithLargestRegsSize(unsigned HwMode) const { - return RCsWithLargestRegSize[HwMode]; + const CodeGenRegisterClass *getRCWithLargestRegsSize() const { + return RCWithLargestRegsSize; } iterator_range @@ -153,7 +147,7 @@ void RegisterBankEmitter::emitBaseClassDefinition( OS << "private:\n" << " static RegisterBank *RegBanks[];\n\n" << "protected:\n" - << " " << TargetName << "GenRegisterBankInfo(unsigned HwMode = 0);\n" + << " " << TargetName << "GenRegisterBankInfo();\n" << "\n"; } @@ -219,7 +213,6 @@ void RegisterBankEmitter::emitBaseClassImplementation( raw_ostream &OS, StringRef TargetName, std::vector &Banks) { const CodeGenRegBank &RegisterClassHierarchy = Target.getRegBank(); - const CodeGenHwModes &CGH = Target.getHwModes(); OS << "namespace llvm {\n" << "namespace " << TargetName << " {\n"; @@ -247,30 +240,14 @@ void RegisterBankEmitter::emitBaseClassImplementation( } OS << "\n"; - unsigned NumModeIds = CGH.getNumModeIds(); - for (const auto &Bank : Banks) { - OS << "const unsigned " << Bank.getSizesArrayName() << "[] = {\n"; - for (unsigned M = 0; M < NumModeIds; ++M) { - const CodeGenRegisterClass &RC = *Bank.getRCWithLargestRegsSize(M); - unsigned Size = RC.RSI.get(M).SpillSize; - OS << " // Mode = " << M << " ("; - if (M == 0) - OS << "Default"; - else - OS << CGH.getMode(M).Name; - OS << ")\n"; - OS << " " << Size << ",\n"; - } - OS << "};\n"; - } - OS << "\n"; - for (const auto &Bank : Banks) { std::string QualifiedBankID = (TargetName + "::" + Bank.getEnumeratorName()).str(); + const CodeGenRegisterClass &RC = *Bank.getRCWithLargestRegsSize(); + unsigned Size = RC.RSI.get(DefaultMode).SpillSize; OS << "RegisterBank " << Bank.getInstanceVarName() << "(/* ID */ " << QualifiedBankID << ", /* Name */ \"" << Bank.getName() - << "\", /* Sizes */ " << Bank.getInstanceVarName() << "Sizes, " + << "\", /* Size */ " << Size << ", " << "/* CoveredRegClasses */ " << Bank.getCoverageArrayName() << ", /* NumRegClasses */ " << RegisterClassHierarchy.getRegClasses().size() << ");\n"; @@ -285,9 +262,9 @@ void RegisterBankEmitter::emitBaseClassImplementation( OS << "};\n\n"; OS << TargetName << "GenRegisterBankInfo::" << TargetName - << "GenRegisterBankInfo(unsigned HwMode)\n" + << "GenRegisterBankInfo()\n" << " : RegisterBankInfo(RegBanks, " << TargetName - << "::NumRegisterBanks, HwMode) {\n" + << "::NumRegisterBanks) {\n" << " // Assert that RegBank indices match their ID's\n" << "#ifndef NDEBUG\n" << " unsigned Index = 0;\n" @@ -301,12 +278,11 @@ void RegisterBankEmitter::emitBaseClassImplementation( void RegisterBankEmitter::run(raw_ostream &OS) { StringRef TargetName = Target.getName(); const CodeGenRegBank &RegisterClassHierarchy = Target.getRegBank(); - const CodeGenHwModes &CGH = Target.getHwModes(); std::vector Banks; for (const auto &V : Records.getAllDerivedDefinitions("RegisterBank")) { SmallPtrSet VisitedRCs; - RegisterBank Bank(*V, CGH.getNumModeIds()); + RegisterBank Bank(*V); for (const CodeGenRegisterClass *RC : Bank.getExplicitlySpecifiedRegisterClasses(RegisterClassHierarchy)) { diff --git a/llvm/utils/UpdateTestChecks/common.py b/llvm/utils/UpdateTestChecks/common.py index 9dc42d46dc9f7..3186628b50a26 100644 --- a/llvm/utils/UpdateTestChecks/common.py +++ b/llvm/utils/UpdateTestChecks/common.py @@ -67,16 +67,16 @@ def invoke_tool(exe, cmd_args, ir): UTC_ARGS_CMD = re.compile(r'.*' + UTC_ARGS_KEY + '\s*(?P.*)\s*$') OPT_FUNCTION_RE = re.compile( - r'^\s*define\s+(?:internal\s+)?[^@]*@(?P[\w-]+?)\s*' - r'(?P\((\)|(.*?[\w\.\-]+?)\))[^{]*)\{\n(?P.*?)^\}$', + r'^\s*define\s+(?:internal\s+)?[^@]*@(?P[\w.-]+?)\s*' + r'(?P\((\)|(.*?[\w.-]+?)\))[^{]*)\{\n(?P.*?)^\}$', flags=(re.M | re.S)) ANALYZE_FUNCTION_RE = re.compile( - r'^\s*\'(?P[\w\s-]+?)\'\s+for\s+function\s+\'(?P[\w-]+?)\':' + r'^\s*\'(?P[\w\s-]+?)\'\s+for\s+function\s+\'(?P[\w.-]+?)\':' r'\s*\n(?P.*)$', flags=(re.X | re.S)) -IR_FUNCTION_RE = re.compile(r'^\s*define\s+(?:internal\s+)?[^@]*@([\w.]+)\s*\(') +IR_FUNCTION_RE = re.compile(r'^\s*define\s+(?:internal\s+)?[^@]*@([\w.-]+)\s*\(') TRIPLE_IR_RE = re.compile(r'^\s*target\s+triple\s*=\s*"([^"]+)"$') TRIPLE_ARG_RE = re.compile(r'-mtriple[= ]([^ ]+)') MARCH_ARG_RE = re.compile(r'-march[= ]([^ ]+)') @@ -215,7 +215,7 @@ def build_function_body_dictionary(function_re, scrubber, scrubber_args, raw_too # Match things that look at identifiers, but only if they are followed by # spaces, commas, paren, or end of the string -IR_VALUE_RE = re.compile(r'(\s+)%([\w\.\-]+?)([,\s\(\)]|\Z)') +IR_VALUE_RE = re.compile(r'(\s+)%([\w.-]+?)([,\s\(\)]|\Z)') # Create a FileCheck variable name based on an IR name. def get_value_name(var): diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn index 06f2a8f8efe49..a34f9dcf9c0a0 100644 --- a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn +++ b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn @@ -46,6 +46,7 @@ static_library("bugprone") { "SignedCharMisuseCheck.cpp", "SizeofContainerCheck.cpp", "SizeofExpressionCheck.cpp", + "SpuriouslyWakeUpFunctionsCheck.cpp", "StringConstructorCheck.cpp", "StringIntegerAssignmentCheck.cpp", "StringLiteralWithEmbeddedNulCheck.cpp", diff --git a/llvm/utils/gn/secondary/clang/lib/StaticAnalyzer/Core/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/StaticAnalyzer/Core/BUILD.gn index 228d13446cabf..f37cc42c481e8 100644 --- a/llvm/utils/gn/secondary/clang/lib/StaticAnalyzer/Core/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/lib/StaticAnalyzer/Core/BUILD.gn @@ -57,6 +57,7 @@ static_library("Core") { "Store.cpp", "SubEngine.cpp", "SymbolManager.cpp", + "TextDiagnostics.cpp", "WorkList.cpp", ] } diff --git a/llvm/utils/gn/secondary/compiler-rt/lib/profile/BUILD.gn b/llvm/utils/gn/secondary/compiler-rt/lib/profile/BUILD.gn index 777b2da40be8c..f5fc87c102682 100644 --- a/llvm/utils/gn/secondary/compiler-rt/lib/profile/BUILD.gn +++ b/llvm/utils/gn/secondary/compiler-rt/lib/profile/BUILD.gn @@ -37,6 +37,7 @@ static_library("profile") { "InstrProfilingBiasVar.c", "InstrProfilingBuffer.c", "InstrProfilingFile.c", + "InstrProfilingInternal.c", "InstrProfilingInternal.h", "InstrProfilingMerge.c", "InstrProfilingMergeFile.c", diff --git a/llvm/utils/gn/secondary/compiler-rt/lib/scudo/standalone/BUILD.gn b/llvm/utils/gn/secondary/compiler-rt/lib/scudo/standalone/BUILD.gn index 88b820ad03ed0..1185a3165c78a 100644 --- a/llvm/utils/gn/secondary/compiler-rt/lib/scudo/standalone/BUILD.gn +++ b/llvm/utils/gn/secondary/compiler-rt/lib/scudo/standalone/BUILD.gn @@ -25,7 +25,7 @@ source_set("sources") { "flags_parser.h", "fuchsia.cpp", "fuchsia.h", - "interface.h", + "include/scudo/interface.h", "internal_defs.h", "linux.cpp", "linux.h", @@ -88,7 +88,11 @@ source_set("cxx_wrapper_sources") { } config("scudo_config") { - include_dirs = [ "//compiler-rt/lib/scudo/standalone" ] + include_dirs = [ + "include", + "//compiler-rt/lib/scudo/standalone", + "//compiler-rt/lib/scudo/standalone/include", + ] if (current_os == "android") { cflags = [ "-fno-emulated-tls" ] } diff --git a/llvm/utils/gn/secondary/lld/COFF/BUILD.gn b/llvm/utils/gn/secondary/lld/COFF/BUILD.gn index 4e0a2074a3a41..ceb1b22d81fd9 100644 --- a/llvm/utils/gn/secondary/lld/COFF/BUILD.gn +++ b/llvm/utils/gn/secondary/lld/COFF/BUILD.gn @@ -34,6 +34,7 @@ static_library("COFF") { "DriverUtils.cpp", "ICF.cpp", "InputFiles.cpp", + "LLDMapFile.cpp", "LTO.cpp", "MapFile.cpp", "MarkLive.cpp", diff --git a/llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn index b5d8e62b5d2be..b1200b6c60d29 100644 --- a/llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn @@ -93,6 +93,7 @@ static_library("Support") { "MathExtras.cpp", "MemoryBuffer.cpp", "NativeFormatting.cpp", + "OptimalLayout.cpp", "Optional.cpp", "Parallel.cpp", "PluginLoader.cpp", diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/targets.gni b/llvm/utils/gn/secondary/llvm/lib/Target/targets.gni index 8306b5daabcce..2590432529b6a 100644 --- a/llvm/utils/gn/secondary/llvm/lib/Target/targets.gni +++ b/llvm/utils/gn/secondary/llvm/lib/Target/targets.gni @@ -44,6 +44,7 @@ if (llvm_targets_to_build == "host") { # and remember which targets are built where needed (for conditionally-built # unittest targets). llvm_build_AArch64 = false +llvm_build_AMDGPU = false llvm_build_ARM = false llvm_build_BPF = false llvm_build_Mips = false @@ -53,6 +54,8 @@ llvm_build_X86 = false foreach(target, llvm_targets_to_build) { if (target == "AArch64") { llvm_build_AArch64 = true + } else if (target == "AMDGPU") { + llvm_build_AMDGPU = true } else if (target == "ARM") { llvm_build_ARM = true } else if (target == "BPF") { @@ -65,7 +68,7 @@ foreach(target, llvm_targets_to_build) { llvm_build_WebAssembly = true } else if (target == "X86") { llvm_build_X86 = true - } else if (target == "AMDGPU" || target == "AVR" || target == "Hexagon" || + } else if (target == "AVR" || target == "Hexagon" || target == "Lanai" || target == "NVPTX" || target == "RISCV" || target == "Sparc" || target == "SystemZ") { # Nothing to do. diff --git a/llvm/utils/gn/secondary/llvm/unittests/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/BUILD.gn index fb951086571de..3d960d501e425 100644 --- a/llvm/utils/gn/secondary/llvm/unittests/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/unittests/BUILD.gn @@ -56,6 +56,12 @@ group("unittests") { "tools/llvm-exegesis/AArch64:LLVMExegesisAArch64Tests", ] } + if (llvm_build_AMDGPU) { + deps += [ + "MC/AMDGPU:AMDGPUDwarfTests", + "Target/AMDGPU:AMDGPUTests", + ] + } if (llvm_build_ARM) { deps += [ "Target/ARM:ARMTests", diff --git a/llvm/utils/gn/secondary/llvm/unittests/IR/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/IR/BUILD.gn index 4e3e776790c20..92b3f4e37810e 100644 --- a/llvm/utils/gn/secondary/llvm/unittests/IR/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/unittests/IR/BUILD.gn @@ -39,6 +39,7 @@ unittest("IRTests") { "TypesTest.cpp", "UseTest.cpp", "UserTest.cpp", + "VPIntrinsicTest.cpp", "ValueHandleTest.cpp", "ValueMapTest.cpp", "ValueTest.cpp", diff --git a/llvm/utils/gn/secondary/llvm/unittests/MC/AMDGPU/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/MC/AMDGPU/BUILD.gn new file mode 100644 index 0000000000000..8d1763e65f0ce --- /dev/null +++ b/llvm/utils/gn/secondary/llvm/unittests/MC/AMDGPU/BUILD.gn @@ -0,0 +1,14 @@ +import("//llvm/utils/unittest/unittest.gni") + +unittest("AMDGPUDwarfTests") { + deps = [ + "//llvm/lib/MC", + "//llvm/lib/Support", + "//llvm/lib/Target/AMDGPU:LLVMAMDGPUCodeGen", + "//llvm/lib/Target/AMDGPU/MCTargetDesc", + "//llvm/lib/Target/AMDGPU/TargetInfo", + ] + sources = [ + "DwarfRegMappings.cpp", + ] +} diff --git a/llvm/utils/gn/secondary/llvm/unittests/Support/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/Support/BUILD.gn index 3204dd3602671..0ba402cf9e815 100644 --- a/llvm/utils/gn/secondary/llvm/unittests/Support/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/unittests/Support/BUILD.gn @@ -54,6 +54,7 @@ unittest("SupportTests") { "MemoryBufferTest.cpp", "MemoryTest.cpp", "NativeFormatTests.cpp", + "OptimalLayoutTest.cpp", "ParallelTest.cpp", "Path.cpp", "ProcessTest.cpp", diff --git a/llvm/utils/gn/secondary/llvm/unittests/Target/AMDGPU/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/Target/AMDGPU/BUILD.gn new file mode 100644 index 0000000000000..238ff86b771c3 --- /dev/null +++ b/llvm/utils/gn/secondary/llvm/unittests/Target/AMDGPU/BUILD.gn @@ -0,0 +1,17 @@ +import("//llvm/utils/unittest/unittest.gni") + +unittest("AMDGPUTests") { + deps = [ + "//llvm/lib/Support", + "//llvm/lib/Target", + "//llvm/lib/Target/AMDGPU:LLVMAMDGPUCodeGen", + "//llvm/lib/Target/AMDGPU/MCTargetDesc", + "//llvm/lib/Target/AMDGPU/TargetInfo", + "//llvm/lib/Target/AMDGPU/Utils", + ] + include_dirs = [ "//llvm/lib/Target/AMDGPU" ] + sources = [ + # Make `gn format` not collapse this, for sync_source_lists_from_cmake.py. + "DwarfRegMappings.cpp", + ] +} diff --git a/llvm/utils/lit/lit/Test.py b/llvm/utils/lit/lit/Test.py index 62f1bbf1f03a5..000bcf8fc38fe 100644 --- a/llvm/utils/lit/lit/Test.py +++ b/llvm/utils/lit/lit/Test.py @@ -220,6 +220,10 @@ def __init__(self, suite, path_in_suite, config, file_path = None): # triple parts. All of them must be False for the test to run. self.unsupported = [] + # An optional number of retries allowed before the test finally succeeds. + # The test is run at most once plus the number of retries specified here. + self.allowed_retries = getattr(config, 'test_retry_attempts', 0) + # The test result, once complete. self.result = None diff --git a/llvm/utils/lit/lit/TestRunner.py b/llvm/utils/lit/lit/TestRunner.py index 96411c98ee334..4ee3b673c28f9 100644 --- a/llvm/utils/lit/lit/TestRunner.py +++ b/llvm/utils/lit/lit/TestRunner.py @@ -1182,13 +1182,15 @@ class ParserKind(object): LIST: A keyword taking a comma-separated list of values. BOOLEAN_EXPR: A keyword taking a comma-separated list of boolean expressions. Ex 'XFAIL:' + INTEGER: A keyword taking a single integer. Ex 'ALLOW_RETRIES:' CUSTOM: A keyword with custom parsing semantics. """ TAG = 0 COMMAND = 1 LIST = 2 BOOLEAN_EXPR = 3 - CUSTOM = 4 + INTEGER = 4 + CUSTOM = 5 @staticmethod def allowedKeywordSuffixes(value): @@ -1196,6 +1198,7 @@ def allowedKeywordSuffixes(value): ParserKind.COMMAND: [':'], ParserKind.LIST: [':'], ParserKind.BOOLEAN_EXPR: [':'], + ParserKind.INTEGER: [':'], ParserKind.CUSTOM: [':', '.'] } [value] @@ -1205,6 +1208,7 @@ def str(value): ParserKind.COMMAND: 'COMMAND', ParserKind.LIST: 'LIST', ParserKind.BOOLEAN_EXPR: 'BOOLEAN_EXPR', + ParserKind.INTEGER: 'INTEGER', ParserKind.CUSTOM: 'CUSTOM' } [value] @@ -1247,6 +1251,8 @@ def __init__(self, keyword, kind, parser=None, initial_value=None): self.parser = self._handleList elif kind == ParserKind.BOOLEAN_EXPR: self.parser = self._handleBooleanExpr + elif kind == ParserKind.INTEGER: + self.parser = self._handleSingleInteger elif kind == ParserKind.TAG: self.parser = self._handleTag elif kind == ParserKind.CUSTOM: @@ -1311,6 +1317,18 @@ def _handleList(line_number, line, output): output.extend([s.strip() for s in line.split(',')]) return output + @staticmethod + def _handleSingleInteger(line_number, line, output): + """A parser for INTEGER type keywords""" + if output is None: + output = [] + try: + n = int(line) + except ValueError: + raise ValueError("INTEGER parser requires the input to be an integer (got {})".format(line)) + output.append(n) + return output + @staticmethod def _handleBooleanExpr(line_number, line, output): """A parser for BOOLEAN_EXPR type keywords""" @@ -1331,8 +1349,8 @@ def _handleBooleanExpr(line_number, line, output): def parseIntegratedTestScript(test, additional_parsers=[], require_script=True): """parseIntegratedTestScript - Scan an LLVM/Clang style integrated test - script and extract the lines to 'RUN' as well as 'XFAIL' and 'REQUIRES' - and 'UNSUPPORTED' information. + script and extract the lines to 'RUN' as well as 'XFAIL', 'REQUIRES', + 'UNSUPPORTED' and 'ALLOW_RETRIES' information. If additional parsers are specified then the test is also scanned for the keywords they specify and all matches are passed to the custom parser. @@ -1353,6 +1371,7 @@ def parseIntegratedTestScript(test, additional_parsers=[], initial_value=test.requires), IntegratedTestKeywordParser('UNSUPPORTED:', ParserKind.BOOLEAN_EXPR, initial_value=test.unsupported), + IntegratedTestKeywordParser('ALLOW_RETRIES:', ParserKind.INTEGER), IntegratedTestKeywordParser('END.', ParserKind.TAG) ] keyword_parsers = {p.keyword: p for p in builtin_parsers} @@ -1412,6 +1431,14 @@ def parseIntegratedTestScript(test, additional_parsers=[], "Test does not support the following features " "and/or targets: %s" % msg) + # Handle ALLOW_RETRIES: + allowed_retries = keyword_parsers['ALLOW_RETRIES:'].getValue() + if allowed_retries: + if len(allowed_retries) > 1: + return lit.Test.Result(Test.UNRESOLVED, + "Test has more than one ALLOW_RETRIES lines") + test.allowed_retries = allowed_retries[0] + # Enforce limit_to_features. if not test.isWithinFeatureLimits(): msg = ', '.join(test.config.limit_to_features) @@ -1461,13 +1488,17 @@ def _runShTest(test, litConfig, useExternalSh, script, tmpBase): def executeShTest(test, litConfig, useExternalSh, - extra_substitutions=[]): + extra_substitutions=[], + preamble_commands=[]): if test.config.unsupported: return lit.Test.Result(Test.UNSUPPORTED, 'Test is unsupported') - script = parseIntegratedTestScript(test) - if isinstance(script, lit.Test.Result): - return script + script = list(preamble_commands) + parsed = parseIntegratedTestScript(test, require_script=not script) + if isinstance(parsed, lit.Test.Result): + return parsed + script += parsed + if litConfig.noExecute: return lit.Test.Result(Test.PASS) @@ -1477,10 +1508,8 @@ def executeShTest(test, litConfig, useExternalSh, normalize_slashes=useExternalSh) script = applySubstitutions(script, substitutions) - # Re-run failed tests up to test_retry_attempts times. - attempts = 1 - if hasattr(test.config, 'test_retry_attempts'): - attempts += test.config.test_retry_attempts + # Re-run failed tests up to test.allowed_retries times. + attempts = test.allowed_retries + 1 for i in range(attempts): res = _runShTest(test, litConfig, useExternalSh, script, tmpBase) if res.code != Test.FAIL: diff --git a/llvm/utils/lit/tests/Inputs/allow-retries/does-not-succeed-within-limit.py b/llvm/utils/lit/tests/Inputs/allow-retries/does-not-succeed-within-limit.py new file mode 100644 index 0000000000000..05e3f35b6f81e --- /dev/null +++ b/llvm/utils/lit/tests/Inputs/allow-retries/does-not-succeed-within-limit.py @@ -0,0 +1,3 @@ +# ALLOW_RETRIES: 3 + +# RUN: false diff --git a/llvm/utils/lit/tests/Inputs/allow-retries/lit.cfg b/llvm/utils/lit/tests/Inputs/allow-retries/lit.cfg new file mode 100644 index 0000000000000..eed69f389ed07 --- /dev/null +++ b/llvm/utils/lit/tests/Inputs/allow-retries/lit.cfg @@ -0,0 +1,9 @@ +import lit.formats +config.name = 'allow-retries' +config.suffixes = ['.py'] +config.test_format = lit.formats.ShTest() +config.test_source_root = None +config.test_exec_root = None + +config.substitutions.append(('%python', lit_config.params.get('python', ''))) +config.substitutions.append(('%counter', lit_config.params.get('counter', ''))) diff --git a/llvm/utils/lit/tests/Inputs/allow-retries/more-than-one-allow-retries-lines.py b/llvm/utils/lit/tests/Inputs/allow-retries/more-than-one-allow-retries-lines.py new file mode 100644 index 0000000000000..14fb6b26661a5 --- /dev/null +++ b/llvm/utils/lit/tests/Inputs/allow-retries/more-than-one-allow-retries-lines.py @@ -0,0 +1,4 @@ +# ALLOW_RETRIES: 3 +# ALLOW_RETRIES: 5 + +# RUN: true diff --git a/llvm/utils/lit/tests/Inputs/allow-retries/not-a-valid-integer.py b/llvm/utils/lit/tests/Inputs/allow-retries/not-a-valid-integer.py new file mode 100644 index 0000000000000..d624de900b7f0 --- /dev/null +++ b/llvm/utils/lit/tests/Inputs/allow-retries/not-a-valid-integer.py @@ -0,0 +1,3 @@ +# ALLOW_RETRIES: not-an-integer + +# RUN: true diff --git a/llvm/utils/lit/tests/Inputs/allow-retries/succeeds-within-limit.py b/llvm/utils/lit/tests/Inputs/allow-retries/succeeds-within-limit.py new file mode 100644 index 0000000000000..45ac9433fc7ef --- /dev/null +++ b/llvm/utils/lit/tests/Inputs/allow-retries/succeeds-within-limit.py @@ -0,0 +1,24 @@ +# ALLOW_RETRIES: 5 + +# RUN: "%python" "%s" "%counter" + +import sys +import os + +counter_file = sys.argv[1] + +# The first time the test is run, initialize the counter to 1. +if not os.path.exists(counter_file): + with open(counter_file, 'w') as counter: + counter.write("1") + +# Succeed if this is the fourth time we're being run. +with open(counter_file, 'r') as counter: + num = int(counter.read()) + if num == 4: + sys.exit(0) + +# Otherwise, increment the counter and fail +with open(counter_file, 'w') as counter: + counter.write(str(num + 1)) + sys.exit(1) diff --git a/llvm/utils/lit/tests/Inputs/shtest-inject/lit.cfg b/llvm/utils/lit/tests/Inputs/shtest-inject/lit.cfg new file mode 100644 index 0000000000000..65a02e0081a25 --- /dev/null +++ b/llvm/utils/lit/tests/Inputs/shtest-inject/lit.cfg @@ -0,0 +1,17 @@ +import lit + +class CustomFormat(lit.formats.TestFormat): + def execute(self, test, litConfig): + commands = [ + 'echo "THIS WAS"', + 'echo "INJECTED"' + ] + return lit.TestRunner.executeShTest(test, litConfig, + useExternalSh=False, + preamble_commands=commands) + +config.name = 'shtest-inject' +config.suffixes = ['.txt'] +config.test_format = CustomFormat() +config.test_source_root = None +config.test_exec_root = None diff --git a/llvm/utils/lit/tests/Inputs/shtest-inject/test-empty.txt b/llvm/utils/lit/tests/Inputs/shtest-inject/test-empty.txt new file mode 100644 index 0000000000000..293607453a741 --- /dev/null +++ b/llvm/utils/lit/tests/Inputs/shtest-inject/test-empty.txt @@ -0,0 +1,3 @@ + +# This test voluntarily has no RUN lines or anything else. The RUN lines are +# injected by the test format. diff --git a/llvm/utils/lit/tests/Inputs/shtest-inject/test-many.txt b/llvm/utils/lit/tests/Inputs/shtest-inject/test-many.txt new file mode 100644 index 0000000000000..bc990580edfda --- /dev/null +++ b/llvm/utils/lit/tests/Inputs/shtest-inject/test-many.txt @@ -0,0 +1,7 @@ + +# This test has several RUN lines, but more run lines are prepended to it by +# the test format in use. + +# RUN: echo "IN THE FILE" +# RUN: echo "IF IT WORKS" +# RUN: echo "AS EXPECTED" diff --git a/llvm/utils/lit/tests/Inputs/shtest-inject/test-one.txt b/llvm/utils/lit/tests/Inputs/shtest-inject/test-one.txt new file mode 100644 index 0000000000000..ab66fc9ef7496 --- /dev/null +++ b/llvm/utils/lit/tests/Inputs/shtest-inject/test-one.txt @@ -0,0 +1,5 @@ + +# This test has one RUN line, but more run lines are prepended to it by the +# test format in use. + +# RUN: echo "IN THE FILE" diff --git a/llvm/utils/lit/tests/Inputs/test_retry_attempts/lit.cfg b/llvm/utils/lit/tests/Inputs/test_retry_attempts/lit.cfg new file mode 100644 index 0000000000000..a3b660fbaef32 --- /dev/null +++ b/llvm/utils/lit/tests/Inputs/test_retry_attempts/lit.cfg @@ -0,0 +1,10 @@ +import lit.formats +config.name = 'test_retry_attempts' +config.suffixes = ['.py'] +config.test_format = lit.formats.ShTest() +config.test_source_root = None +config.test_exec_root = None + +config.test_retry_attempts = 5 +config.substitutions.append(('%python', lit_config.params.get('python', ''))) +config.substitutions.append(('%counter', lit_config.params.get('counter', ''))) diff --git a/llvm/utils/lit/tests/Inputs/test_retry_attempts/test.py b/llvm/utils/lit/tests/Inputs/test_retry_attempts/test.py new file mode 100644 index 0000000000000..ee8a92cc5d8ff --- /dev/null +++ b/llvm/utils/lit/tests/Inputs/test_retry_attempts/test.py @@ -0,0 +1,22 @@ +# RUN: "%python" "%s" "%counter" + +import sys +import os + +counter_file = sys.argv[1] + +# The first time the test is run, initialize the counter to 1. +if not os.path.exists(counter_file): + with open(counter_file, 'w') as counter: + counter.write("1") + +# Succeed if this is the fourth time we're being run. +with open(counter_file, 'r') as counter: + num = int(counter.read()) + if num == 4: + sys.exit(0) + +# Otherwise, increment the counter and fail +with open(counter_file, 'w') as counter: + counter.write(str(num + 1)) + sys.exit(1) diff --git a/llvm/utils/lit/tests/Inputs/testrunner-custom-parsers/test.txt b/llvm/utils/lit/tests/Inputs/testrunner-custom-parsers/test.txt index e28060320a2e0..5809af5477ceb 100644 --- a/llvm/utils/lit/tests/Inputs/testrunner-custom-parsers/test.txt +++ b/llvm/utils/lit/tests/Inputs/testrunner-custom-parsers/test.txt @@ -13,6 +13,9 @@ // MY_BOOL: b) // MY_BOOL: d // +// MY_INT: 4 +// MY_INT: 6 +// // MY_BOOL_UNTERMINATED: a \ // // END. diff --git a/llvm/utils/lit/tests/allow-retries.py b/llvm/utils/lit/tests/allow-retries.py new file mode 100644 index 0000000000000..3f6cf8f1faa56 --- /dev/null +++ b/llvm/utils/lit/tests/allow-retries.py @@ -0,0 +1,41 @@ +# Check the behavior of the ALLOW_RETRIES keyword. + +# This test uses a file that's stable across retries of the test to fail and +# only succeed the fourth time it is retried. +# +# RUN: rm -f %t.counter +# RUN: %{lit} -j 1 %{inputs}/allow-retries/succeeds-within-limit.py -Dcounter=%t.counter -Dpython=%{python} | FileCheck --check-prefix=CHECK-TEST1 %s +# CHECK-TEST1: Passes With Retry : 1 + +# Test that a per-file ALLOW_RETRIES overwrites the config-wide test_retry_attempts property, if any. +# +# RUN: rm -f %t.counter +# RUN: %{lit} -j 1 %{inputs}/allow-retries/succeeds-within-limit.py -Dtest_retry_attempts=2 -Dcounter=%t.counter -Dpython=%{python} | FileCheck --check-prefix=CHECK-TEST2 %s +# CHECK-TEST2: Passes With Retry : 1 + +# This test does not succeed within the allowed retry limit +# +# RUN: not %{lit} -j 1 %{inputs}/allow-retries/does-not-succeed-within-limit.py | FileCheck --check-prefix=CHECK-TEST3 %s +# CHECK-TEST3: Failing Tests (1): +# CHECK-TEST3: allow-retries :: does-not-succeed-within-limit.py + +# This test should be UNRESOLVED since it has more than one ALLOW_RETRIES +# lines, and that is not allowed. +# +# RUN: not %{lit} -j 1 %{inputs}/allow-retries/more-than-one-allow-retries-lines.py | FileCheck --check-prefix=CHECK-TEST4 %s +# CHECK-TEST4: Unresolved Tests (1): +# CHECK-TEST4: allow-retries :: more-than-one-allow-retries-lines.py + +# This test does not provide a valid integer to the ALLOW_RETRIES keyword. +# It should be unresolved. +# +# RUN: not %{lit} -j 1 %{inputs}/allow-retries/not-a-valid-integer.py | FileCheck --check-prefix=CHECK-TEST5 %s +# CHECK-TEST5: Unresolved Tests (1): +# CHECK-TEST5: allow-retries :: not-a-valid-integer.py + +# This test checks that the config-wide test_retry_attempts property is used +# when no ALLOW_RETRIES keyword is present. +# +# RUN: rm -f %t.counter +# RUN: %{lit} -j 1 %{inputs}/test_retry_attempts/test.py -Dcounter=%t.counter -Dpython=%{python} | FileCheck --check-prefix=CHECK-TEST6 %s +# CHECK-TEST6: Passes With Retry : 1 diff --git a/llvm/utils/lit/tests/shtest-inject.py b/llvm/utils/lit/tests/shtest-inject.py new file mode 100644 index 0000000000000..f51f083f3990c --- /dev/null +++ b/llvm/utils/lit/tests/shtest-inject.py @@ -0,0 +1,49 @@ +# Check that we can inject commands at the beginning of a ShTest using a custom +# test format. + +# RUN: %{lit} -j 1 %{inputs}/shtest-inject/test-empty.txt --show-all | FileCheck --check-prefix=CHECK-TEST1 %s +# +# CHECK-TEST1: Script: +# CHECK-TEST1: -- +# CHECK-TEST1: echo "THIS WAS" +# CHECK-TEST1: echo "INJECTED" +# CHECK-TEST1: -- +# +# CHECK-TEST1: THIS WAS +# CHECK-TEST1: INJECTED +# +# CHECK-TEST1: Expected Passes : 1 + +# RUN: %{lit} -j 1 %{inputs}/shtest-inject/test-one.txt --show-all | FileCheck --check-prefix=CHECK-TEST2 %s +# +# CHECK-TEST2: Script: +# CHECK-TEST2: -- +# CHECK-TEST2: echo "THIS WAS" +# CHECK-TEST2: echo "INJECTED" +# CHECK-TEST2: echo "IN THE FILE" +# CHECK-TEST2: -- +# +# CHECK-TEST2: THIS WAS +# CHECK-TEST2: INJECTED +# CHECK-TEST2: IN THE FILE +# +# CHECK-TEST2: Expected Passes : 1 + +# RUN: %{lit} -j 1 %{inputs}/shtest-inject/test-many.txt --show-all | FileCheck --check-prefix=CHECK-TEST3 %s +# +# CHECK-TEST3: Script: +# CHECK-TEST3: -- +# CHECK-TEST3: echo "THIS WAS" +# CHECK-TEST3: echo "INJECTED" +# CHECK-TEST3: echo "IN THE FILE" +# CHECK-TEST3: echo "IF IT WORKS" +# CHECK-TEST3: echo "AS EXPECTED" +# CHECK-TEST3: -- +# +# CHECK-TEST3: THIS WAS +# CHECK-TEST3: INJECTED +# CHECK-TEST3: IN THE FILE +# CHECK-TEST3: IF IT WORKS +# CHECK-TEST3: AS EXPECTED +# +# CHECK-TEST3: Expected Passes : 1 diff --git a/llvm/utils/lit/tests/unit/TestRunner.py b/llvm/utils/lit/tests/unit/TestRunner.py index ceb7bef34f6a8..4f33fce648850 100644 --- a/llvm/utils/lit/tests/unit/TestRunner.py +++ b/llvm/utils/lit/tests/unit/TestRunner.py @@ -57,6 +57,7 @@ def custom_parse(line_number, line, output): IntegratedTestKeywordParser("MY_DNE_TAG.", ParserKind.TAG), IntegratedTestKeywordParser("MY_LIST:", ParserKind.LIST), IntegratedTestKeywordParser("MY_BOOL:", ParserKind.BOOLEAN_EXPR), + IntegratedTestKeywordParser("MY_INT:", ParserKind.INTEGER), IntegratedTestKeywordParser("MY_RUN:", ParserKind.COMMAND), IntegratedTestKeywordParser("MY_CUSTOM:", ParserKind.CUSTOM, custom_parse), @@ -112,6 +113,17 @@ def test_boolean(self): self.assertEqual(value[0].strip(), "a && (b)") self.assertEqual(value[1].strip(), "d") + def test_integer(self): + parsers = self.make_parsers() + self.parse_test(parsers) + int_parser = self.get_parser(parsers, 'MY_INT:') + value = int_parser.getValue() + self.assertEqual(len(value), 2) # there are only two MY_INT: lines + self.assertEqual(type(value[0]), int) + self.assertEqual(value[0], 4) + self.assertEqual(type(value[1]), int) + self.assertEqual(value[1], 6) + def test_boolean_unterminated(self): parsers = self.make_parsers() + \ [IntegratedTestKeywordParser("MY_BOOL_UNTERMINATED:", ParserKind.BOOLEAN_EXPR)] diff --git a/llvm/utils/update_analyze_test_checks.py b/llvm/utils/update_analyze_test_checks.py index e3b6dfdf620cf..2f1a842976428 100755 --- a/llvm/utils/update_analyze_test_checks.py +++ b/llvm/utils/update_analyze_test_checks.py @@ -45,10 +45,6 @@ ADVERT = '; NOTE: Assertions have been autogenerated by ' -# RegEx: this is where the magic happens. - -IR_FUNCTION_RE = re.compile('^\s*define\s+(?:internal\s+)?[^@]*@([\w-]+)\s*\(') - def main(): from argparse import RawTextHelpFormatter parser = argparse.ArgumentParser(description=__doc__, formatter_class=RawTextHelpFormatter) @@ -168,7 +164,7 @@ def main(): # If it's outside a function, it just gets copied to the output. output_lines.append(input_line) - m = IR_FUNCTION_RE.match(input_line) + m = common.IR_FUNCTION_RE.match(input_line) if not m: continue func_name = m.group(1) diff --git a/llvm/utils/update_test_checks.py b/llvm/utils/update_test_checks.py index f6fee1437f5d4..014b55c9ae839 100755 --- a/llvm/utils/update_test_checks.py +++ b/llvm/utils/update_test_checks.py @@ -45,14 +45,6 @@ ADVERT = '; NOTE: Assertions have been autogenerated by ' -# RegEx: this is where the magic happens. - -IR_FUNCTION_RE = re.compile('^\s*define\s+(?:internal\s+)?[^@]*@([\w-]+)\s*\(') - - - - - def main(): from argparse import RawTextHelpFormatter parser = argparse.ArgumentParser(description=__doc__, formatter_class=RawTextHelpFormatter) @@ -203,7 +195,7 @@ def main(): # If it's outside a function, it just gets copied to the output. output_lines.append(input_line) - m = IR_FUNCTION_RE.match(input_line) + m = common.IR_FUNCTION_RE.match(input_line) if not m: continue func_name = m.group(1) diff --git a/llvm/utils/vim/syntax/llvm.vim b/llvm/utils/vim/syntax/llvm.vim index 487a37b4b86ba..0a661e82d86a9 100644 --- a/llvm/utils/vim/syntax/llvm.vim +++ b/llvm/utils/vim/syntax/llvm.vim @@ -203,6 +203,7 @@ syn match llvmConstant /\/ syn match llvmSpecialComment /;\s*PR\d*\s*$/ syn match llvmSpecialComment /;\s*REQUIRES:.*$/ syn match llvmSpecialComment /;\s*RUN:.*$/ +syn match llvmSpecialComment /;\s*ALLOW_RETRIES:.*$/ syn match llvmSpecialComment /;\s*CHECK:.*$/ syn match llvmSpecialComment "\v;\s*CHECK-(NEXT|NOT|DAG|SAME|LABEL):.*$" syn match llvmSpecialComment /;\s*XFAIL:.*$/ diff --git a/llvm/utils/vscode/llvm/syntaxes/ll.tmLanguage.yaml b/llvm/utils/vscode/llvm/syntaxes/ll.tmLanguage.yaml index 9765cee98df81..117ec134d5738 100644 --- a/llvm/utils/vscode/llvm/syntaxes/ll.tmLanguage.yaml +++ b/llvm/utils/vscode/llvm/syntaxes/ll.tmLanguage.yaml @@ -319,6 +319,8 @@ patterns: name: string.regexp - match: ";\\s*RUN:.*$" name: string.regexp + - match: ";\\s*ALLOW_RETRIES:.*$" + name: string.regexp - match: ";\\s*CHECK:.*$" name: string.regexp - match: ";\\s*CHECK-(NEXT|NOT|DAG|SAME|LABEL):.*$" diff --git a/mlir/cmake/modules/AddMLIR.cmake b/mlir/cmake/modules/AddMLIR.cmake index 4e06a0e743fec..2adb8f2f29351 100644 --- a/mlir/cmake/modules/AddMLIR.cmake +++ b/mlir/cmake/modules/AddMLIR.cmake @@ -11,7 +11,11 @@ function(whole_archive_link target) if("${CMAKE_SYSTEM_NAME}" STREQUAL "Darwin") set(link_flags "-L${CMAKE_BINARY_DIR}/lib ") FOREACH(LIB ${ARGN}) - string(CONCAT link_flags ${link_flags} "-Wl,-force_load ${CMAKE_BINARY_DIR}/lib/lib${LIB}.a ") + if("${CMAKE_GENERATOR}" STREQUAL "Xcode") + string(CONCAT link_flags ${link_flags} "-Wl,-force_load ${CMAKE_BINARY_DIR}/${CMAKE_CFG_INTDIR}/lib/lib${LIB}.a ") + else() + string(CONCAT link_flags ${link_flags} "-Wl,-force_load ${CMAKE_BINARY_DIR}/lib/lib${LIB}.a ") + endif() ENDFOREACH(LIB) elseif(MSVC) FOREACH(LIB ${ARGN}) @@ -28,26 +32,28 @@ function(whole_archive_link target) endfunction(whole_archive_link) # Declare a dialect in the include directory -function(add_mlir_dialect dialect dialect_namespace dialect_doc_filename) +function(add_mlir_dialect dialect dialect_namespace) set(LLVM_TARGET_DEFINITIONS ${dialect}.td) mlir_tablegen(${dialect}.h.inc -gen-op-decls) mlir_tablegen(${dialect}.cpp.inc -gen-op-defs) mlir_tablegen(${dialect}Dialect.h.inc -gen-dialect-decls -dialect=${dialect_namespace}) add_public_tablegen_target(MLIR${dialect}IncGen) add_dependencies(mlir-headers MLIR${dialect}IncGen) +endfunction() - # Generate Dialect Documentation - set(LLVM_TARGET_DEFINITIONS ${dialect_doc_filename}.td) - tablegen(MLIR ${dialect_doc_filename}.md -gen-op-doc "-I${MLIR_MAIN_SRC_DIR}" "-I${MLIR_INCLUDE_DIR}") - set(GEN_DOC_FILE ${MLIR_BINARY_DIR}/docs/Dialects/${dialect_doc_filename}.md) +# Generate Documentation +function(add_mlir_doc doc_filename command output_file output_directory) + set(LLVM_TARGET_DEFINITIONS ${doc_filename}.td) + tablegen(MLIR ${output_file}.md ${command} "-I${MLIR_MAIN_SRC_DIR}" "-I${MLIR_INCLUDE_DIR}") + set(GEN_DOC_FILE ${MLIR_BINARY_DIR}/docs/${output_directory}${output_file}.md) add_custom_command( OUTPUT ${GEN_DOC_FILE} COMMAND ${CMAKE_COMMAND} -E copy - ${CMAKE_CURRENT_BINARY_DIR}/${dialect_doc_filename}.md + ${CMAKE_CURRENT_BINARY_DIR}/${output_file}.md ${GEN_DOC_FILE} - DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${dialect_doc_filename}.md) - add_custom_target(${dialect_doc_filename}DocGen DEPENDS ${GEN_DOC_FILE}) - add_dependencies(mlir-doc ${dialect_doc_filename}DocGen) + DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${output_file}.md) + add_custom_target(${output_file}DocGen DEPENDS ${GEN_DOC_FILE}) + add_dependencies(mlir-doc ${output_file}DocGen) endfunction() # Declare a library which can be compiled in libMLIR.so diff --git a/mlir/cmake/modules/MLIRConfig.cmake.in b/mlir/cmake/modules/MLIRConfig.cmake.in index 190aa193d1db9..da518762919cd 100644 --- a/mlir/cmake/modules/MLIRConfig.cmake.in +++ b/mlir/cmake/modules/MLIRConfig.cmake.in @@ -19,7 +19,9 @@ set_property(GLOBAL PROPERTY MLIR_DIALECT_LIBS "@MLIR_DIALECT_LIBS@") set_property(GLOBAL PROPERTY MLIR_CONVERSION_LIBS "@MLIR_CONVERSION_LIBS@") # Provide all our library targets to users. -include("@MLIR_CONFIG_EXPORTS_FILE@") +if(EXISTS @MLIR_CONFIG_EXPORTS_FILE@) + include("@MLIR_CONFIG_EXPORTS_FILE@") +endif() # By creating these targets here, subprojects that depend on MLIR's # tablegen-generated headers can always depend on these targets whether building diff --git a/mlir/docs/ConversionToLLVMDialect.md b/mlir/docs/ConversionToLLVMDialect.md index 1e6fdedf933ee..f1917e8ac7069 100644 --- a/mlir/docs/ConversionToLLVMDialect.md +++ b/mlir/docs/ConversionToLLVMDialect.md @@ -1,7 +1,7 @@ # Conversion to the LLVM Dialect Conversion from the Standard to the [LLVM Dialect](Dialects/LLVM.md) can be -performed by the specialized dialect conversion pass by running +performed by the specialized dialect conversion pass by running: ```shell mlir-opt -convert-std-to-llvm @@ -19,7 +19,7 @@ described in this document. We use the terminology defined by the ### Scalar Types Scalar types are converted to their LLVM counterparts if they exist. The -following conversions are currently implemented. +following conversions are currently implemented: - `i*` converts to `!llvm.i*` - `f16` converts to `!llvm.half` @@ -52,7 +52,7 @@ x 8 x 16 x f32>` converts to `!llvm<"[4 x [8 x <16 x float>]]">`. Memref types in MLIR have both static and dynamic information associated with them. The dynamic information comprises the buffer pointer as well as sizes and -strides of any dynamically sized dimensions. Memref types are normalized and +strides of any dynamically-sized dimensions. Memref types are normalized and converted to a descriptor that is only dependent on the rank of the memref. The descriptor contains: @@ -90,7 +90,7 @@ memref<10x?x42x?x123 x f32> -> !llvm<"{ float*, float*, i64, [5 x i64], [5 x i64 memref<1x? x vector<4xf32>> -> !llvm<"{ <4 x float>*, <4 x float>*, i64, [1 x i64], [1 x i64] }"> ``` -If the rank of the memref is unknown at compile time, the Memref is converted to +If the rank of the memref is unknown at compile time, the memref is converted to an unranked descriptor that contains: 1. a 64-bit integer representing the dynamic rank of the memref, followed by @@ -128,7 +128,7 @@ fact that LLVM IR functions always have a return type, which may be a Void type. The converted function always has a single result type. If the original function type had no results, the converted function will have one result of the wrapped `void` type. If the original function type had one result, the converted -function will have one result converted using these rules. Otherwise, the result +function will also have one result converted using these rules. Otherwise, the result type will be a wrapped LLVM IR structure type where each element of the structure corresponds to one of the results of the original function, converted using these rules. In high-order functions, function-typed arguments and results @@ -407,7 +407,7 @@ in the MLIR module. 1. Add a body to the original function (making it non-external) that 1. allocates a memref descriptor, 1. populates it, and - 1. passes the pointer to it into the newly declared interface function + 1. passes the pointer to it into the newly declared interface function, then 1. collects the result of the call and returns it to the caller. For (non-external) functions defined in the MLIR module. @@ -560,7 +560,7 @@ produce an address of a specific element. In particular, it holds dynamic values for static sizes, and they are expected to match at all times. It is created by the allocation operation and is updated by the conversion -operations that may change static dimensions into dynamic and vice versa. +operations that may change static dimensions into dynamic dimensions and vice versa. **Note**: LLVM IR conversion does not support `memref`s with layouts that are not amenable to the strided form. diff --git a/mlir/docs/CreatingADialect.md b/mlir/docs/CreatingADialect.md index 8757bf5e2a906..e8a8d40ff0965 100644 --- a/mlir/docs/CreatingADialect.md +++ b/mlir/docs/CreatingADialect.md @@ -39,7 +39,8 @@ is declared using add_mlir_dialect(). ```cmake -add_mlir_dialect(FooOps foo FooOps) +add_mlir_dialect(FooOps foo) +add_mlir_doc(FooOps -gen-dialect-doc FooDialect Dialects/) ``` diff --git a/mlir/docs/Dialects/Vector.md b/mlir/docs/Dialects/Vector.md index 5650607950830..10de1731a2f24 100644 --- a/mlir/docs/Dialects/Vector.md +++ b/mlir/docs/Dialects/Vector.md @@ -1,5 +1,7 @@ # Vector Dialect +[TOC] + MLIR supports multi-dimensional `vector` types and custom operations on those types. A generic, retargetable, higher-order ``vector`` type (`n-D` with `n > 1`) is a structured type, that carries semantic information useful for @@ -488,6 +490,6 @@ low-level abstraction. The use of special intrinsics in a `1-D` LLVM world is still available thanks to an explicit `vector.cast` op. +## Operations -### Operations - +[include "Dialects/VectorOps.md"] diff --git a/mlir/docs/Interfaces.md b/mlir/docs/Interfaces.md index f413cac28bb00..16422a1045827 100644 --- a/mlir/docs/Interfaces.md +++ b/mlir/docs/Interfaces.md @@ -63,7 +63,7 @@ struct AffineInlinerInterface : public DialectInlinerInterface { }; /// Register the interface with the dialect. -AffineOpsDialect::AffineOpsDialect(MLIRContext *context) ... { +AffineDialect::AffineDialect(MLIRContext *context) ... { addInterfaces(); } ``` diff --git a/mlir/docs/LangRef.md b/mlir/docs/LangRef.md index a3e5738b49b51..f180075c0d731 100644 --- a/mlir/docs/LangRef.md +++ b/mlir/docs/LangRef.md @@ -944,7 +944,7 @@ multidimensional index from one index space to another. For example, the following figure shows an index map which maps a 2-dimensional index from a 2x2 index space to a 3x3 index space, using symbols `S0` and `S1` as offsets. -![Index Map Example](includes/img/index-map.svg) +![Index Map Example](/includes/img/index-map.svg) The number of domain dimensions and range dimensions of an index map can be different, but must match the number of dimensions of the input and output index diff --git a/mlir/docs/OpDefinitions.md b/mlir/docs/OpDefinitions.md index cc8761eccb158..29269e508bb08 100644 --- a/mlir/docs/OpDefinitions.md +++ b/mlir/docs/OpDefinitions.md @@ -180,6 +180,10 @@ values, including two categories: shape of type. This is mostly used for convenience interface generation or interaction with other frameworks/translation. + All derived attributes should be materializable as an Attribute. That is, + even though they are not materialized, it should be possible to store as + an attribute. + Both operands and attributes are specified inside the `dag`-typed `arguments`, led by `ins`: @@ -1222,7 +1226,7 @@ mlir-tblgen --gen-op-decls -I /path/to/mlir/include /path/to/input/td/file # To see op C++ class definition mlir-tblgen --gen-op-defs -I /path/to/mlir/include /path/to/input/td/file # To see op documentation -mlir-tblgen --gen-op-doc -I /path/to/mlir/include /path/to/input/td/file +mlir-tblgen --gen-dialect-doc -I /path/to/mlir/include /path/to/input/td/file # To see op interface C++ class declaration mlir-tblgen --gen-op-interface-decls -I /path/to/mlir/include /path/to/input/td/file @@ -1232,7 +1236,6 @@ mlir-tblgen --gen-op-interface-defs -I /path/to/mlir/include /path/to/input/td/f mlir-tblgen --gen-op-interface-doc -I /path/to/mlir/include /path/to/input/td/file ``` - ## Appendix ### Requirements and existing mechanisms analysis diff --git a/mlir/docs/Tutorials/Toy/Ch-5.md b/mlir/docs/Tutorials/Toy/Ch-5.md index dbc545c492067..8f32a7289a618 100644 --- a/mlir/docs/Tutorials/Toy/Ch-5.md +++ b/mlir/docs/Tutorials/Toy/Ch-5.md @@ -62,7 +62,7 @@ void ToyToAffineLoweringPass::runOnFunction() { // We define the specific operations, or dialects, that are legal targets for // this lowering. In our case, we are lowering to a combination of the // `Affine` and `Standard` dialects. - target.addLegalDialect(); + target.addLegalDialect(); // We also define the Toy dialect as Illegal so that the conversion will fail // if any of these operations are *not* converted. Given that we actually want diff --git a/mlir/examples/toy/Ch5/mlir/LowerToAffineLoops.cpp b/mlir/examples/toy/Ch5/mlir/LowerToAffineLoops.cpp index 9559402708c8e..249f17b0fc71b 100644 --- a/mlir/examples/toy/Ch5/mlir/LowerToAffineLoops.cpp +++ b/mlir/examples/toy/Ch5/mlir/LowerToAffineLoops.cpp @@ -15,7 +15,7 @@ #include "toy/Dialect.h" #include "toy/Passes.h" -#include "mlir/Dialect/AffineOps/AffineOps.h" +#include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/StandardOps/IR/Ops.h" #include "mlir/Pass/Pass.h" #include "mlir/Transforms/DialectConversion.h" @@ -280,7 +280,7 @@ void ToyToAffineLoweringPass::runOnFunction() { // We define the specific operations, or dialects, that are legal targets for // this lowering. In our case, we are lowering to a combination of the // `Affine` and `Standard` dialects. - target.addLegalDialect(); + target.addLegalDialect(); // We also define the Toy dialect as Illegal so that the conversion will fail // if any of these operations are *not* converted. Given that we actually want diff --git a/mlir/examples/toy/Ch6/mlir/LowerToAffineLoops.cpp b/mlir/examples/toy/Ch6/mlir/LowerToAffineLoops.cpp index 9559402708c8e..249f17b0fc71b 100644 --- a/mlir/examples/toy/Ch6/mlir/LowerToAffineLoops.cpp +++ b/mlir/examples/toy/Ch6/mlir/LowerToAffineLoops.cpp @@ -15,7 +15,7 @@ #include "toy/Dialect.h" #include "toy/Passes.h" -#include "mlir/Dialect/AffineOps/AffineOps.h" +#include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/StandardOps/IR/Ops.h" #include "mlir/Pass/Pass.h" #include "mlir/Transforms/DialectConversion.h" @@ -280,7 +280,7 @@ void ToyToAffineLoweringPass::runOnFunction() { // We define the specific operations, or dialects, that are legal targets for // this lowering. In our case, we are lowering to a combination of the // `Affine` and `Standard` dialects. - target.addLegalDialect(); + target.addLegalDialect(); // We also define the Toy dialect as Illegal so that the conversion will fail // if any of these operations are *not* converted. Given that we actually want diff --git a/mlir/examples/toy/Ch6/mlir/LowerToLLVM.cpp b/mlir/examples/toy/Ch6/mlir/LowerToLLVM.cpp index 5455738dff2a8..f6dcba229276f 100644 --- a/mlir/examples/toy/Ch6/mlir/LowerToLLVM.cpp +++ b/mlir/examples/toy/Ch6/mlir/LowerToLLVM.cpp @@ -19,7 +19,7 @@ #include "mlir/Conversion/LoopToStandard/ConvertLoopToStandard.h" #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h" #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h" -#include "mlir/Dialect/AffineOps/AffineOps.h" +#include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/LLVMIR/LLVMDialect.h" #include "mlir/Dialect/LoopOps/LoopOps.h" #include "mlir/Dialect/StandardOps/IR/Ops.h" diff --git a/mlir/examples/toy/Ch7/mlir/LowerToAffineLoops.cpp b/mlir/examples/toy/Ch7/mlir/LowerToAffineLoops.cpp index 9559402708c8e..249f17b0fc71b 100644 --- a/mlir/examples/toy/Ch7/mlir/LowerToAffineLoops.cpp +++ b/mlir/examples/toy/Ch7/mlir/LowerToAffineLoops.cpp @@ -15,7 +15,7 @@ #include "toy/Dialect.h" #include "toy/Passes.h" -#include "mlir/Dialect/AffineOps/AffineOps.h" +#include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/StandardOps/IR/Ops.h" #include "mlir/Pass/Pass.h" #include "mlir/Transforms/DialectConversion.h" @@ -280,7 +280,7 @@ void ToyToAffineLoweringPass::runOnFunction() { // We define the specific operations, or dialects, that are legal targets for // this lowering. In our case, we are lowering to a combination of the // `Affine` and `Standard` dialects. - target.addLegalDialect(); + target.addLegalDialect(); // We also define the Toy dialect as Illegal so that the conversion will fail // if any of these operations are *not* converted. Given that we actually want diff --git a/mlir/examples/toy/Ch7/mlir/LowerToLLVM.cpp b/mlir/examples/toy/Ch7/mlir/LowerToLLVM.cpp index 5455738dff2a8..f6dcba229276f 100644 --- a/mlir/examples/toy/Ch7/mlir/LowerToLLVM.cpp +++ b/mlir/examples/toy/Ch7/mlir/LowerToLLVM.cpp @@ -19,7 +19,7 @@ #include "mlir/Conversion/LoopToStandard/ConvertLoopToStandard.h" #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h" #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h" -#include "mlir/Dialect/AffineOps/AffineOps.h" +#include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/LLVMIR/LLVMDialect.h" #include "mlir/Dialect/LoopOps/LoopOps.h" #include "mlir/Dialect/StandardOps/IR/Ops.h" diff --git a/mlir/include/mlir/Analysis/AffineStructures.h b/mlir/include/mlir/Analysis/AffineStructures.h index e37d6982c20e8..5d99320c28601 100644 --- a/mlir/include/mlir/Analysis/AffineStructures.h +++ b/mlir/include/mlir/Analysis/AffineStructures.h @@ -443,16 +443,17 @@ class FlatAffineConstraints { /// identifier. Returns None if it's not a constant. This method employs /// trivial (low complexity / cost) checks and detection. Symbolic identifiers /// are treated specially, i.e., it looks for constant differences between - /// affine expressions involving only the symbolic identifiers. See comments - /// at function definition for examples. 'lb' and 'lbDivisor', if provided, - /// are used to express the lower bound associated with the constant - /// difference: 'lb' has the coefficients and lbDivisor, the divisor. For eg., - /// if the lower bound is [(s0 + s2 - 1) floordiv 32] for a system with three - /// symbolic identifiers, *lb = [1, 0, 1], lbDivisor = 32. + /// affine expressions involving only the symbolic identifiers. `lb` and + /// `ub` (along with the `boundFloorDivisor`) are set to represent the lower + /// and upper bound associated with the constant difference: `lb`, `ub` have + /// the coefficients, and boundFloorDivisor, their divisor. + /// Ex: if the lower bound is [(s0 + s2 - 1) floordiv 32] for a system with + /// three symbolic identifiers, *lb = [1, 0, 1], boundDivisor = 32. See + /// comments at function definition for examples. Optional getConstantBoundOnDimSize(unsigned pos, SmallVectorImpl *lb = nullptr, - int64_t *lbFloorDivisor = nullptr, + int64_t *boundFloorDivisor = nullptr, SmallVectorImpl *ub = nullptr) const; /// Returns the constant lower bound for the pos^th identifier if there is diff --git a/mlir/include/mlir/Analysis/Passes.h b/mlir/include/mlir/Analysis/Passes.h deleted file mode 100644 index 296b3b9838c5b..0000000000000 --- a/mlir/include/mlir/Analysis/Passes.h +++ /dev/null @@ -1,33 +0,0 @@ -//===- Passes.h - Pass Entrypoints ------------------------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This header file defines prototypes that expose pass constructors in the -// analysis library. -// -//===----------------------------------------------------------------------===// - -#ifndef MLIR_ANALYSIS_PASSES_H -#define MLIR_ANALYSIS_PASSES_H - -#include "mlir/Support/LLVM.h" -#include - -namespace mlir { - -class FuncOp; -template class OpPassBase; - -/// Creates a pass to check memref accesses in a Function. -std::unique_ptr> createTestMemRefBoundCheckPass(); - -/// Creates a pass to check memref access dependences in a Function. -std::unique_ptr> createTestMemRefDependenceCheckPass(); - -} // end namespace mlir - -#endif // MLIR_ANALYSIS_PASSES_H diff --git a/mlir/include/mlir/Conversion/AVX512ToLLVM/ConvertAVX512ToLLVM.h b/mlir/include/mlir/Conversion/AVX512ToLLVM/ConvertAVX512ToLLVM.h new file mode 100644 index 0000000000000..bd65970d5bf77 --- /dev/null +++ b/mlir/include/mlir/Conversion/AVX512ToLLVM/ConvertAVX512ToLLVM.h @@ -0,0 +1,29 @@ +//===- ConvertAVX512ToLLVM.h - Conversion Patterns from AVX512 to LLVM ----===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef MLIR_EDGE_CONVERSION_AVX512TOLLVM_CONVERTAVX512TOLLVM_H_ +#define MLIR_EDGE_CONVERSION_AVX512TOLLVM_CONVERTAVX512TOLLVM_H_ + +#include + +namespace mlir { +class LLVMTypeConverter; +class ModuleOp; +template class OpPassBase; +class OwningRewritePatternList; + +/// Collect a set of patterns to convert from the AVX512 dialect to LLVM. +void populateAVX512ToLLVMConversionPatterns(LLVMTypeConverter &converter, + OwningRewritePatternList &patterns); + +/// Create a pass to convert AVX512 operations to the LLVMIR dialect. +std::unique_ptr> createConvertAVX512ToLLVMPass(); + +} // namespace mlir + +#endif // MLIR_EDGE_CONVERSION_AVX512TOLLVM_CONVERTAVX512TOLLVM_H_ diff --git a/mlir/include/mlir/Conversion/StandardToStandard/StandardToStandard.h b/mlir/include/mlir/Conversion/StandardToStandard/StandardToStandard.h new file mode 100644 index 0000000000000..a384d7c22166a --- /dev/null +++ b/mlir/include/mlir/Conversion/StandardToStandard/StandardToStandard.h @@ -0,0 +1,31 @@ +//===- StandardToStandard.h - Std intra-dialect conversion -----*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This files contains patterns for lowering within the Standard dialect. +// +//===----------------------------------------------------------------------===// + +#ifndef MLIR_CONVERSION_STANDARDTOSTANDARD_STANDARDTOSTANDARD_H_ +#define MLIR_CONVERSION_STANDARDTOSTANDARD_STANDARDTOSTANDARD_H_ + +namespace mlir { + +// Forward declarations. +class MLIRContext; +class OwningRewritePatternList; +class TypeConverter; + +/// Add a pattern to the given pattern list to convert the operand and result +/// types of a CallOp with the given type converter. +void populateCallOpTypeConversionPattern(OwningRewritePatternList &patterns, + MLIRContext *ctx, + TypeConverter &converter); + +} // end namespace mlir + +#endif // MLIR_CONVERSION_STANDARDTOSTANDARD_STANDARDTOSTANDARD_H_ diff --git a/mlir/include/mlir/Dialect/AVX512/AVX512.td b/mlir/include/mlir/Dialect/AVX512/AVX512.td new file mode 100644 index 0000000000000..917af2e1cc04b --- /dev/null +++ b/mlir/include/mlir/Dialect/AVX512/AVX512.td @@ -0,0 +1,99 @@ +//===-- AVX512Ops.td - AVX512 dialect operation definitions *- tablegen -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines the basic operations for the AVX512 dialect. +// +//===----------------------------------------------------------------------===// + +#ifndef AVX512_OPS +#define AVX512_OPS + +include "mlir/Interfaces/SideEffects.td" + +//===----------------------------------------------------------------------===// +// AVX512 dialect definition +//===----------------------------------------------------------------------===// + +def AVX512_Dialect : Dialect { + let name = "avx512"; + let cppNamespace = "avx512"; +} + +//===----------------------------------------------------------------------===// +// AVX512 op definitions +//===----------------------------------------------------------------------===// + +class AVX512_Op traits = []> : + Op {} + +def MaskRndScaleOp : AVX512_Op<"mask.rndscale", [NoSideEffect, + AllTypesMatch<["src", "a", "dst"]>, + TypesMatchWith<"imm has the same number of bits as elements in dst", + "dst", "imm", + "IntegerType::get(($_self.cast().getShape()[0])," + " $_self.getContext())">]> { + let summary = "Masked roundscale op"; + let description = [{ + The mask.rndscale op is an AVX512 specific op that can lower to the proper + LLVMAVX512 operation: `llvm.mask.rndscale.ps.512` or + `llvm.mask.rndscale.pd.512` instruction depending on the type of vectors it + is applied to. + + From the Intel Intrinsics Guide: + ================================ + Round packed floating-point elements in `a` to the number of fraction bits + specified by `imm`, and store the results in `dst` using writemask `k` + (elements are copied from src when the corresponding mask bit is not set). + }]; + // Supports vector<16xf32> and vector<8xf64>. + let arguments = (ins VectorOfLengthAndType<[16, 8], [F32, F64]>:$src, + I32:$k, + VectorOfLengthAndType<[16, 8], [F32, F64]>:$a, + AnyTypeOf<[I16, I8]>:$imm, + // TODO(ntv): figure rounding out (optional operand?). + I32:$rounding + ); + let results = (outs VectorOfLengthAndType<[16, 8], [F32, F64]>:$dst); + let assemblyFormat = + "$src `,` $k `,` $a `,` $imm `,` $rounding attr-dict `:` type($dst)"; +} + +def MaskScaleFOp : AVX512_Op<"mask.scalef", [NoSideEffect, + AllTypesMatch<["src", "a", "b", "dst"]>, + TypesMatchWith<"k has the same number of bits as elements in dst", + "dst", "k", + "IntegerType::get(($_self.cast().getShape()[0])," + " $_self.getContext())">]> { + let summary = "ScaleF op"; + let description = [{ + The `mask.scalef` op is an AVX512 specific op that can lower to the proper + LLVMAVX512 operation: `llvm.mask.scalef.ps.512` or + `llvm.mask.scalef.pd.512` depending on the type of MLIR vectors it is + applied to. + + From the Intel Intrinsics Guide: + ================================ + Scale the packed floating-point elements in `a` using values from `b`, and + store the results in `dst` using writemask `k` (elements are copied from src + when the corresponding mask bit is not set). + }]; + // Supports vector<16xf32> and vector<8xf64>. + let arguments = (ins VectorOfLengthAndType<[16, 8], [F32, F64]>:$src, + VectorOfLengthAndType<[16, 8], [F32, F64]>:$a, + VectorOfLengthAndType<[16, 8], [F32, F64]>:$b, + AnyTypeOf<[I16, I8]>:$k, + // TODO(ntv): figure rounding out (optional operand?). + I32:$rounding + ); + let results = (outs VectorOfLengthAndType<[16, 8], [F32, F64]>:$dst); + // Fully specified by traits. + let assemblyFormat = + "$src `,` $a `,` $b `,` $k `,` $rounding attr-dict `:` type($dst)"; +} + +#endif // AVX512_OPS diff --git a/mlir/include/mlir/Dialect/AVX512/AVX512Dialect.h b/mlir/include/mlir/Dialect/AVX512/AVX512Dialect.h new file mode 100644 index 0000000000000..aeec2b728a113 --- /dev/null +++ b/mlir/include/mlir/Dialect/AVX512/AVX512Dialect.h @@ -0,0 +1,31 @@ +//===- AVX512Dialect.h - MLIR Dialect for AVX512 ----------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file declares the Target dialect for AVX512 in MLIR. +// +//===----------------------------------------------------------------------===// + +#ifndef MLIR_DIALECT_AVX512_AVX512DIALECT_H_ +#define MLIR_DIALECT_AVX512_AVX512DIALECT_H_ + +#include "mlir/IR/Dialect.h" +#include "mlir/IR/OpDefinition.h" +#include "mlir/Interfaces/SideEffects.h" + +namespace mlir { +namespace avx512 { + +#define GET_OP_CLASSES +#include "mlir/Dialect/AVX512/AVX512.h.inc" + +#include "mlir/Dialect/AVX512/AVX512Dialect.h.inc" + +} // namespace avx512 +} // namespace mlir + +#endif // MLIR_DIALECT_AVX512_AVX512DIALECT_H_ diff --git a/mlir/include/mlir/Dialect/AVX512/CMakeLists.txt b/mlir/include/mlir/Dialect/AVX512/CMakeLists.txt new file mode 100644 index 0000000000000..bc57372689b28 --- /dev/null +++ b/mlir/include/mlir/Dialect/AVX512/CMakeLists.txt @@ -0,0 +1 @@ +add_mlir_dialect(AVX512 avx512 AVX512) diff --git a/mlir/include/mlir/Dialect/Affine/CMakeLists.txt b/mlir/include/mlir/Dialect/Affine/CMakeLists.txt new file mode 100644 index 0000000000000..f33061b2d87cf --- /dev/null +++ b/mlir/include/mlir/Dialect/Affine/CMakeLists.txt @@ -0,0 +1 @@ +add_subdirectory(IR) diff --git a/mlir/include/mlir/Dialect/AffineOps/EDSC/Builders.h b/mlir/include/mlir/Dialect/Affine/EDSC/Builders.h similarity index 97% rename from mlir/include/mlir/Dialect/AffineOps/EDSC/Builders.h rename to mlir/include/mlir/Dialect/Affine/EDSC/Builders.h index f750a1d41f8a2..9c320ece22092 100644 --- a/mlir/include/mlir/Dialect/AffineOps/EDSC/Builders.h +++ b/mlir/include/mlir/Dialect/Affine/EDSC/Builders.h @@ -11,10 +11,10 @@ // //===----------------------------------------------------------------------===// -#ifndef MLIR_DIALECT_AFFINEOPS_EDSC_BUILDERS_H_ -#define MLIR_DIALECT_AFFINEOPS_EDSC_BUILDERS_H_ +#ifndef MLIR_DIALECT_AFFINE_EDSC_BUILDERS_H_ +#define MLIR_DIALECT_AFFINE_EDSC_BUILDERS_H_ -#include "mlir/Dialect/AffineOps/AffineOps.h" +#include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/EDSC/Builders.h" #include "mlir/IR/Builders.h" #include "mlir/IR/Types.h" @@ -203,4 +203,4 @@ ValueHandle TemplatedIndexedValue::operator>=(ValueHandle e) { } // namespace edsc } // namespace mlir -#endif // MLIR_DIALECT_AFFINEOPS_EDSC_BUILDERS_H_ +#endif // MLIR_DIALECT_AFFINE_EDSC_BUILDERS_H_ diff --git a/mlir/include/mlir/Dialect/AffineOps/EDSC/Intrinsics.h b/mlir/include/mlir/Dialect/Affine/EDSC/Intrinsics.h similarity index 87% rename from mlir/include/mlir/Dialect/AffineOps/EDSC/Intrinsics.h rename to mlir/include/mlir/Dialect/Affine/EDSC/Intrinsics.h index 67d4ac16bb0b8..392e2433b9924 100644 --- a/mlir/include/mlir/Dialect/AffineOps/EDSC/Intrinsics.h +++ b/mlir/include/mlir/Dialect/Affine/EDSC/Intrinsics.h @@ -5,10 +5,10 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -#ifndef MLIR_DIALECT_AFFINEOPS_EDSC_INTRINSICS_H_ -#define MLIR_DIALECT_AFFINEOPS_EDSC_INTRINSICS_H_ +#ifndef MLIR_DIALECT_AFFINE_EDSC_INTRINSICS_H_ +#define MLIR_DIALECT_AFFINE_EDSC_INTRINSICS_H_ -#include "mlir/Dialect/AffineOps/EDSC/Builders.h" +#include "mlir/Dialect/Affine/EDSC/Builders.h" #include "mlir/EDSC/Intrinsics.h" namespace mlir { diff --git a/mlir/include/mlir/Dialect/AffineOps/AffineOps.h b/mlir/include/mlir/Dialect/Affine/IR/AffineOps.h similarity index 99% rename from mlir/include/mlir/Dialect/AffineOps/AffineOps.h rename to mlir/include/mlir/Dialect/Affine/IR/AffineOps.h index edae534f12baf..6ce38bcddddc5 100644 --- a/mlir/include/mlir/Dialect/AffineOps/AffineOps.h +++ b/mlir/include/mlir/Dialect/Affine/IR/AffineOps.h @@ -11,8 +11,8 @@ // //===----------------------------------------------------------------------===// -#ifndef MLIR_DIALECT_AFFINEOPS_AFFINEOPS_H -#define MLIR_DIALECT_AFFINEOPS_AFFINEOPS_H +#ifndef MLIR_DIALECT_AFFINE_IR_AFFINEOPS_H +#define MLIR_DIALECT_AFFINE_IR_AFFINEOPS_H #include "mlir/IR/AffineMap.h" #include "mlir/IR/Builders.h" @@ -493,10 +493,10 @@ AffineApplyOp makeComposedAffineApply(OpBuilder &b, Location loc, AffineMap map, void fullyComposeAffineMapAndOperands(AffineMap *map, SmallVectorImpl *operands); -#include "mlir/Dialect/AffineOps/AffineOpsDialect.h.inc" +#include "mlir/Dialect/Affine/IR/AffineOpsDialect.h.inc" #define GET_OP_CLASSES -#include "mlir/Dialect/AffineOps/AffineOps.h.inc" +#include "mlir/Dialect/Affine/IR/AffineOps.h.inc" /// Returns if the provided value is the induction variable of a AffineForOp. bool isForInductionVar(Value val); diff --git a/mlir/include/mlir/Dialect/AffineOps/AffineOps.td b/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td similarity index 99% rename from mlir/include/mlir/Dialect/AffineOps/AffineOps.td rename to mlir/include/mlir/Dialect/Affine/IR/AffineOps.td index 307860f1622b0..6994b5f17661d 100644 --- a/mlir/include/mlir/Dialect/AffineOps/AffineOps.td +++ b/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td @@ -13,11 +13,11 @@ #ifndef AFFINE_OPS #define AFFINE_OPS -include "mlir/Dialect/AffineOps/AffineOpsBase.td" +include "mlir/Dialect/Affine/IR/AffineOpsBase.td" include "mlir/Interfaces/LoopLikeInterface.td" include "mlir/Interfaces/SideEffects.td" -def AffineOps_Dialect : Dialect { +def Affine_Dialect : Dialect { let name = "affine"; let cppNamespace = ""; let hasConstantMaterializer = 1; @@ -25,7 +25,7 @@ def AffineOps_Dialect : Dialect { // Base class for Affine dialect ops. class Affine_Op traits = []> : - Op { + Op { // For every affine op, there needs to be a: // * void print(OpAsmPrinter &p, ${C++ class of Op} op) // * LogicalResult verify(${C++ class of Op} op) @@ -291,7 +291,7 @@ def AffineIfOp : Affine_Op<"if", } class AffineMinMaxOpBase traits = []> : - Op { + Op { let arguments = (ins AffineMapAttr:$map, Variadic:$operands); let results = (outs Index); diff --git a/mlir/include/mlir/Dialect/AffineOps/AffineOpsBase.td b/mlir/include/mlir/Dialect/Affine/IR/AffineOpsBase.td similarity index 100% rename from mlir/include/mlir/Dialect/AffineOps/AffineOpsBase.td rename to mlir/include/mlir/Dialect/Affine/IR/AffineOpsBase.td diff --git a/mlir/include/mlir/Dialect/AffineOps/AffineValueMap.h b/mlir/include/mlir/Dialect/Affine/IR/AffineValueMap.h similarity index 96% rename from mlir/include/mlir/Dialect/AffineOps/AffineValueMap.h rename to mlir/include/mlir/Dialect/Affine/IR/AffineValueMap.h index 3ec2b85597281..ffc6d73f38259 100644 --- a/mlir/include/mlir/Dialect/AffineOps/AffineValueMap.h +++ b/mlir/include/mlir/Dialect/Affine/IR/AffineValueMap.h @@ -10,8 +10,8 @@ // analysis purposes. //===----------------------------------------------------------------------===// -#ifndef MLIR_DIALECT_AFFINEOPS_AFFINEVALUEMAP_H -#define MLIR_DIALECT_AFFINEOPS_AFFINEVALUEMAP_H +#ifndef MLIR_DIALECT_AFFINE_IR_AFFINEVALUEMAP_H +#define MLIR_DIALECT_AFFINE_IR_AFFINEVALUEMAP_H #include "mlir/IR/AffineMap.h" #include "mlir/IR/OperationSupport.h" @@ -87,4 +87,4 @@ class AffineValueMap { } // namespace mlir -#endif // MLIR_DIALECT_AFFINEOPS_AFFINEVALUEMAP_H +#endif // MLIR_DIALECT_AFFINE_IR_AFFINEVALUEMAP_H diff --git a/mlir/include/mlir/Dialect/Affine/IR/CMakeLists.txt b/mlir/include/mlir/Dialect/Affine/IR/CMakeLists.txt new file mode 100644 index 0000000000000..4a7144ef96e4b --- /dev/null +++ b/mlir/include/mlir/Dialect/Affine/IR/CMakeLists.txt @@ -0,0 +1,2 @@ +add_mlir_dialect(AffineOps affine) +add_mlir_doc(AffineOps -gen-dialect-doc AffineDialect Dialects/) diff --git a/mlir/include/mlir/Dialect/Affine/Passes.h b/mlir/include/mlir/Dialect/Affine/Passes.h new file mode 100644 index 0000000000000..735c6c0360f5d --- /dev/null +++ b/mlir/include/mlir/Dialect/Affine/Passes.h @@ -0,0 +1,73 @@ +//===- Passes.h - Pass Entrypoints ------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This header file defines a set of transforms specific for the AffineOps +// dialect. +// +//===----------------------------------------------------------------------===// + +#ifndef MLIR_DIALECT_AFFINE_TRANSFORMS_PASSES_H +#define MLIR_DIALECT_AFFINE_TRANSFORMS_PASSES_H + +#include "mlir/Support/LLVM.h" +#include +#include + +namespace mlir { + +class AffineForOp; +class FuncOp; +class ModuleOp; +class Pass; +template class OpPassBase; + +/// Creates a simplification pass for affine structures (maps and sets). In +/// addition, this pass also normalizes memrefs to have the trivial (identity) +/// layout map. +std::unique_ptr> createSimplifyAffineStructuresPass(); + +/// Creates a loop invariant code motion pass that hoists loop invariant +/// operations out of affine loops. +std::unique_ptr> createAffineLoopInvariantCodeMotionPass(); + +/// Performs packing (or explicit copying) of accessed memref regions into +/// buffers in the specified faster memory space through either pointwise copies +/// or DMA operations. +std::unique_ptr> createAffineDataCopyGenerationPass( + unsigned slowMemorySpace, unsigned fastMemorySpace, + unsigned tagMemorySpace = 0, int minDmaTransferSize = 1024, + uint64_t fastMemCapacityBytes = std::numeric_limits::max()); + +/// Creates a pass to perform tiling on loop nests. +std::unique_ptr> +createLoopTilingPass(uint64_t cacheSizeBytes); + +/// Creates a loop unrolling pass with the provided parameters. +/// 'getUnrollFactor' is a function callback for clients to supply a function +/// that computes an unroll factor - the callback takes precedence over unroll +/// factors supplied through other means. If -1 is passed as the unrollFactor +/// and no callback is provided, anything passed from the command-line (if at +/// all) or the default unroll factor is used (LoopUnroll:kDefaultUnrollFactor). +std::unique_ptr> createLoopUnrollPass( + int unrollFactor = -1, int unrollFull = -1, + const std::function &getUnrollFactor = nullptr); + +/// Creates a loop unroll jam pass to unroll jam by the specified factor. A +/// factor of -1 lets the pass use the default factor or the one on the command +/// line if provided. +std::unique_ptr> +createLoopUnrollAndJamPass(int unrollJamFactor = -1); + +/// Creates a pass to vectorize loops, operations and data types using a +/// target-independent, n-D super-vector abstraction. +std::unique_ptr> +createSuperVectorizePass(ArrayRef virtualVectorSize); + +} // end namespace mlir + +#endif // MLIR_DIALECT_AFFINE_RANSFORMS_PASSES_H diff --git a/mlir/include/mlir/Dialect/AffineOps/CMakeLists.txt b/mlir/include/mlir/Dialect/AffineOps/CMakeLists.txt deleted file mode 100644 index 155e066a47259..0000000000000 --- a/mlir/include/mlir/Dialect/AffineOps/CMakeLists.txt +++ /dev/null @@ -1 +0,0 @@ -add_mlir_dialect(AffineOps affine AffineOps) diff --git a/mlir/include/mlir/Dialect/CMakeLists.txt b/mlir/include/mlir/Dialect/CMakeLists.txt index 27cbe93783469..aabb2fdc6e839 100644 --- a/mlir/include/mlir/Dialect/CMakeLists.txt +++ b/mlir/include/mlir/Dialect/CMakeLists.txt @@ -1,4 +1,5 @@ -add_subdirectory(AffineOps) +add_subdirectory(Affine) +add_subdirectory(AVX512) add_subdirectory(FxpMathOps) add_subdirectory(GPU) add_subdirectory(Linalg) diff --git a/mlir/include/mlir/Dialect/FxpMathOps/CMakeLists.txt b/mlir/include/mlir/Dialect/FxpMathOps/CMakeLists.txt index cff7f3ea1548c..2a493d6c1b204 100644 --- a/mlir/include/mlir/Dialect/FxpMathOps/CMakeLists.txt +++ b/mlir/include/mlir/Dialect/FxpMathOps/CMakeLists.txt @@ -1 +1,2 @@ -add_mlir_dialect(FxpMathOps fxpmath FxpMathOps) +add_mlir_dialect(FxpMathOps fxpmath) +add_mlir_doc(FxpMathOps -gen-dialect-doc FxpMathDialect Dialects/) diff --git a/mlir/include/mlir/Dialect/GPU/CMakeLists.txt b/mlir/include/mlir/Dialect/GPU/CMakeLists.txt index bb4c4f5d34c49..d341303d62da2 100644 --- a/mlir/include/mlir/Dialect/GPU/CMakeLists.txt +++ b/mlir/include/mlir/Dialect/GPU/CMakeLists.txt @@ -1 +1,12 @@ add_mlir_dialect(GPUOps gpu GPUOps) +add_mlir_doc(GPUOps -gen-dialect-doc GPUDialect Dialects/) + +set(LLVM_TARGET_DEFINITIONS ParallelLoopMapperAttr.td) +mlir_tablegen(ParallelLoopMapperAttr.h.inc -gen-struct-attr-decls) +mlir_tablegen(ParallelLoopMapperAttr.cpp.inc -gen-struct-attr-defs) +add_public_tablegen_target(MLIRParallelLoopMapperAttrGen) + +set(LLVM_TARGET_DEFINITIONS ParallelLoopMapperAttr.td) +mlir_tablegen(ParallelLoopMapperEnums.h.inc -gen-enum-decls) +mlir_tablegen(ParallelLoopMapperEnums.cpp.inc -gen-enum-defs) +add_public_tablegen_target(MLIRParallelLoopMapperEnumsGen) diff --git a/mlir/include/mlir/Dialect/GPU/GPUBase.td b/mlir/include/mlir/Dialect/GPU/GPUBase.td new file mode 100644 index 0000000000000..39e2f1a940d95 --- /dev/null +++ b/mlir/include/mlir/Dialect/GPU/GPUBase.td @@ -0,0 +1,58 @@ +//===-- GPUBase.td - GPU dialect definitions ---------------*- tablegen -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Defines the GPU dialect +// +//===----------------------------------------------------------------------===// + +#ifndef GPU_BASE +#define GPU_BASE + +include "mlir/IR/OpBase.td" + +//===----------------------------------------------------------------------===// +// GPU Dialect. +//===----------------------------------------------------------------------===// + +def GPU_Dialect : Dialect { + let name = "gpu"; + let hasOperationAttrVerify = 1; + + let extraClassDeclaration = [{ + /// Get the name of the attribute used to annotate the modules that contain + /// kernel modules. + static StringRef getContainerModuleAttrName() { + return "gpu.container_module"; + } + /// Get the name of the attribute used to annotate external kernel + /// functions. + static StringRef getKernelFuncAttrName() { return "gpu.kernel"; } + + /// Get the name of the attribute used to annotate kernel modules. + static StringRef getKernelModuleAttrName() { return "gpu.kernel_module"; } + + /// Returns whether the given function is a kernel function, i.e., has the + /// 'gpu.kernel' attribute. + static bool isKernel(Operation *op); + + /// Returns the number of workgroup (thread, block) dimensions supported in + /// the GPU dialect. + // TODO(zinenko,herhut): consider generalizing this. + static unsigned getNumWorkgroupDimensions() { return 3; } + + /// Returns the numeric value used to identify the workgroup memory address + /// space. + static unsigned getWorkgroupAddressSpace() { return 3; } + + /// Returns the numeric value used to identify the private memory address + /// space. + static unsigned getPrivateAddressSpace() { return 5; } + }]; +} + +#endif // GPU_BASE diff --git a/mlir/include/mlir/Dialect/GPU/GPUOps.td b/mlir/include/mlir/Dialect/GPU/GPUOps.td index 659c10142e815..6feaf82405f0f 100644 --- a/mlir/include/mlir/Dialect/GPU/GPUOps.td +++ b/mlir/include/mlir/Dialect/GPU/GPUOps.td @@ -13,6 +13,7 @@ #ifndef GPU_OPS #define GPU_OPS +include "mlir/Dialect/GPU/GPUBase.td" include "mlir/Dialect/LLVMIR/LLVMOpBase.td" include "mlir/Interfaces/SideEffects.td" @@ -26,42 +27,6 @@ def IntLikeOrLLVMInt : TypeConstraint< // GPU Dialect operations. //===----------------------------------------------------------------------===// -def GPU_Dialect : Dialect { - let name = "gpu"; - let hasOperationAttrVerify = 1; - - let extraClassDeclaration = [{ - /// Get the name of the attribute used to annotate the modules that contain - /// kernel modules. - static StringRef getContainerModuleAttrName() { - return "gpu.container_module"; - } - /// Get the name of the attribute used to annotate external kernel - /// functions. - static StringRef getKernelFuncAttrName() { return "gpu.kernel"; } - - /// Get the name of the attribute used to annotate kernel modules. - static StringRef getKernelModuleAttrName() { return "gpu.kernel_module"; } - - /// Returns whether the given function is a kernel function, i.e., has the - /// 'gpu.kernel' attribute. - static bool isKernel(Operation *op); - - /// Returns the number of workgroup (thread, block) dimensions supported in - /// the GPU dialect. - // TODO(zinenko,herhut): consider generalizing this. - static unsigned getNumWorkgroupDimensions() { return 3; } - - /// Returns the numeric value used to identify the workgroup memory address - /// space. - static unsigned getWorkgroupAddressSpace() { return 3; } - - /// Returns the numeric value used to identify the private memory address - /// space. - static unsigned getPrivateAddressSpace() { return 5; } - }]; -} - class GPU_Op traits = []> : Op; diff --git a/mlir/include/mlir/Dialect/GPU/ParallelLoopMapper.h b/mlir/include/mlir/Dialect/GPU/ParallelLoopMapper.h index 92fb09ff60208..6bbcafb919253 100644 --- a/mlir/include/mlir/Dialect/GPU/ParallelLoopMapper.h +++ b/mlir/include/mlir/Dialect/GPU/ParallelLoopMapper.h @@ -14,28 +14,48 @@ #ifndef MLIR_DIALECT_GPU_PARALLELLOOPMAPPER_H #define MLIR_DIALECT_GPU_PARALLELLOOPMAPPER_H +#include "mlir/IR/Attributes.h" +#include "mlir/Support/LLVM.h" +#include "llvm/ADT/DenseMap.h" + +#include "mlir/Dialect/GPU/ParallelLoopMapperEnums.h.inc" + namespace mlir { +class AffineMap; +struct LogicalResult; +class Operation; class Region; +#include "mlir/Dialect/GPU/ParallelLoopMapperAttr.h.inc" + +namespace loop { +class ParallelOp; +} + namespace gpu { /// Name of the mapping attribute produced by loop mappers. -static constexpr const char *kMappingAttributeName = "mapping"; -/// Name of the processor sub-attribute that identifies the hardware id -/// to map a loop to. -static constexpr const char *kProcessorEntryName = "processor"; -/// Name of the map sub-attribute that identifies the affine map to apply -/// to the hardware id to compute the iteration number of the loop. This -/// map is expected to be extended by step and lower bound computations: -/// index = map(hardware_id) * step + lowerbound -static constexpr const char *kIndexMapEntryName = "map"; -/// Name of the bound sub-attribute that itendities the affine map to -/// compute an upper bound of iterations for the hardware id. This is -/// applied to an upper bound on the number of iterations: -/// launchBound = bound(upperbound-lowerbound ceildiv step) -static constexpr const char *kBoundMapEntryName = "bound"; +StringRef getMappingAttrName(); +/// Get the value of the processor in the ParallelLoopDimMapping attribute. +inline Processor getProcessor(ParallelLoopDimMapping attr) { + return static_cast(attr.processor().getInt()); +} + +/// Helper function to create a ParallelDimMapperAttr. +/// TODO(ravishankarm/antiagainst): Replace its uses with an auto-gened method. +ParallelLoopDimMapping getParallelLoopDimMappingAttr(Processor processor, + AffineMap map, + AffineMap bound); + +/// Sets the mapping attribute of a loop.parallel operation. Verifies that the +/// mapping passed is valid. +/// - the number of DimMapperAttr provided is same as the number of loops of +/// the `ploopOp`. +/// - the mapping does not map multiple loops to the same processor. +LogicalResult setMappingAttr(loop::ParallelOp ploopOp, + ArrayRef mapping); } // end namespace gpu /// Maps the parallel loops found in the given function to workgroups. The first @@ -46,5 +66,4 @@ static constexpr const char *kBoundMapEntryName = "bound"; void greedilyMapParallelLoopsToGPU(Region ®ion); } // end namespace mlir - #endif // MLIR_DIALECT_GPU_PARALLELLOOPMAPPER_H diff --git a/mlir/include/mlir/Dialect/GPU/ParallelLoopMapperAttr.td b/mlir/include/mlir/Dialect/GPU/ParallelLoopMapperAttr.td new file mode 100644 index 0000000000000..1bfdfe5ebcfc2 --- /dev/null +++ b/mlir/include/mlir/Dialect/GPU/ParallelLoopMapperAttr.td @@ -0,0 +1,51 @@ +//===-- ParallelLoopMapperAttr.td - Attribute definition ---*- tablegen -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Defines the attribute used for driving conversion from loop.parallel to +// gpu.launch operations +// +//===----------------------------------------------------------------------===// + +#ifndef PARALLEL_LOOP_MAPPER_ATTR +#define PARALLEL_LOOP_MAPPER_ATTR + +include "mlir/Dialect/Affine/IR/AffineOpsBase.td" +include "mlir/Dialect/GPU/GPUBase.td" + +def BlockX : I64EnumAttrCase<"BlockX", 0>; +def BlockY : I64EnumAttrCase<"BlockY", 1>; +def BlockZ : I64EnumAttrCase<"BlockZ", 2>; +def ThreadX : I64EnumAttrCase<"ThreadX", 3>; +def ThreadY : I64EnumAttrCase<"ThreadY", 4>; +def ThreadZ : I64EnumAttrCase<"ThreadZ", 5>; +def Sequential : I64EnumAttrCase<"Sequential", 6>; + +def ProcessorAttr : I64EnumAttr<"Processor", "processor for loop mapping", [ + BlockX, BlockY, BlockZ, ThreadX, ThreadY, ThreadZ, Sequential]> { + let cppNamespace = "::mlir::gpu"; +} + +// Attribute that drives conversion of a loop.parallel to gpu.launch +// operation. +// processor: the hardware id to map to. +// map : An affine map that is used to pre-process hardware ids before +// substitution. +// bound : An affine map that is used to compute the bound of the hardware +// id based on an upper bound of the number of iterations. +def ParallelLoopDimMappingAttr : + StructAttr<"ParallelLoopDimMapping", GPU_Dialect, + [StructFieldAttr<"processor", ProcessorAttr>, + StructFieldAttr<"map", AffineMapAttr>, + StructFieldAttr<"bound", AffineMapAttr>]>; + + +def ParallelLoopMappingAttr : + TypedArrayAttrBase; + +#endif // PARALLEL_LOOP_MAPPER_ATTR diff --git a/mlir/include/mlir/Dialect/LLVMIR/CMakeLists.txt b/mlir/include/mlir/Dialect/LLVMIR/CMakeLists.txt index 796b4a68a2b14..d7e581b1b9492 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/CMakeLists.txt +++ b/mlir/include/mlir/Dialect/LLVMIR/CMakeLists.txt @@ -6,8 +6,10 @@ mlir_tablegen(LLVMOpsEnums.h.inc -gen-enum-decls) mlir_tablegen(LLVMOpsEnums.cpp.inc -gen-enum-defs) add_public_tablegen_target(MLIRLLVMOpsIncGen) -add_mlir_dialect(NVVMOps nvvm NVVMOps) -add_mlir_dialect(ROCDLOps rocdl ROCDLOps) +add_mlir_dialect(NVVMOps nvvm) +add_mlir_doc(NVVMOps -gen-dialect-doc NVVMDialect Dialects/) +add_mlir_dialect(ROCDLOps rocdl) +add_mlir_doc(ROCDLOps -gen-dialect-doc ROCDLDialect Dialects/) set(LLVM_TARGET_DEFINITIONS LLVMOps.td) mlir_tablegen(LLVMConversions.inc -gen-llvmir-conversions) @@ -20,3 +22,9 @@ add_public_tablegen_target(MLIRNVVMConversionsIncGen) set(LLVM_TARGET_DEFINITIONS ROCDLOps.td) mlir_tablegen(ROCDLConversions.inc -gen-llvmir-conversions) add_public_tablegen_target(MLIRROCDLConversionsIncGen) + +add_mlir_dialect(LLVMAVX512 llvm_avx512 LLVMAVX512) + +set(LLVM_TARGET_DEFINITIONS LLVMAVX512.td) +mlir_tablegen(LLVMAVX512Conversions.inc -gen-llvmir-conversions) +add_public_tablegen_target(MLIRLLVMAVX512ConversionsIncGen) diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMAVX512.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMAVX512.td new file mode 100644 index 0000000000000..12668c4da41be --- /dev/null +++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMAVX512.td @@ -0,0 +1,52 @@ +//===-- LLVMAVX512.td - LLVMAVX512 dialect op definitions --*- tablegen -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines the basic operations for the LLVMAVX512 dialect. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVMIR_AVX512_OPS +#define LLVMIR_AVX512_OPS + +include "mlir/Dialect/LLVMIR/LLVMOpBase.td" + +//===----------------------------------------------------------------------===// +// LLVMAVX512 dialect definition +//===----------------------------------------------------------------------===// + +def LLVMAVX512_Dialect : Dialect { + let name = "llvm_avx512"; + let cppNamespace = "LLVM"; +} + +//----------------------------------------------------------------------------// +// MLIR LLVM AVX512 intrinsics using the MLIR LLVM Dialect type system +//----------------------------------------------------------------------------// + +class LLVMAVX512_IntrOp traits = []> : + LLVM_IntrOpBase; + +def LLVM_x86_avx512_mask_rndscale_ps_512 : + LLVMAVX512_IntrOp<"mask.rndscale.ps.512">, + Arguments<(ins LLVM_Type, LLVM_Type, LLVM_Type, LLVM_Type, LLVM_Type)>; + +def LLVM_x86_avx512_mask_rndscale_pd_512 : + LLVMAVX512_IntrOp<"mask.rndscale.pd.512">, + Arguments<(ins LLVM_Type, LLVM_Type, LLVM_Type, LLVM_Type, LLVM_Type)>; + +def LLVM_x86_avx512_mask_scalef_ps_512 : + LLVMAVX512_IntrOp<"mask.scalef.ps.512">, + Arguments<(ins LLVM_Type, LLVM_Type, LLVM_Type, LLVM_Type, LLVM_Type)>; + +def LLVM_x86_avx512_mask_scalef_pd_512 : + LLVMAVX512_IntrOp<"mask.scalef.pd.512">, + Arguments<(ins LLVM_Type, LLVM_Type, LLVM_Type, LLVM_Type, LLVM_Type)>; + +#endif // AVX512_OPS diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMAVX512Dialect.h b/mlir/include/mlir/Dialect/LLVMIR/LLVMAVX512Dialect.h new file mode 100644 index 0000000000000..27b98fd189107 --- /dev/null +++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMAVX512Dialect.h @@ -0,0 +1,30 @@ +//===- LLVMAVX512Dialect.h - MLIR Dialect for LLVMAVX512 --------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file declares the Target dialect for LLVMAVX512 in MLIR. +// +//===----------------------------------------------------------------------===// + +#ifndef MLIR_DIALECT_LLVMIR_LLVMAVX512DIALECT_H_ +#define MLIR_DIALECT_LLVMIR_LLVMAVX512DIALECT_H_ + +#include "mlir/IR/Dialect.h" +#include "mlir/IR/OpDefinition.h" + +namespace mlir { +namespace LLVM { + +#define GET_OP_CLASSES +#include "mlir/Dialect/LLVMIR/LLVMAVX512.h.inc" + +#include "mlir/Dialect/LLVMIR/LLVMAVX512Dialect.h.inc" + +} // namespace LLVM +} // namespace mlir + +#endif // MLIR_DIALECT_LLVMIR_LLVMAVX512DIALECT_H_ diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td index c734526ce9430..53dea5bec65a5 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td @@ -506,6 +506,18 @@ def LLVM_ReturnOp : LLVM_TerminatorOp<"return", [NoSideEffect]>, let parser = [{ return parseReturnOp(parser, result); }]; let printer = [{ printReturnOp(p, *this); }]; } +def LLVM_ResumeOp : LLVM_TerminatorOp<"resume", []> { + let arguments = (ins LLVM_Type:$value); + string llvmBuilder = [{ builder.CreateResume($value); }]; + let verifier = [{ + if (!isa_and_nonnull(value().getDefiningOp())) + return emitOpError("expects landingpad value as operand"); + // No check for personality of function - landingpad op verifies it. + return success(); + }]; + + let assemblyFormat = "$value attr-dict `:` type($value)"; +} def LLVM_UnreachableOp : LLVM_TerminatorOp<"unreachable", []> { string llvmBuilder = [{ builder.CreateUnreachable(); }]; let parser = [{ return success(); }]; @@ -650,7 +662,8 @@ def LLVM_GlobalOp def LLVM_LLVMFuncOp : LLVM_ZeroResultOp<"func", [IsolatedFromAbove, FunctionLike, Symbol]>, Arguments<(ins DefaultValuedAttr:$linkage)> { + "Linkage::External">:$linkage, + OptionalAttr:$personality)> { let summary = "LLVM dialect function, has wrapped LLVM IR function type"; let regions = (region AnyRegion:$body); diff --git a/mlir/include/mlir/Dialect/Linalg/IR/CMakeLists.txt b/mlir/include/mlir/Dialect/Linalg/IR/CMakeLists.txt index 212e3022262db..2c8c33ff62a64 100644 --- a/mlir/include/mlir/Dialect/Linalg/IR/CMakeLists.txt +++ b/mlir/include/mlir/Dialect/Linalg/IR/CMakeLists.txt @@ -1,4 +1,5 @@ -add_mlir_dialect(LinalgOps linalg LinalgDoc) +add_mlir_dialect(LinalgOps linalg) +add_mlir_doc(LinalgDoc -gen-dialect-doc LinalgDialect Dialects/) set(LLVM_TARGET_DEFINITIONS LinalgStructuredOps.td) mlir_tablegen(LinalgStructuredOps.h.inc -gen-op-decls) mlir_tablegen(LinalgStructuredOps.cpp.inc -gen-op-defs) diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.td index 90489299bafef..dc0c03f26cc84 100644 --- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.td +++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.td @@ -13,7 +13,7 @@ #ifndef LINALG_OPS #define LINALG_OPS -include "mlir/Dialect/AffineOps/AffineOpsBase.td" +include "mlir/Dialect/Affine/IR/AffineOpsBase.td" include "mlir/Dialect/Linalg/IR/LinalgBase.td" include "mlir/Interfaces/SideEffects.td" diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td index 457a8db7788f5..5c8590fc60635 100644 --- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td +++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td @@ -14,7 +14,7 @@ #ifndef LINALG_STRUCTURED_OPS #define LINALG_STRUCTURED_OPS -include "mlir/Dialect/AffineOps/AffineOpsBase.td" +include "mlir/Dialect/Affine/IR/AffineOpsBase.td" include "mlir/Dialect/Linalg/IR/LinalgBase.td" include "mlir/Dialect/Linalg/IR/LinalgStructuredOpsInterface.td" diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/LinalgTransformPatterns.td b/mlir/include/mlir/Dialect/Linalg/Transforms/LinalgTransformPatterns.td index c080ff2066d02..7fa33e4f29823 100644 --- a/mlir/include/mlir/Dialect/Linalg/Transforms/LinalgTransformPatterns.td +++ b/mlir/include/mlir/Dialect/Linalg/Transforms/LinalgTransformPatterns.td @@ -15,7 +15,7 @@ include "mlir/Dialect/Linalg/IR/LinalgOps.td" include "mlir/Dialect/Linalg/IR/LinalgStructuredOps.td" -include "mlir/Dialect/AffineOps/AffineOps.td" +include "mlir/Dialect/Affine/IR/AffineOps.td" def HasNoLinalgTransformMarker : CPred<[{ !op.getAttrOfType(LinalgTransforms::kLinalgTransformMarker) diff --git a/mlir/include/mlir/Dialect/LoopOps/CMakeLists.txt b/mlir/include/mlir/Dialect/LoopOps/CMakeLists.txt index 511c32d32a47f..4a838cc1d52da 100644 --- a/mlir/include/mlir/Dialect/LoopOps/CMakeLists.txt +++ b/mlir/include/mlir/Dialect/LoopOps/CMakeLists.txt @@ -1 +1,2 @@ -add_mlir_dialect(LoopOps loop LoopOps) +add_mlir_dialect(LoopOps loop) +add_mlir_doc(LoopOps -gen-dialect-doc LoopDialect Dialects/) diff --git a/mlir/include/mlir/Dialect/LoopOps/LoopOps.td b/mlir/include/mlir/Dialect/LoopOps/LoopOps.td index 84765222ce4c4..7b010729831b8 100644 --- a/mlir/include/mlir/Dialect/LoopOps/LoopOps.td +++ b/mlir/include/mlir/Dialect/LoopOps/LoopOps.td @@ -221,19 +221,23 @@ def IfOp : Loop_Op<"if", let skipDefaultBuilders = 1; let builders = [ OpBuilder<"Builder *builder, OperationState &result, " - "Value cond, bool withElseRegion"> + "Value cond, bool withElseRegion">, + OpBuilder<"Builder *builder, OperationState &result, " + "TypeRange resultTypes, Value cond, bool withElseRegion"> ]; let extraClassDeclaration = [{ OpBuilder getThenBodyBuilder() { assert(!thenRegion().empty() && "Unexpected empty 'then' region."); Block &body = thenRegion().front(); - return OpBuilder(&body, std::prev(body.end())); + return OpBuilder(&body, + results().empty() ? std::prev(body.end()) : body.end()); } OpBuilder getElseBodyBuilder() { assert(!elseRegion().empty() && "Unexpected empty 'else' region."); Block &body = elseRegion().front(); - return OpBuilder(&body, std::prev(body.end())); + return OpBuilder(&body, + results().empty() ? std::prev(body.end()) : body.end()); } }]; } diff --git a/mlir/include/mlir/Dialect/OpenMP/CMakeLists.txt b/mlir/include/mlir/Dialect/OpenMP/CMakeLists.txt index 0362a631dc2e4..0df7a04f52b9e 100644 --- a/mlir/include/mlir/Dialect/OpenMP/CMakeLists.txt +++ b/mlir/include/mlir/Dialect/OpenMP/CMakeLists.txt @@ -1 +1,2 @@ -add_mlir_dialect(OpenMPOps omp OpenMPOps) +add_mlir_dialect(OpenMPOps omp) +add_mlir_doc(OpenMPOps -gen-dialect-doc OpenMPDialect Dialects/) diff --git a/mlir/include/mlir/Dialect/Quant/CMakeLists.txt b/mlir/include/mlir/Dialect/Quant/CMakeLists.txt index 87a9fd6a30662..b18726736e942 100644 --- a/mlir/include/mlir/Dialect/Quant/CMakeLists.txt +++ b/mlir/include/mlir/Dialect/Quant/CMakeLists.txt @@ -1 +1,2 @@ -add_mlir_dialect(QuantOps quant QuantOps) +add_mlir_dialect(QuantOps quant) +add_mlir_doc(QuantOps -gen-dialect-doc QuantDialect Dialects/) diff --git a/mlir/include/mlir/Dialect/SPIRV/CMakeLists.txt b/mlir/include/mlir/Dialect/SPIRV/CMakeLists.txt index 8d297cc921173..771d4c1a43bbb 100644 --- a/mlir/include/mlir/Dialect/SPIRV/CMakeLists.txt +++ b/mlir/include/mlir/Dialect/SPIRV/CMakeLists.txt @@ -1,4 +1,5 @@ -add_mlir_dialect(SPIRVOps spv SPIRVOps) +add_mlir_dialect(SPIRVOps spv) +add_mlir_doc(SPIRVOps -gen-dialect-doc SPIRVDialect Dialects/) set(LLVM_TARGET_DEFINITIONS SPIRVBase.td) mlir_tablegen(SPIRVEnums.h.inc -gen-enum-decls) diff --git a/mlir/include/mlir/Dialect/SPIRV/SPIRVBase.td b/mlir/include/mlir/Dialect/SPIRV/SPIRVBase.td index 26d8f1401c32e..b6715dc9fcd7a 100644 --- a/mlir/include/mlir/Dialect/SPIRV/SPIRVBase.td +++ b/mlir/include/mlir/Dialect/SPIRV/SPIRVBase.td @@ -52,16 +52,6 @@ def SPIRV_Dialect : Dialect { let hasRegionResultAttrVerify = 1; let extraClassDeclaration = [{ - //===------------------------------------------------------------------===// - // Type - //===------------------------------------------------------------------===// - - /// Checks if the given `type` is valid in SPIR-V dialect. - static bool isValidType(Type type); - - /// Checks if the given `scalar type` is valid in SPIR-V dialect. - static bool isValidScalarType(Type type); - //===------------------------------------------------------------------===// // Attribute //===------------------------------------------------------------------===// diff --git a/mlir/include/mlir/Dialect/SPIRV/SPIRVLowering.h b/mlir/include/mlir/Dialect/SPIRV/SPIRVLowering.h index 85b42eeea2915..ba0b7ea0714cf 100644 --- a/mlir/include/mlir/Dialect/SPIRV/SPIRVLowering.h +++ b/mlir/include/mlir/Dialect/SPIRV/SPIRVLowering.h @@ -13,6 +13,7 @@ #ifndef MLIR_DIALECT_SPIRV_SPIRVLOWERING_H #define MLIR_DIALECT_SPIRV_SPIRVLOWERING_H +#include "mlir/Dialect/SPIRV/SPIRVAttributes.h" #include "mlir/Dialect/SPIRV/SPIRVTypes.h" #include "mlir/Dialect/SPIRV/TargetAndABI.h" #include "mlir/Transforms/DialectConversion.h" @@ -22,15 +23,38 @@ namespace mlir { /// Type conversion from standard types to SPIR-V types for shader interface. /// -/// For composite types, this converter additionally performs type wrapping to +/// Non-32-bit scalar types require special hardware support that may not exist +/// on all GPUs. This is reflected in SPIR-V as that non-32-bit scalar types +/// require special capabilities or extensions. Right now if a scalar type of a +/// certain bitwidth is not supported in the target environment, we use 32-bit +/// ones unconditionally. This requires the runtime to also feed in data with +/// a matched bitwidth and layout for interface types. The runtime can do that +/// by inspecting the SPIR-V module. +/// +/// For memref types, this converter additionally performs type wrapping to /// satisfy shader interface requirements: shader interface types must be /// pointers to structs. +/// +/// TODO(antiagainst): We might want to introduce a way to control how +/// unsupported bitwidth are handled and explicitly fail if wanted. class SPIRVTypeConverter : public TypeConverter { public: - SPIRVTypeConverter(); + explicit SPIRVTypeConverter(spirv::TargetEnvAttr targetAttr); /// Gets the SPIR-V correspondence for the standard index type. static Type getIndexType(MLIRContext *context); + + /// Returns the corresponding memory space for memref given a SPIR-V storage + /// class. + static unsigned getMemorySpaceForStorageClass(spirv::StorageClass); + + /// Returns the SPIR-V storage class given a memory space for memref. Return + /// llvm::None if the memory space does not map to any SPIR-V storage class. + static Optional + getStorageClassForMemorySpace(unsigned space); + +private: + spirv::TargetEnv targetEnv; }; /// Base class to define a conversion pattern to lower `SourceOp` into SPIR-V. @@ -61,11 +85,10 @@ class FuncOp; class SPIRVConversionTarget : public ConversionTarget { public: /// Creates a SPIR-V conversion target for the given target environment. - static std::unique_ptr get(TargetEnvAttr targetEnv, - MLIRContext *context); + static std::unique_ptr get(TargetEnvAttr targetAttr); private: - SPIRVConversionTarget(TargetEnvAttr targetEnv, MLIRContext *context); + explicit SPIRVConversionTarget(TargetEnvAttr targetAttr); // Be explicit that instance of this class cannot be copied or moved: there // are lambdas capturing fields of the instance. @@ -78,9 +101,7 @@ class SPIRVConversionTarget : public ConversionTarget { /// environment. bool isLegalOp(Operation *op); - Version givenVersion; /// SPIR-V version to target - llvm::SmallSet givenExtensions; /// Allowed extensions - llvm::SmallSet givenCapabilities; /// Allowed capabilities + TargetEnv targetEnv; }; /// Returns the value for the given `builtin` variable. This function gets or diff --git a/mlir/include/mlir/Dialect/SPIRV/SPIRVTypes.h b/mlir/include/mlir/Dialect/SPIRV/SPIRVTypes.h index 385e79a0445eb..85b35f73f82c5 100644 --- a/mlir/include/mlir/Dialect/SPIRV/SPIRVTypes.h +++ b/mlir/include/mlir/Dialect/SPIRV/SPIRVTypes.h @@ -78,6 +78,8 @@ class SPIRVType : public Type { static bool classof(Type type); + bool isScalarOrVector(); + /// The extension requirements for each type are following the /// ((Extension::A OR Extension::B) AND (Extension::C OR Extension::D)) /// convention. @@ -109,6 +111,11 @@ class ScalarType : public SPIRVType { static bool classof(Type type); + /// Returns true if the given integer type is valid for the SPIR-V dialect. + static bool isValid(FloatType); + /// Returns true if the given float type is valid for the SPIR-V dialect. + static bool isValid(IntegerType); + void getExtensions(SPIRVType::ExtensionArrayRefVector &extensions, Optional storage = llvm::None); void getCapabilities(SPIRVType::CapabilityArrayRefVector &capabilities, @@ -122,6 +129,9 @@ class CompositeType : public SPIRVType { static bool classof(Type type); + /// Returns true if the given vector type is valid for the SPIR-V dialect. + static bool isValid(VectorType); + unsigned getNumElements() const; Type getElementType(unsigned) const; diff --git a/mlir/include/mlir/Dialect/SPIRV/TargetAndABI.h b/mlir/include/mlir/Dialect/SPIRV/TargetAndABI.h index 5ffd00c530c6e..3f14addd9b6bb 100644 --- a/mlir/include/mlir/Dialect/SPIRV/TargetAndABI.h +++ b/mlir/include/mlir/Dialect/SPIRV/TargetAndABI.h @@ -15,6 +15,7 @@ #include "mlir/Dialect/SPIRV/SPIRVAttributes.h" #include "mlir/Support/LLVM.h" +#include "llvm/ADT/SmallSet.h" namespace mlir { class Operation; @@ -22,13 +23,45 @@ class Operation; namespace spirv { enum class StorageClass : uint32_t; +/// A wrapper class around a spirv::TargetEnvAttr to provide query methods for +/// allowed version/capabilities/extensions. +class TargetEnv { +public: + explicit TargetEnv(TargetEnvAttr targetAttr); + + Version getVersion(); + + /// Returns true if the given capability is allowed. + bool allows(Capability) const; + /// Returns the first allowed one if any of the given capabilities is allowed. + /// Returns llvm::None otherwise. + Optional allows(ArrayRef) const; + + /// Returns true if the given extension is allowed. + bool allows(Extension) const; + /// Returns the first allowed one if any of the given extensions is allowed. + /// Returns llvm::None otherwise. + Optional allows(ArrayRef) const; + + /// Returns the MLIRContext. + MLIRContext *getContext() const; + + /// Allows implicity converting to the underlying spirv::TargetEnvAttr. + operator TargetEnvAttr() const { return targetAttr; } + +private: + TargetEnvAttr targetAttr; + llvm::SmallSet givenExtensions; /// Allowed extensions + llvm::SmallSet givenCapabilities; /// Allowed capabilities +}; + /// Returns the attribute name for specifying argument ABI information. StringRef getInterfaceVarABIAttrName(); /// Gets the InterfaceVarABIAttr given its fields. InterfaceVarABIAttr getInterfaceVarABIAttr(unsigned descriptorSet, unsigned binding, - StorageClass storageClass, + Optional storageClass, MLIRContext *context); /// Returns the attribute name for specifying entry point information. diff --git a/mlir/include/mlir/Dialect/SPIRV/TargetAndABI.td b/mlir/include/mlir/Dialect/SPIRV/TargetAndABI.td index a463f0e8da95f..5d08aa1f2d7c0 100644 --- a/mlir/include/mlir/Dialect/SPIRV/TargetAndABI.td +++ b/mlir/include/mlir/Dialect/SPIRV/TargetAndABI.td @@ -32,7 +32,7 @@ include "mlir/Dialect/SPIRV/SPIRVBase.td" def SPV_InterfaceVarABIAttr : StructAttr<"InterfaceVarABIAttr", SPIRV_Dialect, [ StructFieldAttr<"descriptor_set", I32Attr>, StructFieldAttr<"binding", I32Attr>, - StructFieldAttr<"storage_class", SPV_StorageClassAttr> + StructFieldAttr<"storage_class", OptionalAttr> ]>; // For entry functions, this attribute specifies information related to entry diff --git a/mlir/include/mlir/Dialect/Vector/CMakeLists.txt b/mlir/include/mlir/Dialect/Vector/CMakeLists.txt index 4977e117e7e03..a27eef693a288 100644 --- a/mlir/include/mlir/Dialect/Vector/CMakeLists.txt +++ b/mlir/include/mlir/Dialect/Vector/CMakeLists.txt @@ -1,4 +1,5 @@ -add_mlir_dialect(VectorOps vector VectorOps) +add_mlir_dialect(VectorOps vector) +add_mlir_doc(VectorOps -gen-op-doc VectorOps Dialects/) set(LLVM_TARGET_DEFINITIONS VectorTransformPatterns.td) mlir_tablegen(VectorTransformPatterns.h.inc -gen-rewriters) diff --git a/mlir/include/mlir/Dialect/Vector/VectorOps.h b/mlir/include/mlir/Dialect/Vector/VectorOps.h index 50fa0150ba537..2a8835102d596 100644 --- a/mlir/include/mlir/Dialect/Vector/VectorOps.h +++ b/mlir/include/mlir/Dialect/Vector/VectorOps.h @@ -53,7 +53,9 @@ void populateVectorSlicesLoweringPatterns(OwningRewritePatternList &patterns, /// Collect a set of transformation patterns that are related to contracting /// or expanding vector operations: /// ContractionOpLowering, -/// ShapeCastOp2DDownCastRewritePattern, ShapeCastOp2DUpCastRewritePattern +/// ShapeCastOp2DDownCastRewritePattern, +/// ShapeCastOp2DUpCastRewritePattern +/// TransposeOpLowering /// OuterproductOpLowering /// These transformation express higher level vector ops in terms of more /// elementary extraction, insertion, reduction, product, and broadcast ops. diff --git a/mlir/include/mlir/Dialect/Vector/VectorOps.td b/mlir/include/mlir/Dialect/Vector/VectorOps.td index 2a791365db4fa..2e895e63ba27e 100644 --- a/mlir/include/mlir/Dialect/Vector/VectorOps.td +++ b/mlir/include/mlir/Dialect/Vector/VectorOps.td @@ -13,7 +13,7 @@ #ifndef VECTOR_OPS #define VECTOR_OPS -include "mlir/Dialect/AffineOps/AffineOpsBase.td" +include "mlir/Dialect/Affine/IR/AffineOpsBase.td" include "mlir/Interfaces/SideEffects.td" def Vector_Dialect : Dialect { @@ -88,7 +88,7 @@ def Vector_ContractionOp : iterator in the iterator type list, to each dimension of an N-D vector. Examples: - + ```mlir // Simple dot product (K = 0). #contraction_accesses = [ affine_map<(i) -> (i)>, @@ -139,6 +139,7 @@ def Vector_ContractionOp : %5 = vector.contract #contraction_trait %0, %1, %2, %lhs_mask, %rhs_mask : vector<7x8x16x15xf32>, vector<8x16x7x5xf32> into vector<8x15x8x5xf32> + ``` }]; let builders = [OpBuilder< "Builder *builder, OperationState &result, Value lhs, Value rhs, " @@ -203,7 +204,7 @@ def Vector_ReductionOp : http://llvm.org/docs/LangRef.html#experimental-vector-reduction-intrinsics Examples: - ``` + ```mlir %1 = vector.reduction "add", %0 : vector<16xf32> into f32 %3 = vector.reduction "xor", %2 : vector<4xi32> into i32 @@ -247,7 +248,7 @@ def Vector_BroadcastOp : shaped vector with the same element type is always legal. Examples: - ``` + ```mlir %0 = constant 0.0 : f32 %1 = vector.broadcast %0 : f32 to vector<16xf32> %2 = vector.broadcast %1 : vector<16xf32> to vector<4x16xf32> @@ -290,7 +291,7 @@ def Vector_ShuffleOp : above, all mask values are in the range [0,s_1+t_1) Examples: - ``` + ```mlir %0 = vector.shuffle %a, %b[0, 3] : vector<2xf32>, vector<2xf32> ; yields vector<2xf32> %1 = vector.shuffle %c, %b[0, 1, 2] @@ -332,7 +333,7 @@ def Vector_ExtractElementOp : https://llvm.org/docs/LangRef.html#extractelement-instruction Example: - ``` + ```mlir %c = constant 15 : i32 %1 = vector.extractelement %0[%c : i32]: vector<16xf32> ``` @@ -360,7 +361,7 @@ def Vector_ExtractOp : the proper position. Degenerates to an element type in the 0-D case. Examples: - ``` + ```mlir %1 = vector.extract %0[3]: vector<4x8x16xf32> %2 = vector.extract %0[3, 3, 3]: vector<4x8x16xf32> ``` @@ -396,7 +397,7 @@ def Vector_ExtractSlicesOp : Currently, only unit strides are supported. Examples: - ``` + ```mlir %0 = vector.transfer_read ...: vector<4x2xf32> %1 = vector.extract_slices %0, [2, 2], [1, 1] @@ -448,8 +449,7 @@ def Vector_FMAOp : to the `llvm.fma.*` intrinsic. Example: - - ``` + ```mlir %3 = vector.fma %0, %1, %2: vector<8x16xf32> ``` }]; @@ -483,7 +483,7 @@ def Vector_InsertElementOp : https://llvm.org/docs/LangRef.html#insertelement-instruction Example: - ``` + ```mlir %c = constant 15 : i32 %f = constant 0.0f : f32 %1 = vector.insertelement %f, %0[%c : i32]: vector<16xf32> @@ -516,7 +516,7 @@ def Vector_InsertOp : position. Degenerates to a scalar source type when n = 0. Examples: - ``` + ```mlir %2 = vector.insert %0, %1[3]: vector<8x16xf32> into vector<4x8x16xf32> %5 = vector.insert %3, %4[3, 3, 3]: @@ -559,7 +559,7 @@ def Vector_InsertSlicesOp : Currently, only unit strides are supported. Examples: - ``` + ```mlir %0 = vector.extract_slices %0, [2, 2], [1, 1] : vector<4x2xf32> into tuple, vector<2x2xf32>> @@ -617,7 +617,7 @@ def Vector_InsertStridedSliceOp : the proper location as specified by the offsets. Examples: - ``` + ```mlir %2 = vector.insert_strided_slice %0, %1 {offsets = [0, 0, 2], strides = [1, 1]}: vector<2x4xf32> into vector<16x4x8xf32> @@ -659,8 +659,7 @@ def Vector_OuterProductOp : lower to actual `fma` instructions on x86. Examples: - - ``` + ```mlir %2 = vector.outerproduct %0, %1: vector<4xf32>, vector<8xf32> return %2: vector<4x8xf32> @@ -709,8 +708,8 @@ def Vector_ReshapeOp : In the examples below, valid data elements are represented by an alphabetic character, and undefined data elements are represented by '-'. - Example - + Example: + ```mlir vector<1x8xf32> with valid data shape [6], fixed vector sizes [8] input: [a, b, c, d, e, f] @@ -719,8 +718,9 @@ def Vector_ReshapeOp : vector layout: [a, b, c, d, e, f, -, -] - Example - + ``` + Example: + ```mlir vector<2x8xf32> with valid data shape [10], fixed vector sizes [8] input: [a, b, c, d, e, f, g, h, i, j] @@ -729,9 +729,9 @@ def Vector_ReshapeOp : vector layout: [[a, b, c, d, e, f, g, h], [i, j, -, -, -, -, -, -]] - - Example - + ``` + Example: + ```mlir vector<2x2x2x3xf32> with valid data shape [3, 5], fixed vector sizes [2, 3] @@ -750,9 +750,9 @@ def Vector_ReshapeOp : [-, -, -]] [[n, o, -], [-, -, -]]]] - - Example - + ``` + Example: + ```mlir %1 = vector.reshape %0, [%c3, %c6], [%c2, %c9], [4] : vector<3x2x4xf32> to vector<2x3x4xf32> @@ -776,6 +776,7 @@ def Vector_ReshapeOp : [[j, k, l, m], [n, o, p, q], [r, -, -, -]]] + ``` }]; let extraClassDeclaration = [{ @@ -828,7 +829,7 @@ def Vector_StridedSliceOp : `offsets` and ending at `offsets + sizes`. Examples: - ``` + ```mlir %1 = vector.strided_slice %0 {offsets = [0, 2], sizes = [2, 4], strides = [1, 1]}: vector<4x8x16xf32> to vector<2x4x16xf32> @@ -947,13 +948,12 @@ def Vector_TransferReadOp : implemented using a warp-shuffle if loop `j` were mapped to `threadIdx.x`. Syntax - ``` + ```mlir operation ::= ssa-id `=` `vector.transfer_read` ssa-use-list `{` attribute-entry `} :` memref-type `,` vector-type ``` Examples: - ```mlir // Read the slice `%A[%i0, %i1:%i1+256, %i2:%i2+32]` into vector<32x256xf32> // and pad with %f0 to handle the boundary case: @@ -1028,7 +1028,7 @@ def Vector_TransferWriteOp : Syntax: - ``` + ```mlir operation ::= `vector.transfer_write` ssa-use-list `{` attribute-entry `} : ` vector-type ', ' memref-type ' ``` @@ -1139,7 +1139,7 @@ def Vector_TypeCastOp : Syntax: - ``` + ```mlir operation ::= `vector.type_cast` ssa-use : memref-type to memref-type ``` @@ -1183,8 +1183,10 @@ def Vector_ConstantMaskOp : define a hyper-rectangular region within which elements values are set to 1 (otherwise element values are set to 0). - Example: create a constant vector mask of size 4x3xi1 with elements in range - 0 <= row <= 2 and 0 <= col <= 1 are set to 1 (others to 0). + Example: + ``` + create a constant vector mask of size 4x3xi1 with elements in range + 0 <= row <= 2 and 0 <= col <= 1 are set to 1 (others to 0). %1 = vector.constant_mask [3, 2] : vector<4x3xi1> @@ -1196,6 +1198,7 @@ def Vector_ConstantMaskOp : rows 1 | 1 1 0 2 | 1 1 0 3 | 0 0 0 + ``` }]; let extraClassDeclaration = [{ @@ -1217,8 +1220,10 @@ def Vector_CreateMaskOp : hyper-rectangular region within which elements values are set to 1 (otherwise element values are set to 0). - Example: create a vector mask of size 4x3xi1 where elements in range - 0 <= row <= 2 and 0 <= col <= 1 are set to 1 (others to 0). + Example: + ``` + create a vector mask of size 4x3xi1 where elements in range + 0 <= row <= 2 and 0 <= col <= 1 are set to 1 (others to 0). %1 = vector.create_mask %c3, %c2 : vector<4x3xi1> @@ -1230,6 +1235,7 @@ def Vector_CreateMaskOp : rows 1 | 1 1 0 2 | 1 1 0 3 | 0 0 0 + ``` }]; let hasCanonicalizer = 1; @@ -1248,9 +1254,8 @@ def Vector_TupleOp : transformation and should be removed before lowering to lower-level dialects. - Examples: - ``` + ```mlir %0 = vector.transfer_read ... : vector<2x2xf32> %1 = vector.transfer_read ... : vector<2x1xf32> %2 = vector.transfer_read ... : vector<2x2xf32> @@ -1269,6 +1274,46 @@ def Vector_TupleOp : }]; } +def Vector_TransposeOp : + Vector_Op<"transpose", [NoSideEffect, + PredOpTrait<"operand and result have same element type", + TCresVTEtIsSameAsOpBase<0, 0>>]>, + Arguments<(ins AnyVector:$vector, I64ArrayAttr:$transp)>, + Results<(outs AnyVector:$result)> { + let summary = "vector transpose operation"; + let description = [{ + Takes a n-D vector and returns the transposed n-D vector defined by + the permutation of ranks in the n-sized integer array attribute. + In the operation + ```mlir + %1 = vector.tranpose %0, [i_1, .., i_n] + : vector + to vector + ``` + the transp array [i_1, .., i_n] must be a permutation of [0, .., n-1]. + + Example: + ```mlir + %1 = vector.tranpose %0, [1, 0] : vector<2x3xf32> to vector<3x2xf32> + + [ [a, b, c], [ [a, d], + [d, e, f] ] -> [b, e], + [c, f] ] + ``` + }]; + let extraClassDeclaration = [{ + VectorType getVectorType() { + return vector().getType().cast(); + } + VectorType getResultType() { + return result().getType().cast(); + } + }]; + let assemblyFormat = [{ + $vector `,` $transp attr-dict `:` type($vector) `to` type($result) + }]; +} + def Vector_TupleGetOp : Vector_Op<"tuple_get", [NoSideEffect]>, Arguments<(ins TupleOf<[AnyVector]>:$vectors, APIntAttr:$index)>, @@ -1282,7 +1327,7 @@ def Vector_TupleGetOp : dialects. Examples: - ``` + ```mlir %4 = vector.tuple %0, %1, %2, %3 : vector<2x2xf32>, vector<2x1xf32>, vector<2x2xf32>, vector<2x1xf32>> @@ -1312,7 +1357,7 @@ def Vector_PrintOp : format (for testing and debugging). No return value. Examples: - ``` + ```mlir %0 = constant 0.0 : f32 %1 = vector.broadcast %0 : f32 to vector<4xf32> vector.print %1 : vector<4xf32> @@ -1375,7 +1420,7 @@ def Vector_MatmulOp : Vector_Op<"matrix_multiply", [NoSideEffect, Example: - ``` + ```mlir %C = vector.matrix_multiply %A, %B { lhs_rows = 4: i32, lhs_columns = 16: i32 , rhs_columns = 3: i32 } : (vector<64xf64>, vector<48xf64>) -> vector<12xf64> diff --git a/mlir/include/mlir/Dialect/VectorOps/VectorOps.td b/mlir/include/mlir/Dialect/VectorOps/VectorOps.td new file mode 100644 index 0000000000000..ef1fde00e5ec2 --- /dev/null +++ b/mlir/include/mlir/Dialect/VectorOps/VectorOps.td @@ -0,0 +1,1402 @@ +//===- VectorOps.td - Vector op definitions ---------------*- tablegen -*-====// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Defines MLIR vector operations. +// +//===----------------------------------------------------------------------===// + +#ifndef VECTOR_OPS +#define VECTOR_OPS + +include "mlir/Dialect/Affine/IR/AffineOpsBase.td" +include "mlir/Interfaces/SideEffects.td" + + +def VectorOps_Dialect : Dialect { + let name = "vector"; + let cppNamespace = "vector"; + let hasConstantMaterializer = 1; +} + +// Base class for Vector dialect ops. +class Vector_Op traits = []> : + Op { + // For every vector op, there needs to be a: + // * void print(OpAsmPrinter &p, ${C++ class of Op} op) + // * LogicalResult verify(${C++ class of Op} op) + // * ParseResult parse${C++ class of Op}(OpAsmParser &parser, + // OperationState &result) + // functions. + let printer = [{ return ::print(p, *this); }]; + let verifier = [{ return ::verify(*this); }]; + let parser = [{ return ::parse$cppClass(parser, result); }]; +} + +// TODO(andydavis, ntv) Add an attribute to specify a different algebra +// with operators other than the current set: {*, +}. +def Vector_ContractionOp : + Vector_Op<"contract", [NoSideEffect, + PredOpTrait<"first operand lhs and result have same element type", + TCresVTEtIsSameAsOpBase<0, 0>>, + PredOpTrait<"second operand rhs and result have same element type", + TCresVTEtIsSameAsOpBase<0, 1>>, + PredOpTrait<"third operand acc and result have same element type", + TCresVTEtIsSameAsOpBase<0, 1>>]>, + Arguments<(ins AnyVector:$lhs, AnyVector:$rhs, AnyType:$acc, + Variadic>:$masks, + AffineMapArrayAttr:$indexing_maps, ArrayAttr:$iterator_types)>, + Results<(outs AnyType)> { + let summary = "vector contraction operation"; + let description = [{ + Computes the sum of products of vector elements along contracting + dimension pairs from 2 vectors of rank M and N respectively, adds this + intermediate result to the accumulator argument of rank K, and returns a + vector result of rank K (where K = num_lhs_free_dims + num_rhs_free_dims + + num_batch_dims (see dimension type descriptions below)). For K = 0 (no + free or batch dimensions), the accumulator and output are a scalar. + + Optional vector mask arguments (produced by CreateMaskOp or ConstantMaskOp) + specify the dynamic dimension sizes of valid data within the lhs/rhs vector + arguments. + + An iterator type attribute list must be specified, where each element of + the list represents an iterator with one of the following types: + + *) "reduction": reduction dimensions are present in the lhs and rhs + arguments but not in the output (and accumulator + argument). These are the dimensions along which the vector + contraction op computes the sum of products, and + contracting dimension pair dimension sizes must match + between lhs/rhs. + *) "parallel": Batch dimensions are iterator type "parallel", and + are non-contracting dimensions present in the lhs, rhs and + output. The lhs/rhs co-iterate along the batch dimensions, + which should be expressed in their indexing maps. + + Free dimensions are iterator type "parallel", and are + non-contraction, non-batch dimensions accessed by either the + lhs or rhs (but not both). The lhs and rhs free dimensions + are unrelated to each other and do not co-iterate, which + should be expressed in their indexing maps. + + An indexing map attribute list must be specified with an entry for lhs, rhs + and acc arguments. An indexing map attribute specifies a mapping from each + iterator in the iterator type list, to each dimension of an N-D vector. + + Examples: + + // Simple dot product (K = 0). + #contraction_accesses = [ + affine_map<(i) -> (i)>, + affine_map<(i) -> (i)>, + affine_map<(i) -> ()> + ] + #contraction_trait = { + indexing_maps = #contraction_accesses, + iterator_types = ["reduction"] + } + %3 = vector.contract #contraction_trait %0, %1, %2 + : vector<10xf32>, vector<10xf32> into f32 + + // 2D vector contraction with one contracting dimension (matmul, K = 2). + #contraction_accesses = [ + affine_map<(i, j, k) -> (i, k)>, + affine_map<(i, j, k) -> (k, j)>, + affine_map<(i, j, k) -> (i, j)> + ] + #contraction_trait = { + indexing_maps = #contraction_accesses, + iterator_types = ["parallel", "parallel", "reduction"] + } + + %3 = vector.contract #contraction_trait %0, %1, %2 + : vector<4x3xf32>, vector<3x7xf32> into vector<4x7xf32> + + // 4D to 3D vector contraction with two contracting dimensions and + // one batch dimension (K = 3). + #contraction_accesses = [ + affine_map<(b0, f0, f1, c0, c1) -> (c0, b0, c1, f0)>, + affine_map<(b0, f0, f1, c0, c1) -> (b0, c1, c0, f1)>, + affine_map<(b0, f0, f1, c0, c1) -> (b0, f0, f1)> + ] + #contraction_trait = { + indexing_maps = #contraction_accesses, + iterator_types = ["parallel", "parallel", "parallel", + "reduction", "reduction"] + } + + %4 = vector.contract #contraction_trait %0, %1, %2 + : vector<7x8x16x15xf32>, vector<8x16x7x5xf32> into vector<8x15x5xf32> + + // 4D vector contraction with two contracting dimensions and optional + // vector mask arguments. + %lhs_mask = vector.constant_mask [7, 8, 16, 15] : vector<7x8x16x15xi1> + %rhs_mask = vector.constant_mask [8, 16, 7, 5] : vector<8x16x7x5xi1> + + %5 = vector.contract #contraction_trait %0, %1, %2, %lhs_mask, %rhs_mask + : vector<7x8x16x15xf32>, vector<8x16x7x5xf32> into vector<8x15x8x5xf32> + }]; + let builders = [OpBuilder< + "Builder *builder, OperationState &result, Value lhs, Value rhs, " + "Value acc, ArrayAttr indexingMaps, ArrayAttr iteratorTypes">, + OpBuilder< + "Builder *builder, OperationState &result, Value lhs, Value rhs, " + "Value acc, ArrayRef> indexingExprs, " + "ArrayRef iteratorTypes">]; + let extraClassDeclaration = [{ + VectorType getLhsType() { + return lhs().getType().cast(); + } + VectorType getRhsType() { + return rhs().getType().cast(); + } + Type getAccType() { return acc().getType(); } + VectorType getLHSVectorMaskType() { + if (llvm::size(masks()) != 2) return VectorType(); + return getOperand(3).getType().cast(); + } + VectorType getRHSVectorMaskType() { + if (llvm::size(masks()) != 2) return VectorType(); + return getOperand(4).getType().cast(); + } + Type getResultType() { return getResult().getType(); } + ArrayRef getTraitAttrNames(); + SmallVector getIndexingMaps(); + static unsigned getAccOperandIndex() { return 2; } + + // Returns the bounds of each dimension in the iteration space spanned + // by the iterator types of this operation. + void getIterationBounds(SmallVectorImpl &iterationBounds); + + // Returns a list of index maps, where there is a list entry for each + // op indexing map attribute (i.e. one for each input and output, with + // the output listed last). Each index map, maps from this operations + // iteration space, to vector dimensions of the maps input/output. + void getIterationIndexMap( + std::vector> &iterationIndexMap); + + std::vector> getContractingDimMap(); + std::vector> getBatchDimMap(); + }]; +} + +def Vector_ReductionOp : + Vector_Op<"reduction", [NoSideEffect, + PredOpTrait<"source operand and result have same element type", + TCresVTEtIsSameAsOpBase<0, 0>>]>, + Arguments<(ins StrAttr:$kind, AnyVector:$vector, Variadic:$acc)>, + Results<(outs AnyType:$dest)> { + let summary = "reduction operation"; + let description = [{ + Reduces an 1-D vector "horizontally" into a scalar using the given + operation (add/mul/min/max for int/fp and and/or/xor for int only). + Some reductions (add/mul for fp) also allow an optional fused + accumulator. + + Note that these operations are restricted to 1-D vectors to remain + close to the corresponding LLVM intrinsics: + + http://llvm.org/docs/LangRef.html#experimental-vector-reduction-intrinsics + + Examples: + ``` + %1 = vector.reduction "add", %0 : vector<16xf32> into f32 + + %3 = vector.reduction "xor", %2 : vector<4xi32> into i32 + + %4 = vector.reduction "mul", %0, %1 : vector<16xf32> into f32 + ``` + }]; + let extraClassDeclaration = [{ + VectorType getVectorType() { + return vector().getType().cast(); + } + }]; +} + +def Vector_BroadcastOp : + Vector_Op<"broadcast", [NoSideEffect, + PredOpTrait<"source operand and result have same element type", + TCresVTEtIsSameAsOpBase<0, 0>>]>, + Arguments<(ins AnyType:$source)>, + Results<(outs AnyVector:$vector)> { + let summary = "broadcast operation"; + let description = [{ + Broadcasts the scalar or k-D vector value in the source operand + to a n-D result vector such that the broadcast makes sense, i.e., + the source operand is duplicated to match the given rank and sizes + in the result vector. The legality rules are: + * the source operand must have the same element type as the result type + * a k-D vector can be broadcast to + a n-D vector if + * k <= n, and + * the sizes in the trailing dimensions n-k < i <= n with j=i+k-n + match exactly as s_j = t_i or s_j = 1: + ``` + t_1 x .. t_n-k x t_n-k+1 x .. x t_i x .. x t_n + s_1 x .. x s_j x .. x s_k + + ``` + The source operand is duplicated over all the missing leading dimensions + and stretched over the trailing dimensions where the source has a non-equal + dimension of 1. These rules imply that any scalar broadcast (k=0) to any + shaped vector with the same element type is always legal. + + Examples: + ``` + %0 = constant 0.0 : f32 + %1 = vector.broadcast %0 : f32 to vector<16xf32> + %2 = vector.broadcast %1 : vector<16xf32> to vector<4x16xf32> + ``` + }]; + let extraClassDeclaration = [{ + Type getSourceType() { return source().getType(); } + VectorType getVectorType() { + return vector().getType().cast(); + } + }]; + let assemblyFormat = "$source attr-dict `:` type($source) `to` type($vector)"; +} + +def Vector_ShuffleOp : + Vector_Op<"shuffle", [NoSideEffect, + PredOpTrait<"first operand v1 and result have same element type", + TCresVTEtIsSameAsOpBase<0, 0>>, + PredOpTrait<"second operand v2 and result have same element type", + TCresVTEtIsSameAsOpBase<0, 1>>]>, + Arguments<(ins AnyVector:$v1, AnyVector:$v2, I64ArrayAttr:$mask)>, + Results<(outs AnyVector:$vector)> { + let summary = "shuffle operation"; + let description = [{ + The shuffle operation constructs a permutation (or duplication) of elements + from two input vectors, returning a vector with the same element type as + the input and a length that is the same as the shuffle mask. The two input + vectors must have the same element type, rank, and trailing dimension sizes + and shuffles their values in the leading dimension (which may differ in size) + according to the given mask. The legality rules are: + * the two operands must have the same element type as the result + * the two operands and the result must have the same rank and trailing + dimension sizes, viz. given two k-D operands + v1 : and + v2 : + we have s_i = t_i for all 1 < i <= k + * the mask length equals the leading dimension size of the result + * numbering the input vector indices left to right across the operands, all + mask values must be within range, viz. given two k-D operands v1 and v2 + above, all mask values are in the range [0,s_1+t_1) + + Examples: + ``` + %0 = vector.shuffle %a, %b[0, 3] + : vector<2xf32>, vector<2xf32> ; yields vector<2xf32> + %1 = vector.shuffle %c, %b[0, 1, 2] + : vector<2x16xf32>, vector<1x16xf32> ; yields vector<3x16xf32> + %2 = vector.shuffle %a, %b[3, 2, 1, 0] + : vector<2xf32>, vector<2xf32> ; yields vector<4xf32> + + ``` + }]; + let builders = [OpBuilder<"Builder *builder, OperationState &result," + "Value v1, Value v2, ArrayRef">]; + let extraClassDeclaration = [{ + static StringRef getMaskAttrName() { return "mask"; } + VectorType getV1VectorType() { + return v1().getType().cast(); + } + VectorType getV2VectorType() { + return v2().getType().cast(); + } + VectorType getVectorType() { + return vector().getType().cast(); + } + }]; +} + +def Vector_ExtractElementOp : + Vector_Op<"extractelement", [NoSideEffect, + TypesMatchWith<"result type matches element type of vector operand", + "vector", "result", + "$_self.cast().getElementType()">]>, + Arguments<(ins AnyVector:$vector, AnySignlessInteger:$position)>, + Results<(outs AnyType:$result)> { + let summary = "extractelement operation"; + let description = [{ + Takes an 1-D vector and a dynamic index position and extracts the + scalar at that position. Note that this instruction resembles + vector.extract, but is restricted to 1-D vectors and relaxed + to dynamic indices. It is meant to be closer to LLVM's version: + https://llvm.org/docs/LangRef.html#extractelement-instruction + + Example: + ``` + %c = constant 15 : i32 + %1 = vector.extractelement %0[%c : i32]: vector<16xf32> + ``` + }]; + let extraClassDeclaration = [{ + VectorType getVectorType() { + return vector().getType().cast(); + } + }]; + + let assemblyFormat = [{ + $vector `[` $position `:` type($position) `]` attr-dict `:` type($vector) + }]; +} + +def Vector_ExtractOp : + Vector_Op<"extract", [NoSideEffect, + PredOpTrait<"operand and result have same element type", + TCresVTEtIsSameAsOpBase<0, 0>>]>, + Arguments<(ins AnyVector:$vector, I64ArrayAttr:$position)>, + Results<(outs AnyType)> { + let summary = "extract operation"; + let description = [{ + Takes an n-D vector and a k-D position and extracts the (n-k)-D vector at + the proper position. Degenerates to an element type in the 0-D case. + + Examples: + ``` + %1 = vector.extract %0[3]: vector<4x8x16xf32> + %2 = vector.extract %0[3, 3, 3]: vector<4x8x16xf32> + ``` + }]; + let builders = [OpBuilder< + "Builder *builder, OperationState &result, Value source," + "ArrayRef">]; + let extraClassDeclaration = [{ + static StringRef getPositionAttrName() { return "position"; } + VectorType getVectorType() { + return vector().getType().cast(); + } + }]; +} + +def Vector_ExtractSlicesOp : + Vector_Op<"extract_slices", [NoSideEffect]>, + Arguments<(ins AnyVector:$vector, I64ArrayAttr:$sizes, + I64ArrayAttr:$strides)>, + Results<(outs TupleOf<[AnyVector]>)> { + let summary = "vector extract slices operation"; + let description = [{ + Takes an N-d vector and returns a tuple of vector slices of 'vector', + based on 'sizes' and 'strides' parameters. + + The arguments 'sizes' and 'strides' represent a specification for + generating the unrolling of 'vector' shape, which has all slices of shape + 'sizes' except for slices at dimension boundaries when 'vector' dimension + sizes are not a multiple of 'sizes'. + + Each slice is returned at the tuple element index corresponding to the + linear index of the slice w.r.t the unrolling scheme represented by 'sizes'. + Currently, only unit strides are supported. + + Examples: + ``` + %0 = vector.transfer_read ...: vector<4x2xf32> + + %1 = vector.extract_slices %0, [2, 2], [1, 1] + : vector<4x2xf32> into tuple, vector<2x2xf32>> + + // Example with partial slices at dimension boundaries. + %2 = vector.transfer_read ...: vector<4x3xf32> + + %3 = vector.extract_slices %2, [2, 2], [1, 1] + : vector<4x3xf32> into tuple, vector<2x1xf32>, + vector<2x2xf32>, vector<2x1xf32>> + ``` + }]; + let builders = [OpBuilder< + "Builder *builder, OperationState &result, TupleType tupleType, " # + "Value vector, ArrayRef sizes, " # + "ArrayRef strides">]; + let extraClassDeclaration = [{ + VectorType getSourceVectorType() { + return vector().getType().cast(); + } + TupleType getResultTupleType() { + return getResult().getType().cast(); + } + void getSizes(SmallVectorImpl &results); + void getStrides(SmallVectorImpl &results); + static StringRef getSizesAttrName() { return "sizes"; } + static StringRef getStridesAttrName() { return "strides"; } + }]; + let assemblyFormat = [{ + $vector `,` $sizes `,` $strides attr-dict `:` type($vector) `into` + type(results) + }]; +} + +def Vector_FMAOp : + Op]>, + Arguments<(ins AnyVector:$lhs, AnyVector:$rhs, AnyVector:$acc)>, + Results<(outs AnyVector:$result)> { + let summary = "vector fused multiply-add"; + let description = [{ + Multiply-add expressions operate on n-D vectors and compute a fused + pointwise multiply-and-accumulate: `$result = `$lhs * $rhs + $acc`. + All operands and result have the same vector type. The semantics + of the operation correspond to those of the `llvm.fma` + [intrinsic](https://llvm.org/docs/LangRef.html#int-fma). In the + particular case of lowering to LLVM, this is guaranteed to lower + to the `llvm.fma.*` intrinsic. + + Example: + + ``` + %3 = vector.fma %0, %1, %2: vector<8x16xf32> + ``` + }]; + // Fully specified by traits. + let verifier = ?; + let assemblyFormat = "$lhs `,` $rhs `,` $acc attr-dict `:` type($lhs)"; + let builders = [OpBuilder< + "Builder *b, OperationState &result, Value lhs, Value rhs, Value acc", + "build(b, result, lhs.getType(), lhs, rhs, acc);">]; + let extraClassDeclaration = [{ + VectorType getVectorType() { return lhs().getType().cast(); } + }]; +} + +def Vector_InsertElementOp : + Vector_Op<"insertelement", [NoSideEffect, + TypesMatchWith<"source operand type matches element type of result", + "result", "source", + "$_self.cast().getElementType()">, + AllTypesMatch<["dest", "result"]>]>, + Arguments<(ins AnyType:$source, AnyVector:$dest, + AnySignlessInteger:$position)>, + Results<(outs AnyVector:$result)> { + let summary = "insertelement operation"; + let description = [{ + Takes a scalar source, an 1-D destination vector and a dynamic index + position and inserts the source into the destination at the proper + position. Note that this instruction resembles vector.insert, but + is restricted to 1-D vectors and relaxed to dynamic indices. It is + meant to be closer to LLVM's version: + https://llvm.org/docs/LangRef.html#insertelement-instruction + + Example: + ``` + %c = constant 15 : i32 + %f = constant 0.0f : f32 + %1 = vector.insertelement %f, %0[%c : i32]: vector<16xf32> + ``` + }]; + let extraClassDeclaration = [{ + Type getSourceType() { return source().getType(); } + VectorType getDestVectorType() { + return dest().getType().cast(); + } + }]; + + let assemblyFormat = [{ + $source `,` $dest `[` $position `:` type($position) `]` attr-dict `:` + type($result) + }]; +} + +def Vector_InsertOp : + Vector_Op<"insert", [NoSideEffect, + PredOpTrait<"source operand and result have same element type", + TCresVTEtIsSameAsOpBase<0, 0>>, + AllTypesMatch<["dest", "res"]>]>, + Arguments<(ins AnyType:$source, AnyVector:$dest, I64ArrayAttr:$position)>, + Results<(outs AnyVector:$res)> { + let summary = "insert operation"; + let description = [{ + Takes an n-D source vector, an (n+k)-D destination vector and a k-D position + and inserts the n-D source into the (n+k)-D destination at the proper + position. Degenerates to a scalar source type when n = 0. + + Examples: + ``` + %2 = vector.insert %0, %1[3]: + vector<8x16xf32> into vector<4x8x16xf32> + %5 = vector.insert %3, %4[3, 3, 3]: + f32 into vector<4x8x16xf32> + ``` + }]; + let assemblyFormat = [{ + $source `,` $dest $position attr-dict `:` type($source) `into` type($dest) + }]; + + let builders = [OpBuilder< + "Builder *builder, OperationState &result, Value source, " # + "Value dest, ArrayRef">]; + let extraClassDeclaration = [{ + static StringRef getPositionAttrName() { return "position"; } + Type getSourceType() { return source().getType(); } + VectorType getDestVectorType() { + return dest().getType().cast(); + } + }]; +} + +def Vector_InsertSlicesOp : + Vector_Op<"insert_slices", [NoSideEffect]>, + Arguments<(ins TupleOf<[AnyVector]>:$vectors, I64ArrayAttr:$sizes, + I64ArrayAttr:$strides)>, + Results<(outs AnyVector)> { + let summary = "vector insert slices operation"; + let description = [{ + Takes a tuple of vector slices and inserts them into the vector result + according to the 'sizes' and 'strides' parameters. + + The arguments 'sizes' and 'strides' represent a specification for + generating the unrolling of 'vector' shape, which has all slices of shape + 'sizes' except for slices at dimension boundaries when 'vector' dimension + sizes are not a multiple of 'sizes'. + + Each slice in 'vectors' is at the tuple element index corresponding to the + linear index of the slice w.r.t the unrolling scheme represented by 'sizes'. + Currently, only unit strides are supported. + + Examples: + ``` + %0 = vector.extract_slices %0, [2, 2], [1, 1] + : vector<4x2xf32> into tuple, vector<2x2xf32>> + + %1 = vector.insert_slices %0, [2, 2], [1, 1] + : tuple, vector<2x2xf32>> into vector<4x2xf32> + + // Example with partial slices at dimension boundaries. + %3 = vector.extract_slices %2, [2, 2], [1, 1] + : vector<4x3xf32> into tuple, vector<2x1xf32>, + vector<2x2xf32>, vector<2x1xf32>> + + %4 = vector.insert_slices %3, [2, 2], [1, 1] + : tuple, vector<2x1xf32>, + vector<2x2xf32>, vector<2x1xf32>> into vector<4x3xf32> + ``` + }]; + + let extraClassDeclaration = [{ + TupleType getSourceTupleType() { + return vectors().getType().cast(); + } + VectorType getResultVectorType() { + return getResult().getType().cast(); + } + void getSizes(SmallVectorImpl &results); + void getStrides(SmallVectorImpl &results); + static StringRef getSizesAttrName() { return "sizes"; } + static StringRef getStridesAttrName() { return "strides"; } + }]; + let assemblyFormat = [{ + $vectors `,` $sizes `,` $strides attr-dict `:` type($vectors) `into` + type(results) + }]; +} + +def Vector_InsertStridedSliceOp : + Vector_Op<"insert_strided_slice", [NoSideEffect, + PredOpTrait<"operand #0 and result have same element type", + TCresVTEtIsSameAsOpBase<0, 0>>, + AllTypesMatch<["dest", "res"]>]>, + Arguments<(ins AnyVector:$source, AnyVector:$dest, I64ArrayAttr:$offsets, + I64ArrayAttr:$strides)>, + Results<(outs AnyVector:$res)> { + let summary = "strided_slice operation"; + let description = [{ + Takes a k-D source vector, an n-D destination vector (n >= k), n-sized + `offsets` integer array attribute, a k-sized `strides` integer array attribute + and inserts the k-D source vector as a strided subvector at the proper offset + into the n-D destination vector. + + At the moment strides must contain only 1s. + + Returns an n-D vector that is a copy of the n-D destination vector in which + the last k-D dimensions contain the k-D source vector elements strided at + the proper location as specified by the offsets. + + Examples: + ``` + %2 = vector.insert_strided_slice %0, %1 + {offsets = [0, 0, 2], strides = [1, 1]}: + vector<2x4xf32> into vector<16x4x8xf32> + ``` + }]; + + let assemblyFormat = [{ + $source `,` $dest attr-dict `:` type($source) `into` type($dest) + }]; + + let builders = [OpBuilder< + "Builder *builder, OperationState &result, Value source, Value dest, " # + "ArrayRef offsets, ArrayRef strides">]; + let extraClassDeclaration = [{ + static StringRef getOffsetsAttrName() { return "offsets"; } + static StringRef getStridesAttrName() { return "strides"; } + VectorType getSourceVectorType() { + return source().getType().cast(); + } + VectorType getDestVectorType() { + return dest().getType().cast(); + } + }]; +} + +def Vector_OuterProductOp : + Vector_Op<"outerproduct", [NoSideEffect, SameOperandsAndResultElementType]>, + Arguments<(ins AnyVector:$lhs, AnyVector:$rhs, Variadic:$acc)>, + Results<(outs AnyVector)> { + let summary = "vector outerproduct with optional fused add"; + let description = [{ + Takes 2 1-D vectors and returns the 2-D vector containing the outer-product. + + An optional extra 2-D vector argument may be specified in which case the + operation returns the sum of the outer-product and the extra vector. In this + multiply-accumulate scenario, the rounding mode is that obtained by + guaranteeing that a fused-multiply add operation is emitted. When lowered to + the LLVMIR dialect, this form emits `llvm.intr.fma`, which is guaranteed to + lower to actual `fma` instructions on x86. + + Examples: + + ``` + %2 = vector.outerproduct %0, %1: vector<4xf32>, vector<8xf32> + return %2: vector<4x8xf32> + + %3 = vector.outerproduct %0, %1, %2: + vector<4xf32>, vector<8xf32>, vector<4x8xf32> + return %3: vector<4x8xf32> + ``` + }]; + let extraClassDeclaration = [{ + VectorType getOperandVectorTypeLHS() { + return lhs().getType().cast(); + } + VectorType getOperandVectorTypeRHS() { + return rhs().getType().cast(); + } + VectorType getOperandVectorTypeACC() { + return (llvm::size(acc()) == 0) ? VectorType() : + (*acc().begin()).getType().cast(); + } + VectorType getVectorType() { + return getResult().getType().cast(); + } + }]; +} + +// TODO(andydavis) Add transformation which decomposes ReshapeOp into an +// optimized sequence of vector rotate/shuffle/select operations. +def Vector_ReshapeOp : + Vector_Op<"reshape", [AttrSizedOperandSegments, NoSideEffect]>, + Arguments<(ins AnyVector:$vector, Variadic:$input_shape, + Variadic:$output_shape, + I64ArrayAttr:$fixed_vector_sizes)>, + Results<(outs AnyVector:$result)> { + let summary = "vector reshape operation"; + let description = [{ + Reshapes its vector operand from 'input_shape' to 'output_shape' maintaining + fixed vector dimension 'fixed_vector_sizes' on the innermost vector + dimensions. + + The parameters 'input_shape' and 'output_shape' represent valid data shapes + across fixed vector shapes. For example, if a vector has a valid data + shape [6] with fixed vector size [8], then the valid data elements are + assumed to be stored at the beginning of the vector with the remaining + vector elements undefined. + + In the examples below, valid data elements are represented by an alphabetic + character, and undefined data elements are represented by '-'. + + Example + + vector<1x8xf32> with valid data shape [6], fixed vector sizes [8] + + input: [a, b, c, d, e, f] + + layout map: (d0) -> (d0 floordiv 8, d0 mod 8) + + vector layout: [a, b, c, d, e, f, -, -] + + Example + + vector<2x8xf32> with valid data shape [10], fixed vector sizes [8] + + input: [a, b, c, d, e, f, g, h, i, j] + + layout map: (d0) -> (d0 floordiv 8, d0 mod 8) + + vector layout: [[a, b, c, d, e, f, g, h], + [i, j, -, -, -, -, -, -]] + + Example + + vector<2x2x2x3xf32> with valid data shape [3, 5], fixed vector sizes + [2, 3] + + input: [[a, b, c, d, e], + [f, g, h, i, j], + [k, l, m, n, o]] + + layout map: (d0, d1) -> (d0 floordiv 3, d1 floordiv 5, + d0 mod 3, d1 mod 5) + + vector layout: [[[[a, b, c], + [f, g, h]] + [[d, e, -], + [i, j, -]]], + [[[k, l, m], + [-, -, -]] + [[n, o, -], + [-, -, -]]]] + + Example + + %1 = vector.reshape %0, [%c3, %c6], [%c2, %c9], [4] + : vector<3x2x4xf32> to vector<2x3x4xf32> + + input: [[a, b, c, d, e, f], + [g, h, i, j, k, l], + [m, n, o, p, q, r]] + + layout map: (d0, d1) -> (d0, d1 floordiv 4, d1 mod 4) + + + Input vector: [[[a, b, c, d], + [e, f, -, -]], + [[g, h, i, j], + [k, l, -, -]], + [[m, n, o, p], + [q, r, -, -]]] + + Output vector: [[[a, b, c, d], + [e, f, g, h], + [i, -, -, -]], + [[j, k, l, m], + [n, o, p, q], + [r, -, -, -]]] + }]; + + let extraClassDeclaration = [{ + VectorType getInputVectorType() { + return vector().getType().cast(); + } + VectorType getOutputVectorType() { + return getResult().getType().cast(); + } + + /// Returns as integer value the number of input shape operands. + int64_t getNumInputShapeSizes() { return input_shape().size(); } + + /// Returns as integer value the number of output shape operands. + int64_t getNumOutputShapeSizes() { return output_shape().size(); } + + void getFixedVectorSizes(SmallVectorImpl &results); + + static StringRef getFixedVectorSizesAttrName() { + return "fixed_vector_sizes"; + } + static StringRef getInputShapeAttrName() { return "input_shape"; } + static StringRef getOutputShapeAttrName() { return "output_shape"; } + }]; + + let assemblyFormat = [{ + $vector `,` `[` $input_shape `]` `,` `[` $output_shape `]` `,` + $fixed_vector_sizes attr-dict `:` type($vector) `to` type($result) + }]; +} + +def Vector_StridedSliceOp : + Vector_Op<"strided_slice", [NoSideEffect, + PredOpTrait<"operand and result have same element type", + TCresVTEtIsSameAsOpBase<0, 0>>]>, + Arguments<(ins AnyVector:$vector, I64ArrayAttr:$offsets, + I64ArrayAttr:$sizes, I64ArrayAttr:$strides)>, + Results<(outs AnyVector)> { + let summary = "strided_slice operation"; + let description = [{ + Takes an n-D vector, k-D `offsets` integer array attribute, a k-sized + `sizes` integer array attribute, a k-sized `strides` integer array + attribute and extracts the n-D subvector at the proper offset. + + At the moment strides must contain only 1s. + // TODO(ntv) support non-1 strides. + + Returns an n-D vector where the first k-D dimensions match the `sizes` + attribute. The returned subvector contains the elements starting at offset + `offsets` and ending at `offsets + sizes`. + + Examples: + ``` + %1 = vector.strided_slice %0 + {offsets = [0, 2], sizes = [2, 4], strides = [1, 1]}: + vector<4x8x16xf32> to vector<2x4x16xf32> + ``` + + // TODO(ntv) Evolve to a range form syntax similar to: + %1 = vector.strided_slice %0[0:2:1][2:4:1] + vector<4x8x16xf32> to vector<2x4x16xf32> + }]; + let builders = [OpBuilder< + "Builder *builder, OperationState &result, Value source, " # + "ArrayRef offsets, ArrayRef sizes, " # + "ArrayRef strides">]; + let extraClassDeclaration = [{ + static StringRef getOffsetsAttrName() { return "offsets"; } + static StringRef getSizesAttrName() { return "sizes"; } + static StringRef getStridesAttrName() { return "strides"; } + VectorType getVectorType(){ return vector().getType().cast(); } + void getOffsets(SmallVectorImpl &results); + }]; + let hasCanonicalizer = 1; + let assemblyFormat = "$vector attr-dict `:` type($vector) `to` type(results)"; +} + +def Vector_TransferReadOp : + Vector_Op<"transfer_read">, + Arguments<(ins AnyMemRef:$memref, Variadic:$indices, + AffineMapAttr:$permutation_map, AnyType:$padding)>, + Results<(outs AnyVector:$vector)> { + + let summary = "Reads a supervector from memory into an SSA vector value."; + + let description = [{ + The `vector.transfer_read` op performs a blocking read from a slice within + a [MemRef](../LangRef.md#memref-type) supplied as its first operand + into a [vector](../LangRef.md#vector-type) of the same base elemental type. + + A memref operand with vector element type, must have its vector element + type match a suffix (shape and element type) of the vector (e.g. + memref<3x2x6x4x3xf32>, vector<1x1x4x3xf32>). + + The slice is further defined by a full-rank index within the MemRef, + supplied as the operands `2 .. 1 + rank(memref)`. The permutation_map + [attribute](../LangRef.md#attributes) is an + [affine-map](Affine.md#affine-maps) which specifies the transposition on the + slice to match the vector shape. The size of the slice is specified by the + size of the vector, given as the return type. An `ssa-value` of the same + elemental type as the MemRef is provided as the last operand to specify + padding in the case of out-of-bounds accesses. This operation is called + 'read' by opposition to 'load' because the super-vector granularity is + generally not representable with a single hardware register. + A `vector.transfer_read` is thus a mid-level + abstraction that supports super-vectorization with non-effecting padding for + full-tile-only code. + + More precisely, let's dive deeper into the permutation_map for the following + MLIR: + + ```mlir + vector.transfer_read %A[%expr1, %expr2, %expr3, %expr4] + { permutation_map : (d0,d1,d2,d3) -> (d2,0,d0) } : + memref, vector<3x4x5xf32> + ``` + + This operation always reads a slice starting at `%A[%expr1, %expr2, %expr3, + %expr4]`. The size of the slice is 3 along d2 and 5 along d0, so the slice + is: `%A[%expr1 : %expr1 + 5, %expr2, %expr3:%expr3 + 3, %expr4]` + + That slice needs to be read into a `vector<3x4x5xf32>`. Since the + permutation map is not full rank, there must be a broadcast along vector + dimension `1`. + + A notional lowering of vector.transfer_read could generate code resembling: + + ```mlir + // %expr1, %expr2, %expr3, %expr4 defined before this point + %tmp = alloc() : vector<3x4x5xf32> + %view_in_tmp = "element_type_cast"(%tmp) : memref<1xvector<3x4x5xf32>> + for %i = 0 to 3 { + affine.for %j = 0 to 4 { + affine.for %k = 0 to 5 { + %a = load %A[%expr1 + %k, %expr2, %expr3 + %i, %expr4] : + memref + store %tmp[%i, %j, %k] : vector<3x4x5xf32> + }}} + %c0 = constant 0 : index + %vec = load %view_in_tmp[%c0] : vector<3x4x5xf32> + ``` + + On a GPU one could then map `i`, `j`, `k` to blocks and threads. Notice that + the temporary storage footprint is `3 * 5` values but `3 * 4 * 5` values are + actually transferred between `%A` and `%tmp`. + + Alternatively, if a notional vector broadcast operation were available, the + lowered code would resemble: + + ```mlir + // %expr1, %expr2, %expr3, %expr4 defined before this point + %tmp = alloc() : vector<3x4x5xf32> + %view_in_tmp = "element_type_cast"(%tmp) : memref<1xvector<3x4x5xf32>> + for %i = 0 to 3 { + affine.for %k = 0 to 5 { + %a = load %A[%expr1 + %k, %expr2, %expr3 + %i, %expr4] : + memref + store %tmp[%i, 0, %k] : vector<3x4x5xf32> + }} + %c0 = constant 0 : index + %tmpvec = load %view_in_tmp[%c0] : vector<3x4x5xf32> + %vec = broadcast %tmpvec, 1 : vector<3x4x5xf32> + ``` + + where `broadcast` broadcasts from element 0 to all others along the + specified dimension. This time, the temporary storage footprint is `3 * 5` + values which is the same amount of data as the `3 * 5` values transferred. + An additional `1` broadcast is required. On a GPU this broadcast could be + implemented using a warp-shuffle if loop `j` were mapped to `threadIdx.x`. + + Syntax + ``` + operation ::= ssa-id `=` `vector.transfer_read` ssa-use-list + `{` attribute-entry `} :` memref-type `,` vector-type + ``` + + Examples: + + ```mlir + // Read the slice `%A[%i0, %i1:%i1+256, %i2:%i2+32]` into vector<32x256xf32> + // and pad with %f0 to handle the boundary case: + %f0 = constant 0.0f : f32 + for %i0 = 0 to %0 { + affine.for %i1 = 0 to %1 step 256 { + affine.for %i2 = 0 to %2 step 32 { + %v = vector.transfer_read %A[%i0, %i1, %i2], (%f0) + {permutation_map: (d0, d1, d2) -> (d2, d1)} : + memref, vector<32x256xf32> + }}} + + // Read the slice `%A[%i0, %i1]` (i.e. the element `%A[%i0, %i1]`) into + // vector<128xf32>. The underlying implementation will require a 1-D vector + // broadcast: + for %i0 = 0 to %0 { + affine.for %i1 = 0 to %1 { + %3 = vector.transfer_read %A[%i0, %i1] + {permutation_map: (d0, d1) -> (0)} : + memref, vector<128xf32> + } + } + + // Read from a memref with vector element type. + %4 = vector.transfer_read %arg1[%c3, %c3], %vf0 + {permutation_map = (d0, d1)->(d0, d1)} + : memref>, vector<1x1x4x3xf32> + ``` + }]; + + let extraClassDeclaration = [{ + MemRefType getMemRefType() { + return memref().getType().cast(); + } + VectorType getVectorType() { + return vector().getType().cast(); + } + }]; +} + +def Vector_TransferWriteOp : + Vector_Op<"transfer_write">, + Arguments<(ins AnyVector:$vector, AnyMemRef:$memref, + Variadic:$indices, + AffineMapAttr:$permutation_map)> { + + let summary = "The vector.transfer_write op writes a supervector to memory."; + + let description = [{ + The `vector.transfer_write` performs a blocking write from a + [vector](../LangRef.md#vector-type), supplied as its first operand, into a + slice within a [MemRef](../LangRef.md#memref-type) of the same base + elemental type, supplied as its second operand. + + A vector memref operand must have its vector element type match a suffix + (shape and element type) of the vector (e.g. memref<3x2x6x4x3xf32>, + vector<1x1x4x3xf32>). + + The slice is further defined by a full-rank index within the MemRef, + supplied as the operands `3 .. 2 + rank(memref)`. + The permutation_map [attribute](../LangRef.md#attributes) is an + [affine-map](Affine.md#affine-maps) which specifies the transposition on the + slice to match the vector shape. The size of the slice is specified by the + size of the vector. This operation is called 'write' by opposition to + 'store' because the super-vector granularity is generally not representable + with a single hardware register. A `vector.transfer_write` is thus a + mid-level abstraction that supports super-vectorization with non-effecting + padding for full-tile-only code. It is the responsibility of + `vector.transfer_write`'s implementation to ensure the memory writes are + valid. Different lowerings may be pertinent depending on the hardware + support. + + Syntax: + + ``` + operation ::= `vector.transfer_write` ssa-use-list `{` attribute-entry `} : + ` vector-type ', ' memref-type ' + ``` + + Examples: + + ```mlir + // write vector<16x32x64xf32> into the slice + // `%A[%i0, %i1:%i1+32, %i2:%i2+64, %i3:%i3+16]`: + for %i0 = 0 to %0 { + affine.for %i1 = 0 to %1 step 32 { + affine.for %i2 = 0 to %2 step 64 { + affine.for %i3 = 0 to %3 step 16 { + %val = `ssa-value` : vector<16x32x64xf32> + vector.transfer_write %val, %A[%i0, %i1, %i2, %i3] + {permutation_map: (d0, d1, d2, d3) -> (d3, d1, d2)} : + vector<16x32x64xf32>, memref + }}}} + + // write to a memref with vector element type. + vector.transfer_write %4, %arg1[%c3, %c3] + {permutation_map = (d0, d1)->(d0, d1)} + : vector<1x1x4x3xf32>, memref> + ``` + }]; + + let extraClassDeclaration = [{ + VectorType getVectorType() { + return vector().getType().cast(); + } + MemRefType getMemRefType() { + return memref().getType().cast(); + } + }]; + let assemblyFormat = [{ + $vector `,` $memref `[` $indices `]` attr-dict `:` type($vector) `,` + type($memref) + }]; +} + +def Vector_ShapeCastOp : + Vector_Op<"shape_cast", [NoSideEffect]>, + Arguments<(ins AnyTypeOf<[AnyVector, TupleOf<[AnyVector]>]>:$source)>, + Results<(outs AnyTypeOf<[AnyVector, TupleOf<[AnyVector]>]>:$result)> { + let summary = "shape_cast casts between vector shapes"; + let description = [{ + The shape_cast operation casts between an n-D source vector shape and + a k-D result vector shape (the element type remains the same). + + If reducing rank (n > k), result dimension sizes must be a product + of contiguous source dimension sizes. + If expanding rank (n < k), source dimensions must factor into a + contiguous sequence of destination dimension sizes. + Each source dim is expanded (or contiguous sequence of source dims combined) + in source dimension list order (i.e. 0 <= i < n), to produce a contiguous + sequence of result dims (or a single result dim), in result dimension list + order (i.e. 0 <= j < k). The product of all source dimension sizes and all + result dimension sizes must match. + + If the source/result types are a tuple of vectors, the casting operation + described above is applied to each source/result tuple element pair. + + It is currently assumed that this operation does not require moving data, + and that it will be folded away before lowering vector operations. + + There is an exception to the folding expectation when targeting + llvm.intr.matrix operations. We need a type conversion back and forth from a + 2-D MLIR vector to a 1-D flattened LLVM vector.shape_cast lowering to LLVM + is supported in that particular case, for now. + + Examples: + + ```mlir + // Example casting to a lower vector rank. + %1 = vector.shape_cast %0 : vector<5x1x4x3xf32> to vector<20x3xf32> + + // Example casting to a higher vector rank. + %3 = vector.shape_cast %2 : vector<10x12x8xf32> to vector<5x2x3x4x8xf32> + + // Example casting a tuple of vectors of same rank, where tuple elements + // may have different shapes. + %5 = vector.shape_cast %4 : tuple, vector<3x3x2xf32>> to + tuple, vector<9x2xf32>> + ``` + }]; + let extraClassDeclaration = [{ + VectorType getSourceVectorType() { + return source().getType().cast(); + } + VectorType getResultVectorType() { + return getResult().getType().cast(); + } + }]; + let assemblyFormat = "$source attr-dict `:` type($source) `to` type($result)"; +} + +def Vector_TypeCastOp : + Vector_Op<"type_cast", [NoSideEffect]>, + Arguments<(ins StaticShapeMemRefOf<[AnyType]>:$memref)>, + Results<(outs AnyMemRef)> { + let summary = "type_cast op converts a scalar memref to a vector memref"; + let description = [{ + Performs a conversion from a memref with scalar element to a memref with a + *single* vector element, copying the shape of the memref to the vector. This + is the minimal viable operation that is required to makeke + super-vectorization operational. It can be seen as a special case of the + `view` operation but scoped in the super-vectorization context. + + Syntax: + + ``` + operation ::= `vector.type_cast` ssa-use : memref-type to memref-type + ``` + + Example: + + ```mlir + %A = alloc() : memref<5x4x3xf32> + %VA = vector.type_cast %A : memref<5x4x3xf32> to memref> + ``` + }]; + + let builders = [OpBuilder< + "Builder *builder, OperationState &result, Value source">]; + + let parser = [{ + return impl::parseCastOp(parser, result); + }]; + + let extraClassDeclaration = [{ + MemRefType getMemRefType() { + return memref().getType().cast(); + } + MemRefType getResultMemRefType() { + return getResult().getType().cast(); + } + }]; +} + +def Vector_ConstantMaskOp : + Vector_Op<"constant_mask", [NoSideEffect]>, + Arguments<(ins I64ArrayAttr:$mask_dim_sizes)>, + Results<(outs VectorOf<[I1]>)> { + let summary = "creates a constant vector mask"; + let description = [{ + Creates and returns a vector mask where elements of the result vector + are set to '0' or '1', based on whether the element indices are contained + within a hyper-rectangular region specified by the 'mask_dim_sizes' + array attribute argument. Each element of the 'mask_dim_sizes' array, + specifies an exclusive upper bound [0, mask-dim-size-element-value) + for a unique dimension in the vector result. The conjunction of the ranges + define a hyper-rectangular region within which elements values are set to 1 + (otherwise element values are set to 0). + + Example: create a constant vector mask of size 4x3xi1 with elements in range + 0 <= row <= 2 and 0 <= col <= 1 are set to 1 (others to 0). + + %1 = vector.constant_mask [3, 2] : vector<4x3xi1> + + print %1 + columns + 0 1 2 + |------------ + 0 | 1 1 0 + rows 1 | 1 1 0 + 2 | 1 1 0 + 3 | 0 0 0 + }]; + + let extraClassDeclaration = [{ + static StringRef getMaskDimSizesAttrName() { return "mask_dim_sizes"; } + }]; + let assemblyFormat = "$mask_dim_sizes attr-dict `:` type(results)"; +} + +def Vector_CreateMaskOp : + Vector_Op<"create_mask", [NoSideEffect]>, + Arguments<(ins Variadic:$operands)>, Results<(outs VectorOf<[I1]>)> { + let summary = "creates a vector mask"; + let description = [{ + Creates and returns a vector mask where elements of the result vector + are set to '0' or '1', based on whether the element indices are contained + within a hyper-rectangular region specified by the operands. Specifically, + each operand specifies a range [0, operand-value) for a unique dimension in + the vector result. The conjunction of the operand ranges define a + hyper-rectangular region within which elements values are set to 1 + (otherwise element values are set to 0). + + Example: create a vector mask of size 4x3xi1 where elements in range + 0 <= row <= 2 and 0 <= col <= 1 are set to 1 (others to 0). + + %1 = vector.create_mask %c3, %c2 : vector<4x3xi1> + + print %1 + columns + 0 1 2 + |------------ + 0 | 1 1 0 + rows 1 | 1 1 0 + 2 | 1 1 0 + 3 | 0 0 0 + }]; + + let hasCanonicalizer = 1; + let assemblyFormat = "$operands attr-dict `:` type(results)"; +} + +def Vector_TupleOp : + Vector_Op<"tuple", [NoSideEffect]>, + Arguments<(ins Variadic:$vectors)>, + Results<(outs TupleOf<[AnyVector]>)> { + let summary = "make tuple of vectors operation"; + let description = [{ + Returns a tuple of its operands 'vectors'. + + Note that this operation is used during the vector op unrolling + transformation and should be removed before lowering to lower-level + dialects. + + + Examples: + ``` + %0 = vector.transfer_read ... : vector<2x2xf32> + %1 = vector.transfer_read ... : vector<2x1xf32> + %2 = vector.transfer_read ... : vector<2x2xf32> + %3 = vector.transfer_read ... : vector<2x1xf32> + + %4 = vector.tuple %0, %1, %2, %3 + : vector<2x2xf32>, vector<2x1xf32>, vector<2x2xf32>, vector<2x1xf32> + + ``` + }]; + + let extraClassDeclaration = [{ + TupleType getResultTupleType() { + return getResult().getType().cast(); + } + }]; +} + +def Vector_TupleGetOp : + Vector_Op<"tuple_get", [NoSideEffect]>, + Arguments<(ins TupleOf<[AnyVector]>:$vectors, APIntAttr:$index)>, + Results<(outs AnyVector)> { + let summary = "vector tuple get operation"; + let description = [{ + Returns the tuple element of 'vectors' at 'index'. + + Note that this operation is used during the vector op unrolling + transformation and should be removed before lowering to lower-level + dialects. + + Examples: + ``` + %4 = vector.tuple %0, %1, %2, %3 + : vector<2x2xf32>, vector<2x1xf32>, vector<2x2xf32>, vector<2x1xf32>> + + %5 = vector.tuple_get %4, 1 + : tuple, vector<2x1xf32>, + vector<2x2xf32>, vector<2x1xf32>> + ``` + }]; + + let extraClassDeclaration = [{ + VectorType getResultVectorType() { + return getResult().getType().cast(); + } + int64_t getIndex() { + return getAttrOfType("index").getValue().getSExtValue(); + } + static StringRef getIndexAttrName() { return "index"; } + }]; + let hasFolder = 1; +} + +def Vector_PrintOp : + Vector_Op<"print", []>, Arguments<(ins AnyType:$source)> { + let summary = "print operation (for testing and debugging)"; + let description = [{ + Prints the source vector (or scalar) to stdout in human readable + format (for testing and debugging). No return value. + + Examples: + ``` + %0 = constant 0.0 : f32 + %1 = vector.broadcast %0 : f32 to vector<4xf32> + vector.print %1 : vector<4xf32> + + when lowered to LLVM, the vector print is unrolled into + elementary printing method calls that at runtime will yield + + ( 0.0, 0.0, 0.0, 0.0 ) + + on stdout when linked with a small runtime support library, + which only needs to provide a few printing methods (single + value for all data types, opening/closing bracket, comma, + newline). + ``` + }]; + let verifier = ?; + let extraClassDeclaration = [{ + Type getPrintType() { + return source().getType(); + } + }]; + let assemblyFormat = "$source attr-dict `:` type($source)"; +} + +//===----------------------------------------------------------------------===// +// Ops used for supporting progressive lowering and conversion type changes. +//===----------------------------------------------------------------------===// + +/// Vector dialect matrix multiplication op that operates on flattened 1-D +/// MLIR vectors. This is the counterpart of llvm.matrix.multiply in MLIR. +/// This may seem redundant with vector.contract but it serves the purposes of +/// more progressive lowering and localized type conversion on the path: +/// `vector<...x...xf32> -> vector<...xf32> -> !llvm<... x float>`. +def Vector_MatmulOp : Vector_Op<"matrix_multiply", [NoSideEffect, + PredOpTrait<"lhs operand and result have same element type", + TCresVTEtIsSameAsOpBase<0, 0>>, + PredOpTrait<"rhs operand and result have same element type", + TCresVTEtIsSameAsOpBase<0, 1>>]>, + Arguments<( + // TODO(ntv, fhahn): tighten vector element types that make sense. + ins VectorOfRankAndType<[1], + [AnySignlessInteger, AnySignedInteger, AnyFloat]>:$lhs, + VectorOfRankAndType<[1], + [AnySignlessInteger, AnySignedInteger, AnyFloat]>:$rhs, + I32Attr:$lhs_rows, I32Attr:$lhs_columns, I32Attr:$rhs_columns)>, + Results<( + outs VectorOfRankAndType<[1], + [AnySignlessInteger, AnySignedInteger, AnyFloat]>:$res)> +{ + let summary = "Vector matrix multiplication op that operates on flattened 1-D" + " MLIR vectors"; + let description = [{ + This is the counterpart of llvm.matrix.multiply in MLIR. It serves the + purposes of more progressive lowering and localized type conversion. + + The ‘vector.matrix_multiply’ op treats `lhs` as matrix with rows + and columns, `rhs` as matrix with rows and + and multiplies them. The result matrix is returned embedded in + the result vector. + + Example: + + ``` + %C = vector.matrix_multiply %A, %B + { lhs_rows = 4: i32, lhs_columns = 16: i32 , rhs_columns = 3: i32 } : + (vector<64xf64>, vector<48xf64>) -> vector<12xf64> + ``` + }]; + let builders = [ + OpBuilder<"Builder *builder, OperationState &result, Value lhs, Value rhs, " + "unsigned lhsRows, unsigned lhsColumns, unsigned rhsColumns", + [{ + result.addOperands({lhs, rhs}); + result.addAttribute("lhs_rows", builder->getI32IntegerAttr(lhsRows)); + result.addAttribute("lhs_columns", builder->getI32IntegerAttr(lhsColumns)); + result.addAttribute("rhs_columns", builder->getI32IntegerAttr(rhsColumns)); + result.addTypes(VectorType::get(lhsRows * lhsColumns, + lhs.getType().cast().getElementType())); + }]>, + ]; + let verifier = ?; + let assemblyFormat = "$lhs `,` $rhs attr-dict " + "`:` `(` type($lhs) `,` type($rhs) `)` `->` type($res)"; +} + +#endif // VECTOR_OPS diff --git a/mlir/include/mlir/IR/Builders.h b/mlir/include/mlir/IR/Builders.h index 3817b35ea7cf1..5b42132d463ab 100644 --- a/mlir/include/mlir/IR/Builders.h +++ b/mlir/include/mlir/IR/Builders.h @@ -116,9 +116,16 @@ class Builder { IntegerAttr getSI32IntegerAttr(int32_t value); IntegerAttr getUI32IntegerAttr(uint32_t value); + /// Vector-typed DenseIntElementsAttr getters. `values` must not be empty. DenseIntElementsAttr getI32VectorAttr(ArrayRef values); DenseIntElementsAttr getI64VectorAttr(ArrayRef values); + /// Tensor-typed DenseIntElementsAttr getters. `values` can be empty. + /// These are generally preferable for representing general lists of integers + /// as attributes. + DenseIntElementsAttr getI32TensorAttr(ArrayRef values); + DenseIntElementsAttr getI64TensorAttr(ArrayRef values); + ArrayAttr getAffineMapArrayAttr(ArrayRef values); ArrayAttr getI32ArrayAttr(ArrayRef values); ArrayAttr getI64ArrayAttr(ArrayRef values); diff --git a/mlir/include/mlir/IR/Dialect.h b/mlir/include/mlir/IR/Dialect.h index 2f17ad002b30d..f650602381719 100644 --- a/mlir/include/mlir/IR/Dialect.h +++ b/mlir/include/mlir/IR/Dialect.h @@ -28,6 +28,7 @@ using DialectConstantFoldHook = std::function, SmallVectorImpl &)>; using DialectExtractElementHook = std::function)>; +using DialectAllocatorFunction = std::function; /// Dialects are groups of MLIR operations and behavior associated with the /// entire group. For example, hooks into other systems for constant folding, @@ -241,24 +242,30 @@ class Dialect { /// A collection of registered dialect interfaces. DenseMap> registeredInterfaces; -}; - -using DialectAllocatorFunction = std::function; - -/// Registers a specific dialect creation function with the system, typically -/// used through the DialectRegistration template. -void registerDialectAllocator(const DialectAllocatorFunction &function); -/// Registers all dialects with the specified MLIRContext. + /// Registers a specific dialect creation function with the global registry. + /// Used through the registerDialect template. + /// Registrations are deduplicated by dialect ClassID and only the first + /// registration will be used. + static void + registerDialectAllocator(const ClassID *classId, + const DialectAllocatorFunction &function); + template + friend void registerDialect(); +}; +/// Registers all dialects and hooks from the global registries with the +/// specified MLIRContext. void registerAllDialects(MLIRContext *context); /// Utility to register a dialect. Client can register their dialect with the /// global registry by calling registerDialect(); template void registerDialect() { - registerDialectAllocator([](MLIRContext *ctx) { - // Just allocate the dialect, the context takes ownership of it. - new ConcreteDialect(ctx); - }); + Dialect::registerDialectAllocator(ClassID::getID(), + [](MLIRContext *ctx) { + // Just allocate the dialect, the context + // takes ownership of it. + new ConcreteDialect(ctx); + }); } /// DialectRegistration provides a global initializer that registers a Dialect diff --git a/mlir/include/mlir/IR/DialectHooks.h b/mlir/include/mlir/IR/DialectHooks.h index 2dce1c2b203a5..4e59b4953e656 100644 --- a/mlir/include/mlir/IR/DialectHooks.h +++ b/mlir/include/mlir/IR/DialectHooks.h @@ -35,36 +35,53 @@ class DialectHooks { DialectConstantDecodeHook getDecodeHook() { return nullptr; } // Returns hook to extract an element of an opaque constant tensor. DialectExtractElementHook getExtractElementHook() { return nullptr; } + +private: + /// Registers a function that will set hooks in the registered dialects. + /// Registrations are deduplicated by dialect ClassID and only the first + /// registration will be used. + static void registerDialectHooksSetter(const ClassID *classId, + const DialectHooksSetter &function); + template + friend void registerDialectHooks(StringRef dialectName); }; -/// Registers a function that will set hooks in the registered dialects -/// based on information coming from DialectHooksRegistration. -void registerDialectHooksSetter(const DialectHooksSetter &function); +void registerDialectHooksSetter(const ClassID *classId, + const DialectHooksSetter &function); + +/// Utility to register dialect hooks. Client can register their dialect hooks +/// with the global registry by calling +/// registerDialectHooks("dialect_namespace"); +template +void registerDialectHooks(StringRef dialectName) { + DialectHooks::registerDialectHooksSetter( + ClassID::getID(), [dialectName](MLIRContext *ctx) { + Dialect *dialect = ctx->getRegisteredDialect(dialectName); + if (!dialect) { + llvm::errs() << "error: cannot register hooks for unknown dialect '" + << dialectName << "'\n"; + abort(); + } + // Set hooks. + ConcreteHooks hooks; + if (auto h = hooks.getConstantFoldHook()) + dialect->constantFoldHook = h; + if (auto h = hooks.getDecodeHook()) + dialect->decodeHook = h; + if (auto h = hooks.getExtractElementHook()) + dialect->extractElementHook = h; + }); +} /// DialectHooksRegistration provides a global initializer that registers /// a dialect hooks setter routine. /// Usage: /// /// // At namespace scope. -/// static DialectHooksRegistration unused; +/// static DialectHooksRegistration Unused("dialect_namespace"); template struct DialectHooksRegistration { DialectHooksRegistration(StringRef dialectName) { - registerDialectHooksSetter([dialectName](MLIRContext *ctx) { - Dialect *dialect = ctx->getRegisteredDialect(dialectName); - if (!dialect) { - llvm::errs() << "error: cannot register hooks for unknown dialect '" - << dialectName << "'\n"; - abort(); - } - // Set hooks. - ConcreteHooks hooks; - if (auto h = hooks.getConstantFoldHook()) - dialect->constantFoldHook = h; - if (auto h = hooks.getDecodeHook()) - dialect->decodeHook = h; - if (auto h = hooks.getExtractElementHook()) - dialect->extractElementHook = h; - }); + registerDialectHooks(dialectName); } }; diff --git a/mlir/include/mlir/IR/OpBase.td b/mlir/include/mlir/IR/OpBase.td index d76c9d5476ef8..845fa93878a18 100644 --- a/mlir/include/mlir/IR/OpBase.td +++ b/mlir/include/mlir/IR/OpBase.td @@ -1386,6 +1386,9 @@ def SymbolRefArrayAttr : // DerivedAttr are attributes whose value is computed from properties // of the operation. They do not require additional storage and are // materialized as needed. +// Note: All derived attributes should be materializable as an Attribute. E.g., +// do not use DerivedAttr for things that could not have been stored as +// Attribute. class DerivedAttr : Attr, "derived attribute"> { let returnType = ret; code body = b; diff --git a/mlir/include/mlir/IR/OpImplementation.h b/mlir/include/mlir/IR/OpImplementation.h index dbd7b0b0b9828..f471e6ca0cc58 100644 --- a/mlir/include/mlir/IR/OpImplementation.h +++ b/mlir/include/mlir/IR/OpImplementation.h @@ -37,6 +37,7 @@ class OpAsmPrinter { /// Print implementations for various things an operation contains. virtual void printOperand(Value value) = 0; + virtual void printOperand(Value value, raw_ostream &os) = 0; /// Print a comma separated list of operands. template @@ -245,6 +246,24 @@ class OpAsmParser { return success(); } + /// Return the name of the specified result in the specified syntax, as well + /// as the sub-element in the name. It returns an empty string and ~0U for + /// invalid result numbers. For example, in this operation: + /// + /// %x, %y:2, %z = foo.op + /// + /// getResultName(0) == {"x", 0 } + /// getResultName(1) == {"y", 0 } + /// getResultName(2) == {"y", 1 } + /// getResultName(3) == {"z", 0 } + /// getResultName(4) == {"", ~0U } + virtual std::pair + getResultName(unsigned resultNo) const = 0; + + /// Return the number of declared SSA results. This returns 4 for the foo.op + /// example in the comment for `getResultName`. + virtual size_t getNumResults() const = 0; + /// Return the location of the original name token. virtual llvm::SMLoc getNameLoc() const = 0; @@ -510,8 +529,9 @@ class OpAsmParser { ArrayRef(type), loc, result); } template - ParseResult resolveOperands(Operands &&operands, Types &&types, - llvm::SMLoc loc, SmallVectorImpl &result) { + std::enable_if_t::value, ParseResult> + resolveOperands(Operands &&operands, Types &&types, llvm::SMLoc loc, + SmallVectorImpl &result) { size_t operandSize = std::distance(operands.begin(), operands.end()); size_t typeSize = std::distance(types.begin(), types.end()); if (operandSize != typeSize) diff --git a/mlir/include/mlir/IR/StandardTypes.h b/mlir/include/mlir/IR/StandardTypes.h index d1c31acb0a51a..1c18e21006dc4 100644 --- a/mlir/include/mlir/IR/StandardTypes.h +++ b/mlir/include/mlir/IR/StandardTypes.h @@ -342,8 +342,8 @@ class TensorType : public ShapedType { }; /// Ranked tensor types represent multi-dimensional arrays that have a shape -/// with a fixed number of dimensions. Each shape element can be a positive -/// integer or unknown (represented -1). +/// with a fixed number of dimensions. Each shape element can be a non-negative +/// integer or unknown (represented by -1). class RankedTensorType : public Type::TypeBase { @@ -416,8 +416,8 @@ class BaseMemRefType : public ShapedType { /// MemRef types represent a region of memory that have a shape with a fixed /// number of dimensions. Each shape element can be a non-negative integer or -/// unknown (represented by any negative integer). MemRef types also have an -/// affine map composition, represented as an array AffineMap pointers. +/// unknown (represented by -1). MemRef types also have an affine map +/// composition, represented as an array AffineMap pointers. class MemRefType : public Type::TypeBase { public: diff --git a/mlir/include/mlir/IR/Types.h b/mlir/include/mlir/IR/Types.h index eccc90cdae0c6..e45fa9037470a 100644 --- a/mlir/include/mlir/IR/Types.h +++ b/mlir/include/mlir/IR/Types.h @@ -169,8 +169,11 @@ class Type { /// Return true of this is a signless integer or a float type. bool isSignlessIntOrFloat(); - /// Return true of this is an integer(of any signedness) or a float type. + /// Return true if this is an integer (of any signedness) or a float type. bool isIntOrFloat(); + /// Return true if this is an integer (of any signedness), index, or float + /// type. + bool isIntOrIndexOrFloat(); /// Print the current type. void print(raw_ostream &os); diff --git a/mlir/include/mlir/InitAllDialects.h b/mlir/include/mlir/InitAllDialects.h index c0a7ca04081f8..aba4556e4bc1f 100644 --- a/mlir/include/mlir/InitAllDialects.h +++ b/mlir/include/mlir/InitAllDialects.h @@ -14,9 +14,11 @@ #ifndef MLIR_INITALLDIALECTS_H_ #define MLIR_INITALLDIALECTS_H_ -#include "mlir/Dialect/AffineOps/AffineOps.h" +#include "mlir/Dialect/AVX512/AVX512Dialect.h" +#include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/FxpMathOps/FxpMathOps.h" #include "mlir/Dialect/GPU/GPUDialect.h" +#include "mlir/Dialect/LLVMIR/LLVMAVX512Dialect.h" #include "mlir/Dialect/LLVMIR/LLVMDialect.h" #include "mlir/Dialect/LLVMIR/NVVMDialect.h" #include "mlir/Dialect/LLVMIR/ROCDLDialect.h" @@ -37,9 +39,11 @@ namespace mlir { // all the possible dialects to be made available to the context automatically. inline void registerAllDialects() { static bool init_once = []() { - registerDialect(); + registerDialect(); + registerDialect(); registerDialect(); registerDialect(); + registerDialect(); registerDialect(); registerDialect(); registerDialect(); diff --git a/mlir/include/mlir/InitAllPasses.h b/mlir/include/mlir/InitAllPasses.h index b358cfa8802ed..b983ecb478ddb 100644 --- a/mlir/include/mlir/InitAllPasses.h +++ b/mlir/include/mlir/InitAllPasses.h @@ -14,7 +14,7 @@ #ifndef MLIR_INITALLPASSES_H_ #define MLIR_INITALLPASSES_H_ -#include "mlir/Analysis/Passes.h" +#include "mlir/Conversion/AVX512ToLLVM/ConvertAVX512ToLLVM.h" #include "mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h" #include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h" #include "mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h" @@ -24,6 +24,8 @@ #include "mlir/Conversion/LinalgToSPIRV/LinalgToSPIRVPass.h" #include "mlir/Conversion/LoopsToGPU/LoopsToGPUPass.h" #include "mlir/Conversion/StandardToSPIRV/ConvertStandardToSPIRVPass.h" +#include "mlir/Dialect/Affine/Passes.h" +#include "mlir/Dialect/FxpMathOps/Passes.h" #include "mlir/Dialect/FxpMathOps/Passes.h" #include "mlir/Dialect/GPU/Passes.h" #include "mlir/Dialect/LLVMIR/Transforms/LegalizeForExport.h" @@ -59,7 +61,7 @@ inline void registerAllPasses() { // Init general passes createCanonicalizerPass(); createCSEPass(); - createVectorizePass({}); + createSuperVectorizePass({}); createLoopUnrollPass(); createLoopUnrollAndJamPass(); createSimplifyAffineStructuresPass(); @@ -78,6 +80,9 @@ inline void registerAllPasses() { createSymbolDCEPass(); createLocationSnapshotPass({}); + // AVX512 + createConvertAVX512ToLLVMPass(); + // GPUtoRODCLPass createLowerGpuOpsToROCDLOpsPass(); @@ -92,10 +97,6 @@ inline void registerAllPasses() { // CUDA createConvertGpuLaunchFuncToCudaCallsPass(); -#if MLIR_CUDA_CONVERSIONS_ENABLED - createConvertGPUKernelToCubinPass( - [](const std::string &, Location, StringRef) { return nullptr; }); -#endif createLowerGpuOpsToNVVMOpsPass(); // Linalg diff --git a/mlir/include/mlir/TableGen/Operator.h b/mlir/include/mlir/TableGen/Operator.h index 08af550499a73..aaf93d3964e88 100644 --- a/mlir/include/mlir/TableGen/Operator.h +++ b/mlir/include/mlir/TableGen/Operator.h @@ -198,6 +198,10 @@ class Operator { bool hasSummary() const; StringRef getSummary() const; + // Query functions for the assembly format of the operator. + bool hasAssemblyFormat() const; + StringRef getAssemblyFormat() const; + // Returns this op's extra class declaration code. StringRef getExtraClassDeclaration() const; diff --git a/mlir/include/mlir/Transforms/FoldUtils.h b/mlir/include/mlir/Transforms/FoldUtils.h index 83ce3bf0d072e..0bab87c5e4e32 100644 --- a/mlir/include/mlir/Transforms/FoldUtils.h +++ b/mlir/include/mlir/Transforms/FoldUtils.h @@ -75,11 +75,20 @@ class OperationFolder { template void create(OpBuilder &builder, SmallVectorImpl &results, Location location, Args &&... args) { - Operation *op = builder.create(location, std::forward(args)...); - if (failed(tryToFold(op, results))) + // The op needs to be inserted only if the fold (below) fails, or the number + // of results of the op is zero (which is treated as an in-place + // fold). Using create methods of the builder will insert the op, so not + // using it here. + OperationState state(location, OpTy::getOperationName()); + OpTy::build(&builder, state, std::forward(args)...); + Operation *op = Operation::create(state); + + if (failed(tryToFold(builder, op, results)) || op->getNumResults() == 0) { + builder.insert(op); results.assign(op->result_begin(), op->result_end()); - else if (op->getNumResults() != 0) - op->erase(); + return; + } + op->destroy(); } /// Overload to create or fold a single result operation. @@ -120,7 +129,7 @@ class OperationFolder { /// Tries to perform folding on the given `op`. If successful, populates /// `results` with the results of the folding. LogicalResult tryToFold( - Operation *op, SmallVectorImpl &results, + OpBuilder &builder, Operation *op, SmallVectorImpl &results, function_ref processGeneratedConstants = nullptr); /// Try to get or create a new constant entry. On success this returns the diff --git a/mlir/include/mlir/Transforms/Passes.h b/mlir/include/mlir/Transforms/Passes.h index 8b7495ec0e581..28b29eb29ad49 100644 --- a/mlir/include/mlir/Transforms/Passes.h +++ b/mlir/include/mlir/Transforms/Passes.h @@ -32,32 +32,6 @@ std::unique_ptr createCanonicalizerPass(); /// Creates a pass to perform common sub expression elimination. std::unique_ptr createCSEPass(); -/// Creates a pass to vectorize loops, operations and data types using a -/// target-independent, n-D super-vector abstraction. -std::unique_ptr> -createVectorizePass(ArrayRef virtualVectorSize); - -/// Creates a loop unrolling pass with the provided parameters. -/// 'getUnrollFactor' is a function callback for clients to supply a function -/// that computes an unroll factor - the callback takes precedence over unroll -/// factors supplied through other means. If -1 is passed as the unrollFactor -/// and no callback is provided, anything passed from the command-line (if at -/// all) or the default unroll factor is used (LoopUnroll:kDefaultUnrollFactor). -std::unique_ptr> createLoopUnrollPass( - int unrollFactor = -1, int unrollFull = -1, - const std::function &getUnrollFactor = nullptr); - -/// Creates a loop unroll jam pass to unroll jam by the specified factor. A -/// factor of -1 lets the pass use the default factor or the one on the command -/// line if provided. -std::unique_ptr> -createLoopUnrollAndJamPass(int unrollJamFactor = -1); - -/// Creates a simplification pass for affine structures (maps and sets). In -/// addition, this pass also normalizes memrefs to have the trivial (identity) -/// layout map. -std::unique_ptr> createSimplifyAffineStructuresPass(); - /// Creates a loop fusion pass which fuses loops. Buffers of size less than or /// equal to `localBufSizeThreshold` are promoted to memory space /// `fastMemorySpace'. @@ -70,10 +44,6 @@ createLoopFusionPass(unsigned fastMemorySpace = 0, /// instructions out of the loop. std::unique_ptr createLoopInvariantCodeMotionPass(); -/// Creates a loop invariant code motion pass that hoists loop invariant -/// instructions out of affine loop. -std::unique_ptr> createAffineLoopInvariantCodeMotionPass(); - /// Creates a pass to pipeline explicit movement of data across levels of the /// memory hierarchy. std::unique_ptr> createPipelineDataTransferPass(); @@ -83,22 +53,10 @@ std::unique_ptr> createPipelineDataTransferPass(); /// primitives). std::unique_ptr> createLowerAffinePass(); -/// Creates a pass to perform tiling on loop nests. -std::unique_ptr> -createLoopTilingPass(uint64_t cacheSizeBytes); - /// Creates a pass that transforms perfectly nested loops with independent /// bounds into a single loop. std::unique_ptr> createLoopCoalescingPass(); -/// Performs packing (or explicit copying) of accessed memref regions into -/// buffers in the specified faster memory space through either pointwise copies -/// or DMA operations. -std::unique_ptr> createAffineDataCopyGenerationPass( - unsigned slowMemorySpace, unsigned fastMemorySpace, - unsigned tagMemorySpace = 0, int minDmaTransferSize = 1024, - uint64_t fastMemCapacityBytes = std::numeric_limits::max()); - /// Creates a pass to perform optimizations relying on memref dataflow such as /// store to load forwarding, elimination of dead stores, and dead allocs. std::unique_ptr> createMemRefDataFlowOptPass(); diff --git a/mlir/lib/Analysis/AffineAnalysis.cpp b/mlir/lib/Analysis/AffineAnalysis.cpp index 6f1a21d0e5d08..56a0d72bb387c 100644 --- a/mlir/lib/Analysis/AffineAnalysis.cpp +++ b/mlir/lib/Analysis/AffineAnalysis.cpp @@ -14,8 +14,8 @@ #include "mlir/Analysis/AffineAnalysis.h" #include "mlir/Analysis/AffineStructures.h" #include "mlir/Analysis/Utils.h" -#include "mlir/Dialect/AffineOps/AffineOps.h" -#include "mlir/Dialect/AffineOps/AffineValueMap.h" +#include "mlir/Dialect/Affine/IR/AffineOps.h" +#include "mlir/Dialect/Affine/IR/AffineValueMap.h" #include "mlir/Dialect/StandardOps/IR/Ops.h" #include "mlir/IR/AffineExprVisitor.h" #include "mlir/IR/Builders.h" diff --git a/mlir/lib/Analysis/AffineStructures.cpp b/mlir/lib/Analysis/AffineStructures.cpp index 0b75767d62102..6ebc673c3100b 100644 --- a/mlir/lib/Analysis/AffineStructures.cpp +++ b/mlir/lib/Analysis/AffineStructures.cpp @@ -11,8 +11,8 @@ //===----------------------------------------------------------------------===// #include "mlir/Analysis/AffineStructures.h" -#include "mlir/Dialect/AffineOps/AffineOps.h" -#include "mlir/Dialect/AffineOps/AffineValueMap.h" +#include "mlir/Dialect/Affine/IR/AffineOps.h" +#include "mlir/Dialect/Affine/IR/AffineValueMap.h" #include "mlir/Dialect/StandardOps/IR/Ops.h" #include "mlir/IR/AffineExprVisitor.h" #include "mlir/IR/IntegerSet.h" @@ -1201,17 +1201,30 @@ static bool detectAsMod(const FlatAffineConstraints &cst, unsigned pos, return false; } -/// Gather all lower and upper bounds of the identifier at `pos`. +/// Gather all lower and upper bounds of the identifier at `pos`. The bounds are +/// to be independent of [offset, offset + num) identifiers. static void getLowerAndUpperBoundIndices(const FlatAffineConstraints &cst, unsigned pos, SmallVectorImpl *lbIndices, - SmallVectorImpl *ubIndices) { + SmallVectorImpl *ubIndices, + unsigned offset = 0, + unsigned num = 0) { assert(pos < cst.getNumIds() && "invalid position"); // Gather all lower bounds and upper bounds of the variable. Since the // canonical form c_1*x_1 + c_2*x_2 + ... + c_0 >= 0, a constraint is a lower // bound for x_i if c_i >= 1, and an upper bound if c_i <= -1. for (unsigned r = 0, e = cst.getNumInequalities(); r < e; r++) { + // The bounds are to be independent of [offset, offset + num) columns. + unsigned c, f; + for (c = offset, f = offset + num; c < f; ++c) { + if (c == pos) + continue; + if (cst.atIneq(r, c) != 0) + break; + } + if (c < f) + continue; if (cst.atIneq(r, pos) >= 1) { // Lower bound. lbIndices->push_back(r); @@ -1866,7 +1879,8 @@ void FlatAffineConstraints::removeEquality(unsigned pos) { /// Finds an equality that equates the specified identifier to a constant. /// Returns the position of the equality row. If 'symbolic' is set to true, /// symbols are also treated like a constant, i.e., an affine function of the -/// symbols is also treated like a constant. +/// symbols is also treated like a constant. Returns -1 if such an equality +/// could not be found. static int findEqualityToConstant(const FlatAffineConstraints &cst, unsigned pos, bool symbolic = false) { assert(pos < cst.getNumIds() && "invalid position"); @@ -1937,19 +1951,15 @@ void FlatAffineConstraints::constantFoldIdRange(unsigned pos, unsigned num) { // s0 - 7 <= 8*j <= s0 returns 1 with lb = s0, lbDivisor = 8 (since lb = // ceil(s0 - 7 / 8) = floor(s0 / 8)). Optional FlatAffineConstraints::getConstantBoundOnDimSize( - unsigned pos, SmallVectorImpl *lb, int64_t *lbFloorDivisor, + unsigned pos, SmallVectorImpl *lb, int64_t *boundFloorDivisor, SmallVectorImpl *ub) const { assert(pos < getNumDimIds() && "Invalid identifier position"); assert(getNumLocalIds() == 0); - // TODO(bondhugula): eliminate all remaining dimensional identifiers (other - // than the one at 'pos' to make this more powerful. Not needed for - // hyper-rectangular spaces. - // Find an equality for 'pos'^th identifier that equates it to some function // of the symbolic identifiers (+ constant). - int eqRow = findEqualityToConstant(*this, pos, /*symbolic=*/true); - if (eqRow != -1) { + int eqPos = findEqualityToConstant(*this, pos, /*symbolic=*/true); + if (eqPos != -1) { // This identifier can only take a single value. if (lb) { // Set lb to that symbolic value. @@ -1957,18 +1967,18 @@ Optional FlatAffineConstraints::getConstantBoundOnDimSize( if (ub) ub->resize(getNumSymbolIds() + 1); for (unsigned c = 0, f = getNumSymbolIds() + 1; c < f; c++) { - int64_t v = atEq(eqRow, pos); + int64_t v = atEq(eqPos, pos); // atEq(eqRow, pos) is either -1 or 1. assert(v * v == 1); - (*lb)[c] = v < 0 ? atEq(eqRow, getNumDimIds() + c) / -v - : -atEq(eqRow, getNumDimIds() + c) / v; + (*lb)[c] = v < 0 ? atEq(eqPos, getNumDimIds() + c) / -v + : -atEq(eqPos, getNumDimIds() + c) / v; // Since this is an equality, ub = lb. if (ub) (*ub)[c] = (*lb)[c]; } - assert(lbFloorDivisor && + assert(boundFloorDivisor && "both lb and divisor or none should be provided"); - *lbFloorDivisor = 1; + *boundFloorDivisor = 1; } return 1; } @@ -1990,25 +2000,9 @@ Optional FlatAffineConstraints::getConstantBoundOnDimSize( // the bounds can only involve symbolic (and local) identifiers. Since the // canonical form c_1*x_1 + c_2*x_2 + ... + c_0 >= 0, a constraint is a lower // bound for x_i if c_i >= 1, and an upper bound if c_i <= -1. - for (unsigned r = 0, e = getNumInequalities(); r < e; r++) { - unsigned c, f; - for (c = 0, f = getNumDimIds(); c < f; c++) { - if (c != pos && atIneq(r, c) != 0) - break; - } - if (c < getNumDimIds()) - // Not a pure symbolic bound. - continue; - if (atIneq(r, pos) >= 1) - // Lower bound. - lbIndices.push_back(r); - else if (atIneq(r, pos) <= -1) - // Upper bound. - ubIndices.push_back(r); - } - - // TODO(bondhugula): eliminate other dimensional identifiers to make this more - // powerful. Not needed for hyper-rectangular iteration spaces. + getLowerAndUpperBoundIndices(*this, pos, &lbIndices, &ubIndices, + /*offset=*/0, + /*num=*/getNumDimIds()); Optional minDiff = None; unsigned minLbPosition, minUbPosition; @@ -2046,8 +2040,8 @@ Optional FlatAffineConstraints::getConstantBoundOnDimSize( // of the variable at 'pos'. We express the ceildiv equivalently as a floor // for uniformity. For eg., if the lower bound constraint was: 32*d0 - N + // 31 >= 0, the lower bound for d0 is ceil(N - 31, 32), i.e., floor(N, 32). - *lbFloorDivisor = atIneq(minLbPosition, pos); - assert(*lbFloorDivisor == -atIneq(minUbPosition, pos)); + *boundFloorDivisor = atIneq(minLbPosition, pos); + assert(*boundFloorDivisor == -atIneq(minUbPosition, pos)); for (unsigned c = 0, e = getNumSymbolIds() + 1; c < e; c++) { (*lb)[c] = -atIneq(minLbPosition, getNumDimIds() + c); } diff --git a/mlir/lib/Analysis/CMakeLists.txt b/mlir/lib/Analysis/CMakeLists.txt index f9c0236e3d4ec..262bc7e8a5882 100644 --- a/mlir/lib/Analysis/CMakeLists.txt +++ b/mlir/lib/Analysis/CMakeLists.txt @@ -24,7 +24,7 @@ add_mlir_library(MLIRAnalysis target_link_libraries(MLIRAnalysis PUBLIC - MLIRAffineOps + MLIRAffine MLIRCallInterfaces MLIRControlFlowInterfaces MLIRInferTypeOpInterface @@ -44,7 +44,7 @@ add_mlir_library(MLIRLoopAnalysis target_link_libraries(MLIRLoopAnalysis PUBLIC - MLIRAffineOps + MLIRAffine MLIRCallInterfaces MLIRControlFlowInterfaces MLIRInferTypeOpInterface diff --git a/mlir/lib/Analysis/LoopAnalysis.cpp b/mlir/lib/Analysis/LoopAnalysis.cpp index b5ff91b44062a..9b5725188faea 100644 --- a/mlir/lib/Analysis/LoopAnalysis.cpp +++ b/mlir/lib/Analysis/LoopAnalysis.cpp @@ -15,8 +15,8 @@ #include "mlir/Analysis/AffineAnalysis.h" #include "mlir/Analysis/AffineStructures.h" #include "mlir/Analysis/NestedMatcher.h" -#include "mlir/Dialect/AffineOps/AffineOps.h" -#include "mlir/Dialect/AffineOps/AffineValueMap.h" +#include "mlir/Dialect/Affine/IR/AffineOps.h" +#include "mlir/Dialect/Affine/IR/AffineValueMap.h" #include "mlir/Support/MathExtras.h" #include "llvm/ADT/DenseSet.h" @@ -220,9 +220,9 @@ DenseSet mlir::getInvariantAccesses(Value iv, ArrayRef indices) { template static bool isContiguousAccess(Value iv, LoadOrStoreOp memoryOp, int *memRefDim) { - static_assert(std::is_same::value || - std::is_same::value, - "Must be called on either const LoadOp & or const StoreOp &"); + static_assert( + llvm::is_one_of::value, + "Must be called on either LoadOp or StoreOp"); assert(memRefDim && "memRefDim == nullptr"); auto memRefType = memoryOp.getMemRefType(); diff --git a/mlir/lib/Analysis/NestedMatcher.cpp b/mlir/lib/Analysis/NestedMatcher.cpp index 2324bbcf7b564..807e5df46d1c2 100644 --- a/mlir/lib/Analysis/NestedMatcher.cpp +++ b/mlir/lib/Analysis/NestedMatcher.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// #include "mlir/Analysis/NestedMatcher.h" -#include "mlir/Dialect/AffineOps/AffineOps.h" +#include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/StandardOps/IR/Ops.h" #include "llvm/ADT/ArrayRef.h" diff --git a/mlir/lib/Analysis/SliceAnalysis.cpp b/mlir/lib/Analysis/SliceAnalysis.cpp index fae41b4a027b1..b1e45d1cfe7ba 100644 --- a/mlir/lib/Analysis/SliceAnalysis.cpp +++ b/mlir/lib/Analysis/SliceAnalysis.cpp @@ -11,7 +11,7 @@ //===----------------------------------------------------------------------===// #include "mlir/Analysis/SliceAnalysis.h" -#include "mlir/Dialect/AffineOps/AffineOps.h" +#include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/LoopOps/LoopOps.h" #include "mlir/IR/Function.h" #include "mlir/IR/Operation.h" diff --git a/mlir/lib/Analysis/Utils.cpp b/mlir/lib/Analysis/Utils.cpp index 7b3cd58aa980a..9e400e4b6a3c4 100644 --- a/mlir/lib/Analysis/Utils.cpp +++ b/mlir/lib/Analysis/Utils.cpp @@ -14,8 +14,8 @@ #include "mlir/Analysis/Utils.h" #include "mlir/Analysis/AffineAnalysis.h" -#include "mlir/Dialect/AffineOps/AffineOps.h" -#include "mlir/Dialect/AffineOps/AffineValueMap.h" +#include "mlir/Dialect/Affine/IR/AffineOps.h" +#include "mlir/Dialect/Affine/IR/AffineValueMap.h" #include "mlir/Dialect/StandardOps/IR/Ops.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/Support/Debug.h" @@ -368,16 +368,16 @@ Optional mlir::getMemRefSizeInBytes(MemRefType memRefType) { return sizeInBytes; } -template -LogicalResult mlir::boundCheckLoadOrStoreOp(LoadOrStoreOpPointer loadOrStoreOp, +template +LogicalResult mlir::boundCheckLoadOrStoreOp(LoadOrStoreOp loadOrStoreOp, bool emitError) { - static_assert(std::is_same::value || - std::is_same::value, - "argument should be either a AffineLoadOp or a AffineStoreOp"); + static_assert( + llvm::is_one_of::value, + "argument should be either a AffineLoadOp or a AffineStoreOp"); - Operation *opInst = loadOrStoreOp.getOperation(); - MemRefRegion region(opInst->getLoc()); - if (failed(region.compute(opInst, /*loopDepth=*/0, /*sliceState=*/nullptr, + Operation *op = loadOrStoreOp.getOperation(); + MemRefRegion region(op->getLoc()); + if (failed(region.compute(op, /*loopDepth=*/0, /*sliceState=*/nullptr, /*addMemRefDimBounds=*/false))) return success(); diff --git a/mlir/lib/Conversion/AVX512ToLLVM/CMakeLists.txt b/mlir/lib/Conversion/AVX512ToLLVM/CMakeLists.txt new file mode 100644 index 0000000000000..5573f6ca1618c --- /dev/null +++ b/mlir/lib/Conversion/AVX512ToLLVM/CMakeLists.txt @@ -0,0 +1,19 @@ +add_mlir_conversion_library(MLIRAVX512ToLLVM + ConvertAVX512ToLLVM.cpp + + ADDITIONAL_HEADER_DIRS + ${MLIR_MAIN_INCLUDE_DIR}/mlir/Conversion/AVX512ToLLVM +) + +set(LIBS + MLIRAVX512 + MLIRLLVMAVX512 + MLIRLLVMIR + MLIRStandardToLLVM + MLIRTransforms + LLVMCore + LLVMSupport + ) + +add_dependencies(MLIRAVX512ToLLVM ${LIBS}) +target_link_libraries(MLIRAVX512ToLLVM PUBLIC ${LIBS}) diff --git a/mlir/lib/Conversion/AVX512ToLLVM/ConvertAVX512ToLLVM.cpp b/mlir/lib/Conversion/AVX512ToLLVM/ConvertAVX512ToLLVM.cpp new file mode 100644 index 0000000000000..af29714eb69a8 --- /dev/null +++ b/mlir/lib/Conversion/AVX512ToLLVM/ConvertAVX512ToLLVM.cpp @@ -0,0 +1,197 @@ +//===- ConvertAVX512ToLLVM.cpp - Convert AVX512 to the LLVM dialect -------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "mlir/Conversion/AVX512ToLLVM/ConvertAVX512ToLLVM.h" + +#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h" +#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h" +#include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h" +#include "mlir/Dialect/AVX512/AVX512Dialect.h" +#include "mlir/Dialect/LLVMIR/LLVMAVX512Dialect.h" +#include "mlir/Dialect/LLVMIR/LLVMDialect.h" +#include "mlir/Dialect/StandardOps/IR/Ops.h" +#include "mlir/Dialect/Vector/VectorOps.h" +#include "mlir/EDSC/Intrinsics.h" +#include "mlir/IR/Module.h" +#include "mlir/IR/PatternMatch.h" +#include "mlir/Pass/Pass.h" + +using namespace mlir; +using namespace mlir::edsc; +using namespace mlir::edsc::intrinsics; +using namespace mlir::vector; +using namespace mlir::avx512; + +template +static Type getSrcVectorElementType(OpTy op) { + return op.src().getType().template cast().getElementType(); +} + +// TODO(ntv, zinenko): Code is currently copy-pasted and adapted from the code +// 1-1 LLVM conversion. It would better if it were properly exposed in core and +// reusable. +/// Basic lowering implementation for one-to-one rewriting from AVX512 Ops to +/// LLVM Dialect Ops. Convert the type of the result to an LLVM type, pass +/// operands as is, preserve attributes. +template +static LogicalResult +matchAndRewriteOneToOne(const ConvertToLLVMPattern &lowering, + LLVMTypeConverter &typeConverter, Operation *op, + ArrayRef operands, + ConversionPatternRewriter &rewriter) { + unsigned numResults = op->getNumResults(); + + Type packedType; + if (numResults != 0) { + packedType = typeConverter.packFunctionResults(op->getResultTypes()); + if (!packedType) + return failure(); + } + + auto newOp = rewriter.create(op->getLoc(), packedType, operands, + op->getAttrs()); + + // If the operation produced 0 or 1 result, return them immediately. + if (numResults == 0) + return rewriter.eraseOp(op), success(); + if (numResults == 1) + return rewriter.replaceOp(op, newOp.getOperation()->getResult(0)), + success(); + + // Otherwise, it had been converted to an operation producing a structure. + // Extract individual results from the structure and return them as list. + SmallVector results; + results.reserve(numResults); + for (unsigned i = 0; i < numResults; ++i) { + auto type = typeConverter.convertType(op->getResult(i).getType()); + results.push_back(rewriter.create( + op->getLoc(), type, newOp.getOperation()->getResult(0), + rewriter.getI64ArrayAttr(i))); + } + rewriter.replaceOp(op, results); + return success(); +} + +namespace { +// TODO(ntv): Patterns are too verbose due to the fact that we have 1 op (e.g. +// MaskRndScaleOp) and different possible target ops. It would be better to take +// a Functor so that all these conversions become 1-liners. +struct MaskRndScaleOpPS512Conversion : public ConvertToLLVMPattern { + explicit MaskRndScaleOpPS512Conversion(MLIRContext *context, + LLVMTypeConverter &typeConverter) + : ConvertToLLVMPattern(MaskRndScaleOp::getOperationName(), context, + typeConverter) {} + + LogicalResult + matchAndRewrite(Operation *op, ArrayRef operands, + ConversionPatternRewriter &rewriter) const override { + if (!getSrcVectorElementType(cast(op)).isF32()) + return failure(); + return matchAndRewriteOneToOne( + *this, this->typeConverter, op, operands, rewriter); + } +}; + +struct MaskRndScaleOpPD512Conversion : public ConvertToLLVMPattern { + explicit MaskRndScaleOpPD512Conversion(MLIRContext *context, + LLVMTypeConverter &typeConverter) + : ConvertToLLVMPattern(MaskRndScaleOp::getOperationName(), context, + typeConverter) {} + + LogicalResult + matchAndRewrite(Operation *op, ArrayRef operands, + ConversionPatternRewriter &rewriter) const override { + if (!getSrcVectorElementType(cast(op)).isF64()) + return failure(); + return matchAndRewriteOneToOne( + *this, this->typeConverter, op, operands, rewriter); + } +}; + +struct ScaleFOpPS512Conversion : public ConvertToLLVMPattern { + explicit ScaleFOpPS512Conversion(MLIRContext *context, + LLVMTypeConverter &typeConverter) + : ConvertToLLVMPattern(MaskScaleFOp::getOperationName(), context, + typeConverter) {} + + LogicalResult + matchAndRewrite(Operation *op, ArrayRef operands, + ConversionPatternRewriter &rewriter) const override { + if (!getSrcVectorElementType(cast(op)).isF32()) + return failure(); + return matchAndRewriteOneToOne( + *this, this->typeConverter, op, operands, rewriter); + } +}; + +struct ScaleFOpPD512Conversion : public ConvertToLLVMPattern { + explicit ScaleFOpPD512Conversion(MLIRContext *context, + LLVMTypeConverter &typeConverter) + : ConvertToLLVMPattern(MaskScaleFOp::getOperationName(), context, + typeConverter) {} + + LogicalResult + matchAndRewrite(Operation *op, ArrayRef operands, + ConversionPatternRewriter &rewriter) const override { + if (!getSrcVectorElementType(cast(op)).isF64()) + return failure(); + return matchAndRewriteOneToOne( + *this, this->typeConverter, op, operands, rewriter); + } +}; +} // namespace + +/// Populate the given list with patterns that convert from AVX512 to LLVM. +void mlir::populateAVX512ToLLVMConversionPatterns( + LLVMTypeConverter &converter, OwningRewritePatternList &patterns) { + MLIRContext *ctx = converter.getDialect()->getContext(); + // clang-format off + patterns.insert(ctx, converter); + // clang-format on +} + +namespace { +struct ConvertAVX512ToLLVMPass : public ModulePass { + void runOnModule() override; +}; +} // namespace + +void ConvertAVX512ToLLVMPass::runOnModule() { + // Convert to the LLVM IR dialect. + OwningRewritePatternList patterns; + LLVMTypeConverter converter(&getContext()); + populateAVX512ToLLVMConversionPatterns(converter, patterns); + populateVectorToLLVMConversionPatterns(converter, patterns); + populateStdToLLVMConversionPatterns(converter, patterns); + + ConversionTarget target(getContext()); + target.addLegalDialect(); + target.addLegalDialect(); + target.addIllegalDialect(); + target.addDynamicallyLegalOp( + [&](FuncOp op) { return converter.isSignatureLegal(op.getType()); }); + if (failed( + applyPartialConversion(getModule(), target, patterns, &converter))) { + signalPassFailure(); + } +} + +std::unique_ptr> mlir::createConvertAVX512ToLLVMPass() { + return std::make_unique(); +} + +static PassRegistration pass( + "convert-avx512-to-llvm", + "Convert the operations from the avx512 dialect into the LLVM dialect"); diff --git a/mlir/lib/Conversion/AffineToStandard/AffineToStandard.cpp b/mlir/lib/Conversion/AffineToStandard/AffineToStandard.cpp index 9c100a280a64c..bb434abdc26c7 100644 --- a/mlir/lib/Conversion/AffineToStandard/AffineToStandard.cpp +++ b/mlir/lib/Conversion/AffineToStandard/AffineToStandard.cpp @@ -13,7 +13,7 @@ #include "mlir/Conversion/AffineToStandard/AffineToStandard.h" -#include "mlir/Dialect/AffineOps/AffineOps.h" +#include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/LoopOps/LoopOps.h" #include "mlir/Dialect/StandardOps/IR/Ops.h" #include "mlir/IR/AffineExprVisitor.h" diff --git a/mlir/lib/Conversion/AffineToStandard/CMakeLists.txt b/mlir/lib/Conversion/AffineToStandard/CMakeLists.txt index aa916739697e4..5613b28e3418f 100644 --- a/mlir/lib/Conversion/AffineToStandard/CMakeLists.txt +++ b/mlir/lib/Conversion/AffineToStandard/CMakeLists.txt @@ -7,7 +7,7 @@ add_mlir_conversion_library(MLIRAffineToStandard target_link_libraries( MLIRAffineToStandard PUBLIC - MLIRAffineOps + MLIRAffine MLIRLoopOps MLIRPass MLIRStandardOps diff --git a/mlir/lib/Conversion/CMakeLists.txt b/mlir/lib/Conversion/CMakeLists.txt index 4634345cf43e5..fbf3e12594935 100644 --- a/mlir/lib/Conversion/CMakeLists.txt +++ b/mlir/lib/Conversion/CMakeLists.txt @@ -1,4 +1,5 @@ add_subdirectory(AffineToStandard) +add_subdirectory(AVX512ToLLVM) add_subdirectory(GPUToCUDA) add_subdirectory(GPUToNVVM) add_subdirectory(GPUToROCDL) @@ -10,5 +11,6 @@ add_subdirectory(LoopsToGPU) add_subdirectory(LoopToStandard) add_subdirectory(StandardToLLVM) add_subdirectory(StandardToSPIRV) +add_subdirectory(StandardToStandard) add_subdirectory(VectorToLLVM) add_subdirectory(VectorToLoops) diff --git a/mlir/lib/Conversion/GPUToCUDA/ConvertKernelFuncToCubin.cpp b/mlir/lib/Conversion/GPUToCUDA/ConvertKernelFuncToCubin.cpp index 140026eaf6434..1640978b3a18c 100644 --- a/mlir/lib/Conversion/GPUToCUDA/ConvertKernelFuncToCubin.cpp +++ b/mlir/lib/Conversion/GPUToCUDA/ConvertKernelFuncToCubin.cpp @@ -49,8 +49,7 @@ static constexpr const char *kCubinAnnotation = "nvvm.cubin"; class GpuKernelToCubinPass : public OperationPass { public: - GpuKernelToCubinPass( - CubinGenerator cubinGenerator = compilePtxToCubinForTesting) + GpuKernelToCubinPass(CubinGenerator cubinGenerator) : cubinGenerator(cubinGenerator) {} void runOnOperation() override { @@ -76,9 +75,6 @@ class GpuKernelToCubinPass } private: - static OwnedCubin compilePtxToCubinForTesting(const std::string &ptx, - Location, StringRef); - std::string translateModuleToPtx(llvm::Module &module, llvm::TargetMachine &target_machine); @@ -112,13 +108,6 @@ std::string GpuKernelToCubinPass::translateModuleToPtx( return ptx; } -OwnedCubin -GpuKernelToCubinPass::compilePtxToCubinForTesting(const std::string &ptx, - Location, StringRef) { - const char data[] = "CUBIN"; - return std::make_unique>(data, data + sizeof(data) - 1); -} - OwnedCubin GpuKernelToCubinPass::convertModuleToCubin(llvm::Module &llvmModule, Location loc, StringRef name) { @@ -158,7 +147,3 @@ std::unique_ptr> mlir::createConvertGPUKernelToCubinPass(CubinGenerator cubinGenerator) { return std::make_unique(cubinGenerator); } - -static PassRegistration - pass("test-kernel-to-cubin", - "Convert all kernel functions to CUDA cubin blobs"); diff --git a/mlir/lib/Conversion/GPUToSPIRV/ConvertGPUToSPIRV.cpp b/mlir/lib/Conversion/GPUToSPIRV/ConvertGPUToSPIRV.cpp index 533ef7f53b92c..5483c2330c20d 100644 --- a/mlir/lib/Conversion/GPUToSPIRV/ConvertGPUToSPIRV.cpp +++ b/mlir/lib/Conversion/GPUToSPIRV/ConvertGPUToSPIRV.cpp @@ -349,10 +349,15 @@ LogicalResult GPUFuncOpConversion::matchAndRewrite( if (!gpu::GPUDialect::isKernel(funcOp)) return failure(); + // TODO(antiagainst): we are dictating the ABI by ourselves here; it should be + // specified outside. SmallVector argABI; - for (auto argNum : llvm::seq(0, funcOp.getNumArguments())) { - argABI.push_back(spirv::getInterfaceVarABIAttr( - 0, argNum, spirv::StorageClass::StorageBuffer, rewriter.getContext())); + for (auto argIndex : llvm::seq(0, funcOp.getNumArguments())) { + Optional sc; + if (funcOp.getArgument(argIndex).getType().isIntOrIndexOrFloat()) + sc = spirv::StorageClass::StorageBuffer; + argABI.push_back( + spirv::getInterfaceVarABIAttr(0, argIndex, sc, rewriter.getContext())); } auto entryPointAttr = spirv::lookupEntryPointABI(funcOp); diff --git a/mlir/lib/Conversion/GPUToSPIRV/ConvertGPUToSPIRVPass.cpp b/mlir/lib/Conversion/GPUToSPIRV/ConvertGPUToSPIRVPass.cpp index 4b84bc424fbdf..272eb163ab69b 100644 --- a/mlir/lib/Conversion/GPUToSPIRV/ConvertGPUToSPIRVPass.cpp +++ b/mlir/lib/Conversion/GPUToSPIRV/ConvertGPUToSPIRVPass.cpp @@ -52,14 +52,15 @@ void GPUToSPIRVPass::runOnModule() { kernelModules.push_back(builder.clone(*moduleOp.getOperation())); }); - SPIRVTypeConverter typeConverter; + auto targetAttr = spirv::lookupTargetEnvOrDefault(module); + std::unique_ptr target = + spirv::SPIRVConversionTarget::get(targetAttr); + + SPIRVTypeConverter typeConverter(targetAttr); OwningRewritePatternList patterns; populateGPUToSPIRVPatterns(context, typeConverter, patterns); populateStandardToSPIRVPatterns(context, typeConverter, patterns); - std::unique_ptr target = spirv::SPIRVConversionTarget::get( - spirv::lookupTargetEnvOrDefault(module), context); - if (failed(applyFullConversion(kernelModules, *target, patterns, &typeConverter))) { return signalPassFailure(); diff --git a/mlir/lib/Conversion/LinalgToSPIRV/LinalgToSPIRVPass.cpp b/mlir/lib/Conversion/LinalgToSPIRV/LinalgToSPIRVPass.cpp index 68d31ca724794..4477c070796ef 100644 --- a/mlir/lib/Conversion/LinalgToSPIRV/LinalgToSPIRVPass.cpp +++ b/mlir/lib/Conversion/LinalgToSPIRV/LinalgToSPIRVPass.cpp @@ -25,15 +25,15 @@ void LinalgToSPIRVPass::runOnModule() { MLIRContext *context = &getContext(); ModuleOp module = getModule(); - SPIRVTypeConverter typeConverter; + auto targetAttr = spirv::lookupTargetEnvOrDefault(module); + std::unique_ptr target = + spirv::SPIRVConversionTarget::get(targetAttr); + + SPIRVTypeConverter typeConverter(targetAttr); OwningRewritePatternList patterns; populateLinalgToSPIRVPatterns(context, typeConverter, patterns); populateBuiltinFuncToSPIRVPatterns(context, typeConverter, patterns); - auto targetEnv = spirv::lookupTargetEnvOrDefault(module); - std::unique_ptr target = - spirv::SPIRVConversionTarget::get(targetEnv, context); - // Allow builtin ops. target->addLegalOp(); target->addDynamicallyLegalOp( diff --git a/mlir/lib/Conversion/LoopToStandard/CMakeLists.txt b/mlir/lib/Conversion/LoopToStandard/CMakeLists.txt index 87188dbf3dbd3..d0401d80fe939 100644 --- a/mlir/lib/Conversion/LoopToStandard/CMakeLists.txt +++ b/mlir/lib/Conversion/LoopToStandard/CMakeLists.txt @@ -1,5 +1,5 @@ add_mlir_conversion_library(MLIRLoopToStandard - ConvertLoopToStandard.cpp + LoopToStandard.cpp ADDITIONAL_HEADER_DIRS ${MLIR_MAIN_INCLUDE_DIR}/mlir/Conversion/LoopToStandard diff --git a/mlir/lib/Conversion/LoopToStandard/ConvertLoopToStandard.cpp b/mlir/lib/Conversion/LoopToStandard/LoopToStandard.cpp similarity index 99% rename from mlir/lib/Conversion/LoopToStandard/ConvertLoopToStandard.cpp rename to mlir/lib/Conversion/LoopToStandard/LoopToStandard.cpp index 8f7c76c921e19..178759ad67d6f 100644 --- a/mlir/lib/Conversion/LoopToStandard/ConvertLoopToStandard.cpp +++ b/mlir/lib/Conversion/LoopToStandard/LoopToStandard.cpp @@ -1,4 +1,4 @@ -//===- ConvertLoopToStandard.cpp - ControlFlow to CFG conversion ----------===// +//===- LoopToStandard.cpp - ControlFlow to CFG conversion -----------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. diff --git a/mlir/lib/Conversion/LoopsToGPU/CMakeLists.txt b/mlir/lib/Conversion/LoopsToGPU/CMakeLists.txt index dd69af418bdef..9a460bcf71655 100644 --- a/mlir/lib/Conversion/LoopsToGPU/CMakeLists.txt +++ b/mlir/lib/Conversion/LoopsToGPU/CMakeLists.txt @@ -7,7 +7,7 @@ add_mlir_conversion_library(MLIRLoopsToGPU ) target_link_libraries(MLIRLoopsToGPU PUBLIC - MLIRAffineOps + MLIRAffine MLIRAffineToStandard MLIRGPU MLIRIR diff --git a/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp b/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp index 8023226bc3008..b9c81ea45592e 100644 --- a/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp +++ b/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp @@ -15,7 +15,7 @@ #include "mlir/Conversion/LoopsToGPU/LoopsToGPU.h" #include "mlir/Conversion/AffineToStandard/AffineToStandard.h" -#include "mlir/Dialect/AffineOps/AffineOps.h" +#include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/GPU/GPUDialect.h" #include "mlir/Dialect/GPU/ParallelLoopMapper.h" #include "mlir/Dialect/LoopOps/LoopOps.h" @@ -500,35 +500,8 @@ struct ParallelToGpuLaunchLowering : public OpRewritePattern { LogicalResult matchAndRewrite(ParallelOp parallelOp, PatternRewriter &rewriter) const override; }; - -struct MappingAnnotation { - unsigned processor; - AffineMap indexMap; - AffineMap boundMap; -}; - } // namespace -/// Extracts the mapping annotations from the provided attribute. The attribute -/// is expected to be of the form -/// { processor = , map = , bound = } -/// where the bound is optional. -static MappingAnnotation extractMappingAnnotation(Attribute attribute) { - DictionaryAttr dict = attribute.cast(); - unsigned processor = dict.get(gpu::kProcessorEntryName) - .cast() - .getValue() - .getSExtValue(); - AffineMap map = - dict.get(gpu::kIndexMapEntryName).cast().getValue(); - AffineMapAttr boundAttr = - dict.get(gpu::kBoundMapEntryName).dyn_cast_or_null(); - AffineMap bound; - if (boundAttr) - bound = boundAttr.getValue(); - return {processor, map, bound}; -} - /// Tries to derive a static upper bound from the defining operation of /// `upperBound`. static Value deriveStaticUpperBound(Value upperBound, @@ -546,6 +519,30 @@ static Value deriveStaticUpperBound(Value upperBound, return {}; } +static bool isMappedToProcessor(gpu::Processor processor) { + return processor != gpu::Processor::Sequential; +} + +static unsigned getLaunchOpArgumentNum(gpu::Processor processor) { + switch (processor) { + case gpu::Processor::BlockX: + return 0; + case gpu::Processor::BlockY: + return 1; + case gpu::Processor::BlockZ: + return 2; + case gpu::Processor::ThreadX: + return 3; + case gpu::Processor::ThreadY: + return 4; + case gpu::Processor::ThreadZ: + return 5; + default:; + } + llvm_unreachable( + "invalid processor type while retrieving launch op argument number"); +} + /// Modifies the current transformation state to capture the effect of the given /// `loop.parallel` operation on index substitutions and the operations to be /// inserted. @@ -568,16 +565,14 @@ static Value deriveStaticUpperBound(Value upperBound, /// inserted, a sentinel (the `gpu.launch` operation) is inserted into the /// worklist. This signals the processor of the worklist to pop the rewriter /// one scope-level up. -static LogicalResult processParallelLoop(ParallelOp parallelOp, - gpu::LaunchOp launchOp, - BlockAndValueMapping &cloningMap, - SmallVectorImpl &worklist, - DenseMap &bounds, - PatternRewriter &rewriter) { +static LogicalResult processParallelLoop( + ParallelOp parallelOp, gpu::LaunchOp launchOp, + BlockAndValueMapping &cloningMap, SmallVectorImpl &worklist, + DenseMap &bounds, PatternRewriter &rewriter) { // TODO(herhut): Verify that this is a valid GPU mapping. // processor ids: 0-2 block [x/y/z], 3-5 -> thread [x/y/z], 6-> sequential ArrayAttr mapping = - parallelOp.getAttrOfType(gpu::kMappingAttributeName); + parallelOp.getAttrOfType(gpu::getMappingAttrName()); // TODO(herhut): Support reductions. if (!mapping || parallelOp.getNumResults() != 0) @@ -604,12 +599,17 @@ static LogicalResult processParallelLoop(ParallelOp parallelOp, Attribute mappingAttribute; Value iv, lowerBound, upperBound, step; std::tie(mappingAttribute, iv, lowerBound, upperBound, step) = config; - MappingAnnotation annotation = extractMappingAnnotation(mappingAttribute); + auto annotation = mappingAttribute.dyn_cast(); + if (!annotation) + return parallelOp.emitOpError() + << "expected mapping attribute for lowering to GPU"; Value newIndex; + gpu::Processor processor = gpu::getProcessor(annotation); - if (annotation.processor < gpu::LaunchOp::kNumConfigOperands) { + if (isMappedToProcessor(processor)) { // Use the corresponding thread/grid index as replacement for the loop iv. - Value operand = launchOp.body().front().getArgument(annotation.processor); + Value operand = launchOp.body().front().getArgument( + getLaunchOpArgumentNum(processor)); // Take the indexmap and add the lower bound and step computations in. // This computes operand * step + lowerBound. // Use an affine map here so that it composes nicely with the provided @@ -619,11 +619,11 @@ static LogicalResult processParallelLoop(ParallelOp parallelOp, rewriter.getAffineDimExpr(0) * rewriter.getAffineSymbolExpr(0) + rewriter.getAffineSymbolExpr(1)); newIndex = rewriter.create( - loc, annotation.indexMap.compose(lowerAndStep), + loc, annotation.map().getValue().compose(lowerAndStep), ValueRange{operand, step, lowerBound}); // If there was also a bound, insert that, too. // TODO(herhut): Check that we do not assign bounds twice. - if (annotation.boundMap) { + if (annotation.bound().getValue()) { // We pass as the single opererand to the bound-map the number of // iterations, which is (upperBound - lowerBound) ceilDiv step. To // support inner loops with dynamic upper bounds (as generated by e.g. @@ -663,19 +663,21 @@ static LogicalResult processParallelLoop(ParallelOp parallelOp, rewriter.getAffineSymbolExpr(1)) .ceilDiv(rewriter.getAffineSymbolExpr(2)))); Value launchBound = rewriter.create( - loc, annotation.boundMap.compose(stepMap), + loc, annotation.bound().getValue().compose(stepMap), ValueRange{ ensureLaunchIndependent( cloningMap.lookupOrDefault(upperBound)), ensureLaunchIndependent( cloningMap.lookupOrDefault(lowerBound)), ensureLaunchIndependent(cloningMap.lookupOrDefault(step))}); - if (bounds.find(annotation.processor) != bounds.end()) { + // todo(herhut,ravishankarm): Update the behavior of setMappingAttr + // when this condition is relaxed. + if (bounds.find(processor) != bounds.end()) { return parallelOp.emitOpError() << "cannot redefine the bound for processor " - << annotation.processor; + << static_cast(processor); } - bounds[annotation.processor] = launchBound; + bounds[processor] = launchBound; } if (!boundIsPrecise) { // We are using an approximation, create a surrounding conditional. @@ -757,7 +759,7 @@ ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp, rewriter.setInsertionPointToStart(&launchOp.body().front()); BlockAndValueMapping cloningMap; - llvm::DenseMap launchBounds; + llvm::DenseMap launchBounds; SmallVector worklist; if (failed(processParallelLoop(parallelOp, launchOp, cloningMap, worklist, launchBounds, rewriter))) @@ -809,7 +811,8 @@ ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp, // Now that we succeeded creating the launch operation, also update the // bounds. for (auto bound : launchBounds) - launchOp.setOperand(std::get<0>(bound), std::get<1>(bound)); + launchOp.setOperand(getLaunchOpArgumentNum(std::get<0>(bound)), + std::get<1>(bound)); rewriter.eraseOp(parallelOp); return success(); diff --git a/mlir/lib/Conversion/LoopsToGPU/LoopsToGPUPass.cpp b/mlir/lib/Conversion/LoopsToGPU/LoopsToGPUPass.cpp index 9a703199cba12..9a5e2a608df98 100644 --- a/mlir/lib/Conversion/LoopsToGPU/LoopsToGPUPass.cpp +++ b/mlir/lib/Conversion/LoopsToGPU/LoopsToGPUPass.cpp @@ -8,7 +8,7 @@ #include "mlir/Conversion/LoopsToGPU/LoopsToGPUPass.h" #include "mlir/Conversion/LoopsToGPU/LoopsToGPU.h" -#include "mlir/Dialect/AffineOps/AffineOps.h" +#include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/GPU/GPUDialect.h" #include "mlir/Dialect/LoopOps/LoopOps.h" #include "mlir/Dialect/StandardOps/IR/Ops.h" @@ -24,36 +24,17 @@ using namespace mlir; using namespace mlir::loop; -static llvm::cl::OptionCategory clOptionsCategory(PASS_NAME " options"); -static llvm::cl::opt - clNumBlockDims("gpu-block-dims", - llvm::cl::desc("Number of GPU block dimensions for mapping"), - llvm::cl::cat(clOptionsCategory), llvm::cl::init(1u)); -static llvm::cl::opt clNumThreadDims( - "gpu-thread-dims", - llvm::cl::desc("Number of GPU thread dimensions for mapping"), - llvm::cl::cat(clOptionsCategory), llvm::cl::init(1u)); - -static llvm::cl::OptionCategory clLoopOpToGPUCategory(LOOPOP_TO_GPU_PASS_NAME - " options"); -static llvm::cl::list - clNumWorkGroups("gpu-num-workgroups", - llvm::cl::desc("Num workgroups in the GPU launch"), - llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated, - llvm::cl::cat(clLoopOpToGPUCategory)); -static llvm::cl::list - clWorkGroupSize("gpu-workgroup-size", - llvm::cl::desc("Workgroup Size in the GPU launch"), - llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated, - llvm::cl::cat(clLoopOpToGPUCategory)); - namespace { // A pass that traverses top-level loops in the function and converts them to // GPU launch operations. Nested launches are not allowed, so this does not // walk the function recursively to avoid considering nested loops. struct ForLoopMapper : public FunctionPass { - ForLoopMapper(unsigned numBlockDims, unsigned numThreadDims) - : numBlockDims(numBlockDims), numThreadDims(numThreadDims) {} + ForLoopMapper() = default; + ForLoopMapper(const ForLoopMapper &) {} + ForLoopMapper(unsigned numBlockDims, unsigned numThreadDims) { + this->numBlockDims = numBlockDims; + this->numThreadDims = numThreadDims; + } void runOnFunction() override { for (Block &block : getFunction()) @@ -70,8 +51,14 @@ struct ForLoopMapper : public FunctionPass { } } - unsigned numBlockDims; - unsigned numThreadDims; + Option numBlockDims{ + *this, "gpu-block-dims", + llvm::cl::desc("Number of GPU block dimensions for mapping"), + llvm::cl::init(1u)}; + Option numThreadDims{ + *this, "gpu-thread-dims", + llvm::cl::desc("Number of GPU thread dimensions for mapping"), + llvm::cl::init(1u)}; }; // A pass that traverses top-level loops in the function and convertes them to @@ -81,10 +68,13 @@ struct ForLoopMapper : public FunctionPass { // to be perfectly nested upto depth equal to size of `workGroupSize`. struct ImperfectlyNestedForLoopMapper : public FunctionPass { + ImperfectlyNestedForLoopMapper() = default; + ImperfectlyNestedForLoopMapper(const ImperfectlyNestedForLoopMapper &) {} ImperfectlyNestedForLoopMapper(ArrayRef numWorkGroups, - ArrayRef workGroupSize) - : numWorkGroups(numWorkGroups.begin(), numWorkGroups.end()), - workGroupSize(workGroupSize.begin(), workGroupSize.end()) {} + ArrayRef workGroupSize) { + this->numWorkGroups->assign(numWorkGroups.begin(), numWorkGroups.end()); + this->workGroupSize->assign(workGroupSize.begin(), workGroupSize.end()); + } void runOnFunction() override { // Insert the num work groups and workgroup sizes as constant values. This @@ -113,8 +103,14 @@ struct ImperfectlyNestedForLoopMapper } } } - SmallVector numWorkGroups; - SmallVector workGroupSize; + ListOption numWorkGroups{ + *this, "gpu-num-workgroups", + llvm::cl::desc("Num workgroups in the GPU launch"), llvm::cl::ZeroOrMore, + llvm::cl::MiscFlags::CommaSeparated}; + ListOption workGroupSize{ + *this, "gpu-workgroup-size", + llvm::cl::desc("Workgroup Size in the GPU launch"), llvm::cl::ZeroOrMore, + llvm::cl::MiscFlags::CommaSeparated}; }; struct ParallelLoopToGpuPass : public OperationPass { @@ -123,7 +119,7 @@ struct ParallelLoopToGpuPass : public OperationPass { populateParallelLoopToGPUPatterns(patterns, &getContext()); ConversionTarget target(getContext()); target.addLegalDialect(); - target.addLegalDialect(); + target.addLegalDialect(); target.addLegalDialect(); target.addLegalDialect(); target.addIllegalOp(); @@ -152,20 +148,11 @@ std::unique_ptr mlir::createParallelLoopToGpuPass() { } static PassRegistration - registration(PASS_NAME, "Convert top-level loops to GPU kernels", [] { - return std::make_unique(clNumBlockDims.getValue(), - clNumThreadDims.getValue()); - }); - -static PassRegistration loopOpToGPU( - LOOPOP_TO_GPU_PASS_NAME, "Convert top-level loop::ForOp to GPU kernels", - [] { - SmallVector numWorkGroups, workGroupSize; - numWorkGroups.assign(clNumWorkGroups.begin(), clNumWorkGroups.end()); - workGroupSize.assign(clWorkGroupSize.begin(), clWorkGroupSize.end()); - return std::make_unique(numWorkGroups, - workGroupSize); - }); + registration(PASS_NAME, "Convert top-level loops to GPU kernels"); + +static PassRegistration + loopOpToGPU(LOOPOP_TO_GPU_PASS_NAME, + "Convert top-level loop::ForOp to GPU kernels"); static PassRegistration pass("convert-parallel-loops-to-gpu", "Convert mapped loop.parallel ops" diff --git a/mlir/lib/Conversion/StandardToLLVM/CMakeLists.txt b/mlir/lib/Conversion/StandardToLLVM/CMakeLists.txt index 50081d4d6de7d..3a5b2f6ba06c6 100644 --- a/mlir/lib/Conversion/StandardToLLVM/CMakeLists.txt +++ b/mlir/lib/Conversion/StandardToLLVM/CMakeLists.txt @@ -1,5 +1,5 @@ add_mlir_conversion_library(MLIRStandardToLLVM - ConvertStandardToLLVM.cpp + StandardToLLVM.cpp ADDITIONAL_HEADER_DIRS ${MLIR_MAIN_INCLUDE_DIR}/mlir/Conversion/StandardToLLVM diff --git a/mlir/lib/Conversion/StandardToLLVM/ConvertStandardToLLVM.cpp b/mlir/lib/Conversion/StandardToLLVM/StandardToLLVM.cpp similarity index 99% rename from mlir/lib/Conversion/StandardToLLVM/ConvertStandardToLLVM.cpp rename to mlir/lib/Conversion/StandardToLLVM/StandardToLLVM.cpp index 250ff3682653a..e353e933a8ae5 100644 --- a/mlir/lib/Conversion/StandardToLLVM/ConvertStandardToLLVM.cpp +++ b/mlir/lib/Conversion/StandardToLLVM/StandardToLLVM.cpp @@ -1,4 +1,4 @@ -//===- ConvertStandardToLLVM.cpp - Standard to LLVM dialect conversion-----===// +//===- StandardToLLVM.cpp - Standard to LLVM dialect conversion -----------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. diff --git a/mlir/lib/Conversion/StandardToSPIRV/CMakeLists.txt b/mlir/lib/Conversion/StandardToSPIRV/CMakeLists.txt index 308f1b0074ed9..6d940eaf024e3 100644 --- a/mlir/lib/Conversion/StandardToSPIRV/CMakeLists.txt +++ b/mlir/lib/Conversion/StandardToSPIRV/CMakeLists.txt @@ -1,7 +1,3 @@ -set(LLVM_TARGET_DEFINITIONS StandardToSPIRV.td) -mlir_tablegen(StandardToSPIRV.cpp.inc -gen-rewriters) -add_public_tablegen_target(MLIRStandardToSPIRVIncGen) - add_mlir_conversion_library(MLIRStandardToSPIRVTransforms ConvertStandardToSPIRV.cpp ConvertStandardToSPIRVPass.cpp @@ -10,9 +6,6 @@ add_mlir_conversion_library(MLIRStandardToSPIRVTransforms ADDITIONAL_HEADER_DIRS ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/SPIRV ${MLIR_MAIN_INCLUDE_DIR}/mlir/IR - - DEPENDS - MLIRStandardToSPIRVIncGen ) target_link_libraries(MLIRStandardToSPIRVTransforms diff --git a/mlir/lib/Conversion/StandardToSPIRV/ConvertStandardToSPIRV.cpp b/mlir/lib/Conversion/StandardToSPIRV/ConvertStandardToSPIRV.cpp index 310dcd8a86bdb..ea8812cebdc4c 100644 --- a/mlir/lib/Conversion/StandardToSPIRV/ConvertStandardToSPIRV.cpp +++ b/mlir/lib/Conversion/StandardToSPIRV/ConvertStandardToSPIRV.cpp @@ -6,52 +6,176 @@ // //===----------------------------------------------------------------------===// // -// This file implements patterns to convert Standard Ops to the SPIR-V dialect. +// This file implements patterns to convert standard ops to SPIR-V ops. // //===----------------------------------------------------------------------===// + #include "mlir/Dialect/SPIRV/LayoutUtils.h" #include "mlir/Dialect/SPIRV/SPIRVDialect.h" #include "mlir/Dialect/SPIRV/SPIRVLowering.h" #include "mlir/Dialect/SPIRV/SPIRVOps.h" #include "mlir/Dialect/StandardOps/IR/Ops.h" #include "mlir/IR/AffineMap.h" +#include "mlir/Support/LogicalResult.h" #include "llvm/ADT/SetVector.h" +#include "llvm/Support/Debug.h" + +#define DEBUG_TYPE "std-to-spirv-pattern" using namespace mlir; +//===----------------------------------------------------------------------===// +// Utility functions +//===----------------------------------------------------------------------===// + +/// Returns true if the given `type` is a boolean scalar or vector type. +static bool isBoolScalarOrVector(Type type) { + if (type.isInteger(1)) + return true; + if (auto vecType = type.dyn_cast()) + return vecType.getElementType().isInteger(1); + return false; +} + +/// Converts the given `srcAttr` into a boolean attribute if it holds a integral +/// value. Returns null attribute if conversion fails. +static BoolAttr convertBoolAttr(Attribute srcAttr, Builder builder) { + if (auto boolAttr = srcAttr.dyn_cast()) + return boolAttr; + if (auto intAttr = srcAttr.dyn_cast()) + return builder.getBoolAttr(intAttr.getValue().getBoolValue()); + return BoolAttr(); +} + +/// Converts the given `srcAttr` to a new attribute of the given `dstType`. +/// Returns null attribute if conversion fails. +static IntegerAttr convertIntegerAttr(IntegerAttr srcAttr, IntegerType dstType, + Builder builder) { + // If the source number uses less active bits than the target bitwidth, then + // it should be safe to convert. + if (srcAttr.getValue().isIntN(dstType.getWidth())) + return builder.getIntegerAttr(dstType, srcAttr.getInt()); + + // XXX: Try again by interpreting the source number as a signed value. + // Although integers in the standard dialect are signless, they can represent + // a signed number. It's the operation decides how to interpret. This is + // dangerous, but it seems there is no good way of handling this if we still + // want to change the bitwidth. Emit a message at least. + if (srcAttr.getValue().isSignedIntN(dstType.getWidth())) { + auto dstAttr = builder.getIntegerAttr(dstType, srcAttr.getInt()); + LLVM_DEBUG(llvm::dbgs() << "attribute '" << srcAttr << "' converted to '" + << dstAttr << "' for type '" << dstType << "'\n"); + return dstAttr; + } + + LLVM_DEBUG(llvm::dbgs() << "attribute '" << srcAttr + << "' illegal: cannot fit into target type '" + << dstType << "'\n"); + return IntegerAttr(); +} + +/// Converts the given `srcAttr` to a new attribute of the given `dstType`. +/// Returns null attribute if `dstType` is not 32-bit or conversion fails. +static FloatAttr convertFloatAttr(FloatAttr srcAttr, FloatType dstType, + Builder builder) { + // Only support converting to float for now. + if (!dstType.isF32()) + return FloatAttr(); + + // Try to convert the source floating-point number to single precision. + APFloat dstVal = srcAttr.getValue(); + bool losesInfo = false; + APFloat::opStatus status = + dstVal.convert(APFloat::IEEEsingle(), APFloat::rmTowardZero, &losesInfo); + if (status != APFloat::opOK || losesInfo) { + LLVM_DEBUG(llvm::dbgs() + << srcAttr << " illegal: cannot fit into converted type '" + << dstType << "'\n"); + return FloatAttr(); + } + + return builder.getF32FloatAttr(dstVal.convertToFloat()); +} + //===----------------------------------------------------------------------===// // Operation conversion //===----------------------------------------------------------------------===// +// Note that DRR cannot be used for the patterns in this file: we may need to +// convert type along the way, which requires ConversionPattern. DRR generates +// normal RewritePattern. + namespace { -/// Convert composite constant operation to SPIR-V dialect. -// TODO(denis0x0D) : move to DRR. -class ConstantCompositeOpConversion final : public SPIRVOpLowering { +/// Converts unary and binary standard operations to SPIR-V operations. +template +class UnaryAndBinaryOpPattern final : public SPIRVOpLowering { +public: + using SPIRVOpLowering::SPIRVOpLowering; + + LogicalResult + matchAndRewrite(StdOp operation, ArrayRef operands, + ConversionPatternRewriter &rewriter) const override { + assert(operands.size() <= 2); + auto dstType = this->typeConverter.convertType(operation.getType()); + if (!dstType) + return failure(); + rewriter.template replaceOpWithNewOp(operation, dstType, operands, + ArrayRef()); + return success(); + } +}; + +/// Converts bitwise standard operations to SPIR-V operations. This is a special +/// pattern other than the BinaryOpPatternPattern because if the operands are +/// boolean values, SPIR-V uses different operations (`SPIRVLogicalOp`). For +/// non-boolean operands, SPIR-V should use `SPIRVBitwiseOp`. +template +class BitwiseOpPattern final : public SPIRVOpLowering { +public: + using SPIRVOpLowering::SPIRVOpLowering; + + LogicalResult + matchAndRewrite(StdOp operation, ArrayRef operands, + ConversionPatternRewriter &rewriter) const override { + assert(operands.size() == 2); + auto dstType = + this->typeConverter.convertType(operation.getResult().getType()); + if (!dstType) + return failure(); + if (isBoolScalarOrVector(operands.front().getType())) { + rewriter.template replaceOpWithNewOp( + operation, dstType, operands, ArrayRef()); + } else { + rewriter.template replaceOpWithNewOp( + operation, dstType, operands, ArrayRef()); + } + return success(); + } +}; + +/// Converts composite std.constant operation to spv.constant. +class ConstantCompositeOpPattern final : public SPIRVOpLowering { public: using SPIRVOpLowering::SPIRVOpLowering; LogicalResult - matchAndRewrite(ConstantOp constCompositeOp, ArrayRef operands, + matchAndRewrite(ConstantOp constOp, ArrayRef operands, ConversionPatternRewriter &rewriter) const override; }; -/// Convert constant operation with IndexType return to SPIR-V constant -/// operation. Since IndexType is not used within SPIR-V dialect, this needs -/// special handling to make sure the result type and the type of the value -/// attribute are consistent. -// TODO(ravishankarm) : This should be moved into DRR. -class ConstantIndexOpConversion final : public SPIRVOpLowering { +/// Converts scalar std.constant operation to spv.constant. +class ConstantScalarOpPattern final : public SPIRVOpLowering { public: using SPIRVOpLowering::SPIRVOpLowering; LogicalResult - matchAndRewrite(ConstantOp constIndexOp, ArrayRef operands, + matchAndRewrite(ConstantOp constOp, ArrayRef operands, ConversionPatternRewriter &rewriter) const override; }; -/// Convert floating-point comparison operations to SPIR-V dialect. -class CmpFOpConversion final : public SPIRVOpLowering { +/// Converts floating-point comparison operations to SPIR-V ops. +class CmpFOpPattern final : public SPIRVOpLowering { public: using SPIRVOpLowering::SPIRVOpLowering; @@ -60,8 +184,8 @@ class CmpFOpConversion final : public SPIRVOpLowering { ConversionPatternRewriter &rewriter) const override; }; -/// Convert compare operation to SPIR-V dialect. -class CmpIOpConversion final : public SPIRVOpLowering { +/// Converts integer compare operation to SPIR-V ops. +class CmpIOpPattern final : public SPIRVOpLowering { public: using SPIRVOpLowering::SPIRVOpLowering; @@ -70,33 +194,8 @@ class CmpIOpConversion final : public SPIRVOpLowering { ConversionPatternRewriter &rewriter) const override; }; -/// Convert integer binary operations to SPIR-V operations. Cannot use -/// tablegen for this. If the integer operation is on variables of IndexType, -/// the type of the return value of the replacement operation differs from -/// that of the replaced operation. This is not handled in tablegen-based -/// pattern specification. -// TODO(ravishankarm) : This should be moved into DRR. -template -class IntegerOpConversion final : public SPIRVOpLowering { -public: - using SPIRVOpLowering::SPIRVOpLowering; - - LogicalResult - matchAndRewrite(StdOp operation, ArrayRef operands, - ConversionPatternRewriter &rewriter) const override { - auto resultType = - this->typeConverter.convertType(operation.getResult().getType()); - rewriter.template replaceOpWithNewOp( - operation, resultType, operands, ArrayRef()); - return success(); - } -}; - -/// Convert load -> spv.LoadOp. The operands of the replaced operation are of -/// IndexType while that of the replacement operation are of type i32. This is -/// not supported in tablegen based pattern specification. -// TODO(ravishankarm) : This should be moved into DRR. -class LoadOpConversion final : public SPIRVOpLowering { +/// Converts std.load to spv.Load. +class LoadOpPattern final : public SPIRVOpLowering { public: using SPIRVOpLowering::SPIRVOpLowering; @@ -105,9 +204,8 @@ class LoadOpConversion final : public SPIRVOpLowering { ConversionPatternRewriter &rewriter) const override; }; -/// Convert return -> spv.Return. -// TODO(ravishankarm) : This should be moved into DRR. -class ReturnOpConversion final : public SPIRVOpLowering { +/// Converts std.return to spv.Return. +class ReturnOpPattern final : public SPIRVOpLowering { public: using SPIRVOpLowering::SPIRVOpLowering; @@ -116,9 +214,8 @@ class ReturnOpConversion final : public SPIRVOpLowering { ConversionPatternRewriter &rewriter) const override; }; -/// Convert select -> spv.Select -// TODO(ravishankarm) : This should be moved into DRR. -class SelectOpConversion final : public SPIRVOpLowering { +/// Converts std.select to spv.Select. +class SelectOpPattern final : public SPIRVOpLowering { public: using SPIRVOpLowering::SPIRVOpLowering; LogicalResult @@ -126,11 +223,8 @@ class SelectOpConversion final : public SPIRVOpLowering { ConversionPatternRewriter &rewriter) const override; }; -/// Convert store -> spv.StoreOp. The operands of the replaced operation are -/// of IndexType while that of the replacement operation are of type i32. This -/// is not supported in tablegen based pattern specification. -// TODO(ravishankarm) : This should be moved into DRR. -class StoreOpConversion final : public SPIRVOpLowering { +/// Converts std.store to spv.Store. +class StoreOpPattern final : public SPIRVOpLowering { public: using SPIRVOpLowering::SPIRVOpLowering; @@ -139,72 +233,179 @@ class StoreOpConversion final : public SPIRVOpLowering { ConversionPatternRewriter &rewriter) const override; }; +/// Converts type-casting standard operations to SPIR-V operations. +template +class TypeCastingOpPattern final : public SPIRVOpLowering { +public: + using SPIRVOpLowering::SPIRVOpLowering; + + LogicalResult + matchAndRewrite(StdOp operation, ArrayRef operands, + ConversionPatternRewriter &rewriter) const override { + assert(operands.size() == 1); + auto dstType = + this->typeConverter.convertType(operation.getResult().getType()); + if (dstType == operands.front().getType()) { + // Due to type conversion, we are seeing the same source and target type. + // Then we can just erase this operation by forwarding its operand. + rewriter.replaceOp(operation, operands.front()); + } else { + rewriter.template replaceOpWithNewOp( + operation, dstType, operands, ArrayRef()); + } + return success(); + } +}; + +/// Converts std.xor to SPIR-V operations. +class XOrOpPattern final : public SPIRVOpLowering { +public: + using SPIRVOpLowering::SPIRVOpLowering; + + LogicalResult + matchAndRewrite(XOrOp xorOp, ArrayRef operands, + ConversionPatternRewriter &rewriter) const override; +}; + } // namespace //===----------------------------------------------------------------------===// // ConstantOp with composite type. //===----------------------------------------------------------------------===// -LogicalResult ConstantCompositeOpConversion::matchAndRewrite( - ConstantOp constCompositeOp, ArrayRef operands, +LogicalResult ConstantCompositeOpPattern::matchAndRewrite( + ConstantOp constOp, ArrayRef operands, ConversionPatternRewriter &rewriter) const { - auto compositeType = - constCompositeOp.getResult().getType().dyn_cast(); - if (!compositeType) + auto srcType = constOp.getType().dyn_cast(); + if (!srcType) return failure(); - auto spirvCompositeType = typeConverter.convertType(compositeType); - if (!spirvCompositeType) + // std.constant should only have vector or tenor types. + assert(srcType.isa() || srcType.isa()); + + auto dstType = typeConverter.convertType(srcType); + if (!dstType) return failure(); - auto linearizedElements = - constCompositeOp.value().dyn_cast(); - if (!linearizedElements) + auto dstElementsAttr = constOp.value().dyn_cast(); + ShapedType dstAttrType = dstElementsAttr.getType(); + if (!dstElementsAttr) return failure(); - // If composite type has rank greater than one, then perform linearization. - if (compositeType.getRank() > 1) { - auto linearizedType = RankedTensorType::get(compositeType.getNumElements(), - compositeType.getElementType()); - linearizedElements = linearizedElements.reshape(linearizedType); + // If the composite type has more than one dimensions, perform linearization. + if (srcType.getRank() > 1) { + if (srcType.isa()) { + dstAttrType = RankedTensorType::get(srcType.getNumElements(), + srcType.getElementType()); + dstElementsAttr = dstElementsAttr.reshape(dstAttrType); + } else { + // TODO(antiagainst): add support for large vectors. + return failure(); + } + } + + Type srcElemType = srcType.getElementType(); + Type dstElemType; + // Tensor types are converted to SPIR-V array types; vector types are + // converted to SPIR-V vector/array types. + if (auto arrayType = dstType.dyn_cast()) + dstElemType = arrayType.getElementType(); + else + dstElemType = dstType.cast().getElementType(); + + // If the source and destination element types are different, perform + // attribute conversion. + if (srcElemType != dstElemType) { + SmallVector elements; + if (srcElemType.isa()) { + for (Attribute srcAttr : dstElementsAttr.getAttributeValues()) { + FloatAttr dstAttr = convertFloatAttr( + srcAttr.cast(), dstElemType.cast(), rewriter); + if (!dstAttr) + return failure(); + elements.push_back(dstAttr); + } + } else if (srcElemType.isInteger(1)) { + return failure(); + } else { + for (Attribute srcAttr : dstElementsAttr.getAttributeValues()) { + IntegerAttr dstAttr = + convertIntegerAttr(srcAttr.cast(), + dstElemType.cast(), rewriter); + if (!dstAttr) + return failure(); + elements.push_back(dstAttr); + } + } + + // Unfortunately, we cannot use dialect-specific types for element + // attributes; element attributes only works with standard types. So we need + // to prepare another converted standard types for the destination elements + // attribute. + if (dstAttrType.isa()) + dstAttrType = RankedTensorType::get(dstAttrType.getShape(), dstElemType); + else + dstAttrType = VectorType::get(dstAttrType.getShape(), dstElemType); + + dstElementsAttr = DenseElementsAttr::get(dstAttrType, elements); } - rewriter.replaceOpWithNewOp( - constCompositeOp, spirvCompositeType, linearizedElements); + rewriter.replaceOpWithNewOp(constOp, dstType, + dstElementsAttr); return success(); } //===----------------------------------------------------------------------===// -// ConstantOp with index type. +// ConstantOp with scalar type. //===----------------------------------------------------------------------===// -LogicalResult ConstantIndexOpConversion::matchAndRewrite( - ConstantOp constIndexOp, ArrayRef operands, +LogicalResult ConstantScalarOpPattern::matchAndRewrite( + ConstantOp constOp, ArrayRef operands, ConversionPatternRewriter &rewriter) const { - if (!constIndexOp.getResult().getType().isa()) { + Type srcType = constOp.getType(); + if (!srcType.isIntOrIndexOrFloat()) return failure(); - } - // The attribute has index type which is not directly supported in - // SPIR-V. Get the integer value and create a new IntegerAttr. - auto constAttr = constIndexOp.value().dyn_cast(); - if (!constAttr) { + + Type dstType = typeConverter.convertType(srcType); + if (!dstType) return failure(); + + // Floating-point types. + if (srcType.isa()) { + auto srcAttr = constOp.value().cast(); + auto dstAttr = srcAttr; + + // Floating-point types not supported in the target environment are all + // converted to float type. + if (srcType != dstType) { + dstAttr = convertFloatAttr(srcAttr, dstType.cast(), rewriter); + if (!dstAttr) + return failure(); + } + + rewriter.replaceOpWithNewOp(constOp, dstType, dstAttr); + return success(); } - // Use the bitwidth set in the value attribute to decide the result type - // of the SPIR-V constant operation since SPIR-V does not support index - // types. - auto constVal = constAttr.getValue(); - auto constValType = constAttr.getType().dyn_cast(); - if (!constValType) { - return failure(); + // Bool type. + if (srcType.isInteger(1)) { + // std.constant can use 0/1 instead of true/false for i1 values. We need to + // handle that here. + auto dstAttr = convertBoolAttr(constOp.value(), rewriter); + if (!dstAttr) + return failure(); + rewriter.replaceOpWithNewOp(constOp, dstType, dstAttr); + return success(); } - auto spirvConstType = - typeConverter.convertType(constIndexOp.getResult().getType()); - auto spirvConstVal = - rewriter.getIntegerAttr(spirvConstType, constAttr.getInt()); - rewriter.replaceOpWithNewOp(constIndexOp, spirvConstType, - spirvConstVal); + + // IndexType or IntegerType. Index values are converted to 32-bit integer + // values when converting to SPIR-V. + auto srcAttr = constOp.value().cast(); + auto dstAttr = + convertIntegerAttr(srcAttr, dstType.cast(), rewriter); + if (!dstAttr) + return failure(); + rewriter.replaceOpWithNewOp(constOp, dstType, dstAttr); return success(); } @@ -213,8 +414,8 @@ LogicalResult ConstantIndexOpConversion::matchAndRewrite( //===----------------------------------------------------------------------===// LogicalResult -CmpFOpConversion::matchAndRewrite(CmpFOp cmpFOp, ArrayRef operands, - ConversionPatternRewriter &rewriter) const { +CmpFOpPattern::matchAndRewrite(CmpFOp cmpFOp, ArrayRef operands, + ConversionPatternRewriter &rewriter) const { CmpFOpOperandAdaptor cmpFOpOperands(operands); switch (cmpFOp.getPredicate()) { @@ -253,8 +454,8 @@ CmpFOpConversion::matchAndRewrite(CmpFOp cmpFOp, ArrayRef operands, //===----------------------------------------------------------------------===// LogicalResult -CmpIOpConversion::matchAndRewrite(CmpIOp cmpIOp, ArrayRef operands, - ConversionPatternRewriter &rewriter) const { +CmpIOpPattern::matchAndRewrite(CmpIOp cmpIOp, ArrayRef operands, + ConversionPatternRewriter &rewriter) const { CmpIOpOperandAdaptor cmpIOpOperands(operands); switch (cmpIOp.getPredicate()) { @@ -286,8 +487,8 @@ CmpIOpConversion::matchAndRewrite(CmpIOp cmpIOp, ArrayRef operands, //===----------------------------------------------------------------------===// LogicalResult -LoadOpConversion::matchAndRewrite(LoadOp loadOp, ArrayRef operands, - ConversionPatternRewriter &rewriter) const { +LoadOpPattern::matchAndRewrite(LoadOp loadOp, ArrayRef operands, + ConversionPatternRewriter &rewriter) const { LoadOpOperandAdaptor loadOperands(operands); auto loadPtr = spirv::getElementPtr( typeConverter, loadOp.memref().getType().cast(), @@ -301,8 +502,8 @@ LoadOpConversion::matchAndRewrite(LoadOp loadOp, ArrayRef operands, //===----------------------------------------------------------------------===// LogicalResult -ReturnOpConversion::matchAndRewrite(ReturnOp returnOp, ArrayRef operands, - ConversionPatternRewriter &rewriter) const { +ReturnOpPattern::matchAndRewrite(ReturnOp returnOp, ArrayRef operands, + ConversionPatternRewriter &rewriter) const { if (returnOp.getNumOperands()) { return failure(); } @@ -315,8 +516,8 @@ ReturnOpConversion::matchAndRewrite(ReturnOp returnOp, ArrayRef operands, //===----------------------------------------------------------------------===// LogicalResult -SelectOpConversion::matchAndRewrite(SelectOp op, ArrayRef operands, - ConversionPatternRewriter &rewriter) const { +SelectOpPattern::matchAndRewrite(SelectOp op, ArrayRef operands, + ConversionPatternRewriter &rewriter) const { SelectOpOperandAdaptor selectOperands(operands); rewriter.replaceOpWithNewOp(op, selectOperands.condition(), selectOperands.true_value(), @@ -329,8 +530,8 @@ SelectOpConversion::matchAndRewrite(SelectOp op, ArrayRef operands, //===----------------------------------------------------------------------===// LogicalResult -StoreOpConversion::matchAndRewrite(StoreOp storeOp, ArrayRef operands, - ConversionPatternRewriter &rewriter) const { +StoreOpPattern::matchAndRewrite(StoreOp storeOp, ArrayRef operands, + ConversionPatternRewriter &rewriter) const { StoreOpOperandAdaptor storeOperands(operands); auto storePtr = spirv::getElementPtr( typeConverter, storeOp.memref().getType().cast(), @@ -341,25 +542,68 @@ StoreOpConversion::matchAndRewrite(StoreOp storeOp, ArrayRef operands, return success(); } -namespace { -/// Import the Standard Ops to SPIR-V Patterns. -#include "StandardToSPIRV.cpp.inc" -} // namespace +//===----------------------------------------------------------------------===// +// XorOp +//===----------------------------------------------------------------------===// + +LogicalResult +XOrOpPattern::matchAndRewrite(XOrOp xorOp, ArrayRef operands, + ConversionPatternRewriter &rewriter) const { + assert(operands.size() == 2); + + if (isBoolScalarOrVector(operands.front().getType())) + return failure(); + + auto dstType = typeConverter.convertType(xorOp.getType()); + if (!dstType) + return failure(); + rewriter.replaceOpWithNewOp(xorOp, dstType, operands, + ArrayRef()); + + return success(); +} + +//===----------------------------------------------------------------------===// +// Pattern population +//===----------------------------------------------------------------------===// namespace mlir { void populateStandardToSPIRVPatterns(MLIRContext *context, SPIRVTypeConverter &typeConverter, OwningRewritePatternList &patterns) { - // Add patterns that lower operations into SPIR-V dialect. - populateWithGenerated(context, &patterns); - patterns.insert, - IntegerOpConversion, - IntegerOpConversion, - IntegerOpConversion, - IntegerOpConversion, LoadOpConversion, - ReturnOpConversion, SelectOpConversion, StoreOpConversion>( + patterns.insert< + UnaryAndBinaryOpPattern, + UnaryAndBinaryOpPattern, + UnaryAndBinaryOpPattern, + UnaryAndBinaryOpPattern, + UnaryAndBinaryOpPattern, + UnaryAndBinaryOpPattern, + UnaryAndBinaryOpPattern, + UnaryAndBinaryOpPattern, + UnaryAndBinaryOpPattern, + UnaryAndBinaryOpPattern, + UnaryAndBinaryOpPattern, + UnaryAndBinaryOpPattern, + UnaryAndBinaryOpPattern, + UnaryAndBinaryOpPattern, + UnaryAndBinaryOpPattern, + UnaryAndBinaryOpPattern, + UnaryAndBinaryOpPattern, + UnaryAndBinaryOpPattern, + UnaryAndBinaryOpPattern, + UnaryAndBinaryOpPattern, + UnaryAndBinaryOpPattern, + UnaryAndBinaryOpPattern, + UnaryAndBinaryOpPattern, + UnaryAndBinaryOpPattern, + BitwiseOpPattern, + BitwiseOpPattern, + ConstantCompositeOpPattern, ConstantScalarOpPattern, CmpFOpPattern, + CmpIOpPattern, LoadOpPattern, ReturnOpPattern, SelectOpPattern, + StoreOpPattern, TypeCastingOpPattern, + TypeCastingOpPattern, + TypeCastingOpPattern, XOrOpPattern>( context, typeConverter); } } // namespace mlir diff --git a/mlir/lib/Conversion/StandardToSPIRV/ConvertStandardToSPIRVPass.cpp b/mlir/lib/Conversion/StandardToSPIRV/ConvertStandardToSPIRVPass.cpp index 7a3dae287d702..efccd168d6ea8 100644 --- a/mlir/lib/Conversion/StandardToSPIRV/ConvertStandardToSPIRVPass.cpp +++ b/mlir/lib/Conversion/StandardToSPIRV/ConvertStandardToSPIRVPass.cpp @@ -31,14 +31,15 @@ void ConvertStandardToSPIRVPass::runOnModule() { MLIRContext *context = &getContext(); ModuleOp module = getModule(); - SPIRVTypeConverter typeConverter; + auto targetAttr = spirv::lookupTargetEnvOrDefault(module); + std::unique_ptr target = + spirv::SPIRVConversionTarget::get(targetAttr); + + SPIRVTypeConverter typeConverter(targetAttr); OwningRewritePatternList patterns; populateStandardToSPIRVPatterns(context, typeConverter, patterns); populateBuiltinFuncToSPIRVPatterns(context, typeConverter, patterns); - std::unique_ptr target = spirv::SPIRVConversionTarget::get( - spirv::lookupTargetEnvOrDefault(module), context); - if (failed(applyPartialConversion(module, *target, patterns))) { return signalPassFailure(); } diff --git a/mlir/lib/Conversion/StandardToSPIRV/StandardToSPIRV.td b/mlir/lib/Conversion/StandardToSPIRV/StandardToSPIRV.td deleted file mode 100644 index a23ae5fe81c9d..0000000000000 --- a/mlir/lib/Conversion/StandardToSPIRV/StandardToSPIRV.td +++ /dev/null @@ -1,53 +0,0 @@ -//==- StandardToSPIRV.td - Standard Ops to SPIR-V Patterns ---*- tablegen -*==// - -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// Defines Patterns to lower standard ops to SPIR-V. -// -//===----------------------------------------------------------------------===// - -#ifndef MLIR_CONVERSION_STANDARDTOSPIRV_TD -#define MLIR_CONVERSION_STANDARDTOSPIRV_TD - -include "mlir/Dialect/StandardOps/IR/Ops.td" -include "mlir/Dialect/SPIRV/SPIRVOps.td" - -class BinaryOpPattern : - Pat<(src SPV_ScalarOrVectorOf:$l, SPV_ScalarOrVectorOf:$r), - (tgt $l, $r)>; - -class UnaryOpPattern : - Pat<(src type:$input), - (tgt $input)>; - -def : BinaryOpPattern; -def : BinaryOpPattern; -def : BinaryOpPattern; -def : BinaryOpPattern; -def : BinaryOpPattern; -def : BinaryOpPattern; -def : BinaryOpPattern; -def : BinaryOpPattern; -def : BinaryOpPattern; -def : BinaryOpPattern; -def : BinaryOpPattern; -def : BinaryOpPattern; -def : BinaryOpPattern; - -def : UnaryOpPattern; -def : UnaryOpPattern; -def : UnaryOpPattern; - -// Constant Op -// TODO(ravishankarm): Handle lowering other constant types. -def : Pat<(ConstantOp:$result $valueAttr), - (SPV_ConstantOp $valueAttr), - [(SPV_ScalarOrVector $result)]>; - -#endif // MLIR_CONVERSION_STANDARDTOSPIRV_TD diff --git a/mlir/lib/Conversion/StandardToStandard/CMakeLists.txt b/mlir/lib/Conversion/StandardToStandard/CMakeLists.txt new file mode 100644 index 0000000000000..e1bc42a746ee9 --- /dev/null +++ b/mlir/lib/Conversion/StandardToStandard/CMakeLists.txt @@ -0,0 +1,13 @@ +add_mlir_conversion_library(MLIRStandardToStandard + StandardToStandard.cpp + + ADDITIONAL_HEADER_DIRS + ${MLIR_MAIN_INCLUDE_DIR}/mlir/Conversion/StandardToStandard + ) +target_link_libraries(MLIRStandardToStandard + PUBLIC + MLIRIR + MLIRPass + MLIRStandardOps + MLIRTransforms + ) diff --git a/mlir/lib/Conversion/StandardToStandard/StandardToStandard.cpp b/mlir/lib/Conversion/StandardToStandard/StandardToStandard.cpp new file mode 100644 index 0000000000000..e4734f31fd633 --- /dev/null +++ b/mlir/lib/Conversion/StandardToStandard/StandardToStandard.cpp @@ -0,0 +1,49 @@ +//===- StandardToStandard.cpp - Std intra-dialect lowering ----------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "mlir/Conversion/StandardToStandard/StandardToStandard.h" +#include "mlir/Dialect/StandardOps/IR/Ops.h" +#include "mlir/Transforms/DialectConversion.h" + +using namespace mlir; + +namespace { +// Converts the operand and result types of the Standard's CallOp, used together +// with the FuncOpSignatureConversion. +struct CallOpSignatureConversion : public OpConversionPattern { + CallOpSignatureConversion(MLIRContext *ctx, TypeConverter &converter) + : OpConversionPattern(ctx), converter(converter) {} + + /// Hook for derived classes to implement combined matching and rewriting. + LogicalResult + matchAndRewrite(CallOp callOp, ArrayRef operands, + ConversionPatternRewriter &rewriter) const override { + FunctionType type = callOp.getCalleeType(); + + // Convert the original function results. + SmallVector convertedResults; + if (failed(converter.convertTypes(type.getResults(), convertedResults))) + return failure(); + + // Substitute with the new result types from the corresponding FuncType + // conversion. + rewriter.replaceOpWithNewOp(callOp, callOp.callee(), + convertedResults, operands); + return success(); + } + + /// The type converter to use when rewriting the signature. + TypeConverter &converter; +}; +} // end anonymous namespace + +void mlir::populateCallOpTypeConversionPattern( + OwningRewritePatternList &patterns, MLIRContext *ctx, + TypeConverter &converter) { + patterns.insert(ctx, converter); +} diff --git a/mlir/lib/Conversion/VectorToLoops/ConvertVectorToLoops.cpp b/mlir/lib/Conversion/VectorToLoops/ConvertVectorToLoops.cpp index b16f02ef6b9c9..b73d94562edcc 100644 --- a/mlir/lib/Conversion/VectorToLoops/ConvertVectorToLoops.cpp +++ b/mlir/lib/Conversion/VectorToLoops/ConvertVectorToLoops.cpp @@ -13,7 +13,7 @@ #include #include "mlir/Conversion/VectorToLoops/ConvertVectorToLoops.h" -#include "mlir/Dialect/AffineOps/EDSC/Intrinsics.h" +#include "mlir/Dialect/Affine/EDSC/Intrinsics.h" #include "mlir/Dialect/LoopOps/EDSC/Builders.h" #include "mlir/Dialect/StandardOps/EDSC/Intrinsics.h" #include "mlir/Dialect/Vector/VectorOps.h" diff --git a/mlir/lib/Dialect/AVX512/CMakeLists.txt b/mlir/lib/Dialect/AVX512/CMakeLists.txt new file mode 100644 index 0000000000000..eb1e7dc5c4b51 --- /dev/null +++ b/mlir/lib/Dialect/AVX512/CMakeLists.txt @@ -0,0 +1,16 @@ +add_mlir_dialect_library(MLIRAVX512 + IR/AVX512Dialect.cpp + + ADDITIONAL_HEADER_DIRS + ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/AVX512 + + DEPENDS + MLIRAVX512IncGen + ) +target_link_libraries(MLIRAVX512 + PUBLIC + MLIRIR + MLIRSideEffects + MLIRVectorToLLVM + LLVMSupport + ) diff --git a/mlir/lib/Dialect/AVX512/IR/AVX512Dialect.cpp b/mlir/lib/Dialect/AVX512/IR/AVX512Dialect.cpp new file mode 100644 index 0000000000000..aade931ee4e7e --- /dev/null +++ b/mlir/lib/Dialect/AVX512/IR/AVX512Dialect.cpp @@ -0,0 +1,35 @@ +//===- AVX512Ops.cpp - MLIR AVX512 ops implementation ---------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements the AVX512 dialect and its operations. +// +//===----------------------------------------------------------------------===// + +#include "mlir/Dialect/AVX512/AVX512Dialect.h" +#include "mlir/Dialect/Vector/VectorOps.h" +#include "mlir/IR/Builders.h" +#include "mlir/IR/OpImplementation.h" +#include "mlir/IR/TypeUtilities.h" + +using namespace mlir; + +avx512::AVX512Dialect::AVX512Dialect(MLIRContext *context) + : Dialect(getDialectNamespace(), context) { + addOperations< +#define GET_OP_LIST +#include "mlir/Dialect/AVX512/AVX512.cpp.inc" + >(); +} + +namespace mlir { +namespace avx512 { +#define GET_OP_CLASSES +#include "mlir/Dialect/AVX512/AVX512.cpp.inc" +} // namespace avx512 +} // namespace mlir + diff --git a/mlir/lib/Dialect/Affine/CMakeLists.txt b/mlir/lib/Dialect/Affine/CMakeLists.txt new file mode 100644 index 0000000000000..c018b50f967f1 --- /dev/null +++ b/mlir/lib/Dialect/Affine/CMakeLists.txt @@ -0,0 +1,21 @@ +add_mlir_dialect_library(MLIRAffine + IR/AffineOps.cpp + IR/AffineValueMap.cpp + EDSC/Builders.cpp + + ADDITIONAL_HEADER_DIRS + ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/Affine + + DEPENDS + MLIRAffineOpsIncGen + ) +target_link_libraries(MLIRAffine + PUBLIC + MLIREDSC + MLIRIR + MLIRLoopLikeInterface + MLIRSideEffects + MLIRStandardOps + ) + +add_subdirectory(Transforms) diff --git a/mlir/lib/Dialect/AffineOps/EDSC/Builders.cpp b/mlir/lib/Dialect/Affine/EDSC/Builders.cpp similarity index 99% rename from mlir/lib/Dialect/AffineOps/EDSC/Builders.cpp rename to mlir/lib/Dialect/Affine/EDSC/Builders.cpp index e69f3d6c8c7a2..06f88dcec1be8 100644 --- a/mlir/lib/Dialect/AffineOps/EDSC/Builders.cpp +++ b/mlir/lib/Dialect/Affine/EDSC/Builders.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -#include "mlir/Dialect/AffineOps/EDSC/Builders.h" +#include "mlir/Dialect/Affine/EDSC/Builders.h" #include "mlir/Dialect/StandardOps/EDSC/Builders.h" #include "mlir/IR/AffineExpr.h" #include "mlir/IR/AffineMap.h" diff --git a/mlir/lib/Dialect/Affine/EDSC/CMakeLists.txt b/mlir/lib/Dialect/Affine/EDSC/CMakeLists.txt new file mode 100644 index 0000000000000..751bfd351bc6a --- /dev/null +++ b/mlir/lib/Dialect/Affine/EDSC/CMakeLists.txt @@ -0,0 +1,17 @@ +add_mlir_dialect_library(MLIRAffine + EDSC/Builders.cpp + + ADDITIONAL_HEADER_DIRS + ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/Affine + + DEPENDS + MLIRAffineOpsIncGen + ) +target_link_libraries(MLIRAffine + PUBLIC + MLIREDSC + MLIRIR + MLIRLoopLikeInterface + MLIRSideEffects + MLIRStandardOps + ) diff --git a/mlir/lib/Dialect/AffineOps/AffineOps.cpp b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp similarity index 98% rename from mlir/lib/Dialect/AffineOps/AffineOps.cpp rename to mlir/lib/Dialect/Affine/IR/AffineOps.cpp index 0b8795947e06c..7bf8ab4e07a58 100644 --- a/mlir/lib/Dialect/AffineOps/AffineOps.cpp +++ b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#include "mlir/Dialect/AffineOps/AffineOps.h" -#include "mlir/Dialect/AffineOps/AffineValueMap.h" +#include "mlir/Dialect/Affine/IR/AffineOps.h" +#include "mlir/Dialect/Affine/IR/AffineValueMap.h" #include "mlir/Dialect/StandardOps/IR/Ops.h" #include "mlir/IR/Function.h" #include "mlir/IR/IntegerSet.h" @@ -25,7 +25,7 @@ using llvm::dbgs; #define DEBUG_TYPE "affine-analysis" //===----------------------------------------------------------------------===// -// AffineOpsDialect Interfaces +// AffineDialect Interfaces //===----------------------------------------------------------------------===// namespace { @@ -64,21 +64,21 @@ struct AffineInlinerInterface : public DialectInlinerInterface { } // end anonymous namespace //===----------------------------------------------------------------------===// -// AffineOpsDialect +// AffineDialect //===----------------------------------------------------------------------===// -AffineOpsDialect::AffineOpsDialect(MLIRContext *context) +AffineDialect::AffineDialect(MLIRContext *context) : Dialect(getDialectNamespace(), context) { addOperations(); addInterfaces(); } /// Materialize a single constant operation from a given attribute value with /// the desired resultant type. -Operation *AffineOpsDialect::materializeConstant(OpBuilder &builder, +Operation *AffineDialect::materializeConstant(OpBuilder &builder, Attribute value, Type type, Location loc) { return builder.create(loc, type, value); @@ -629,8 +629,7 @@ static void canonicalizePromotedSymbols(MapOrSet *mapOrSet, template static void canonicalizeMapOrSetAndOperands(MapOrSet *mapOrSet, SmallVectorImpl *operands) { - static_assert(std::is_same::value || - std::is_same::value, + static_assert(llvm::is_one_of::value, "Argument must be either of AffineMap or IntegerSet type"); if (!mapOrSet || operands->empty()) @@ -729,13 +728,10 @@ struct SimplifyAffineOp : public OpRewritePattern { LogicalResult matchAndRewrite(AffineOpTy affineOp, PatternRewriter &rewriter) const override { - static_assert(std::is_same::value || - std::is_same::value || - std::is_same::value || - std::is_same::value || - std::is_same::value || - std::is_same::value, - "affine load/store/apply op expected"); + static_assert(llvm::is_one_of::value, + "affine load/store/apply/prefetch/min/max op expected"); auto map = affineOp.getAffineMap(); AffineMap oldMap = map; auto oldOperands = affineOp.getMapOperands(); @@ -2369,4 +2365,4 @@ static ParseResult parseAffineParallelOp(OpAsmParser &parser, //===----------------------------------------------------------------------===// #define GET_OP_CLASSES -#include "mlir/Dialect/AffineOps/AffineOps.cpp.inc" +#include "mlir/Dialect/Affine/IR/AffineOps.cpp.inc" diff --git a/mlir/lib/Dialect/AffineOps/AffineValueMap.cpp b/mlir/lib/Dialect/Affine/IR/AffineValueMap.cpp similarity index 97% rename from mlir/lib/Dialect/AffineOps/AffineValueMap.cpp rename to mlir/lib/Dialect/Affine/IR/AffineValueMap.cpp index bac183505a717..c17f59323a7f0 100644 --- a/mlir/lib/Dialect/AffineOps/AffineValueMap.cpp +++ b/mlir/lib/Dialect/Affine/IR/AffineValueMap.cpp @@ -6,8 +6,8 @@ // //===----------------------------------------------------------------------===// -#include "mlir/Dialect/AffineOps/AffineValueMap.h" -#include "mlir/Dialect/AffineOps/AffineOps.h" +#include "mlir/Dialect/Affine/IR/AffineValueMap.h" +#include "mlir/Dialect/Affine/IR/AffineOps.h" using namespace mlir; diff --git a/mlir/lib/Dialect/AffineOps/CMakeLists.txt b/mlir/lib/Dialect/Affine/IR/CMakeLists.txt similarity index 57% rename from mlir/lib/Dialect/AffineOps/CMakeLists.txt rename to mlir/lib/Dialect/Affine/IR/CMakeLists.txt index bf490a5c9795f..91dcceaf3912b 100644 --- a/mlir/lib/Dialect/AffineOps/CMakeLists.txt +++ b/mlir/lib/Dialect/Affine/IR/CMakeLists.txt @@ -1,15 +1,14 @@ -add_mlir_dialect_library(MLIRAffineOps +add_mlir_dialect_library(MLIRAffine AffineOps.cpp AffineValueMap.cpp - EDSC/Builders.cpp ADDITIONAL_HEADER_DIRS - ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/AffineOps + ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/Affine DEPENDS MLIRAffineOpsIncGen ) -target_link_libraries(MLIRAffineOps +target_link_libraries(MLIRAffine PUBLIC MLIREDSC MLIRIR diff --git a/mlir/lib/Transforms/AffineDataCopyGeneration.cpp b/mlir/lib/Dialect/Affine/Transforms/AffineDataCopyGeneration.cpp similarity index 99% rename from mlir/lib/Transforms/AffineDataCopyGeneration.cpp rename to mlir/lib/Dialect/Affine/Transforms/AffineDataCopyGeneration.cpp index 5409c557da83e..4f6d453fb56b2 100644 --- a/mlir/lib/Transforms/AffineDataCopyGeneration.cpp +++ b/mlir/lib/Dialect/Affine/Transforms/AffineDataCopyGeneration.cpp @@ -20,11 +20,11 @@ //===----------------------------------------------------------------------===// #include "mlir/Analysis/Utils.h" -#include "mlir/Dialect/AffineOps/AffineOps.h" +#include "mlir/Dialect/Affine/IR/AffineOps.h" +#include "mlir/Dialect/Affine/Passes.h" #include "mlir/IR/Builders.h" #include "mlir/Pass/Pass.h" #include "mlir/Transforms/LoopUtils.h" -#include "mlir/Transforms/Passes.h" #include "mlir/Transforms/Utils.h" #include "llvm/ADT/MapVector.h" #include "llvm/Support/CommandLine.h" diff --git a/mlir/lib/Transforms/AffineLoopInvariantCodeMotion.cpp b/mlir/lib/Dialect/Affine/Transforms/AffineLoopInvariantCodeMotion.cpp similarity index 98% rename from mlir/lib/Transforms/AffineLoopInvariantCodeMotion.cpp rename to mlir/lib/Dialect/Affine/Transforms/AffineLoopInvariantCodeMotion.cpp index a8ea4b9c1e106..746aceb8090a8 100644 --- a/mlir/lib/Transforms/AffineLoopInvariantCodeMotion.cpp +++ b/mlir/lib/Dialect/Affine/Transforms/AffineLoopInvariantCodeMotion.cpp @@ -15,13 +15,13 @@ #include "mlir/Analysis/LoopAnalysis.h" #include "mlir/Analysis/SliceAnalysis.h" #include "mlir/Analysis/Utils.h" -#include "mlir/Dialect/AffineOps/AffineOps.h" +#include "mlir/Dialect/Affine/IR/AffineOps.h" +#include "mlir/Dialect/Affine/Passes.h" #include "mlir/IR/AffineExpr.h" #include "mlir/IR/AffineMap.h" #include "mlir/IR/Builders.h" #include "mlir/Pass/Pass.h" #include "mlir/Transforms/LoopUtils.h" -#include "mlir/Transforms/Passes.h" #include "mlir/Transforms/Utils.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DenseSet.h" diff --git a/mlir/lib/Dialect/Affine/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Affine/Transforms/CMakeLists.txt new file mode 100644 index 0000000000000..89abbd521deca --- /dev/null +++ b/mlir/lib/Dialect/Affine/Transforms/CMakeLists.txt @@ -0,0 +1,29 @@ +add_mlir_dialect_library(MLIRAffineTransforms + AffineDataCopyGeneration.cpp + AffineLoopInvariantCodeMotion.cpp + LoopTiling.cpp + LoopUnroll.cpp + LoopUnrollAndJam.cpp + SuperVectorize.cpp + SimplifyAffineStructures.cpp + + ADDITIONAL_HEADER_DIRS + ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/Affine + + DEPENDS + MLIRAffineOpsIncGen + MLIRLoopLikeInterfaceIncGen + ) +target_link_libraries(MLIRAffineTransforms + PUBLIC + MLIRAffine + MLIREDSC + MLIRIR + MLIRPass + MLIRSideEffects + MLIRStandardOps + MLIRTransformUtils + MLIRVector + MLIRVectorToLLVM + ) + diff --git a/mlir/lib/Transforms/LoopTiling.cpp b/mlir/lib/Dialect/Affine/Transforms/LoopTiling.cpp similarity index 93% rename from mlir/lib/Transforms/LoopTiling.cpp rename to mlir/lib/Dialect/Affine/Transforms/LoopTiling.cpp index 0c411144df9c1..3f08315170c8f 100644 --- a/mlir/lib/Transforms/LoopTiling.cpp +++ b/mlir/lib/Dialect/Affine/Transforms/LoopTiling.cpp @@ -14,11 +14,11 @@ #include "mlir/Analysis/AffineStructures.h" #include "mlir/Analysis/LoopAnalysis.h" #include "mlir/Analysis/Utils.h" -#include "mlir/Dialect/AffineOps/AffineOps.h" +#include "mlir/Dialect/Affine/IR/AffineOps.h" +#include "mlir/Dialect/Affine/Passes.h" #include "mlir/IR/Builders.h" #include "mlir/Pass/Pass.h" #include "mlir/Transforms/LoopUtils.h" -#include "mlir/Transforms/Passes.h" #include "mlir/Transforms/Utils.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" @@ -29,19 +29,20 @@ using namespace mlir; static llvm::cl::OptionCategory clOptionsCategory(DEBUG_TYPE " options"); static llvm::cl::opt - clCacheSizeKiB("tile-cache-size", + clCacheSizeKiB("affine-tile-cache-size", llvm::cl::desc("Set size of cache to tile for in KiB"), llvm::cl::cat(clOptionsCategory)); // Tile size to use for all loops (overrides -tile-sizes if provided). static llvm::cl::opt - clTileSize("tile-size", llvm::cl::desc("Use this tile size for all loops"), + clTileSize("affine-tile-size", + llvm::cl::desc("Use this tile size for all loops"), llvm::cl::cat(clOptionsCategory)); // List of tile sizes. If any of them aren't provided, they are filled with // clTileSize / kDefaultTileSize. static llvm::cl::list clTileSizes( - "tile-sizes", + "affine-tile-sizes", llvm::cl::desc( "List of tile sizes for each perfect nest (overridden by -tile-size)"), llvm::cl::ZeroOrMore, llvm::cl::cat(clOptionsCategory)); @@ -176,13 +177,12 @@ constructTiledIndexSetHyperRect(MutableArrayRef origLoops, // TODO(bondhugula): handle non hyper-rectangular spaces. LogicalResult mlir::tileCodeGen(MutableArrayRef band, ArrayRef tileSizes) { - assert(!band.empty()); - assert(band.size() == tileSizes.size() && "Incorrect number of tile sizes"); + assert(!band.empty() && "no loops in band"); + assert(band.size() == tileSizes.size() && "Too few/many tile sizes"); // Check if the supplied for op's are all successively nested. - for (unsigned i = 1, e = band.size(); i < e; i++) { - assert(band[i].getParentOp() == band[i - 1].getOperation()); - } + for (unsigned i = 1, e = band.size(); i < e; i++) + assert(band[i].getParentOp() == band[i - 1] && "not a perfect nest / band"); auto origLoops = band; @@ -191,11 +191,11 @@ LogicalResult mlir::tileCodeGen(MutableArrayRef band, // Note that width is at least one since band isn't empty. unsigned width = band.size(); - SmallVector newLoops(2 * width); - AffineForOp innermostPointLoop; + SmallVector tiledLoops(2 * width); // The outermost among the loops as we add more.. auto *topLoop = rootAffineForOp.getOperation(); + AffineForOp innermostPointLoop; // Add intra-tile (or point) loops. for (unsigned i = 0; i < width; i++) { @@ -205,7 +205,7 @@ LogicalResult mlir::tileCodeGen(MutableArrayRef band, pointLoop.getBody()->getOperations().splice( pointLoop.getBody()->begin(), topLoop->getBlock()->getOperations(), topLoop); - newLoops[2 * width - 1 - i] = pointLoop; + tiledLoops[2 * width - 1 - i] = pointLoop; topLoop = pointLoop.getOperation(); if (i == 0) innermostPointLoop = pointLoop; @@ -219,7 +219,7 @@ LogicalResult mlir::tileCodeGen(MutableArrayRef band, tileSpaceLoop.getBody()->getOperations().splice( tileSpaceLoop.getBody()->begin(), topLoop->getBlock()->getOperations(), topLoop); - newLoops[2 * width - i - 1] = tileSpaceLoop; + tiledLoops[2 * width - i - 1] = tileSpaceLoop; topLoop = tileSpaceLoop.getOperation(); } @@ -233,16 +233,17 @@ LogicalResult mlir::tileCodeGen(MutableArrayRef band, getIndexSet(band, &cst); if (!cst.isHyperRectangular(0, width)) { - rootAffineForOp.emitError("tiled code generation unimplemented for the " - "non-hyperrectangular case"); + llvm::dbgs() << "tiled code generation unimplemented for the " + "non-hyperrectangular case, op:" + << *rootAffineForOp << "\n"; return failure(); } - constructTiledIndexSetHyperRect(origLoops, newLoops, tileSizes); - // In this case, the point loop IVs just replace the original ones. - for (unsigned i = 0; i < width; i++) { - origLoopIVs[i].replaceAllUsesWith(newLoops[i + width].getInductionVar()); - } + constructTiledIndexSetHyperRect(origLoops, tiledLoops, tileSizes); + + // Replace original IVs with intra-tile loop IVs. + for (unsigned i = 0; i < width; i++) + origLoopIVs[i].replaceAllUsesWith(tiledLoops[i + width].getInductionVar()); // Erase the old loop nest. rootAffineForOp.erase(); @@ -380,6 +381,7 @@ void LoopTiling::runOnFunction() { std::vector> bands; getTileableBands(getFunction(), &bands); + // Tile each band. for (auto &band : bands) { // Set up tile sizes; fill missing tile sizes at the end with default tile // size or clTileSize if one was provided. @@ -388,7 +390,7 @@ void LoopTiling::runOnFunction() { if (llvm::DebugFlag) { auto diag = band[0].emitRemark("using tile sizes ["); for (auto tSize : tileSizes) - diag << tSize << " "; + diag << tSize << ' '; diag << "]\n"; } if (failed(tileCodeGen(band, tileSizes))) diff --git a/mlir/lib/Transforms/LoopUnroll.cpp b/mlir/lib/Dialect/Affine/Transforms/LoopUnroll.cpp similarity index 98% rename from mlir/lib/Transforms/LoopUnroll.cpp rename to mlir/lib/Dialect/Affine/Transforms/LoopUnroll.cpp index 2083a1226879e..40187a7ebae7f 100644 --- a/mlir/lib/Transforms/LoopUnroll.cpp +++ b/mlir/lib/Dialect/Affine/Transforms/LoopUnroll.cpp @@ -10,10 +10,9 @@ // //===----------------------------------------------------------------------===// -#include "mlir/Transforms/Passes.h" - #include "mlir/Analysis/LoopAnalysis.h" -#include "mlir/Dialect/AffineOps/AffineOps.h" +#include "mlir/Dialect/Affine/IR/AffineOps.h" +#include "mlir/Dialect/Affine/Passes.h" #include "mlir/IR/AffineExpr.h" #include "mlir/IR/AffineMap.h" #include "mlir/IR/Builders.h" diff --git a/mlir/lib/Dialect/Affine/Transforms/LoopUnrollAndJam.cpp b/mlir/lib/Dialect/Affine/Transforms/LoopUnrollAndJam.cpp new file mode 100644 index 0000000000000..218a6a2ce23db --- /dev/null +++ b/mlir/lib/Dialect/Affine/Transforms/LoopUnrollAndJam.cpp @@ -0,0 +1,105 @@ +//===- LoopUnrollAndJam.cpp - Code to perform loop unroll and jam ---------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements loop unroll and jam. Unroll and jam is a transformation +// that improves locality, in particular, register reuse, while also improving +// operation level parallelism. The example below shows what it does in nearly +// the general case. Loop unroll and jam currently works if the bounds of the +// loops inner to the loop being unroll-jammed do not depend on the latter. +// +// Before After unroll and jam of i by factor 2: +// +// for i, step = 2 +// for i S1(i); +// S1; S2(i); +// S2; S1(i+1); +// for j S2(i+1); +// S3; for j +// S4; S3(i, j); +// S5; S4(i, j); +// S6; S3(i+1, j) +// S4(i+1, j) +// S5(i); +// S6(i); +// S5(i+1); +// S6(i+1); +// +// Note: 'if/else' blocks are not jammed. So, if there are loops inside if +// op's, bodies of those loops will not be jammed. +//===----------------------------------------------------------------------===// +#include "mlir/Analysis/LoopAnalysis.h" +#include "mlir/Dialect/Affine/IR/AffineOps.h" +#include "mlir/Dialect/Affine/Passes.h" +#include "mlir/IR/AffineExpr.h" +#include "mlir/IR/AffineMap.h" +#include "mlir/IR/BlockAndValueMapping.h" +#include "mlir/IR/Builders.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Transforms/LoopUtils.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/Support/CommandLine.h" + +using namespace mlir; + +#define DEBUG_TYPE "affine-loop-unroll-jam" + +static llvm::cl::OptionCategory clOptionsCategory(DEBUG_TYPE " options"); + +// Loop unroll and jam factor. +static llvm::cl::opt + clUnrollJamFactor("unroll-jam-factor", llvm::cl::Hidden, + llvm::cl::desc("Use this unroll jam factor for all loops" + " (default 4)"), + llvm::cl::cat(clOptionsCategory)); + +namespace { +/// Loop unroll jam pass. Currently, this just unroll jams the first +/// outer loop in a Function. +struct LoopUnrollAndJam : public FunctionPass { + Optional unrollJamFactor; + static const unsigned kDefaultUnrollJamFactor = 4; + + explicit LoopUnrollAndJam(Optional unrollJamFactor = None) + : unrollJamFactor(unrollJamFactor) {} + + void runOnFunction() override; + LogicalResult runOnAffineForOp(AffineForOp forOp); +}; +} // end anonymous namespace + +std::unique_ptr> +mlir::createLoopUnrollAndJamPass(int unrollJamFactor) { + return std::make_unique( + unrollJamFactor == -1 ? None : Optional(unrollJamFactor)); +} + +void LoopUnrollAndJam::runOnFunction() { + // Currently, just the outermost loop from the first loop nest is + // unroll-and-jammed by this pass. However, runOnAffineForOp can be called on + // any for operation. + auto &entryBlock = getFunction().front(); + if (auto forOp = dyn_cast(entryBlock.front())) + runOnAffineForOp(forOp); +} + +/// Unroll and jam a 'affine.for' op. Default unroll jam factor is +/// kDefaultUnrollJamFactor. Return failure if nothing was done. +LogicalResult LoopUnrollAndJam::runOnAffineForOp(AffineForOp forOp) { + // Unroll and jam by the factor that was passed if any. + if (unrollJamFactor.hasValue()) + return loopUnrollJamByFactor(forOp, unrollJamFactor.getValue()); + // Otherwise, unroll jam by the command-line factor if one was specified. + if (clUnrollJamFactor.getNumOccurrences() > 0) + return loopUnrollJamByFactor(forOp, clUnrollJamFactor); + + // Unroll and jam by four otherwise. + return loopUnrollJamByFactor(forOp, kDefaultUnrollJamFactor); +} + +static PassRegistration pass("affine-loop-unroll-jam", + "Unroll and jam loops"); diff --git a/mlir/lib/Transforms/SimplifyAffineStructures.cpp b/mlir/lib/Dialect/Affine/Transforms/SimplifyAffineStructures.cpp similarity index 95% rename from mlir/lib/Transforms/SimplifyAffineStructures.cpp rename to mlir/lib/Dialect/Affine/Transforms/SimplifyAffineStructures.cpp index 671b3eab18dfe..60ad1545d3502 100644 --- a/mlir/lib/Transforms/SimplifyAffineStructures.cpp +++ b/mlir/lib/Dialect/Affine/Transforms/SimplifyAffineStructures.cpp @@ -13,7 +13,10 @@ #include "mlir/Analysis/AffineStructures.h" #include "mlir/IR/IntegerSet.h" #include "mlir/Pass/Pass.h" -#include "mlir/Transforms/Passes.h" +#include "mlir/Dialect/Affine/IR/AffineValueMap.h" +#include "mlir/Dialect/Affine/IR/AffineOps.h" +#include "mlir/Dialect/Affine/Passes.h" +#include "mlir/Dialect/StandardOps/IR/Ops.h" #include "mlir/Transforms/Utils.h" #define DEBUG_TYPE "simplify-affine-structure" diff --git a/mlir/lib/Transforms/Vectorize.cpp b/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp similarity index 96% rename from mlir/lib/Transforms/Vectorize.cpp rename to mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp index 75a7d4d5cf6ba..d9a9ad969e511 100644 --- a/mlir/lib/Transforms/Vectorize.cpp +++ b/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp @@ -1,4 +1,4 @@ -//===- Vectorize.cpp - Vectorize Pass Impl --------------------------------===// +//===- SuperVectorize.cpp - Vectorize Pass Impl ---------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -15,7 +15,8 @@ #include "mlir/Analysis/NestedMatcher.h" #include "mlir/Analysis/SliceAnalysis.h" #include "mlir/Analysis/Utils.h" -#include "mlir/Dialect/AffineOps/AffineOps.h" +#include "mlir/Dialect/Affine/IR/AffineOps.h" +#include "mlir/Dialect/Affine/Passes.h" #include "mlir/Dialect/StandardOps/IR/Ops.h" #include "mlir/Dialect/Vector/VectorOps.h" #include "mlir/Dialect/Vector/VectorUtils.h" @@ -27,7 +28,6 @@ #include "mlir/Support/Functional.h" #include "mlir/Support/LLVM.h" #include "mlir/Transforms/FoldUtils.h" -#include "mlir/Transforms/Passes.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DenseSet.h" @@ -414,7 +414,7 @@ using namespace mlir; /// /// The -affine-vectorize pass with the following arguments: /// ``` -/// -affine-vectorize -virtual-vector-size 256 --test-fastest-varying=0 +/// -affine-vectorize="virtual-vector-size=256 test-fastest-varying=0" /// ``` /// /// produces this standard innermost-loop vectorized code: @@ -468,8 +468,7 @@ using namespace mlir; /// /// The -affine-vectorize pass with the following arguments: /// ``` -/// -affine-vectorize -virtual-vector-size 32 -virtual-vector-size 256 -/// --test-fastest-varying=1 --test-fastest-varying=0 +/// -affine-vectorize="virtual-vector-size=32,256 test-fastest-varying=1,0" /// ``` /// /// produces this more interesting mixed outer-innermost-loop vectorized code: @@ -531,21 +530,6 @@ using functional::map; using llvm::dbgs; using llvm::SetVector; -static llvm::cl::OptionCategory clOptionsCategory("vectorize options"); - -static llvm::cl::list clVirtualVectorSize( - "virtual-vector-size", - llvm::cl::desc("Specify an n-D virtual vector size for vectorization"), - llvm::cl::ZeroOrMore, llvm::cl::cat(clOptionsCategory)); - -static llvm::cl::list clFastestVaryingPattern( - "test-fastest-varying", - llvm::cl::desc( - "Specify a 1-D, 2-D or 3-D pattern of fastest varying memory" - " dimensions to match. See defaultPatterns in Vectorize.cpp for a" - " description and examples. This is used for testing purposes"), - llvm::cl::ZeroOrMore, llvm::cl::cat(clOptionsCategory)); - /// Forward declaration. static FilterFunctionType isVectorizableLoopPtrFactory(const DenseSet ¶llelLoops, @@ -590,33 +574,35 @@ namespace { /// Base state for the vectorize pass. /// Command line arguments are preempted by non-empty pass arguments. struct Vectorize : public FunctionPass { - Vectorize(); + Vectorize() = default; + Vectorize(const Vectorize &) {} Vectorize(ArrayRef virtualVectorSize); void runOnFunction() override; - // The virtual vector size that we vectorize to. - SmallVector vectorSizes; - // Optionally, the fixed mapping from loop to fastest varying MemRef dimension - // for all the MemRefs within a loop pattern: - // the index represents the loop depth, the value represents the k^th - // fastest varying memory dimension. - // This is voluntarily restrictive and is meant to precisely target a - // particular loop/op pair, for testing purposes. - SmallVector fastestVaryingPattern; + /// The virtual vector size that we vectorize to. + ListOption vectorSizes{ + *this, "virtual-vector-size", + llvm::cl::desc("Specify an n-D virtual vector size for vectorization"), + llvm::cl::ZeroOrMore, llvm::cl::CommaSeparated}; + /// Optionally, the fixed mapping from loop to fastest varying MemRef + /// dimension for all the MemRefs within a loop pattern: + /// the index represents the loop depth, the value represents the k^th + /// fastest varying memory dimension. + /// This is voluntarily restrictive and is meant to precisely target a + /// particular loop/op pair, for testing purposes. + ListOption fastestVaryingPattern{ + *this, "test-fastest-varying", + llvm::cl::desc( + "Specify a 1-D, 2-D or 3-D pattern of fastest varying memory" + " dimensions to match. See defaultPatterns in Vectorize.cpp for a" + " description and examples. This is used for testing purposes"), + llvm::cl::ZeroOrMore, llvm::cl::CommaSeparated}; }; } // end anonymous namespace -Vectorize::Vectorize() - : vectorSizes(clVirtualVectorSize.begin(), clVirtualVectorSize.end()), - fastestVaryingPattern(clFastestVaryingPattern.begin(), - clFastestVaryingPattern.end()) {} - -Vectorize::Vectorize(ArrayRef virtualVectorSize) : Vectorize() { - if (!virtualVectorSize.empty()) { - this->vectorSizes.assign(virtualVectorSize.begin(), - virtualVectorSize.end()); - } +Vectorize::Vectorize(ArrayRef virtualVectorSize) { + vectorSizes->assign(virtualVectorSize.begin(), virtualVectorSize.end()); } /////// TODO(ntv): Hoist to a VectorizationStrategy.cpp when appropriate. @@ -1282,10 +1268,10 @@ void Vectorize::runOnFunction() { } std::unique_ptr> -mlir::createVectorizePass(ArrayRef virtualVectorSize) { +mlir::createSuperVectorizePass(ArrayRef virtualVectorSize) { return std::make_unique(virtualVectorSize); } static PassRegistration - pass("affine-vectorize", + pass("affine-super-vectorize", "Vectorize to a target independent n-D vector abstraction"); diff --git a/mlir/lib/Dialect/CMakeLists.txt b/mlir/lib/Dialect/CMakeLists.txt index fe99044a90e65..ddc147fe2657b 100644 --- a/mlir/lib/Dialect/CMakeLists.txt +++ b/mlir/lib/Dialect/CMakeLists.txt @@ -1,4 +1,5 @@ -add_subdirectory(AffineOps) +add_subdirectory(Affine) +add_subdirectory(AVX512) add_subdirectory(FxpMathOps) add_subdirectory(GPU) add_subdirectory(Linalg) diff --git a/mlir/lib/Dialect/GPU/CMakeLists.txt b/mlir/lib/Dialect/GPU/CMakeLists.txt index b83c3ca6a4213..e71a018a451c3 100644 --- a/mlir/lib/Dialect/GPU/CMakeLists.txt +++ b/mlir/lib/Dialect/GPU/CMakeLists.txt @@ -10,6 +10,8 @@ add_mlir_dialect_library(MLIRGPU DEPENDS MLIRGPUOpsIncGen + MLIRParallelLoopMapperAttrGen + MLIRParallelLoopMapperEnumsGen ) target_link_libraries(MLIRGPU PUBLIC diff --git a/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp b/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp index f85a0c7027295..9697688ac850c 100644 --- a/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp +++ b/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp @@ -23,6 +23,43 @@ using namespace mlir; using namespace mlir::gpu; using namespace mlir::loop; +#include "mlir/Dialect/GPU/ParallelLoopMapperEnums.cpp.inc" +namespace mlir { + +#include "mlir/Dialect/GPU/ParallelLoopMapperAttr.cpp.inc" +namespace gpu { + +StringRef getMappingAttrName() { return "mapping"; } + +ParallelLoopDimMapping getParallelLoopDimMappingAttr(Processor processor, + AffineMap map, + AffineMap bound) { + MLIRContext *context = map.getContext(); + OpBuilder builder(context); + return ParallelLoopDimMapping::get( + builder.getI64IntegerAttr(static_cast(processor)), + AffineMapAttr::get(map), AffineMapAttr::get(bound), context); +} + +LogicalResult setMappingAttr(loop::ParallelOp ploopOp, + ArrayRef mapping) { + // Verify that each processor is mapped to only once. + llvm::DenseSet specifiedMappings; + for (auto dimAttr : mapping) { + gpu::Processor processor = getProcessor(dimAttr); + if (processor != gpu::Processor::Sequential && + specifiedMappings.count(processor)) + return ploopOp.emitError( + "invalid mapping multiple loops to same processor"); + } + ArrayRef mappingAsAttrs(mapping.data(), mapping.size()); + ploopOp.setAttr(getMappingAttrName(), + ArrayAttr::get(mappingAsAttrs, ploopOp.getContext())); + return success(); +} +} // namespace gpu +} // namespace mlir + namespace { enum MappingLevel { MapGrid = 0, MapBlock = 1, Sequential = 2 }; @@ -43,10 +80,41 @@ MappingLevel &operator++(MappingLevel &mappingLevel) { /// Computed the hardware id to use for a given mapping level. Will /// assign x,y and z hardware ids for the first 3 dimensions and use /// sequential after. -static int64_t getHardwareIdForMapping(MappingLevel level, int dimension) { +/// TODO(ravishankarm/herhut) : Make this use x for the inner-most loop that is +/// distributed to map to x, the next innermost to y and the next innermost to +/// z. +static gpu::Processor getHardwareIdForMapping(MappingLevel level, + int dimension) { + if (dimension >= kNumHardwareIds || level == Sequential) - return Sequential * kNumHardwareIds; - return (level * kNumHardwareIds) + dimension; + return Processor::Sequential; + switch (level) { + case MapGrid: + switch (dimension) { + case 0: + return Processor::BlockX; + case 1: + return Processor::BlockY; + case 2: + return Processor::BlockZ; + default: + return Processor::Sequential; + } + break; + case MapBlock: + switch (dimension) { + case 0: + return Processor::ThreadX; + case 1: + return Processor::ThreadY; + case 2: + return Processor::ThreadZ; + default: + return Processor::Sequential; + } + default:; + } + return Processor::Sequential; } /// Add mapping information to the given parallel loop. Do not add @@ -55,26 +123,20 @@ static int64_t getHardwareIdForMapping(MappingLevel level, int dimension) { static void mapParallelOp(ParallelOp parallelOp, MappingLevel mappingLevel = MapGrid) { // Do not try to add a mapping to already mapped loops or nested loops. - if (parallelOp.getAttr(gpu::kMappingAttributeName) || + if (parallelOp.getAttr(getMappingAttrName()) || ((mappingLevel == MapGrid) && parallelOp.getParentOfType())) return; MLIRContext *ctx = parallelOp.getContext(); Builder b(ctx); - SmallVector attrs; + SmallVector attrs; attrs.reserve(parallelOp.getNumInductionVars()); for (int i = 0, e = parallelOp.getNumInductionVars(); i < e; ++i) { - SmallVector entries; - entries.emplace_back(b.getNamedAttr( - kProcessorEntryName, - b.getI64IntegerAttr(getHardwareIdForMapping(mappingLevel, i)))); - entries.emplace_back(b.getNamedAttr( - kIndexMapEntryName, AffineMapAttr::get(b.getDimIdentityMap()))); - entries.emplace_back(b.getNamedAttr( - kBoundMapEntryName, AffineMapAttr::get(b.getDimIdentityMap()))); - attrs.push_back(DictionaryAttr::get(entries, ctx)); + attrs.push_back(getParallelLoopDimMappingAttr( + getHardwareIdForMapping(mappingLevel, i), b.getDimIdentityMap(), + b.getDimIdentityMap())); } - parallelOp.setAttr(kMappingAttributeName, ArrayAttr::get(attrs, ctx)); + setMappingAttr(parallelOp, attrs); ++mappingLevel; // Parallel loop operations are immediately nested, so do not use // walk but just iterate over the operations. diff --git a/mlir/lib/Dialect/LLVMIR/CMakeLists.txt b/mlir/lib/Dialect/LLVMIR/CMakeLists.txt index 2e53d29f768da..148bc4bef3e88 100644 --- a/mlir/lib/Dialect/LLVMIR/CMakeLists.txt +++ b/mlir/lib/Dialect/LLVMIR/CMakeLists.txt @@ -24,6 +24,26 @@ target_link_libraries(MLIRLLVMIR MLIRSupport ) +add_mlir_dialect_library(MLIRLLVMAVX512 + IR/LLVMAVX512Dialect.cpp + + ADDITIONAL_HEADER_DIRS + ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/LLVMIR + + DEPENDS + MLIRLLVMAVX512IncGen + MLIRLLVMAVX512ConversionsIncGen + ) +target_link_libraries(MLIRLLVMAVX512 + PUBLIC + LLVMAsmParser + MLIRIR + MLIRLLVMIR + MLIRSideEffects + LLVMSupport + LLVMCore + ) + add_mlir_dialect_library(MLIRNVVMIR IR/NVVMDialect.cpp diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMAVX512Dialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMAVX512Dialect.cpp new file mode 100644 index 0000000000000..bde81144fb54b --- /dev/null +++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMAVX512Dialect.cpp @@ -0,0 +1,36 @@ +//===- LLVMAVX512Dialect.cpp - MLIR LLVMAVX512 ops implementation ---------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements the LLVMAVX512 dialect and its operations. +// +//===----------------------------------------------------------------------===// + +#include "llvm/IR/IntrinsicsX86.h" + +#include "mlir/Dialect/LLVMIR/LLVMAVX512Dialect.h" +#include "mlir/Dialect/LLVMIR/LLVMDialect.h" +#include "mlir/IR/Builders.h" +#include "mlir/IR/OpImplementation.h" +#include "mlir/IR/TypeUtilities.h" + +using namespace mlir; + +LLVM::LLVMAVX512Dialect::LLVMAVX512Dialect(MLIRContext *context) + : Dialect(getDialectNamespace(), context) { + addOperations< +#define GET_OP_LIST +#include "mlir/Dialect/LLVMIR/LLVMAVX512.cpp.inc" + >(); +} + +namespace mlir { +namespace LLVM { +#define GET_OP_CLASSES +#include "mlir/Dialect/LLVMIR/LLVMAVX512.cpp.inc" +} // namespace LLVM +} // namespace mlir diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp index 2a2e6699fee5e..c69530b28e29c 100644 --- a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp +++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp @@ -407,6 +407,11 @@ static ParseResult parseInvokeOp(OpAsmParser &parser, OperationState &result) { static LogicalResult verify(LandingpadOp op) { Value value; + if (LLVMFuncOp func = op.getParentOfType()) { + if (!func.personality().hasValue()) + return op.emitError( + "llvm.landingpad needs to be in a function with a personality"); + } if (!op.cleanup() && op.getOperands().empty()) return op.emitError("landingpad instruction expects at least one clause or " diff --git a/mlir/lib/Dialect/Linalg/EDSC/Builders.cpp b/mlir/lib/Dialect/Linalg/EDSC/Builders.cpp index e2c64f0501584..198c7fc698dd2 100644 --- a/mlir/lib/Dialect/Linalg/EDSC/Builders.cpp +++ b/mlir/lib/Dialect/Linalg/EDSC/Builders.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// #include "mlir/IR/Builders.h" -#include "mlir/Dialect/AffineOps/EDSC/Intrinsics.h" +#include "mlir/Dialect/Affine/EDSC/Intrinsics.h" #include "mlir/Dialect/Linalg/EDSC/Intrinsics.h" #include "mlir/Dialect/LoopOps/EDSC/Builders.h" #include "mlir/Dialect/StandardOps/EDSC/Intrinsics.h" diff --git a/mlir/lib/Dialect/Linalg/EDSC/CMakeLists.txt b/mlir/lib/Dialect/Linalg/EDSC/CMakeLists.txt index 85ecf2802b25f..bc9e244d4ad54 100644 --- a/mlir/lib/Dialect/Linalg/EDSC/CMakeLists.txt +++ b/mlir/lib/Dialect/Linalg/EDSC/CMakeLists.txt @@ -12,7 +12,7 @@ target_link_libraries(MLIRLinalgEDSC PUBLIC MLIREDSC MLIRIR - MLIRAffineOps + MLIRAffine MLIRLinalgOps MLIRLoopOps MLIRStandardOps diff --git a/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt index 82e36f8a00b14..349c2d6980acf 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt +++ b/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt @@ -14,7 +14,7 @@ add_mlir_dialect_library(MLIRLinalgTransforms ) target_link_libraries(MLIRLinalgTransforms PUBLIC - MLIRAffineOps + MLIRAffine MLIRAnalysis MLIREDSC MLIRIR diff --git a/mlir/lib/Dialect/Linalg/Transforms/Fusion.cpp b/mlir/lib/Dialect/Linalg/Transforms/Fusion.cpp index 8da000fa5260d..b6af16c979c35 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/Fusion.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Fusion.cpp @@ -55,14 +55,6 @@ using llvm::dbgs; /// More advanced use cases, analyses as well as profitability heuristics are /// left for future work. -static llvm::cl::OptionCategory clOptionsCategory(DEBUG_TYPE " options"); -static llvm::cl::list clTileSizes( - "linalg-fusion-tile-sizes", - llvm::cl::desc( - "Tile sizes by which to tile linalg operations during linalg fusion"), - llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated, - llvm::cl::cat(clOptionsCategory)); - // Return a cloned version of `op` that operates on `loopRanges`, assumed to be // a subset of the original loop ranges of `op`. // This is achieved by applying the `loopToOperandRangesMaps` permutation maps diff --git a/mlir/lib/Dialect/Linalg/Transforms/LinalgToLoops.cpp b/mlir/lib/Dialect/Linalg/Transforms/LinalgToLoops.cpp index 4dc41e2c87ae0..5e1748cc47c00 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/LinalgToLoops.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/LinalgToLoops.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -#include "mlir/Dialect/AffineOps/EDSC/Intrinsics.h" +#include "mlir/Dialect/Affine/EDSC/Intrinsics.h" #include "mlir/Dialect/Linalg/EDSC/Intrinsics.h" #include "mlir/Dialect/Linalg/IR/LinalgOps.h" #include "mlir/Dialect/Linalg/IR/LinalgTypes.h" @@ -33,6 +33,7 @@ using namespace mlir::linalg; using edsc::op::operator+; using edsc::op::operator==; +using mlir::edsc::intrinsics::detail::ValueHandleArray; static SmallVector makeCanonicalAffineApplies(OpBuilder &b, Location loc, AffineMap map, @@ -81,6 +82,30 @@ SmallVector emitLoopRanges(OpBuilder &b, Location loc, AffineMap map, return res; } +template +static void inlineRegionAndEmitStdStore(OpType op, + ArrayRef indexedValues, + ArrayRef indexing, + ArrayRef outputBuffers) { + auto &b = ScopedContext::getBuilder(); + auto &block = op.region().front(); + BlockAndValueMapping map; + map.map(block.getArguments(), indexedValues); + for (auto &op : block.without_terminator()) { + assert(op.getNumRegions() == 0 && "expected a non-nested region"); + auto *newOp = b.clone(op, map); + map.map(op.getResults(), newOp->getResults()); + } + + Operation &terminator = block.back(); + assert(isa(terminator) && + "expected an yield op in the end of the region"); + for (unsigned i = 0, e = terminator.getNumOperands(); i < e; ++i) { + std_store(map.lookup(terminator.getOperand(i)), outputBuffers[i], + indexing[i]); + } +} + namespace { template class LinalgScopedEmitter {}; @@ -185,9 +210,9 @@ class LinalgScopedEmitter { if (!convOp.padding()) return im(imIdx); + auto *context = ScopedContext::getContext(); ValueHandle zeroIndex = std_constant_index(0); - SmallVector conds = { - std_constant_int(/*value=*/1, /*width=*/1)}; + SmallVector conds; SmallVector clampedImIdx; for (auto iter : llvm::enumerate(imIdx)) { int idx = iter.index(); @@ -201,13 +226,16 @@ class LinalgScopedEmitter { using edsc::op::operator<; using edsc::op::operator>=; using edsc::op::operator||; - conds.push_back(conds.back() || (dim < zeroIndex)); - ValueHandle bound = std_dim(convOp.input(), idx); - conds.push_back(conds.back() || (dim >= bound)); + ValueHandle leftOutOfBound = dim < zeroIndex; + if (conds.empty()) + conds.push_back(leftOutOfBound); + else + conds.push_back(conds.back() || leftOutOfBound); + ValueHandle rightBound = std_dim(convOp.input(), idx); + conds.push_back(conds.back() || (dim >= rightBound)); // When padding is involed, the indices will only be shifted to negative, // so having a max op is enough. - auto *context = ScopedContext::getContext(); auto maxMap = AffineMap::get(/*dimCount=*/1, 0, {getAffineDimExpr(/*position=*/0, context), getAffineConstantExpr(0, context)}); @@ -219,7 +247,8 @@ class LinalgScopedEmitter { Type type = convOp.input().getType().cast().getElementType(); ValueHandle zero = std_constant(type, b.getZeroAttr(type)); ValueHandle readInput = im(clampedImIdx); - return std_select(conds.back(), zero, readInput); + return conds.empty() ? readInput + : std_select(conds.back(), zero, readInput); } static void emitScalarImplementation(ArrayRef allIvs, ConvOp convOp) { @@ -296,6 +325,8 @@ class LinalgScopedEmitter { } // 1.b. Emit std_load from output views. + // TODO(mravishankar): Avoid the loads if the corresponding argument of the + // region has no uses. for (unsigned i = 0; i < nOutputs; ++i) { Value output = genericOp.getOutputBuffer(i); ValueHandleArray indexing(makeCanonicalAffineApplies( @@ -320,24 +351,16 @@ class LinalgScopedEmitter { } // TODO(ntv): When a region inliner exists, use it. // 2. Inline region, currently only works for a single basic block. - BlockAndValueMapping map; - auto &block = genericOp.region().front(); - map.map(block.getArguments(), indexedValues); - for (auto &op : block.without_terminator()) { - assert(op.getNumRegions() == 0); - auto *newOp = b.clone(op, map); - map.map(op.getResults(), newOp->getResults()); - } - // 3. Emit std_store. - auto *yieldOp = cast(block.back()).getOperation(); - assert(yieldOp->getNumOperands() == nOutputs); + SmallVector indexing; + SmallVector outputBuffers; for (unsigned i = 0; i < nOutputs; ++i) { - ValueHandleArray indexing(makeCanonicalAffineApplies( + indexing.emplace_back(makeCanonicalAffineApplies( b, loc, genericOp.getOutputIndexingMap(i), allIvs)); - std_store(map.lookup(yieldOp->getOperand(i)), - genericOp.getOutputBuffer(i), indexing); + outputBuffers.push_back(genericOp.getOutputBuffer(i)); } + inlineRegionAndEmitStdStore(genericOp, indexedValues, indexing, + outputBuffers); } }; @@ -393,25 +416,17 @@ class LinalgScopedEmitter { // 1.a. Emit std_load from input views. for (unsigned i = 0; i < nInputs; ++i) { Value input = indexedGenericOp.getInput(i); - if (input.getType().cast().getRank()) { - ValueHandleArray indexing(makeCanonicalAffineApplies( - b, loc, indexedGenericOp.getInputIndexingMap(i), allIvs)); - indexedValues[nLoops + i] = std_load(input, indexing); - } else { - indexedValues[nLoops + i] = std_load(input); - } + ValueHandleArray indexing(makeCanonicalAffineApplies( + b, loc, indexedGenericOp.getInputIndexingMap(i), allIvs)); + indexedValues[nLoops + i] = std_load(input, indexing); } // 1.b. Emit std_load from output views. for (unsigned i = 0; i < nOutputs; ++i) { Value output = indexedGenericOp.getOutputBuffer(i); - if (output.getType().cast().getRank()) { - ValueHandleArray indexing(makeCanonicalAffineApplies( - b, loc, indexedGenericOp.getOutputIndexingMap(i), allIvs)); - indexedValues[nLoops + nInputs + i] = std_load(output, indexing); - } else { - indexedValues[nLoops + nInputs + i] = std_load(output); - } + ValueHandleArray indexing(makeCanonicalAffineApplies( + b, loc, indexedGenericOp.getOutputIndexingMap(i), allIvs)); + indexedValues[nLoops + nInputs + i] = std_load(output, indexing); } if (auto funcOp = indexedGenericOp.getFunction()) { @@ -422,40 +437,24 @@ class LinalgScopedEmitter { // 3. Emit std_store. for (unsigned i = 0; i < nOutputs; ++i) { Value output = indexedGenericOp.getOutputBuffer(i); - if (output.getType().cast().getRank()) { - ValueHandleArray indexing(makeCanonicalAffineApplies( - b, loc, indexedGenericOp.getOutputIndexingMap(i), allIvs)); - std_store(callOp->getResult(i), output, indexing); - } else { - std_store(callOp->getResult(i), output); - } + ValueHandleArray indexing(makeCanonicalAffineApplies( + b, loc, indexedGenericOp.getOutputIndexingMap(i), allIvs)); + std_store(callOp->getResult(i), output, indexing); } return; } // TODO(ntv): When a region inliner exists, use it. // 2. Inline region, currently only works for a single basic block. - BlockAndValueMapping map; - auto &block = indexedGenericOp.region().front(); - map.map(block.getArguments(), indexedValues); - for (auto &op : block.without_terminator()) { - assert(op.getNumRegions() == 0); - auto *newOp = b.clone(op, map); - map.map(op.getResults(), newOp->getResults()); - } - // 3. Emit std_store. - auto *yieldOp = cast(block.back()).getOperation(); - assert(yieldOp->getNumOperands() == nOutputs); + SmallVector indexing; + SmallVector outputBuffers; for (unsigned i = 0; i < nOutputs; ++i) { - Value output = indexedGenericOp.getOutputBuffer(i); - if (output.getType().cast().getRank()) { - ValueHandleArray indexing(makeCanonicalAffineApplies( - b, loc, indexedGenericOp.getOutputIndexingMap(i), allIvs)); - std_store(map.lookup(yieldOp->getOperand(i)), output, indexing); - } else { - std_store(map.lookup(yieldOp->getOperand(i)), output); - } + indexing.emplace_back(makeCanonicalAffineApplies( + b, loc, indexedGenericOp.getOutputIndexingMap(i), allIvs)); + outputBuffers.push_back(indexedGenericOp.getOutputBuffer(i)); } + inlineRegionAndEmitStdStore(indexedGenericOp, indexedValues, indexing, + outputBuffers); } }; diff --git a/mlir/lib/Dialect/Linalg/Transforms/Promotion.cpp b/mlir/lib/Dialect/Linalg/Transforms/Promotion.cpp index 54a4290e6e361..8a6b5cf8b5daf 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/Promotion.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Promotion.cpp @@ -10,7 +10,7 @@ // //===----------------------------------------------------------------------===// -#include "mlir/Dialect/AffineOps/EDSC/Intrinsics.h" +#include "mlir/Dialect/Affine/EDSC/Intrinsics.h" #include "mlir/Dialect/Linalg/EDSC/Intrinsics.h" #include "mlir/Dialect/Linalg/IR/LinalgOps.h" #include "mlir/Dialect/Linalg/IR/LinalgTypes.h" @@ -43,12 +43,6 @@ using folded_linalg_range = folded::ValueBuilder; #define DEBUG_TYPE "linalg-promotion" -static llvm::cl::OptionCategory clOptionsCategory(DEBUG_TYPE " options"); -static llvm::cl::opt clPromoteDynamic( - "test-linalg-promote-dynamic", - llvm::cl::desc("Test generation of dynamic promoted buffers"), - llvm::cl::cat(clOptionsCategory), llvm::cl::init(false)); - static Value allocBuffer(Type elementType, Value size, bool dynamicBuffers) { auto *ctx = size.getContext(); auto width = llvm::divideCeil(elementType.getIntOrFloatBitWidth(), 8); @@ -238,13 +232,19 @@ static void promoteSubViews(FuncOp f, bool dynamicBuffers) { namespace { struct LinalgPromotionPass : public FunctionPass { LinalgPromotionPass() = default; - LinalgPromotionPass(bool dynamicBuffers) : dynamicBuffers(dynamicBuffers) {} + LinalgPromotionPass(const LinalgPromotionPass &) {} + LinalgPromotionPass(bool dynamicBuffers) { + this->dynamicBuffers = dynamicBuffers; + } void runOnFunction() override { promoteSubViews(getFunction(), dynamicBuffers); } - bool dynamicBuffers; + Option dynamicBuffers{ + *this, "test-promote-dynamic", + llvm::cl::desc("Test generation of dynamic promoted buffers"), + llvm::cl::init(false)}; }; } // namespace @@ -254,6 +254,4 @@ mlir::createLinalgPromotionPass(bool dynamicBuffers) { } static PassRegistration - pass("linalg-promote-subviews", "promote subview ops to local buffers", [] { - return std::make_unique(clPromoteDynamic); - }); + pass("linalg-promote-subviews", "promote subview ops to local buffers"); diff --git a/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp b/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp index cabdd7497cafe..2d9ca16c63b69 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp @@ -10,7 +10,7 @@ // //===----------------------------------------------------------------------===// -#include "mlir/Dialect/AffineOps/EDSC/Intrinsics.h" +#include "mlir/Dialect/Affine/EDSC/Intrinsics.h" #include "mlir/Dialect/Linalg/EDSC/Intrinsics.h" #include "mlir/Dialect/Linalg/IR/LinalgTypes.h" #include "mlir/Dialect/Linalg/Passes.h" @@ -39,13 +39,6 @@ using folded_affine_min = folded::ValueBuilder; #define DEBUG_TYPE "linalg-tiling" -static llvm::cl::OptionCategory clOptionsCategory(DEBUG_TYPE " options"); -static llvm::cl::list - clTileSizes("linalg-tile-sizes", - llvm::cl::desc("Tile sizes by which to tile linalg operations"), - llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated, - llvm::cl::cat(clOptionsCategory)); - static bool isZero(Value v) { return isa_and_nonnull(v.getDefiningOp()) && cast(v.getDefiningOp()).getValue() == 0; @@ -513,15 +506,19 @@ namespace { template struct LinalgTilingPass : public FunctionPass> { LinalgTilingPass() = default; + LinalgTilingPass(const LinalgTilingPass &) {} LinalgTilingPass(ArrayRef sizes) { - this->tileSizes.assign(sizes.begin(), sizes.end()); + this->tileSizes->assign(sizes.begin(), sizes.end()); } void runOnFunction() override { tileLinalgOps(this->getFunction(), tileSizes); } - SmallVector tileSizes; + Pass::ListOption tileSizes{ + *this, "linalg-tile-sizes", + llvm::cl::desc("Tile sizes by which to tile linalg operations"), + llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated}; }; } // namespace @@ -537,17 +534,9 @@ mlir::createLinalgTilingToParallelLoopsPass(ArrayRef tileSizes) { } static PassRegistration> - tiling_pass("linalg-tile", "Tile operations in the linalg dialect", [] { - auto pass = std::make_unique>(); - pass->tileSizes.assign(clTileSizes.begin(), clTileSizes.end()); - return pass; - }); + tiling_pass("linalg-tile", "Tile operations in the linalg dialect"); static PassRegistration> tiling_to_parallel_loops( "linalg-tile-to-parallel-loops", - "Tile operations in the linalg dialect to parallel loops", [] { - auto pass = std::make_unique>(); - pass->tileSizes.assign(clTileSizes.begin(), clTileSizes.end()); - return pass; - }); + "Tile operations in the linalg dialect to parallel loops"); diff --git a/mlir/lib/Dialect/Linalg/Utils/CMakeLists.txt b/mlir/lib/Dialect/Linalg/Utils/CMakeLists.txt index 681a47d31271e..f9ad613f2a175 100644 --- a/mlir/lib/Dialect/Linalg/Utils/CMakeLists.txt +++ b/mlir/lib/Dialect/Linalg/Utils/CMakeLists.txt @@ -9,7 +9,7 @@ add_mlir_dialect_library(MLIRLinalgUtils target_link_libraries(MLIRLinalgUtils PUBLIC - MLIRAffineOps + MLIRAffine MLIREDSC MLIRIR MLIRLinalgOps diff --git a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp index 9cc6aa48c9665..c572be4d132e3 100644 --- a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp +++ b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp @@ -11,7 +11,7 @@ //===----------------------------------------------------------------------===// #include "mlir/Dialect/Linalg/Utils/Utils.h" -#include "mlir/Dialect/AffineOps/AffineOps.h" +#include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/Linalg/IR/LinalgOps.h" #include "mlir/Dialect/Linalg/IR/LinalgTypes.h" #include "mlir/Dialect/LoopOps/LoopOps.h" diff --git a/mlir/lib/Dialect/LoopOps/LoopOps.cpp b/mlir/lib/Dialect/LoopOps/LoopOps.cpp index 4d1533aba8595..3d7ee3846c22d 100644 --- a/mlir/lib/Dialect/LoopOps/LoopOps.cpp +++ b/mlir/lib/Dialect/LoopOps/LoopOps.cpp @@ -201,12 +201,25 @@ ForOp mlir::loop::getForInductionVarOwner(Value val) { void IfOp::build(Builder *builder, OperationState &result, Value cond, bool withElseRegion) { + build(builder, result, /*resultTypes=*/llvm::None, cond, withElseRegion); +} + +void IfOp::build(Builder *builder, OperationState &result, + TypeRange resultTypes, Value cond, bool withElseRegion) { result.addOperands(cond); + result.addTypes(resultTypes); + Region *thenRegion = result.addRegion(); + thenRegion->push_back(new Block()); + if (resultTypes.empty()) + IfOp::ensureTerminator(*thenRegion, *builder, result.location); + Region *elseRegion = result.addRegion(); - IfOp::ensureTerminator(*thenRegion, *builder, result.location); - if (withElseRegion) - IfOp::ensureTerminator(*elseRegion, *builder, result.location); + if (withElseRegion) { + elseRegion->push_back(new Block()); + if (resultTypes.empty()) + IfOp::ensureTerminator(*elseRegion, *builder, result.location); + } } static LogicalResult verify(IfOp op) { diff --git a/mlir/lib/Dialect/LoopOps/Transforms/CMakeLists.txt b/mlir/lib/Dialect/LoopOps/Transforms/CMakeLists.txt index 1b6e6d2327118..2ec44b4722987 100644 --- a/mlir/lib/Dialect/LoopOps/Transforms/CMakeLists.txt +++ b/mlir/lib/Dialect/LoopOps/Transforms/CMakeLists.txt @@ -8,7 +8,7 @@ add_mlir_dialect_library(MLIRLoopOpsTransforms ) target_link_libraries(MLIRLoopOpsTransforms PUBLIC - MLIRAffineOps + MLIRAffine MLIRIR MLIRPass MLIRLoopOps diff --git a/mlir/lib/Dialect/LoopOps/Transforms/ParallelLoopSpecialization.cpp b/mlir/lib/Dialect/LoopOps/Transforms/ParallelLoopSpecialization.cpp index 8cb49f3428d6f..c692c0174f0c6 100644 --- a/mlir/lib/Dialect/LoopOps/Transforms/ParallelLoopSpecialization.cpp +++ b/mlir/lib/Dialect/LoopOps/Transforms/ParallelLoopSpecialization.cpp @@ -10,7 +10,7 @@ // //===----------------------------------------------------------------------===// -#include "mlir/Dialect/AffineOps/AffineOps.h" +#include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/LoopOps/LoopOps.h" #include "mlir/Dialect/LoopOps/Passes.h" #include "mlir/Dialect/StandardOps/IR/Ops.h" diff --git a/mlir/lib/Dialect/LoopOps/Transforms/ParallelLoopTiling.cpp b/mlir/lib/Dialect/LoopOps/Transforms/ParallelLoopTiling.cpp index 6bced3761afb2..85fd241cee7e3 100644 --- a/mlir/lib/Dialect/LoopOps/Transforms/ParallelLoopTiling.cpp +++ b/mlir/lib/Dialect/LoopOps/Transforms/ParallelLoopTiling.cpp @@ -10,7 +10,7 @@ // //===----------------------------------------------------------------------===// -#include "mlir/Dialect/AffineOps/AffineOps.h" +#include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/LoopOps/LoopOps.h" #include "mlir/Dialect/LoopOps/Passes.h" #include "mlir/Dialect/LoopOps/Transforms.h" diff --git a/mlir/lib/Dialect/Quant/Utils/UniformSupport.cpp b/mlir/lib/Dialect/Quant/Utils/UniformSupport.cpp index 4d5db81e326de..991d7c179f900 100644 --- a/mlir/lib/Dialect/Quant/Utils/UniformSupport.cpp +++ b/mlir/lib/Dialect/Quant/Utils/UniformSupport.cpp @@ -46,8 +46,8 @@ Type ExpressedToQuantizedConverter::convert(QuantizedType elementalType) const { switch (inputType.getKind()) { default: - if (isQuantizablePrimitiveType(elementalType)) { - // For primitives, just use the new elemental type. + if (elementalType.getExpressedType() == expressedType) { + // If the expressed types match, just use the new elemental type. return elementalType; } // Unsupported. diff --git a/mlir/lib/Dialect/SPIRV/LayoutUtils.cpp b/mlir/lib/Dialect/SPIRV/LayoutUtils.cpp index 44930b91e0ffd..d4ce17c93706d 100644 --- a/mlir/lib/Dialect/SPIRV/LayoutUtils.cpp +++ b/mlir/lib/Dialect/SPIRV/LayoutUtils.cpp @@ -59,7 +59,7 @@ VulkanLayoutUtils::decorateType(spirv::StructType structType, Type VulkanLayoutUtils::decorateType(Type type, VulkanLayoutUtils::Size &size, VulkanLayoutUtils::Size &alignment) { - if (spirv::SPIRVDialect::isValidScalarType(type)) { + if (type.isa()) { alignment = VulkanLayoutUtils::getScalarTypeAlignment(type); // Vulkan spec does not specify any padding for a scalar type. size = alignment; diff --git a/mlir/lib/Dialect/SPIRV/SPIRVCanonicalization.cpp b/mlir/lib/Dialect/SPIRV/SPIRVCanonicalization.cpp index f378047f36eac..953d95b449d15 100644 --- a/mlir/lib/Dialect/SPIRV/SPIRVCanonicalization.cpp +++ b/mlir/lib/Dialect/SPIRV/SPIRVCanonicalization.cpp @@ -14,6 +14,7 @@ #include "mlir/Dialect/CommonFolders.h" #include "mlir/Dialect/SPIRV/SPIRVDialect.h" +#include "mlir/Dialect/SPIRV/SPIRVTypes.h" #include "mlir/IR/Matchers.h" #include "mlir/IR/PatternMatch.h" #include "mlir/Support/Functional.h" @@ -358,15 +359,6 @@ struct ConvertSelectionOpToSelect rhs.getOperation()->getAttrList().getDictionary(); } - // Checks that given type is valid for `spv.SelectOp`. - // According to SPIR-V spec: - // "Before version 1.4, Result Type must be a pointer, scalar, or vector. - // Starting with version 1.4, Result Type can additionally be a composite type - // other than a vector." - bool isValidType(Type type) const { - return spirv::SPIRVDialect::isValidScalarType(type) || - type.isa(); - } // Returns a source value for the given block. Value getSrcValue(Block *block) const { @@ -401,11 +393,20 @@ LogicalResult ConvertSelectionOpToSelect::canCanonicalizeSelection( return failure(); } + // Checks that given type is valid for `spv.SelectOp`. + // According to SPIR-V spec: + // "Before version 1.4, Result Type must be a pointer, scalar, or vector. + // Starting with version 1.4, Result Type can additionally be a composite type + // other than a vector." + bool isScalarOrVector = trueBrStoreOp.value() + .getType() + .cast() + .isScalarOrVector(); + // Check that each `spv.Store` uses the same pointer, memory access // attributes and a valid type of the value. if ((trueBrStoreOp.ptr() != falseBrStoreOp.ptr()) || - !isSameAttrList(trueBrStoreOp, falseBrStoreOp) || - !isValidType(trueBrStoreOp.value().getType())) { + !isSameAttrList(trueBrStoreOp, falseBrStoreOp) || !isScalarOrVector) { return failure(); } diff --git a/mlir/lib/Dialect/SPIRV/SPIRVDialect.cpp b/mlir/lib/Dialect/SPIRV/SPIRVDialect.cpp index 50ecf9ef7cbda..8ed417cad58d8 100644 --- a/mlir/lib/Dialect/SPIRV/SPIRVDialect.cpp +++ b/mlir/lib/Dialect/SPIRV/SPIRVDialect.cpp @@ -61,7 +61,7 @@ struct SPIRVInlinerInterface : public DialectInlinerInterface { BlockAndValueMapping &) const final { // Return true here when inlining into spv.func, spv.selection, and // spv.loop operations. - auto op = dest->getParentOp(); + auto *op = dest->getParentOp(); return isa(op) || isa(op) || isa(op); } @@ -152,42 +152,6 @@ template <> Optional parseAndVerify(SPIRVDialect const &dialect, DialectAsmParser &parser); -static bool isValidSPIRVIntType(IntegerType type) { - return llvm::is_contained(ArrayRef({1, 8, 16, 32, 64}), - type.getWidth()); -} - -bool SPIRVDialect::isValidScalarType(Type type) { - if (type.isa()) { - return !type.isBF16(); - } - if (auto intType = type.dyn_cast()) { - return isValidSPIRVIntType(intType); - } - return false; -} - -static bool isValidSPIRVVectorType(VectorType type) { - return type.getRank() == 1 && - SPIRVDialect::isValidScalarType(type.getElementType()) && - type.getNumElements() >= 2 && type.getNumElements() <= 4; -} - -bool SPIRVDialect::isValidType(Type type) { - // Allow SPIR-V dialect types - if (type.getKind() >= Type::FIRST_SPIRV_TYPE && - type.getKind() <= TypeKind::LAST_SPIRV_TYPE) { - return true; - } - if (SPIRVDialect::isValidScalarType(type)) { - return true; - } - if (auto vectorType = type.dyn_cast()) { - return isValidSPIRVVectorType(vectorType); - } - return false; -} - static Type parseAndVerifyType(SPIRVDialect const &dialect, DialectAsmParser &parser) { Type type; @@ -206,7 +170,7 @@ static Type parseAndVerifyType(SPIRVDialect const &dialect, return Type(); } } else if (auto t = type.dyn_cast()) { - if (!isValidSPIRVIntType(t)) { + if (!ScalarType::isValid(t)) { parser.emitError(typeLoc, "only 1/8/16/32/64-bit integer type allowed but found ") << type; @@ -383,7 +347,8 @@ namespace { // parseAndVerify does the actual parsing and verification of individual // elements. This is a functor since parsing the last element of the list // (termination condition) needs partial specialization. -template struct parseCommaSeparatedList { +template +struct ParseCommaSeparatedList { Optional> operator()(SPIRVDialect const &dialect, DialectAsmParser &parser) const { auto parseVal = parseAndVerify(dialect, parser); @@ -393,7 +358,7 @@ template struct parseCommaSeparatedList { auto numArgs = std::tuple_size>::value; if (numArgs != 0 && failed(parser.parseComma())) return llvm::None; - auto remainingValues = parseCommaSeparatedList{}(dialect, parser); + auto remainingValues = ParseCommaSeparatedList{}(dialect, parser); if (!remainingValues) return llvm::None; return std::tuple_cat(std::tuple(parseVal.getValue()), @@ -403,7 +368,8 @@ template struct parseCommaSeparatedList { // Partial specialization of the function to parse a comma separated list of // specs to parse the last element of the list. -template struct parseCommaSeparatedList { +template +struct ParseCommaSeparatedList { Optional> operator()(SPIRVDialect const &dialect, DialectAsmParser &parser) const { if (auto value = parseAndVerify(dialect, parser)) @@ -434,7 +400,7 @@ static Type parseImageType(SPIRVDialect const &dialect, return Type(); auto value = - parseCommaSeparatedList{}(dialect, parser); if (!value) @@ -597,10 +563,10 @@ static void print(StructType type, DialectAsmPrinter &os) { if (!decorations.empty()) os << ", "; } - auto each_fn = [&os](spirv::Decoration decoration) { + auto eachFn = [&os](spirv::Decoration decoration) { os << stringifyDecoration(decoration); }; - interleaveComma(decorations, os, each_fn); + interleaveComma(decorations, os, eachFn); os << "]"; } }; @@ -865,39 +831,44 @@ LogicalResult SPIRVDialect::verifyOperationAttribute(Operation *op, return success(); } -// Verifies the given SPIR-V `attribute` attached to a region's argument or -// result and reports error to the given location if invalid. -static LogicalResult -verifyRegionAttribute(Location loc, NamedAttribute attribute, bool forArg) { +/// Verifies the given SPIR-V `attribute` attached to a value of the given +/// `valueType` is valid. +static LogicalResult verifyRegionAttribute(Location loc, Type valueType, + NamedAttribute attribute) { StringRef symbol = attribute.first.strref(); Attribute attr = attribute.second; if (symbol != spirv::getInterfaceVarABIAttrName()) return emitError(loc, "found unsupported '") - << symbol << "' attribute on region " - << (forArg ? "argument" : "result"); + << symbol << "' attribute on region argument"; - if (!attr.isa()) + auto varABIAttr = attr.dyn_cast(); + if (!varABIAttr) return emitError(loc, "'") << symbol - << "' attribute must be a dictionary attribute containing three " - "32-bit integer attributes: 'descriptor_set', 'binding', and " - "'storage_class'"; + << "' attribute must be a dictionary attribute containing two or " + "three 32-bit integer attributes: 'descriptor_set', 'binding', " + "and optional 'storage_class'"; + if (varABIAttr.storage_class() && !valueType.isIntOrIndexOrFloat()) + return emitError(loc, "'") << symbol + << "' attribute cannot specify storage class " + "when attaching to a non-scalar value"; return success(); } LogicalResult SPIRVDialect::verifyRegionArgAttribute(Operation *op, - unsigned /*regionIndex*/, - unsigned /*argIndex*/, + unsigned regionIndex, + unsigned argIndex, NamedAttribute attribute) { - return verifyRegionAttribute(op->getLoc(), attribute, - /*forArg=*/true); + return verifyRegionAttribute( + op->getLoc(), + op->getRegion(regionIndex).front().getArgument(argIndex).getType(), + attribute); } LogicalResult SPIRVDialect::verifyRegionResultAttribute( Operation *op, unsigned /*regionIndex*/, unsigned /*resultIndex*/, NamedAttribute attribute) { - return verifyRegionAttribute(op->getLoc(), attribute, - /*forArg=*/false); + return op->emitError("cannot attach SPIR-V attributes to region result"); } diff --git a/mlir/lib/Dialect/SPIRV/SPIRVLowering.cpp b/mlir/lib/Dialect/SPIRV/SPIRVLowering.cpp index 4adabdaa597ea..3fd987b0e5657 100644 --- a/mlir/lib/Dialect/SPIRV/SPIRVLowering.cpp +++ b/mlir/lib/Dialect/SPIRV/SPIRVLowering.cpp @@ -1,4 +1,4 @@ -//===- SPIRVLowering.cpp - Standard to SPIR-V dialect conversion--===// +//===- SPIRVLowering.cpp - SPIR-V lowering utilities ----------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -15,6 +15,7 @@ #include "mlir/Dialect/SPIRV/SPIRVDialect.h" #include "mlir/Dialect/SPIRV/SPIRVOps.h" #include "llvm/ADT/Sequence.h" +#include "llvm/ADT/StringExtras.h" #include "llvm/Support/Debug.h" #include @@ -23,6 +24,64 @@ using namespace mlir; +//===----------------------------------------------------------------------===// +// Utility functions +//===----------------------------------------------------------------------===// + +/// Checks that `candidates` extension requirements are possible to be satisfied +/// with the given `targetEnv`. +/// +/// `candidates` is a vector of vector for extension requirements following +/// ((Extension::A OR Extension::B) AND (Extension::C OR Extension::D)) +/// convention. +template +static LogicalResult checkExtensionRequirements( + LabelT label, const spirv::TargetEnv &targetEnv, + const spirv::SPIRVType::ExtensionArrayRefVector &candidates) { + for (const auto &ors : candidates) { + if (targetEnv.allows(ors)) + continue; + + SmallVector extStrings; + for (spirv::Extension ext : ors) + extStrings.push_back(spirv::stringifyExtension(ext)); + + LLVM_DEBUG(llvm::dbgs() + << label << " illegal: requires at least one extension in [" + << llvm::join(extStrings, ", ") + << "] but none allowed in target environment\n"); + return failure(); + } + return success(); +} + +/// Checks that `candidates`capability requirements are possible to be satisfied +/// with the given `isAllowedFn`. +/// +/// `candidates` is a vector of vector for capability requirements following +/// ((Capability::A OR Capability::B) AND (Capability::C OR Capability::D)) +/// convention. +template +static LogicalResult checkCapabilityRequirements( + LabelT label, const spirv::TargetEnv &targetEnv, + const spirv::SPIRVType::CapabilityArrayRefVector &candidates) { + for (const auto &ors : candidates) { + if (targetEnv.allows(ors)) + continue; + + SmallVector capStrings; + for (spirv::Capability cap : ors) + capStrings.push_back(spirv::stringifyCapability(cap)); + + LLVM_DEBUG(llvm::dbgs() + << label << " illegal: requires at least one capability in [" + << llvm::join(capStrings, ", ") + << "] but none allowed in target environment\n"); + return failure(); + } + return success(); +} + //===----------------------------------------------------------------------===// // Type Conversion //===----------------------------------------------------------------------===// @@ -38,10 +97,67 @@ Type SPIRVTypeConverter::getIndexType(MLIRContext *context) { return IntegerType::get(32, context); } +/// Mapping between SPIR-V storage classes to memref memory spaces. +/// +/// Note: memref does not have a defined smenatics for each memory space; it +/// depends on the context where it is used. There are no particular reasons +/// behind the number assigments; we try to follow NVVM conventions and largely +/// give common storage classes a smaller number. The hope is use symbolic +/// memory space representation eventually after memref supports it. +// TODO(antiagainst): swap Generic and StorageBuffer assignment to be more akin +// to NVVM. +#define STORAGE_SPACE_MAP_LIST(MAP_FN) \ + MAP_FN(spirv::StorageClass::Generic, 1) \ + MAP_FN(spirv::StorageClass::StorageBuffer, 0) \ + MAP_FN(spirv::StorageClass::Workgroup, 3) \ + MAP_FN(spirv::StorageClass::Uniform, 4) \ + MAP_FN(spirv::StorageClass::Private, 5) \ + MAP_FN(spirv::StorageClass::Function, 6) \ + MAP_FN(spirv::StorageClass::PushConstant, 7) \ + MAP_FN(spirv::StorageClass::UniformConstant, 8) \ + MAP_FN(spirv::StorageClass::Input, 9) \ + MAP_FN(spirv::StorageClass::Output, 10) \ + MAP_FN(spirv::StorageClass::CrossWorkgroup, 11) \ + MAP_FN(spirv::StorageClass::AtomicCounter, 12) \ + MAP_FN(spirv::StorageClass::Image, 13) \ + MAP_FN(spirv::StorageClass::CallableDataNV, 14) \ + MAP_FN(spirv::StorageClass::IncomingCallableDataNV, 15) \ + MAP_FN(spirv::StorageClass::RayPayloadNV, 16) \ + MAP_FN(spirv::StorageClass::HitAttributeNV, 17) \ + MAP_FN(spirv::StorageClass::IncomingRayPayloadNV, 18) \ + MAP_FN(spirv::StorageClass::ShaderRecordBufferNV, 19) \ + MAP_FN(spirv::StorageClass::PhysicalStorageBuffer, 20) + +unsigned +SPIRVTypeConverter::getMemorySpaceForStorageClass(spirv::StorageClass storage) { +#define STORAGE_SPACE_MAP_FN(storage, space) \ + case storage: \ + return space; + + switch (storage) { STORAGE_SPACE_MAP_LIST(STORAGE_SPACE_MAP_FN) } +#undef STORAGE_SPACE_MAP_FN +} + +Optional +SPIRVTypeConverter::getStorageClassForMemorySpace(unsigned space) { +#define STORAGE_SPACE_MAP_FN(storage, space) \ + case space: \ + return storage; + + switch (space) { + STORAGE_SPACE_MAP_LIST(STORAGE_SPACE_MAP_FN) + default: + return llvm::None; + } +#undef STORAGE_SPACE_MAP_FN +} + +#undef STORAGE_SPACE_MAP_LIST + // TODO(ravishankarm): This is a utility function that should probably be // exposed by the SPIR-V dialect. Keeping it local till the use case arises. static Optional getTypeNumBytes(Type t) { - if (spirv::SPIRVDialect::isValidScalarType(t)) { + if (t.isa()) { auto bitWidth = t.getIntOrFloatBitWidth(); // According to the SPIR-V spec: // "There is no physical size or bit pattern defined for values with boolean @@ -101,68 +217,212 @@ static Optional getTypeNumBytes(Type t) { return llvm::None; } -SPIRVTypeConverter::SPIRVTypeConverter() { - addConversion([](Type type) -> Optional { - // If the type is already valid in SPIR-V, directly return. - return spirv::SPIRVDialect::isValidType(type) ? type : Optional(); - }); +/// Converts a scalar `type` to a suitable type under the given `targetEnv`. +static Optional +convertScalarType(const spirv::TargetEnv &targetEnv, spirv::ScalarType type, + Optional storageClass = {}) { + // Get extension and capability requirements for the given type. + SmallVector, 1> extensions; + SmallVector, 2> capabilities; + type.getExtensions(extensions, storageClass); + type.getCapabilities(capabilities, storageClass); + + // If all requirements are met, then we can accept this type as-is. + if (succeeded(checkCapabilityRequirements(type, targetEnv, capabilities)) && + succeeded(checkExtensionRequirements(type, targetEnv, extensions))) + return type; + + // Otherwise we need to adjust the type, which really means adjusting the + // bitwidth given this is a scalar type. + // TODO(antiagainst): We are unconditionally converting the bitwidth here, + // this might be okay for non-interface types (i.e., types used in + // Priviate/Function storage classes), but not for interface types (i.e., + // types used in StorageBuffer/Uniform/PushConstant/etc. storage classes). + // This is because the later actually affects the ABI contract with the + // runtime. So we may want to expose a control on SPIRVTypeConverter to fail + // conversion if we cannot change there. + + if (auto floatType = type.dyn_cast()) { + LLVM_DEBUG(llvm::dbgs() << type << " converted to 32-bit for SPIR-V\n"); + return Builder(targetEnv.getContext()).getF32Type(); + } + + auto intType = type.cast(); + LLVM_DEBUG(llvm::dbgs() << type << " converted to 32-bit for SPIR-V\n"); + return IntegerType::get(/*width=*/32, intType.getSignedness(), + targetEnv.getContext()); +} + +/// Converts a vector `type` to a suitable type under the given `targetEnv`. +static Optional +convertVectorType(const spirv::TargetEnv &targetEnv, VectorType type, + Optional storageClass = {}) { + if (!spirv::CompositeType::isValid(type)) { + // TODO(antiagainst): One-element vector types can be translated into scalar + // types. Vector types with more than four elements can be translated into + // array types. + LLVM_DEBUG(llvm::dbgs() + << type << " illegal: 1- and > 4-element unimplemented\n"); + return llvm::None; + } + + // Get extension and capability requirements for the given type. + SmallVector, 1> extensions; + SmallVector, 2> capabilities; + type.cast().getExtensions(extensions, storageClass); + type.cast().getCapabilities(capabilities, storageClass); + + // If all requirements are met, then we can accept this type as-is. + if (succeeded(checkCapabilityRequirements(type, targetEnv, capabilities)) && + succeeded(checkExtensionRequirements(type, targetEnv, extensions))) + return type; + + auto elementType = convertScalarType( + targetEnv, type.getElementType().cast(), storageClass); + if (elementType) + return VectorType::get(type.getShape(), *elementType); + return llvm::None; +} + +/// Converts a tensor `type` to a suitable type under the given `targetEnv`. +/// +/// Note that this is mainly for lowering constant tensors.In SPIR-V one can +/// create composite constants with OpConstantComposite to embed relative large +/// constant values and use OpCompositeExtract and OpCompositeInsert to +/// manipulate, like what we do for vectors. +static Optional convertTensorType(const spirv::TargetEnv &targetEnv, + TensorType type) { + // TODO(ravishankarm) : Handle dynamic shapes. + if (!type.hasStaticShape()) { + LLVM_DEBUG(llvm::dbgs() + << type << " illegal: dynamic shape unimplemented\n"); + return llvm::None; + } + + auto scalarType = type.getElementType().dyn_cast(); + if (!scalarType) { + LLVM_DEBUG(llvm::dbgs() + << type << " illegal: cannot convert non-scalar element type\n"); + return llvm::None; + } + + Optional scalarSize = getTypeNumBytes(scalarType); + Optional tensorSize = getTypeNumBytes(type); + if (!scalarSize || !tensorSize) { + LLVM_DEBUG(llvm::dbgs() + << type << " illegal: cannot deduce element count\n"); + return llvm::None; + } + + auto arrayElemCount = *tensorSize / *scalarSize; + auto arrayElemType = convertScalarType(targetEnv, scalarType); + if (!arrayElemType) + return llvm::None; + Optional arrayElemSize = getTypeNumBytes(*arrayElemType); + if (!arrayElemSize) { + LLVM_DEBUG(llvm::dbgs() + << type << " illegal: cannot deduce converted element size\n"); + return llvm::None; + } + + return spirv::ArrayType::get(*arrayElemType, arrayElemCount, *arrayElemSize); +} + +static Optional convertMemrefType(const spirv::TargetEnv &targetEnv, + MemRefType type) { + // TODO(ravishankarm) : Handle dynamic shapes. + if (!type.hasStaticShape()) { + LLVM_DEBUG(llvm::dbgs() + << type << " illegal: dynamic shape unimplemented\n"); + return llvm::None; + } + + auto scalarType = type.getElementType().dyn_cast(); + if (!scalarType) { + LLVM_DEBUG(llvm::dbgs() + << type << " illegal: cannot convert non-scalar element type\n"); + return llvm::None; + } + + Optional scalarSize = getTypeNumBytes(scalarType); + Optional memrefSize = getTypeNumBytes(type); + if (!scalarSize || !memrefSize) { + LLVM_DEBUG(llvm::dbgs() + << type << " illegal: cannot deduce element count\n"); + return llvm::None; + } + + auto arrayElemCount = *memrefSize / *scalarSize; + + auto storageClass = + SPIRVTypeConverter::getStorageClassForMemorySpace(type.getMemorySpace()); + if (!storageClass) { + LLVM_DEBUG(llvm::dbgs() + << type << " illegal: cannot convert memory space\n"); + return llvm::None; + } + + auto arrayElemType = convertScalarType(targetEnv, scalarType, storageClass); + if (!arrayElemType) + return llvm::None; + Optional arrayElemSize = getTypeNumBytes(*arrayElemType); + if (!arrayElemSize) { + LLVM_DEBUG(llvm::dbgs() + << type << " illegal: cannot deduce converted element size\n"); + return llvm::None; + } + + auto arrayType = + spirv::ArrayType::get(*arrayElemType, arrayElemCount, *arrayElemSize); + + // Wrap in a struct to satisfy Vulkan interface requirements. + auto structType = spirv::StructType::get(arrayType, 0); + return spirv::PointerType::get(structType, *storageClass); +} + +SPIRVTypeConverter::SPIRVTypeConverter(spirv::TargetEnvAttr targetAttr) + : targetEnv(targetAttr) { + // Add conversions. The order matters here: later ones will be tried earlier. + + // All other cases failed. Then we cannot convert this type. + addConversion([](Type type) { return llvm::None; }); + + // Allow all SPIR-V dialect specific types. This assumes all standard types + // adopted in the SPIR-V dialect (i.e., IntegerType, FloatType, VectorType) + // were tried before. + // + // TODO(antiagainst): this assumes that the SPIR-V types are valid to use in + // the given target environment, which should be the case if the whole + // pipeline is driven by the same target environment. Still, we probably still + // want to validate and convert to be safe. + addConversion([](spirv::SPIRVType type) { return type; }); + addConversion([](IndexType indexType) { return SPIRVTypeConverter::getIndexType(indexType.getContext()); }); - addConversion([this](MemRefType memRefType) -> Type { - // TODO(ravishankarm): For now only support default memory space. The memory - // space description is not set is stone within MLIR, i.e. it depends on the - // context it is being used. To map this to SPIR-V storage classes, we - // should rely on the ABI attributes, and not on the memory space. This is - // still evolving, and needs to be revisited when there is more clarity. - if (memRefType.getMemorySpace()) - return Type(); - - auto elementType = convertType(memRefType.getElementType()); - if (!elementType) - return Type(); - - auto elementSize = getTypeNumBytes(elementType); - if (!elementSize) - return Type(); - - // TODO(ravishankarm) : Handle dynamic shapes. - if (memRefType.hasStaticShape()) { - auto arraySize = getTypeNumBytes(memRefType); - if (!arraySize) - return Type(); - - auto arrayType = spirv::ArrayType::get( - elementType, arraySize.getValue() / elementSize.getValue(), - elementSize.getValue()); - auto structType = spirv::StructType::get(arrayType, 0); - // For now initialize the storage class to StorageBuffer. This will be - // updated later based on whats passed in w.r.t to the ABI attributes. - return spirv::PointerType::get(structType, - spirv::StorageClass::StorageBuffer); - } - return Type(); + + addConversion([this](IntegerType intType) -> Optional { + if (auto scalarType = intType.dyn_cast()) + return convertScalarType(targetEnv, scalarType); + return llvm::None; + }); + + addConversion([this](FloatType floatType) -> Optional { + if (auto scalarType = floatType.dyn_cast()) + return convertScalarType(targetEnv, scalarType); + return llvm::None; + }); + + addConversion([this](VectorType vectorType) { + return convertVectorType(targetEnv, vectorType); + }); + + addConversion([this](TensorType tensorType) { + return convertTensorType(targetEnv, tensorType); }); - addConversion([this](TensorType tensorType) -> Type { - // TODO(ravishankarm) : Handle dynamic shapes. - if (!tensorType.hasStaticShape()) - return Type(); - - auto elementType = convertType(tensorType.getElementType()); - if (!elementType) - return Type(); - - auto elementSize = getTypeNumBytes(elementType); - if (!elementSize) - return Type(); - - auto tensorSize = getTypeNumBytes(tensorType); - if (!tensorSize) - return Type(); - - return spirv::ArrayType::get(elementType, - tensorSize.getValue() / elementSize.getValue(), - elementSize.getValue()); + + addConversion([this](MemRefType memRefType) { + return convertMemrefType(targetEnv, memRefType); }); } @@ -360,11 +620,10 @@ mlir::spirv::setABIAttrs(spirv::FuncOp funcOp, //===----------------------------------------------------------------------===// std::unique_ptr -spirv::SPIRVConversionTarget::get(spirv::TargetEnvAttr targetEnv, - MLIRContext *context) { +spirv::SPIRVConversionTarget::get(spirv::TargetEnvAttr targetAttr) { std::unique_ptr target( // std::make_unique does not work here because the constructor is private. - new SPIRVConversionTarget(targetEnv, context)); + new SPIRVConversionTarget(targetAttr)); SPIRVConversionTarget *targetPtr = target.get(); target->addDynamicallyLegalDialect( Optional( @@ -375,30 +634,15 @@ spirv::SPIRVConversionTarget::get(spirv::TargetEnvAttr targetEnv, } spirv::SPIRVConversionTarget::SPIRVConversionTarget( - spirv::TargetEnvAttr targetEnv, MLIRContext *context) - : ConversionTarget(*context), givenVersion(targetEnv.getVersion()) { - for (spirv::Extension ext : targetEnv.getExtensions()) - givenExtensions.insert(ext); - - // Add extensions implied by the current version. - for (spirv::Extension ext : spirv::getImpliedExtensions(givenVersion)) - givenExtensions.insert(ext); - - for (spirv::Capability cap : targetEnv.getCapabilities()) { - givenCapabilities.insert(cap); - - // Add capabilities implied by the current capability. - for (spirv::Capability c : spirv::getRecursiveImpliedCapabilities(cap)) - givenCapabilities.insert(c); - } -} + spirv::TargetEnvAttr targetAttr) + : ConversionTarget(*targetAttr.getContext()), targetEnv(targetAttr) {} bool spirv::SPIRVConversionTarget::isLegalOp(Operation *op) { // Make sure this op is available at the given version. Ops not implementing // QueryMinVersionInterface/QueryMaxVersionInterface are available to all // SPIR-V versions. if (auto minVersion = dyn_cast(op)) - if (minVersion.getMinVersion() > givenVersion) { + if (minVersion.getMinVersion() > this->targetEnv.getVersion()) { LLVM_DEBUG(llvm::dbgs() << op->getName() << " illegal: requiring min version " << spirv::stringifyVersion(minVersion.getMinVersion()) @@ -406,7 +650,7 @@ bool spirv::SPIRVConversionTarget::isLegalOp(Operation *op) { return false; } if (auto maxVersion = dyn_cast(op)) - if (maxVersion.getMaxVersion() < givenVersion) { + if (maxVersion.getMaxVersion() < this->targetEnv.getVersion()) { LLVM_DEBUG(llvm::dbgs() << op->getName() << " illegal: requiring max version " << spirv::stringifyVersion(maxVersion.getMaxVersion()) @@ -414,38 +658,47 @@ bool spirv::SPIRVConversionTarget::isLegalOp(Operation *op) { return false; } - // Make sure this op's required extensions are allowed to use. For each op, - // we return a vector of vector for its extension requirements following - // ((Extension::A OR Extension::B) AND (Extension::C OR Extension::D)) - // convention. Ops not implementing QueryExtensionInterface do not require - // extensions to be available. - if (auto extensions = dyn_cast(op)) { - auto exts = extensions.getExtensions(); - for (const auto &ors : exts) - if (llvm::all_of(ors, [this](spirv::Extension ext) { - return this->givenExtensions.count(ext) == 0; - })) { - LLVM_DEBUG(llvm::dbgs() << op->getName() - << " illegal: missing required extension\n"); - return false; - } - } + // Make sure this op's required extensions are allowed to use. Ops not + // implementing QueryExtensionInterface do not require extensions to be + // available. + if (auto extensions = dyn_cast(op)) + if (failed(checkExtensionRequirements(op->getName(), this->targetEnv, + extensions.getExtensions()))) + return false; - // Make sure this op's required extensions are allowed to use. For each op, - // we return a vector of vector for its capability requirements following - // ((Capability::A OR Extension::B) AND (Capability::C OR Capability::D)) - // convention. Ops not implementing QueryExtensionInterface do not require - // extensions to be available. - if (auto capabilities = dyn_cast(op)) { - auto caps = capabilities.getCapabilities(); - for (const auto &ors : caps) - if (llvm::all_of(ors, [this](spirv::Capability cap) { - return this->givenCapabilities.count(cap) == 0; - })) { - LLVM_DEBUG(llvm::dbgs() << op->getName() - << " illegal: missing required capability\n"); - return false; - } + // Make sure this op's required extensions are allowed to use. Ops not + // implementing QueryCapabilityInterface do not require capabilities to be + // available. + if (auto capabilities = dyn_cast(op)) + if (failed(checkCapabilityRequirements(op->getName(), this->targetEnv, + capabilities.getCapabilities()))) + return false; + + SmallVector valueTypes; + valueTypes.append(op->operand_type_begin(), op->operand_type_end()); + valueTypes.append(op->result_type_begin(), op->result_type_end()); + + // Special treatment for global variables, whose type requirements are + // conveyed by type attributes. + if (auto globalVar = dyn_cast(op)) + valueTypes.push_back(globalVar.type()); + + // Make sure the op's operands/results use types that are allowed by the + // target environment. + SmallVector, 4> typeExtensions; + SmallVector, 8> typeCapabilities; + for (Type valueType : valueTypes) { + typeExtensions.clear(); + valueType.cast().getExtensions(typeExtensions); + if (failed(checkExtensionRequirements(op->getName(), this->targetEnv, + typeExtensions))) + return false; + + typeCapabilities.clear(); + valueType.cast().getCapabilities(typeCapabilities); + if (failed(checkCapabilityRequirements(op->getName(), this->targetEnv, + typeCapabilities))) + return false; } return true; diff --git a/mlir/lib/Dialect/SPIRV/SPIRVOps.cpp b/mlir/lib/Dialect/SPIRV/SPIRVOps.cpp index 377242482b2a7..f6b862156c49e 100644 --- a/mlir/lib/Dialect/SPIRV/SPIRVOps.cpp +++ b/mlir/lib/Dialect/SPIRV/SPIRVOps.cpp @@ -1373,7 +1373,7 @@ static LogicalResult verify(spirv::ConstantOp constOp) { bool spirv::ConstantOp::isBuildableWith(Type type) { // Must be valid SPIR-V type first. - if (!SPIRVDialect::isValidType(type)) + if (!type.isa()) return false; if (type.getKind() >= Type::FIRST_SPIRV_TYPE && @@ -2460,7 +2460,7 @@ static LogicalResult verify(spirv::SpecConstantOp constOp) { case StandardAttributes::Integer: case StandardAttributes::Float: { // Make sure bitwidth is allowed. - if (!spirv::SPIRVDialect::isValidType(value.getType())) + if (!value.getType().isa()) return constOp.emitOpError("default value bitwidth disallowed"); return success(); } diff --git a/mlir/lib/Dialect/SPIRV/SPIRVTypes.cpp b/mlir/lib/Dialect/SPIRV/SPIRVTypes.cpp index 92dc5b82bb8af..3f963bd1d8a87 100644 --- a/mlir/lib/Dialect/SPIRV/SPIRVTypes.cpp +++ b/mlir/lib/Dialect/SPIRV/SPIRVTypes.cpp @@ -163,13 +163,19 @@ bool CompositeType::classof(Type type) { case TypeKind::Array: case TypeKind::RuntimeArray: case TypeKind::Struct: - case StandardTypes::Vector: return true; + case StandardTypes::Vector: + return isValid(type.cast()); default: return false; } } +bool CompositeType::isValid(VectorType type) { + return type.getRank() == 1 && type.getElementType().isa() && + type.getNumElements() >= 2 && type.getNumElements() <= 4; +} + Type CompositeType::getElementType(unsigned index) const { switch (getKind()) { case spirv::TypeKind::Array: @@ -560,7 +566,30 @@ void RuntimeArrayType::getCapabilities( // ScalarType //===----------------------------------------------------------------------===// -bool ScalarType::classof(Type type) { return type.isIntOrFloat(); } +bool ScalarType::classof(Type type) { + if (auto floatType = type.dyn_cast()) { + return isValid(floatType); + } + if (auto intType = type.dyn_cast()) { + return isValid(intType); + } + return false; +} + +bool ScalarType::isValid(FloatType type) { return !type.isBF16(); } + +bool ScalarType::isValid(IntegerType type) { + switch (type.getWidth()) { + case 1: + case 8: + case 16: + case 32: + case 64: + return true; + default: + return false; + } +} void ScalarType::getExtensions(SPIRVType::ExtensionArrayRefVector &extensions, Optional storage) { @@ -678,9 +707,19 @@ void ScalarType::getCapabilities( //===----------------------------------------------------------------------===// bool SPIRVType::classof(Type type) { - return type.isa() || type.isa() || - (type.getKind() >= Type::FIRST_SPIRV_TYPE && - type.getKind() <= TypeKind::LAST_SPIRV_TYPE); + // Allow SPIR-V dialect types + if (type.getKind() >= Type::FIRST_SPIRV_TYPE && + type.getKind() <= TypeKind::LAST_SPIRV_TYPE) + return true; + if (type.isa()) + return true; + if (auto vectorType = type.dyn_cast()) + return CompositeType::isValid(vectorType); + return false; +} + +bool SPIRVType::isScalarOrVector() { + return isIntOrFloat() || isa(); } void SPIRVType::getExtensions(SPIRVType::ExtensionArrayRefVector &extensions, diff --git a/mlir/lib/Dialect/SPIRV/TargetAndABI.cpp b/mlir/lib/Dialect/SPIRV/TargetAndABI.cpp index 88f3037ccc1e8..491fcf9a6f21b 100644 --- a/mlir/lib/Dialect/SPIRV/TargetAndABI.cpp +++ b/mlir/lib/Dialect/SPIRV/TargetAndABI.cpp @@ -15,19 +15,85 @@ using namespace mlir; +//===----------------------------------------------------------------------===// +// TargetEnv +//===----------------------------------------------------------------------===// + +spirv::TargetEnv::TargetEnv(spirv::TargetEnvAttr targetAttr) + : targetAttr(targetAttr) { + for (spirv::Extension ext : targetAttr.getExtensions()) + givenExtensions.insert(ext); + + // Add extensions implied by the current version. + for (spirv::Extension ext : + spirv::getImpliedExtensions(targetAttr.getVersion())) + givenExtensions.insert(ext); + + for (spirv::Capability cap : targetAttr.getCapabilities()) { + givenCapabilities.insert(cap); + + // Add capabilities implied by the current capability. + for (spirv::Capability c : spirv::getRecursiveImpliedCapabilities(cap)) + givenCapabilities.insert(c); + } +} + +spirv::Version spirv::TargetEnv::getVersion() { + return targetAttr.getVersion(); +} + +bool spirv::TargetEnv::allows(spirv::Capability capability) const { + return givenCapabilities.count(capability); +} + +Optional +spirv::TargetEnv::allows(ArrayRef caps) const { + auto chosen = llvm::find_if(caps, [this](spirv::Capability cap) { + return givenCapabilities.count(cap); + }); + if (chosen != caps.end()) + return *chosen; + return llvm::None; +} + +bool spirv::TargetEnv::allows(spirv::Extension extension) const { + return givenExtensions.count(extension); +} + +Optional +spirv::TargetEnv::allows(ArrayRef exts) const { + auto chosen = llvm::find_if(exts, [this](spirv::Extension ext) { + return givenExtensions.count(ext); + }); + if (chosen != exts.end()) + return *chosen; + return llvm::None; +} + +MLIRContext *spirv::TargetEnv::getContext() const { + return targetAttr.getContext(); +} + +//===----------------------------------------------------------------------===// +// Utility functions +//===----------------------------------------------------------------------===// + StringRef spirv::getInterfaceVarABIAttrName() { return "spv.interface_var_abi"; } spirv::InterfaceVarABIAttr spirv::getInterfaceVarABIAttr(unsigned descriptorSet, unsigned binding, - spirv::StorageClass storageClass, + Optional storageClass, MLIRContext *context) { Type i32Type = IntegerType::get(32, context); + auto scAttr = + storageClass + ? IntegerAttr::get(i32Type, static_cast(*storageClass)) + : IntegerAttr(); return spirv::InterfaceVarABIAttr::get( IntegerAttr::get(i32Type, descriptorSet), - IntegerAttr::get(i32Type, binding), - IntegerAttr::get(i32Type, static_cast(storageClass)), context); + IntegerAttr::get(i32Type, binding), scAttr, context); } StringRef spirv::getEntryPointABIAttrName() { return "spv.entry_point_abi"; } diff --git a/mlir/lib/Dialect/SPIRV/Transforms/LowerABIAttributesPass.cpp b/mlir/lib/Dialect/SPIRV/Transforms/LowerABIAttributesPass.cpp index 4dbc54ecfca2e..1ca9cad977af0 100644 --- a/mlir/lib/Dialect/SPIRV/Transforms/LowerABIAttributesPass.cpp +++ b/mlir/lib/Dialect/SPIRV/Transforms/LowerABIAttributesPass.cpp @@ -21,33 +21,27 @@ using namespace mlir; -/// Checks if the `type` is a scalar or vector type. It is assumed that they are -/// valid for SPIR-V dialect already. -static bool isScalarOrVectorType(Type type) { - return spirv::SPIRVDialect::isValidScalarType(type) || type.isa(); -} - /// Creates a global variable for an argument based on the ABI info. static spirv::GlobalVariableOp -createGlobalVariableForArg(spirv::FuncOp funcOp, OpBuilder &builder, - unsigned argNum, - spirv::InterfaceVarABIAttr abiInfo) { +createGlobalVarForEntryPointArgument(OpBuilder &builder, spirv::FuncOp funcOp, + unsigned argIndex, + spirv::InterfaceVarABIAttr abiInfo) { auto spirvModule = funcOp.getParentOfType(); - if (!spirvModule) { + if (!spirvModule) return nullptr; - } + OpBuilder::InsertionGuard moduleInsertionGuard(builder); builder.setInsertionPoint(funcOp.getOperation()); std::string varName = - funcOp.getName().str() + "_arg_" + std::to_string(argNum); + funcOp.getName().str() + "_arg_" + std::to_string(argIndex); // Get the type of variable. If this is a scalar/vector type and has an ABI - // info create a variable of type !spv.ptr>. If not + // info create a variable of type !spv.ptr>. If not // it must already be a !spv.ptr>. - auto varType = funcOp.getType().getInput(argNum); - auto storageClass = - static_cast(abiInfo.storage_class().getInt()); - if (isScalarOrVectorType(varType)) { + auto varType = funcOp.getType().getInput(argIndex); + if (varType.cast().isScalarOrVector()) { + auto storageClass = + static_cast(abiInfo.storage_class().getInt()); varType = spirv::PointerType::get(spirv::StructType::get(varType), storageClass); } @@ -84,9 +78,18 @@ getInterfaceVariables(spirv::FuncOp funcOp, funcOp.walk([&](spirv::AddressOfOp addressOfOp) { auto var = module.lookupSymbol(addressOfOp.variable()); - if (var.type().cast().getStorageClass() != - spirv::StorageClass::StorageBuffer) { + // TODO(antiagainst): Per SPIR-V spec: "Before version 1.4, the interface’s + // storage classes are limited to the Input and Output storage classes. + // Starting with version 1.4, the interface’s storage classes are all + // storage classes used in declaring all global variables referenced by the + // entry point’s call tree." We should consider the target environment here. + switch (var.type().cast().getStorageClass()) { + case spirv::StorageClass::Input: + case spirv::StorageClass::Output: interfaceVarSet.insert(var.getOperation()); + break; + default: + break; } }); for (auto &var : interfaceVarSet) { @@ -173,11 +176,10 @@ LogicalResult ProcessInterfaceVarABI::matchAndRewrite( // produce an error. return failure(); } - auto var = - createGlobalVariableForArg(funcOp, rewriter, argType.index(), abiInfo); - if (!var) { + spirv::GlobalVariableOp var = createGlobalVarForEntryPointArgument( + rewriter, funcOp, argType.index(), abiInfo); + if (!var) return failure(); - } OpBuilder::InsertionGuard funcInsertionGuard(rewriter); rewriter.setInsertionPointToStart(&funcOp.front()); @@ -190,7 +192,7 @@ LogicalResult ProcessInterfaceVarABI::matchAndRewrite( // at the start of the function. It is probably better to do the load just // before the use. There might be multiple loads and currently there is no // easy way to replace all uses with a sequence of operations. - if (isScalarOrVectorType(argType.value())) { + if (argType.value().cast().isScalarOrVector()) { auto indexType = SPIRVTypeConverter::getIndexType(funcOp.getContext()); auto zero = spirv::ConstantOp::getZero(indexType, funcOp.getLoc(), &rewriter); @@ -216,7 +218,9 @@ void LowerABIAttributesPass::runOnOperation() { spirv::ModuleOp module = getOperation(); MLIRContext *context = &getContext(); - SPIRVTypeConverter typeConverter; + spirv::TargetEnv targetEnv(spirv::lookupTargetEnv(module)); + + SPIRVTypeConverter typeConverter(targetEnv); OwningRewritePatternList patterns; patterns.insert(context, typeConverter); diff --git a/mlir/lib/Dialect/SPIRV/Transforms/UpdateVCEPass.cpp b/mlir/lib/Dialect/SPIRV/Transforms/UpdateVCEPass.cpp index fff15c1857490..201adbbd38374 100644 --- a/mlir/lib/Dialect/SPIRV/Transforms/UpdateVCEPass.cpp +++ b/mlir/lib/Dialect/SPIRV/Transforms/UpdateVCEPass.cpp @@ -34,22 +34,18 @@ class UpdateVCEPass final } // namespace /// Checks that `candidates` extension requirements are possible to be satisfied -/// with the given `allowedExtensions` and updates `deducedExtensions` if so. -/// Emits errors attaching to the given `op` on failures. +/// with the given `targetEnv` and updates `deducedExtensions` if so. Emits +/// errors attaching to the given `op` on failures. /// /// `candidates` is a vector of vector for extension requirements following /// ((Extension::A OR Extension::B) AND (Extension::C OR Extension::D)) /// convention. static LogicalResult checkAndUpdateExtensionRequirements( - Operation *op, const llvm::SmallSet &allowedExtensions, + Operation *op, const spirv::TargetEnv &targetEnv, const spirv::SPIRVType::ExtensionArrayRefVector &candidates, llvm::SetVector &deducedExtensions) { for (const auto &ors : candidates) { - auto chosen = llvm::find_if(ors, [&](spirv::Extension ext) { - return allowedExtensions.count(ext); - }); - - if (chosen != ors.end()) { + if (Optional chosen = targetEnv.allows(ors)) { deducedExtensions.insert(*chosen); } else { SmallVector extStrings; @@ -66,23 +62,18 @@ static LogicalResult checkAndUpdateExtensionRequirements( } /// Checks that `candidates`capability requirements are possible to be satisfied -/// with the given `allowedCapabilities` and updates `deducedCapabilities` if -/// so. Emits errors attaching to the given `op` on failures. +/// with the given `targetEnv` and updates `deducedCapabilities` if so. Emits +/// errors attaching to the given `op` on failures. /// /// `candidates` is a vector of vector for capability requirements following /// ((Capability::A OR Capability::B) AND (Capability::C OR Capability::D)) /// convention. static LogicalResult checkAndUpdateCapabilityRequirements( - Operation *op, - const llvm::SmallSet &allowedCapabilities, + Operation *op, const spirv::TargetEnv &targetEnv, const spirv::SPIRVType::CapabilityArrayRefVector &candidates, llvm::SetVector &deducedCapabilities) { for (const auto &ors : candidates) { - auto chosen = llvm::find_if(ors, [&](spirv::Capability cap) { - return allowedCapabilities.count(cap); - }); - - if (chosen != ors.end()) { + if (Optional chosen = targetEnv.allows(ors)) { deducedCapabilities.insert(*chosen); } else { SmallVector capStrings; @@ -101,32 +92,14 @@ static LogicalResult checkAndUpdateCapabilityRequirements( void UpdateVCEPass::runOnOperation() { spirv::ModuleOp module = getOperation(); - spirv::TargetEnvAttr targetEnv = spirv::lookupTargetEnv(module); - if (!targetEnv) { + spirv::TargetEnvAttr targetAttr = spirv::lookupTargetEnv(module); + if (!targetAttr) { module.emitError("missing 'spv.target_env' attribute"); return signalPassFailure(); } - spirv::Version allowedVersion = targetEnv.getVersion(); - - // Build a set for available extensions in the target environment. - llvm::SmallSet allowedExtensions; - for (spirv::Extension ext : targetEnv.getExtensions()) - allowedExtensions.insert(ext); - - // Add extensions implied by the current version. - for (spirv::Extension ext : spirv::getImpliedExtensions(allowedVersion)) - allowedExtensions.insert(ext); - - // Build a set for available capabilities in the target environment. - llvm::SmallSet allowedCapabilities; - for (spirv::Capability cap : targetEnv.getCapabilities()) { - allowedCapabilities.insert(cap); - - // Add capabilities implied by the current capability. - for (spirv::Capability c : spirv::getRecursiveImpliedCapabilities(cap)) - allowedCapabilities.insert(c); - } + spirv::TargetEnv targetEnv(targetAttr); + spirv::Version allowedVersion = targetAttr.getVersion(); spirv::Version deducedVersion = spirv::Version::V_1_0; llvm::SetVector deducedExtensions; @@ -148,15 +121,14 @@ void UpdateVCEPass::runOnOperation() { // Op extension requirements if (auto extensions = dyn_cast(op)) - if (failed(checkAndUpdateExtensionRequirements(op, allowedExtensions, - extensions.getExtensions(), - deducedExtensions))) + if (failed(checkAndUpdateExtensionRequirements( + op, targetEnv, extensions.getExtensions(), deducedExtensions))) return WalkResult::interrupt(); // Op capability requirements if (auto capabilities = dyn_cast(op)) if (failed(checkAndUpdateCapabilityRequirements( - op, allowedCapabilities, capabilities.getCapabilities(), + op, targetEnv, capabilities.getCapabilities(), deducedCapabilities))) return WalkResult::interrupt(); @@ -176,13 +148,13 @@ void UpdateVCEPass::runOnOperation() { typeExtensions.clear(); valueType.cast().getExtensions(typeExtensions); if (failed(checkAndUpdateExtensionRequirements( - op, allowedExtensions, typeExtensions, deducedExtensions))) + op, targetEnv, typeExtensions, deducedExtensions))) return WalkResult::interrupt(); typeCapabilities.clear(); valueType.cast().getCapabilities(typeCapabilities); if (failed(checkAndUpdateCapabilityRequirements( - op, allowedCapabilities, typeCapabilities, deducedCapabilities))) + op, targetEnv, typeCapabilities, deducedCapabilities))) return WalkResult::interrupt(); } diff --git a/mlir/lib/Dialect/Vector/CMakeLists.txt b/mlir/lib/Dialect/Vector/CMakeLists.txt index 3e1d8de0d3ba4..e5e1251768cd4 100644 --- a/mlir/lib/Dialect/Vector/CMakeLists.txt +++ b/mlir/lib/Dialect/Vector/CMakeLists.txt @@ -16,7 +16,7 @@ target_link_libraries(MLIRVector MLIREDSC MLIRIR MLIRStandardOps - MLIRAffineOps + MLIRAffine MLIRLoopOps MLIRLoopAnalysis MLIRSideEffects diff --git a/mlir/lib/Dialect/Vector/VectorOps.cpp b/mlir/lib/Dialect/Vector/VectorOps.cpp index 342ce37ad5157..816aaf9f59482 100644 --- a/mlir/lib/Dialect/Vector/VectorOps.cpp +++ b/mlir/lib/Dialect/Vector/VectorOps.cpp @@ -1483,6 +1483,10 @@ static void print(OpAsmPrinter &p, TypeCastOp op) { } static LogicalResult verify(TypeCastOp op) { + MemRefType canonicalType = canonicalizeStridedLayout(op.getMemRefType()); + if (!canonicalType.getAffineMaps().empty()) + return op.emitOpError("expects operand to be a memref with no layout"); + auto resultType = inferVectorTypeCastResultType(op.getMemRefType()); if (op.getResultMemRefType() != resultType) return op.emitOpError("expects result type to be: ") << resultType; @@ -1516,6 +1520,35 @@ static void print(OpAsmPrinter &p, TupleOp op) { static LogicalResult verify(TupleOp op) { return success(); } +//===----------------------------------------------------------------------===// +// TransposeOp +//===----------------------------------------------------------------------===// + +static LogicalResult verify(TransposeOp op) { + VectorType vectorType = op.getVectorType(); + VectorType resultType = op.getResultType(); + int64_t rank = resultType.getRank(); + if (vectorType.getRank() != rank) + return op.emitOpError("vector result rank mismatch: ") << rank; + // Verify transposition array. + auto transpAttr = op.transp().getValue(); + int64_t size = transpAttr.size(); + if (rank != size) + return op.emitOpError("transposition length mismatch: ") << size; + SmallVector seen(rank, false); + for (auto ta : llvm::enumerate(transpAttr)) { + int64_t i = ta.value().cast().getInt(); + if (i < 0 || i >= rank) + return op.emitOpError("transposition index out of range: ") << i; + if (seen[i]) + return op.emitOpError("duplicate position index: ") << i; + seen[i] = true; + if (resultType.getDimSize(ta.index()) != vectorType.getDimSize(i)) + return op.emitOpError("dimension size mismatch at: ") << i; + } + return success(); +} + //===----------------------------------------------------------------------===// // TupleGetOp //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Dialect/Vector/VectorTransforms.cpp b/mlir/lib/Dialect/Vector/VectorTransforms.cpp index dd47e0c80dc1f..ef3484d31a3cc 100644 --- a/mlir/lib/Dialect/Vector/VectorTransforms.cpp +++ b/mlir/lib/Dialect/Vector/VectorTransforms.cpp @@ -12,7 +12,7 @@ #include -#include "mlir/Dialect/AffineOps/AffineOps.h" +#include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/StandardOps/IR/Ops.h" #include "mlir/Dialect/Utils/StructuredOpsUtils.h" #include "mlir/Dialect/Vector/VectorOps.h" @@ -864,6 +864,67 @@ class InsertSlicesOpLowering : public OpRewritePattern { } }; +/// Progressive lowering of OuterProductOp. +/// One: +/// %x = vector.transpose %y, [1, 0] +/// is replaced by: +/// %z = constant dense<0.000000e+00> +/// %0 = vector.extract %y[0, 0] +/// %1 = vector.insert %0, %z [0, 0] +/// .. +/// %x = vector.insert .., .. [.., ..] +class TransposeOpLowering : public OpRewritePattern { +public: + using OpRewritePattern::OpRewritePattern; + + LogicalResult matchAndRewrite(vector::TransposeOp op, + PatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + + VectorType resType = op.getResultType(); + Type eltType = resType.getElementType(); + + // Set up convenience transposition table. + SmallVector transp; + for (auto attr : op.transp()) + transp.push_back(attr.cast().getInt()); + + // Generate fully unrolled extract/insert ops. + Value zero = rewriter.create(loc, eltType, + rewriter.getZeroAttr(eltType)); + Value result = rewriter.create(loc, resType, zero); + SmallVector lhs(transp.size(), 0); + SmallVector rhs(transp.size(), 0); + rewriter.replaceOp(op, expandIndices(loc, resType, 0, transp, lhs, rhs, + op.vector(), result, rewriter)); + return success(); + } + +private: + // Builds the indices arrays for the lhs and rhs. Generates the extract/insert + // operation when al ranks are exhausted. + Value expandIndices(Location loc, VectorType resType, int64_t pos, + SmallVector &transp, + SmallVector &lhs, + SmallVector &rhs, Value input, Value result, + PatternRewriter &rewriter) const { + if (pos >= resType.getRank()) { + auto ridx = rewriter.getI64ArrayAttr(rhs); + auto lidx = rewriter.getI64ArrayAttr(lhs); + Type eltType = resType.getElementType(); + Value e = rewriter.create(loc, eltType, input, ridx); + return rewriter.create(loc, resType, e, result, lidx); + } + for (int64_t d = 0, e = resType.getDimSize(pos); d < e; ++d) { + lhs[pos] = d; + rhs[transp[pos]] = d; + result = expandIndices(loc, resType, pos + 1, transp, lhs, rhs, input, + result, rewriter); + } + return result; + } +}; + /// Progressive lowering of OuterProductOp. /// One: /// %x = vector.outerproduct %lhs, %rhs, %acc @@ -1353,7 +1414,7 @@ void mlir::vector::populateVectorContractLoweringPatterns( OwningRewritePatternList &patterns, MLIRContext *context, VectorTransformsOptions parameters) { patterns.insert( - context); + ShapeCastOp2DUpCastRewritePattern, TransposeOpLowering, + OuterProductOpLowering>(context); patterns.insert(parameters, context); } diff --git a/mlir/lib/Dialect/Vector/VectorUtils.cpp b/mlir/lib/Dialect/Vector/VectorUtils.cpp index 1cace25b9835e..f929dddd6d8d6 100644 --- a/mlir/lib/Dialect/Vector/VectorUtils.cpp +++ b/mlir/lib/Dialect/Vector/VectorUtils.cpp @@ -12,7 +12,7 @@ #include "mlir/Dialect/Vector/VectorUtils.h" #include "mlir/Analysis/LoopAnalysis.h" -#include "mlir/Dialect/AffineOps/AffineOps.h" +#include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/StandardOps/IR/Ops.h" #include "mlir/Dialect/Vector/VectorOps.h" #include "mlir/IR/Builders.h" diff --git a/mlir/lib/Dialect/VectorOps/VectorTransforms.cpp b/mlir/lib/Dialect/VectorOps/VectorTransforms.cpp new file mode 100644 index 0000000000000..e853c76d0dba3 --- /dev/null +++ b/mlir/lib/Dialect/VectorOps/VectorTransforms.cpp @@ -0,0 +1,1349 @@ +//===- VectorToLoops.cpp - Conversion within the Vector dialect -----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements target-independent rewrites as 1->N patterns. +// +//===----------------------------------------------------------------------===// + +#include + +#include "mlir/Dialect/Affine/IR/AffineOps.h" +#include "mlir/Dialect/StandardOps/IR/Ops.h" +#include "mlir/Dialect/Utils/StructuredOpsUtils.h" +#include "mlir/Dialect/VectorOps/VectorOps.h" +#include "mlir/Dialect/VectorOps/VectorTransforms.h" +#include "mlir/Dialect/VectorOps/VectorUtils.h" +#include "mlir/IR/AffineExpr.h" +#include "mlir/IR/AffineMap.h" +#include "mlir/IR/Attributes.h" +#include "mlir/IR/Builders.h" +#include "mlir/IR/Function.h" +#include "mlir/IR/Location.h" +#include "mlir/IR/Matchers.h" +#include "mlir/IR/Module.h" +#include "mlir/IR/OperationSupport.h" +#include "mlir/IR/PatternMatch.h" +#include "mlir/IR/Types.h" +#include "mlir/Support/Functional.h" +#include "mlir/Support/STLExtras.h" + +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +#define DEBUG_TYPE "vector-to-vector" + +using namespace mlir; +using llvm::dbgs; +using mlir::functional::zipMap; + +static llvm::cl::OptionCategory clOptionsCategory(DEBUG_TYPE " options"); + +static llvm::cl::opt lowerToLLVMMatrixIntrinsics( + "vector-lower-matrix-intrinsics", + llvm::cl::desc("Lower vector.contract to llvm.intr.matrix.multiply"), + llvm::cl::init(false), llvm::cl::cat(clOptionsCategory)); + +/// Given a shape with sizes greater than 0 along all dimensions, +/// returns the distance, in number of elements, between a slice in a dimension +/// and the next slice in the same dimension. +/// e.g. shape[3, 4, 5] -> linearization_basis[20, 5, 1] +static SmallVector computeStrides(ArrayRef shape) { + if (shape.empty()) + return {}; + SmallVector tmp; + tmp.reserve(shape.size()); + int64_t running = 1; + for (auto size : llvm::reverse(shape)) { + assert(size > 0 && "size must be nonnegative"); + tmp.push_back(running); + running *= size; + } + return SmallVector(tmp.rbegin(), tmp.rend()); +} + +static int64_t computeMaxLinearIndex(ArrayRef basis) { + if (basis.empty()) + return 0; + int64_t res = 1; + for (auto b : basis) + res *= b; + return res; +} + +/// Computes and returns the linearized index of 'offsets' w.r.t. 'basis'. +static int64_t linearize(ArrayRef offsets, ArrayRef basis) { + assert(offsets.size() == basis.size()); + int64_t linearIndex = 0; + for (unsigned idx = 0, e = basis.size(); idx < e; ++idx) + linearIndex += offsets[idx] * basis[idx]; + return linearIndex; +} + +// Clones `op` into a new operations that takes `operands` and returns +// `resultTypes`. +static Operation *cloneOpWithOperandsAndTypes(PatternRewriter &builder, + Location loc, Operation *op, + ArrayRef operands, + ArrayRef resultTypes) { + OperationState res(loc, op->getName().getStringRef(), operands, resultTypes, + op->getAttrs()); + return builder.createOperation(res); +} + +// Populates 'resultElements[indexMap[i]]' with elements from 'inputElements[i]' +// for each index 'i' in inputElements with a valid mapping in 'indexMap'. +static void getMappedElements(const DenseMap &indexMap, + ArrayRef inputElements, + SmallVectorImpl &resultElements) { + assert(indexMap.size() == resultElements.size()); + assert(inputElements.size() >= resultElements.size()); + for (unsigned i = 0, e = inputElements.size(); i < e; ++i) { + auto it = indexMap.find(i); + if (it != indexMap.end()) + resultElements[it->second] = inputElements[i]; + } +} + +// Returns a tuple type with vector element types for each resulting slice +// of 'vectorType' unrolled by 'sizes' and 'strides'. +// TODO(andydavis) Move this to a utility function and share it with +// Extract/InsertSlicesOp verification. +static TupleType generateExtractSlicesOpResultType(VectorType vectorType, + ArrayRef sizes, + ArrayRef strides, + PatternRewriter &builder) { + assert(llvm::all_of(strides, [](int64_t s) { return s == 1; })); + assert(static_cast(sizes.size()) == vectorType.getRank()); + assert(static_cast(strides.size()) == vectorType.getRank()); + + // Compute shape ratio of 'shape' and 'sizes'. + auto shape = vectorType.getShape(); + auto maybeDimSliceCounts = shapeRatio(shape, sizes); + assert(maybeDimSliceCounts.hasValue()); + auto sliceDimCounts = *maybeDimSliceCounts; + + // Compute strides w.r.t number of slices in each dimension. + auto sliceStrides = computeStrides(sliceDimCounts); + int64_t sliceCount = computeMaxLinearIndex(sliceDimCounts); + SmallVector vectorTypes(sliceCount); + for (unsigned i = 0; i < sliceCount; ++i) { + auto vectorOffsets = delinearize(sliceStrides, i); + auto elementOffsets = + computeElementOffsetsFromVectorSliceOffsets(sizes, vectorOffsets); + auto sliceSizes = computeSliceSizes(shape, sizes, elementOffsets); + // Create Vector type and add to 'vectorTypes[i]'. + vectorTypes[i] = VectorType::get(sliceSizes, vectorType.getElementType()); + } + return TupleType::get(vectorTypes, builder.getContext()); +} + +// UnrolledVectorState aggregates per-operand/result vector state required for +// unrolling. +struct UnrolledVectorState { + SmallVector unrolledShape; + SmallVector unrollFactors; + SmallVector basis; + int64_t numInstances; + Value slicesTuple; +}; + +// Populates 'state' with unrolled shape, unroll factors, basis and +// num unrolled instances for 'vectorType'. +static void initUnrolledVectorState(VectorType vectorType, Value initValue, + const DenseMap &indexMap, + ArrayRef targetShape, + UnrolledVectorState &state, + PatternRewriter &builder) { + // Compute unrolled shape of 'vectorType'. + state.unrolledShape.resize(vectorType.getRank()); + getMappedElements(indexMap, targetShape, state.unrolledShape); + // Compute unroll factors for unrolled shape. + auto maybeUnrollFactors = + shapeRatio(vectorType.getShape(), state.unrolledShape); + assert(maybeUnrollFactors.hasValue()); + state.unrollFactors = *maybeUnrollFactors; + // Compute 'basis' and 'numInstances' based on 'state.unrollFactors'. + state.basis = computeStrides(state.unrollFactors); + state.numInstances = computeMaxLinearIndex(state.unrollFactors); + state.slicesTuple = nullptr; + if (initValue != nullptr) { + // Create ExtractSlicesOp. + SmallVector sizes(state.unrolledShape); + SmallVector strides(state.unrollFactors.size(), 1); + auto tupleType = + generateExtractSlicesOpResultType(vectorType, sizes, strides, builder); + state.slicesTuple = builder.create( + initValue.getLoc(), tupleType, initValue, sizes, strides); + } +} + +// Computes and returns the linear index of the unrolled vector at +// 'vectorOffsets' within the vector represented by 'state'. +static int64_t +getUnrolledVectorLinearIndex(UnrolledVectorState &state, + ArrayRef vectorOffsets, + DenseMap &indexMap) { + // Compute vector offsets. + SmallVector sliceOffsets(state.unrolledShape.size()); + getMappedElements(indexMap, vectorOffsets, sliceOffsets); + // Compute and return linear index of 'sliceOffsets' w.r.t 'state.basis'. + return linearize(sliceOffsets, state.basis); +} + +// Returns an unrolled vector at 'vectorOffsets' within the vector +// represented by 'state'. The vector is created from a slice of 'initValue' +// if not present in 'cache'. +static Value getOrCreateUnrolledVectorSlice( + Location loc, UnrolledVectorState &state, ArrayRef vectorOffsets, + ArrayRef offsets, DenseMap &indexMap, + Value initValue, SmallVectorImpl &cache, PatternRewriter &builder) { + // Compute slice offsets. + SmallVector sliceOffsets(state.unrolledShape.size()); + getMappedElements(indexMap, offsets, sliceOffsets); + // TODO(b/144845578) Support non-1 strides. + SmallVector sliceStrides(state.unrolledShape.size(), 1); + // Compute linear index of 'sliceOffsets' w.r.t 'state.basis'. + int64_t sliceLinearIndex = + getUnrolledVectorLinearIndex(state, vectorOffsets, indexMap); + assert(sliceLinearIndex < static_cast(cache.size())); + auto valueSlice = cache[sliceLinearIndex]; + if (valueSlice == nullptr) { + // Return tuple element at 'sliceLinearIndex'. + auto tupleIndex = builder.getI64IntegerAttr(sliceLinearIndex); + auto initValueType = initValue.getType().cast(); + auto vectorType = + VectorType::get(state.unrolledShape, initValueType.getElementType()); + // Initialize 'cache' with slice from 'initValue'. + valueSlice = builder.create( + loc, vectorType, state.slicesTuple, tupleIndex); + // Store value back to 'cache'. + cache[sliceLinearIndex] = valueSlice; + } + return valueSlice; +} + +// VectorState aggregates per-operand/result vector state required for +// creating slices of vector operands, and clones of the operation being +// unrolled. +struct VectorState { + // The type of this vector. + VectorType type; + // Map from iteration space index to vector dimension index. + DenseMap indexMap; + // Index of this value in operation's operand list (-1 if not an operand). + int64_t operandIndex = -1; + // Accumulator iterator flag. + bool isAcc = false; +}; + +// +// unrollSingleResultStructuredOp +// +// Returns a value representing the result of structured operation 'op' +// with iteration bounds 'iterationBounds' unrolled to 'targetShape'. +// A list of VectorState objects must be specified in 'vectors', where +// each VectorState in the list represents a vector operand or vector result +// (if the operation does not have an accumulator operand). +// The VectorState at index 'resultIndex' in the list must be the state +// associated with the operations single result (i.e. either its accumulator +// operand or vector result value). +// +// Example: +// +// // Before unrolling +// +// operand0 operand1 operand2 +// \ | / +// -------------------- opA -------------------- +// +// // After unrolling by 2 +// +// operand0 operand1 operand2 +// / \ / \ / \ +// slice00 slice01 slice10 slice11 slice20 slice21 +// \ | | | / | +// -------------------- opA0 -------------------- | +// | | | | +// \ | | / +// -------------------- opA1 ------------------- +// | | +// \ / +// insertslice +// | + +// TODO(andydavis) Add the following canonicalization/simplifcation patterns: +// *) Add pattern which matches InsertStridedSlice -> StridedSlice and forwards +// InsertStridedSlice operand to StridedSlice. +// *) Add pattern which matches SourceOp -> StridedSlice -> UserOp which checks +// if there are duplicate identical StridedSlice ops from SourceOp, and +// rewrites itself to use the first duplicate. This transformation should +// cause users of identifical StridedSlice ops to reuse the same StridedSlice +// operation, and leave the duplicate StridedSlice ops with no users +// (removable with DCE). + +// TODO(andydavis) Generalize this to support structured ops beyond +// vector ContractionOp, and merge it with 'unrollSingleResultOpMatchingType' +static Value unrollSingleResultStructuredOp(Operation *op, + ArrayRef iterationBounds, + std::vector &vectors, + unsigned resultIndex, + ArrayRef targetShape, + PatternRewriter &builder) { + auto shapedType = op->getResult(0).getType().dyn_cast_or_null(); + if (!shapedType || !shapedType.hasStaticShape()) + assert(false && "Expected a statically shaped result type"); + + // Compute unroll factors for 'iterationBounds' based on 'targetShape' + auto maybeUnrollFactors = shapeRatio(iterationBounds, targetShape); + if (!maybeUnrollFactors.hasValue()) + assert(false && "Failed to compute unroll factors for target shape"); + auto unrollFactors = *maybeUnrollFactors; + + // Compute unrolled vector state for each vector in 'vectors'. + unsigned numVectors = vectors.size(); + SmallVector unrolledVectorState(numVectors); + for (unsigned i = 0; i < numVectors; ++i) { + int64_t operandIndex = vectors[i].operandIndex; + auto operand = operandIndex >= 0 ? op->getOperand(operandIndex) : nullptr; + initUnrolledVectorState(vectors[i].type, operand, vectors[i].indexMap, + targetShape, unrolledVectorState[i], builder); + } + // Compute number of total unrolled instances. + auto numUnrolledInstances = computeMaxLinearIndex(unrollFactors); + auto sliceStrides = computeStrides(unrollFactors); + + auto &resultValueState = unrolledVectorState[resultIndex]; + auto unrolledResultType = VectorType::get(resultValueState.unrolledShape, + shapedType.getElementType()); + + // Initialize caches for intermediate vector results. + std::vector> caches(numVectors); + for (unsigned i = 0; i < numVectors; ++i) + caches[i].resize(unrolledVectorState[i].numInstances); + + // Unroll 'numUnrolledInstances' of 'op', storing results in 'caches'. + for (unsigned i = 0; i < numUnrolledInstances; ++i) { + auto vectorOffsets = delinearize(sliceStrides, i); + auto elementOffsets = + computeElementOffsetsFromVectorSliceOffsets(targetShape, vectorOffsets); + // Get cached slice (or create slice) for each operand at 'offsets'. + SmallVector operands; + operands.resize(op->getNumOperands()); + for (unsigned i = 0; i < numVectors; ++i) { + int64_t operandIndex = vectors[i].operandIndex; + if (operandIndex < 0) + continue; // Output + auto operand = op->getOperand(operandIndex); + operands[operandIndex] = getOrCreateUnrolledVectorSlice( + op->getLoc(), unrolledVectorState[i], vectorOffsets, elementOffsets, + vectors[i].indexMap, operand, caches[i], builder); + } + // Create op on sliced vector arguments. + auto resultVector = + cloneOpWithOperandsAndTypes(builder, op->getLoc(), op, operands, + unrolledResultType) + ->getResult(0); + + // Compute linear result index. + int64_t linearIndex = getUnrolledVectorLinearIndex( + resultValueState, vectorOffsets, vectors[resultIndex].indexMap); + // Update result cache at 'linearIndex'. + caches[resultIndex][linearIndex] = resultVector; + } + + // Create TupleOp of unrolled result vectors. + SmallVector vectorTupleTypes(resultValueState.numInstances); + SmallVector vectorTupleValues(resultValueState.numInstances); + for (unsigned i = 0; i < resultValueState.numInstances; ++i) { + vectorTupleTypes[i] = caches[resultIndex][i].getType().cast(); + vectorTupleValues[i] = caches[resultIndex][i]; + } + TupleType tupleType = builder.getTupleType(vectorTupleTypes); + Value tupleOp = builder.create(op->getLoc(), tupleType, + vectorTupleValues); + + // Create InsertSlicesOp(Tuple(result_vectors)). + auto resultVectorType = op->getResult(0).getType().cast(); + SmallVector sizes(resultValueState.unrolledShape); + SmallVector strides(resultValueState.unrollFactors.size(), 1); + + Value insertSlicesOp = builder.create( + op->getLoc(), resultVectorType, tupleOp, builder.getI64ArrayAttr(sizes), + builder.getI64ArrayAttr(strides)); + return insertSlicesOp; +} + +static void getVectorContractionOpUnrollState( + vector::ContractionOp contractionOp, ArrayRef targetShape, + SmallVectorImpl &iterationBounds, + std::vector &vectors, unsigned &resultIndex) { + // Get contraction op iteration bounds. + contractionOp.getIterationBounds(iterationBounds); + assert(iterationBounds.size() == targetShape.size()); + // Get map from iteration space index to lhs/rhs/result shape index. + std::vector> iterationIndexMapList; + contractionOp.getIterationIndexMap(iterationIndexMapList); + unsigned numIterators = iterationIndexMapList.size(); + vectors.resize(numIterators); + unsigned accOperandIndex = vector::ContractionOp::getAccOperandIndex(); + for (unsigned i = 0; i < numIterators; ++i) { + vectors[i].type = contractionOp.getOperand(i).getType().cast(); + vectors[i].indexMap = iterationIndexMapList[i]; + vectors[i].operandIndex = i; + vectors[i].isAcc = i == accOperandIndex ? true : false; + } + + if (llvm::size(contractionOp.masks()) == 2) { + // Add vectors for lhs/rhs vector mask arguments. Masks have the + // same vector shape lhs/rhs args, so copy their index maps. + vectors.push_back({contractionOp.getLHSVectorMaskType(), + vectors[0].indexMap, accOperandIndex + 1, false}); + vectors.push_back({contractionOp.getRHSVectorMaskType(), + vectors[1].indexMap, accOperandIndex + 2, false}); + } + // Unroll 'op' 'iterationBounds' to 'targetShape'. + // TODO(andydavis) Use linalg style 'args_in'/'args_out' to partition + // 'vectors' instead of 'resultIndex'. + resultIndex = accOperandIndex; +} + +static void +getVectorElementwiseOpUnrollState(Operation *op, ArrayRef targetShape, + SmallVectorImpl &iterationBounds, + std::vector &vectors, + unsigned &resultIndex) { + // Verify that operation and operands all have the same vector shape. + auto resultType = op->getResult(0).getType().dyn_cast_or_null(); + assert(resultType && "Expected op with vector result type"); + auto resultShape = resultType.getShape(); + // Verify that all operands have the same vector type as result. + assert(llvm::all_of(op->getOperandTypes(), + [=](Type type) { return type == resultType; })); + // Populate 'iterationBounds' with 'resultShape' for elementwise operations. + iterationBounds.assign(resultShape.begin(), resultShape.end()); + + // Create trivial elementwise identity index map based on 'resultShape'. + DenseMap indexMap; + indexMap.reserve(resultShape.size()); + for (unsigned i = 0; i < resultShape.size(); ++i) + indexMap[i] = i; + + // Create VectorState each operand and single result. + unsigned numVectors = op->getNumOperands() + op->getNumResults(); + vectors.resize(numVectors); + for (unsigned i = 0; i < op->getNumOperands(); ++i) + vectors[i] = {resultType, indexMap, i, false}; + vectors[numVectors - 1] = {resultType, indexMap, -1, false}; + resultIndex = numVectors - 1; +} + +// Entry point for unrolling declarative pattern rewrites. +SmallVector mlir::vector::unrollSingleResultOpMatchingType( + PatternRewriter &builder, Operation *op, ArrayRef targetShape) { + assert(op->getNumResults() == 1 && "Expected single result operation"); + + // Populate 'iterationBounds', 'vectors' and 'resultIndex' to unroll 'op'. + SmallVector iterationBounds; + std::vector vectors; + unsigned resultIndex; + + if (auto contractionOp = dyn_cast(op)) { + // Populate state for vector ContractionOp. + getVectorContractionOpUnrollState(contractionOp, targetShape, + iterationBounds, vectors, resultIndex); + } else { + // Populate state for vector elementwise op. + getVectorElementwiseOpUnrollState(op, targetShape, iterationBounds, vectors, + resultIndex); + } + + // Unroll 'op' with 'iterationBounds' to 'targetShape'. + return SmallVector{unrollSingleResultStructuredOp( + op, iterationBounds, vectors, resultIndex, targetShape, builder)}; +} + +/// Generates slices of 'vectorType' according to 'sizes' and 'strides, and +/// calls 'fn' with linear index and indices for each slice. +static void +generateTransferOpSlices(Type memrefElementType, VectorType vectorType, + TupleType tupleType, ArrayRef sizes, + ArrayRef strides, ArrayRef indices, + PatternRewriter &rewriter, + function_ref)> fn) { + // Compute strides w.r.t. to slice counts in each dimension. + auto maybeDimSliceCounts = shapeRatio(vectorType.getShape(), sizes); + assert(maybeDimSliceCounts.hasValue()); + auto sliceDimCounts = *maybeDimSliceCounts; + auto sliceStrides = computeStrides(sliceDimCounts); + + int64_t numSlices = tupleType.size(); + unsigned numSliceIndices = indices.size(); + // Compute 'indexOffset' at which to update 'indices', which is equal + // to the memref rank (indices.size) minus the effective 'vectorRank'. + // The effective 'vectorRank', is equal to the rank of the vector type + // minus the rank of the memref vector element type (if it has one). + // + // For example: + // + // Given memref type 'memref<6x2x1xvector<2x4xf32>>' and vector + // transfer_read/write ops which read/write vectors of type + // 'vector<2x1x2x4xf32>'. The memref rank is 3, and the effective + // vector rank is 4 - 2 = 2, and so 'indexOffset' = 3 - 2 = 1. + // + unsigned vectorRank = vectorType.getRank(); + if (auto memrefVectorElementType = memrefElementType.dyn_cast()) { + assert(vectorRank >= memrefVectorElementType.getRank()); + vectorRank -= memrefVectorElementType.getRank(); + } + unsigned indexOffset = numSliceIndices - vectorRank; + + auto *ctx = rewriter.getContext(); + for (unsigned i = 0; i < numSlices; ++i) { + auto vectorOffsets = delinearize(sliceStrides, i); + auto elementOffsets = + computeElementOffsetsFromVectorSliceOffsets(sizes, vectorOffsets); + // Compute 'sliceIndices' by adding 'sliceOffsets[i]' to 'indices[i]'. + SmallVector sliceIndices(numSliceIndices); + for (unsigned j = 0; j < numSliceIndices; ++j) { + if (j < indexOffset) { + sliceIndices[j] = indices[j]; + } else { + auto expr = getAffineDimExpr(0, ctx) + + getAffineConstantExpr(elementOffsets[j - indexOffset], ctx); + auto map = AffineMap::get(/*dimCount=*/1, /*symbolCount=*/0, expr); + sliceIndices[j] = rewriter.create( + indices[j].getLoc(), map, ArrayRef(indices[j])); + } + } + // Call 'fn' to generate slice 'i' at 'sliceIndices'. + fn(i, sliceIndices); + } +} + +/// Returns true if 'map' is a suffix of an identity affine map, false +/// otherwise. Example: affine_map<(d0, d1, d2, d3) -> (d2, d3)> +static bool isIdentitySuffix(AffineMap map) { + if (map.getNumDims() < map.getNumResults()) + return false; + ArrayRef results = map.getResults(); + Optional lastPos; + for (unsigned i = 0, e = map.getNumResults(); i < e; ++i) { + auto expr = results[i].dyn_cast(); + if (!expr) + return false; + int currPos = static_cast(expr.getPosition()); + if (lastPos.hasValue() && currPos != lastPos.getValue() + 1) + return false; + lastPos = currPos; + } + return true; +} + +namespace { + +// Splits vector TransferReadOp into smaller TransferReadOps based on slicing +// scheme of its unique ExtractSlicesOp user. +struct SplitTransferReadOp : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + PatternMatchResult matchAndRewrite(vector::TransferReadOp xferReadOp, + PatternRewriter &rewriter) const override { + // TODO(andydavis, ntv) Support splitting TransferReadOp with non-identity + // permutation maps. Repurpose code from MaterializeVectors transformation. + if (!isIdentitySuffix(xferReadOp.permutation_map())) + return matchFailure(); + // Return unless the unique 'xferReadOp' user is an ExtractSlicesOp. + Value xferReadResult = xferReadOp.getResult(); + auto extractSlicesOp = + dyn_cast(*xferReadResult.getUsers().begin()); + if (!xferReadResult.hasOneUse() || !extractSlicesOp) + return matchFailure(); + + // Get 'sizes' and 'strides' parameters from ExtractSlicesOp user. + auto sourceVectorType = extractSlicesOp.getSourceVectorType(); + auto resultTupleType = extractSlicesOp.getResultTupleType(); + SmallVector sizes; + extractSlicesOp.getSizes(sizes); + SmallVector strides; + extractSlicesOp.getStrides(strides); + assert(llvm::all_of(strides, [](int64_t s) { return s == 1; })); + + Location loc = xferReadOp.getLoc(); + auto memrefElementType = + xferReadOp.memref().getType().cast().getElementType(); + int64_t numSlices = resultTupleType.size(); + SmallVector vectorTupleValues(numSlices); + SmallVector indices(xferReadOp.indices().begin(), + xferReadOp.indices().end()); + auto createSlice = [&](unsigned index, ArrayRef sliceIndices) { + // Get VectorType for slice 'i'. + auto sliceVectorType = resultTupleType.getType(index); + // Create split TransferReadOp for 'sliceUser'. + vectorTupleValues[index] = rewriter.create( + loc, sliceVectorType, xferReadOp.memref(), sliceIndices, + xferReadOp.permutation_map(), xferReadOp.padding()); + }; + generateTransferOpSlices(memrefElementType, sourceVectorType, + resultTupleType, sizes, strides, indices, rewriter, + createSlice); + + // Create tuple of splice xfer read operations. + Value tupleOp = rewriter.create(loc, resultTupleType, + vectorTupleValues); + // Replace 'xferReadOp' with result 'insertSlicesResult'. + rewriter.replaceOpWithNewOp( + xferReadOp, sourceVectorType, tupleOp, extractSlicesOp.sizes(), + extractSlicesOp.strides()); + return matchSuccess(); + } +}; + +// Splits vector TransferWriteOp into smaller TransferWriteOps for each source. +struct SplitTransferWriteOp : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + PatternMatchResult matchAndRewrite(vector::TransferWriteOp xferWriteOp, + PatternRewriter &rewriter) const override { + // TODO(andydavis, ntv) Support splitting TransferWriteOp with non-identity + // permutation maps. Repurpose code from MaterializeVectors transformation. + if (!isIdentitySuffix(xferWriteOp.permutation_map())) + return matchFailure(); + // Return unless the 'xferWriteOp' 'vector' operand is an 'InsertSlicesOp'. + auto *vectorDefOp = xferWriteOp.vector().getDefiningOp(); + auto insertSlicesOp = dyn_cast_or_null(vectorDefOp); + if (!insertSlicesOp) + return matchFailure(); + + // Get TupleOp operand of 'insertSlicesOp'. + auto tupleOp = dyn_cast_or_null( + insertSlicesOp.vectors().getDefiningOp()); + if (!tupleOp) + return matchFailure(); + + // Get 'sizes' and 'strides' parameters from InsertSlicesOp user. + auto sourceTupleType = insertSlicesOp.getSourceTupleType(); + auto resultVectorType = insertSlicesOp.getResultVectorType(); + SmallVector sizes; + insertSlicesOp.getSizes(sizes); + SmallVector strides; + insertSlicesOp.getStrides(strides); + + Location loc = xferWriteOp.getLoc(); + auto memrefElementType = + xferWriteOp.memref().getType().cast().getElementType(); + SmallVector indices(xferWriteOp.indices().begin(), + xferWriteOp.indices().end()); + auto createSlice = [&](unsigned index, ArrayRef sliceIndices) { + // Create split TransferWriteOp for source vector 'tupleOp.operand[i]'. + rewriter.create( + loc, tupleOp.getOperand(index), xferWriteOp.memref(), sliceIndices, + xferWriteOp.permutation_map()); + }; + generateTransferOpSlices(memrefElementType, resultVectorType, + sourceTupleType, sizes, strides, indices, rewriter, + createSlice); + + // Erase old 'xferWriteOp'. + rewriter.eraseOp(xferWriteOp); + return matchSuccess(); + } +}; + +/// Decomposes ShapeCastOp on tuple-of-vectors to multiple ShapeCastOps, each +/// on vector types. +struct ShapeCastOpDecomposer : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + PatternMatchResult matchAndRewrite(vector::ShapeCastOp shapeCastOp, + PatternRewriter &rewriter) const override { + // Check if 'shapeCastOp' has tuple source/result type. + auto sourceTupleType = + shapeCastOp.source().getType().dyn_cast_or_null(); + auto resultTupleType = + shapeCastOp.result().getType().dyn_cast_or_null(); + if (!sourceTupleType || !resultTupleType) + return matchFailure(); + assert(sourceTupleType.size() == resultTupleType.size()); + + // Create single-vector ShapeCastOp for each source tuple element. + Location loc = shapeCastOp.getLoc(); + SmallVector resultElements; + resultElements.reserve(resultTupleType.size()); + for (unsigned i = 0, e = sourceTupleType.size(); i < e; ++i) { + auto sourceElement = rewriter.create( + loc, sourceTupleType.getType(i), shapeCastOp.source(), + rewriter.getI64IntegerAttr(i)); + resultElements.push_back(rewriter.create( + loc, resultTupleType.getType(i), sourceElement)); + } + + // Replace 'shapeCastOp' with tuple of 'resultElements'. + rewriter.replaceOpWithNewOp(shapeCastOp, resultTupleType, + resultElements); + return matchSuccess(); + } +}; + +/// ShapeCastOpFolder folds cancelling ShapeCastOps away. +// +// Example: +// +// The following MLIR with cancelling ShapeCastOps: +// +// %0 = source : vector<5x4x2xf32> +// %1 = shape_cast %0 : vector<5x4x2xf32> to vector<20x2xf32> +// %2 = shape_cast %1 : vector<20x2xf32> to vector<5x4x2xf32> +// %3 = user %2 : vector<5x4x2xf32> +// +// Should canonicalize to the following: +// +// %0 = source : vector<5x4x2xf32> +// %1 = user %0 : vector<5x4x2xf32> +// +struct ShapeCastOpFolder : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + PatternMatchResult matchAndRewrite(vector::ShapeCastOp shapeCastOp, + PatternRewriter &rewriter) const override { + // Check if 'shapeCastOp' has vector source/result type. + auto sourceVectorType = + shapeCastOp.source().getType().dyn_cast_or_null(); + auto resultVectorType = + shapeCastOp.result().getType().dyn_cast_or_null(); + if (!sourceVectorType || !resultVectorType) + return matchFailure(); + + // Check if shape cast op source operand is also a shape cast op. + auto sourceShapeCastOp = dyn_cast_or_null( + shapeCastOp.source().getDefiningOp()); + if (!sourceShapeCastOp) + return matchFailure(); + auto operandSourceVectorType = + sourceShapeCastOp.source().getType().cast(); + auto operandResultVectorType = + sourceShapeCastOp.result().getType().cast(); + + // Check if shape cast operations invert each other. + if (operandSourceVectorType != resultVectorType || + operandResultVectorType != sourceVectorType) + return matchFailure(); + + rewriter.replaceOp(shapeCastOp, sourceShapeCastOp.source()); + return matchSuccess(); + } +}; + +// Patter rewrite which forward tuple elements to their users. +// User(TupleGetOp(ExtractSlicesOp(InsertSlicesOp(TupleOp(Producer))))) +// -> User(Producer) +struct TupleGetFolderOp : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + + PatternMatchResult matchAndRewrite(vector::TupleGetOp tupleGetOp, + PatternRewriter &rewriter) const override { + // Return if 'tupleGetOp.vectors' arg was not defined by ExtractSlicesOp. + auto extractSlicesOp = dyn_cast_or_null( + tupleGetOp.vectors().getDefiningOp()); + if (!extractSlicesOp) + return matchFailure(); + + // Return if 'extractSlicesOp.vector' arg was not defined by InsertSlicesOp. + auto insertSlicesOp = dyn_cast_or_null( + extractSlicesOp.vector().getDefiningOp()); + if (!insertSlicesOp) + return matchFailure(); + + // Return if 'insertSlicesOp.vectors' arg was not defined by TupleOp. + auto tupleOp = dyn_cast_or_null( + insertSlicesOp.vectors().getDefiningOp()); + if (!tupleOp) + return matchFailure(); + + // Forward Value from 'tupleOp' at 'tupleGetOp.index'. + Value tupleValue = tupleOp.getOperand(tupleGetOp.getIndex()); + rewriter.replaceOp(tupleGetOp, tupleValue); + return matchSuccess(); + } +}; + +/// Progressive lowering of ExtractSlicesOp to tuple of StridedSliceOp. +/// One: +/// %x = vector.extract_slices %0 +/// is replaced by: +/// %a = vector.strided_slice %0 +/// %b = vector.strided_slice %0 +/// .. +/// %x = vector.tuple %a, %b, .. +class ExtractSlicesOpLowering + : public OpRewritePattern { +public: + using OpRewritePattern::OpRewritePattern; + + PatternMatchResult matchAndRewrite(vector::ExtractSlicesOp op, + PatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + + VectorType vectorType = op.getSourceVectorType(); + auto shape = vectorType.getShape(); + + SmallVector sizes; + op.getSizes(sizes); + SmallVector strides; + op.getStrides(strides); // all-ones at the moment + + // For each element in the tuple, generate the proper strided slice. + TupleType tupleType = op.getResultTupleType(); + int64_t tupleSize = tupleType.size(); + SmallVector tupleValues(tupleSize); + auto sliceStrides = computeStrides(shape, sizes); + for (int64_t i = 0; i < tupleSize; ++i) { + auto vectorOffsets = delinearize(sliceStrides, i); + auto elementOffsets = + computeElementOffsetsFromVectorSliceOffsets(sizes, vectorOffsets); + auto sliceSizes = computeSliceSizes(shape, sizes, elementOffsets); + // Insert in tuple. + tupleValues[i] = rewriter.create( + loc, op.vector(), elementOffsets, sliceSizes, strides); + } + + rewriter.replaceOpWithNewOp(op, tupleType, tupleValues); + return matchSuccess(); + } +}; + +/// Progressive lowering of InsertSlicesOp to series of InsertStridedSliceOp. +/// One: +/// %x = vector.insert_slices %0 +/// is replaced by: +/// %r0 = vector.splat 0 +// %t1 = vector.tuple_get %0, 0 +/// %r1 = vector.insert_strided_slice %r0, %t1 +// %t2 = vector.tuple_get %0, 1 +/// %r2 = vector.insert_strided_slice %r1, %t2 +/// .. +/// %x = .. +class InsertSlicesOpLowering : public OpRewritePattern { +public: + using OpRewritePattern::OpRewritePattern; + + PatternMatchResult matchAndRewrite(vector::InsertSlicesOp op, + PatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + + VectorType vectorType = op.getResultVectorType(); + auto shape = vectorType.getShape(); + + SmallVector sizes; + op.getSizes(sizes); + SmallVector strides; + op.getStrides(strides); // all-ones at the moment + + // Prepare result. + auto elemType = vectorType.getElementType(); + Value zero = rewriter.create(loc, elemType, + rewriter.getZeroAttr(elemType)); + Value result = rewriter.create(loc, vectorType, zero); + + // For each element in the tuple, extract the proper strided slice. + TupleType tupleType = op.getSourceTupleType(); + int64_t tupleSize = tupleType.size(); + auto sliceStrides = computeStrides(shape, sizes); + for (int64_t i = 0; i < tupleSize; ++i) { + auto vectorOffsets = delinearize(sliceStrides, i); + auto elementOffsets = + computeElementOffsetsFromVectorSliceOffsets(sizes, vectorOffsets); + // Extract from tuple into the result. + auto index = rewriter.getI64IntegerAttr(i); + auto tupleGet = rewriter.create( + loc, tupleType.getType(i), op.getOperand(), index); + result = rewriter.create( + loc, tupleGet, result, elementOffsets, strides); + } + + rewriter.replaceOp(op, result); + return matchSuccess(); + } +}; + +/// Progressive lowering of OuterProductOp. +/// One: +/// %x = vector.outerproduct %lhs, %rhs, %acc +/// is replaced by: +/// %z = zero-result +/// %0 = vector.extract %lhs[0] +/// %1 = vector.broadcast %0 +/// %2 = vector.extract %acc[0] +/// %3 = vector.fma %1, %arg1, %2 +/// %4 = vector.insert %3, %z[0] +/// .. +/// %x = vector.insert %.., %..[N-1] +/// +class OuterProductOpLowering : public OpRewritePattern { +public: + using OpRewritePattern::OpRewritePattern; + + PatternMatchResult matchAndRewrite(vector::OuterProductOp op, + PatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + + VectorType rhsType = op.getOperandVectorTypeRHS(); + VectorType resType = op.getVectorType(); + Type eltType = resType.getElementType(); + Value acc = (op.acc().empty()) ? nullptr : op.acc()[0]; + + Value zero = rewriter.create(loc, eltType, + rewriter.getZeroAttr(eltType)); + Value result = rewriter.create(loc, resType, zero); + for (int64_t d = 0, e = resType.getDimSize(0); d < e; ++d) { + auto pos = rewriter.getI64ArrayAttr(d); + Value x = rewriter.create(loc, eltType, op.lhs(), pos); + Value b = rewriter.create(loc, rhsType, x); + Value m; + if (acc) { + Value z = rewriter.create(loc, rhsType, acc, pos); + m = rewriter.create(loc, b, op.rhs(), z); + } else { + m = rewriter.create(loc, b, op.rhs()); + } + result = rewriter.create(loc, resType, m, result, pos); + } + rewriter.replaceOp(op, result); + return matchSuccess(); + } +}; + +/// Progressive lowering of ContractionOp. +/// One: +/// %x = vector.contract with at least one free/batch dimension +/// is replaced by: +/// %a = vector.contract with one less free/batch dimension +/// %b = vector.contract with one less free/batch dimension +/// .. +/// %x = combine %a %b .. +/// until a pure contraction is reached (no free/batch dimensions), +/// which is replaced by a fma/reduction op. +/// +/// TODO(ajcbik): break down into transpose/reshape/cast ops +/// when they become available to avoid code dup +/// TODO(ajcbik): investigate lowering order impact on performance +class ContractionOpLowering : public OpRewritePattern { +public: + using OpRewritePattern::OpRewritePattern; + + PatternMatchResult matchAndRewrite(vector::ContractionOp op, + PatternRewriter &rewriter) const override { + // TODO(ajcbik): implement masks + if (llvm::size(op.masks()) != 0) + return matchFailure(); + + // TODO(ntv, ajcbik): implement benefits, cost models, separate this out in + // a new pattern. + // TODO(ntv, fhahn): once row-major mode is available in LLVM's matrix + // intrinsics, use that. + if (lowerToLLVMMatrixIntrinsics && + isColumnMajorMatmul(op.indexing_maps())) { + VectorType lhsType = op.getLhsType(); + VectorType rhsType = op.getRhsType(); + Type flattenedLHSType = + VectorType::get(lhsType.getNumElements(), lhsType.getElementType()); + Type flattenedRHSType = + VectorType::get(rhsType.getNumElements(), rhsType.getElementType()); + auto lhs = rewriter.create( + op.getLoc(), flattenedLHSType, op.lhs()); + auto rhs = rewriter.create( + op.getLoc(), flattenedRHSType, op.rhs()); + + unsigned lhsRows = op.getLhsType().getShape()[0]; + unsigned lhsColumns = op.getLhsType().getShape()[1]; + unsigned rhsColumns = op.getRhsType().getShape()[1]; + Value mul = rewriter.create( + op.getLoc(), lhs, rhs, lhsRows, lhsColumns, rhsColumns); + mul = rewriter.create(op.getLoc(), + op.acc().getType(), mul); + Type elementType = op.getLhsType().getElementType(); + assert(elementType.isIntOrFloat()); + if (elementType.isa()) + rewriter.replaceOpWithNewOp(op, op.acc(), mul); + else + rewriter.replaceOpWithNewOp(op, op.acc(), mul); + return matchSuccess(); + } + + // Find first batch dimension in LHS/RHS, and lower when found. + std::vector> batchDimMap = op.getBatchDimMap(); + if (!batchDimMap.empty()) { + int64_t lhsIndex = batchDimMap[0].first; + int64_t rhsIndex = batchDimMap[0].second; + rewriter.replaceOp(op, lowerParallel(op, lhsIndex, rhsIndex, rewriter)); + return matchSuccess(); + } + + // Collect contracting dimensions. + std::vector> contractingDimMap = + op.getContractingDimMap(); + DenseSet lhsContractingDimSet; + DenseSet rhsContractingDimSet; + for (auto &dimPair : contractingDimMap) { + lhsContractingDimSet.insert(dimPair.first); + rhsContractingDimSet.insert(dimPair.second); + } + + // Find first free dimension in LHS, and lower when found. + VectorType lhsType = op.getLhsType(); + for (int64_t lhsIndex = 0, e = lhsType.getRank(); lhsIndex < e; + ++lhsIndex) { + if (lhsContractingDimSet.count(lhsIndex) == 0) { + rewriter.replaceOp( + op, lowerParallel(op, lhsIndex, /*rhsIndex=*/-1, rewriter)); + return matchSuccess(); + } + } + + // Find first free dimension in RHS, and lower when found. + VectorType rhsType = op.getRhsType(); + for (int64_t rhsIndex = 0, e = rhsType.getRank(); rhsIndex < e; + ++rhsIndex) { + if (rhsContractingDimSet.count(rhsIndex) == 0) { + rewriter.replaceOp( + op, lowerParallel(op, /*lhsIndex=*/-1, rhsIndex, rewriter)); + return matchSuccess(); + } + } + + // Lower the first remaining reduction dimension. + if (!contractingDimMap.empty()) { + rewriter.replaceOp(op, lowerReduction(op, rewriter)); + return matchSuccess(); + } + + return matchFailure(); + } + +private: + // Lower one parallel dimension. + // TODO(ajcbik): consider reusing existing contract unrolling + Value lowerParallel(vector::ContractionOp op, int64_t lhsIndex, + int64_t rhsIndex, PatternRewriter &rewriter) const { + VectorType lhsType = op.getLhsType(); + VectorType rhsType = op.getRhsType(); + VectorType resType = op.getResultType().cast(); + // Find the iterator type index and result index. + SmallVector iMap = op.getIndexingMaps(); + int64_t iterIndex = -1; + int64_t dimSize = -1; + if (lhsIndex >= 0) { + iterIndex = + iMap[0].getResult(lhsIndex).cast().getPosition(); + assert((rhsIndex < 0 || iterIndex == iMap[1] + .getResult(rhsIndex) + .cast() + .getPosition()) && + "parallel index should be free in LHS or batch in LHS/RHS"); + dimSize = lhsType.getDimSize(lhsIndex); + } else { + assert(rhsIndex >= 0 && "missing parallel index"); + iterIndex = + iMap[1].getResult(rhsIndex).cast().getPosition(); + dimSize = rhsType.getDimSize(rhsIndex); + } + assert(iterIndex >= 0 && "parallel index not listed in operand mapping"); + Optional lookup = getResultIndex(iMap[2], iterIndex); + assert(lookup.hasValue() && "parallel index not listed in reduction"); + int64_t resIndex = lookup.getValue(); + // Construct new iterator types and affine map array attribute. + SmallVector lowIndexingMaps; + lowIndexingMaps.push_back(adjustMap(iMap[0], iterIndex, rewriter)); + lowIndexingMaps.push_back(adjustMap(iMap[1], iterIndex, rewriter)); + lowIndexingMaps.push_back(adjustMap(iMap[2], iterIndex, rewriter)); + auto lowAffine = rewriter.getAffineMapArrayAttr(lowIndexingMaps); + auto lowIter = + rewriter.getArrayAttr(adjustIter(op.iterator_types(), iterIndex)); + // Unroll into a series of lower dimensional vector.contract ops. + Location loc = op.getLoc(); + Value result = zeroVector(loc, resType, rewriter); + for (int64_t d = 0; d < dimSize; ++d) { + auto lhs = reshapeLoad(loc, op.lhs(), lhsType, lhsIndex, d, rewriter); + auto rhs = reshapeLoad(loc, op.rhs(), rhsType, rhsIndex, d, rewriter); + auto acc = reshapeLoad(loc, op.acc(), resType, resIndex, d, rewriter); + Value lowContract = rewriter.create( + loc, lhs, rhs, acc, lowAffine, lowIter); + result = reshapeStore(loc, lowContract, result, resType, resIndex, d, + rewriter); + } + return result; + } + + // Lower one reduction dimension. + Value lowerReduction(vector::ContractionOp op, + PatternRewriter &rewriter) const { + auto loc = op.getLoc(); + VectorType lhsType = op.getLhsType(); + VectorType rhsType = op.getRhsType(); + Type resType = op.getResultType(); + assert(!resType.isa()); + // Use iterator index 0. + int64_t iterIndex = 0; + SmallVector iMap = op.getIndexingMaps(); + Optional lookupLhs = getResultIndex(iMap[0], iterIndex); + Optional lookupRhs = getResultIndex(iMap[1], iterIndex); + assert(lookupLhs.hasValue() && "missing LHS parallel index"); + assert(lookupRhs.hasValue() && "missing RHS parallel index"); + int64_t lhsIndex = lookupLhs.getValue(); + int64_t rhsIndex = lookupRhs.getValue(); + int64_t dimSize = lhsType.getDimSize(lhsIndex); + assert(dimSize == rhsType.getDimSize(rhsIndex) && "corrupt shape"); + // Base case. + if (lhsType.getRank() == 1) { + assert(rhsType.getRank() == 1 && "corrupt contraction"); + Value zero = zeroVector(loc, lhsType, rewriter); + Value fma = rewriter.create(loc, op.lhs(), op.rhs(), zero); + StringAttr kind = rewriter.getStringAttr("add"); + return rewriter.create(loc, resType, kind, fma, + op.acc()); + } + // Construct new iterator types and affine map array attribute. + SmallVector lowIndexingMaps; + lowIndexingMaps.push_back(adjustMap(iMap[0], iterIndex, rewriter)); + lowIndexingMaps.push_back(adjustMap(iMap[1], iterIndex, rewriter)); + lowIndexingMaps.push_back(adjustMap(iMap[2], iterIndex, rewriter)); + auto lowAffine = rewriter.getAffineMapArrayAttr(lowIndexingMaps); + auto lowIter = + rewriter.getArrayAttr(adjustIter(op.iterator_types(), iterIndex)); + // Unroll into a series of lower dimensional vector.contract ops. + // By feeding the initial accumulator into the first contraction, + // and the result of each contraction into the next, eventually + // the sum of all reductions is computed. + Value result = op.acc(); + for (int64_t d = 0; d < dimSize; ++d) { + auto lhs = reshapeLoad(loc, op.lhs(), lhsType, lhsIndex, d, rewriter); + auto rhs = reshapeLoad(loc, op.rhs(), rhsType, rhsIndex, d, rewriter); + result = rewriter.create(loc, lhs, rhs, result, + lowAffine, lowIter); + } + return result; + } + + // Helper method to construct a zero vector. + static Value zeroVector(Location loc, VectorType vType, + PatternRewriter &rewriter) { + Type eltType = vType.getElementType(); + Value zero = rewriter.create(loc, eltType, + rewriter.getZeroAttr(eltType)); + return rewriter.create(loc, vType, zero); + } + + // Helper to find an index in an affine map. + static Optional getResultIndex(AffineMap map, int64_t index) { + for (int64_t i = 0, e = map.getNumResults(); i < e; ++i) { + int64_t idx = map.getResult(i).cast().getPosition(); + if (idx == index) + return i; + } + return None; + } + + // Helper to construct iterator types with one index removed. + static SmallVector adjustIter(ArrayAttr iteratorTypes, + int64_t index) { + SmallVector results; + for (auto it : llvm::enumerate(iteratorTypes)) { + int64_t idx = it.index(); + if (idx == index) + continue; + results.push_back(it.value()); + } + return results; + } + + // Helper to construct an affine map with one index removed. + static AffineMap adjustMap(AffineMap map, int64_t index, + PatternRewriter &rewriter) { + auto *ctx = rewriter.getContext(); + SmallVector results; + for (int64_t i = 0, e = map.getNumResults(); i < e; ++i) { + int64_t idx = map.getResult(i).cast().getPosition(); + if (idx == index) + continue; + // Re-insert remaining indices, but renamed when occurring + // after the removed index. + auto targetExpr = getAffineDimExpr(idx < index ? idx : idx - 1, ctx); + results.push_back(targetExpr); + } + // The (...) -> () affine map has its own factory method. + return results.empty() ? AffineMap::get(map.getNumDims() - 1, 0, ctx) + : AffineMap::get(map.getNumDims() - 1, 0, results); + } + + // Helper to drop dimension from vector type. + static Type adjustType(VectorType tp, int64_t index) { + int64_t rank = tp.getRank(); + Type eltType = tp.getElementType(); + if (rank == 1) { + assert(index == 0 && "index for scalar result out of bounds"); + return eltType; + } + SmallVector adjustedShape; + for (int64_t i = 0; i < rank; ++i) { + // Omit dimension at the given index. + if (i == index) + continue; + // Otherwise, add dimension back. + adjustedShape.push_back(tp.getDimSize(i)); + } + return VectorType::get(adjustedShape, eltType); + } + + // Helper method to possibly drop a dimension in a load. + // TODO(ajcbik): use a reshaping vector load (and share lowering code) + static Value reshapeLoad(Location loc, Value val, VectorType type, + int64_t index, int64_t pos, + PatternRewriter &rewriter) { + if (index == -1) + return val; + Type lowType = adjustType(type, 0); + // At extraction dimension? + if (index == 0) { + auto posAttr = rewriter.getI64ArrayAttr(pos); + return rewriter.create(loc, lowType, val, posAttr); + } + // Unroll leading dimensions. + VectorType vType = lowType.cast(); + VectorType resType = adjustType(type, index).cast(); + Value result = zeroVector(loc, resType, rewriter); + for (int64_t d = 0, e = resType.getDimSize(0); d < e; d++) { + auto posAttr = rewriter.getI64ArrayAttr(d); + Value ext = rewriter.create(loc, vType, val, posAttr); + Value load = reshapeLoad(loc, ext, vType, index - 1, pos, rewriter); + result = rewriter.create(loc, resType, load, result, + posAttr); + } + return result; + } + + // Helper method to possibly drop a dimension in a store. + // TODO(ajcbik): use a reshaping vector store (and share lowering code) + static Value reshapeStore(Location loc, Value val, Value result, + VectorType type, int64_t index, int64_t pos, + PatternRewriter &rewriter) { + // Unmodified? + if (index == -1) + return val; + // At insertion dimension? + if (index == 0) { + auto posAttr = rewriter.getI64ArrayAttr(pos); + return rewriter.create(loc, type, val, result, posAttr); + } + // Unroll leading dimensions. + Type lowType = adjustType(type, 0); + VectorType vType = lowType.cast(); + Type insType = adjustType(vType, 0); + for (int64_t d = 0, e = type.getDimSize(0); d < e; d++) { + auto posAttr = rewriter.getI64ArrayAttr(d); + Value ext = + rewriter.create(loc, vType, result, posAttr); + Value ins = + rewriter.create(loc, insType, val, posAttr); + Value sto = reshapeStore(loc, ins, ext, vType, index - 1, pos, rewriter); + result = + rewriter.create(loc, type, sto, result, posAttr); + } + return result; + } +}; + +/// ShapeOp 2D -> 1D downcast serves the purpose of flattening 2-D to 1-D +/// vectors progressively on the way to target llvm.matrix intrinsics. +/// This iterates over the most major dimension of the 2-D vector and performs +/// rewrites into: +/// vector.extract from 2-D + vector.insert_strided_slice offset into 1-D +class ShapeCastOp2DDownCastRewritePattern + : public OpRewritePattern { +public: + using OpRewritePattern::OpRewritePattern; + + PatternMatchResult matchAndRewrite(vector::ShapeCastOp op, + PatternRewriter &rewriter) const override { + auto sourceVectorType = op.getSourceVectorType(); + auto resultVectorType = op.getResultVectorType(); + if (sourceVectorType.getRank() != 2 || resultVectorType.getRank() != 1) + return matchFailure(); + + auto loc = op.getLoc(); + auto elemType = sourceVectorType.getElementType(); + Value zero = rewriter.create(loc, elemType, + rewriter.getZeroAttr(elemType)); + Value desc = rewriter.create(loc, resultVectorType, zero); + unsigned mostMinorVectorSize = sourceVectorType.getShape()[1]; + for (int64_t i = 0, e = sourceVectorType.getShape().front(); i != e; ++i) { + Value vec = rewriter.create(loc, op.source(), i); + desc = rewriter.create( + loc, vec, desc, + /*offsets=*/i * mostMinorVectorSize, /*strides=*/1); + } + rewriter.replaceOp(op, desc); + return matchSuccess(); + } +}; + +/// ShapeOp 1D -> 2D upcast serves the purpose of unflattening 2-D from 1-D +/// vectors progressively on the way from targeting llvm.matrix intrinsics. +/// This iterates over the most major dimension of the 2-D vector and performs +/// rewrites into: +/// vector.strided_slice from 1-D + vector.insert into 2-D +class ShapeCastOp2DUpCastRewritePattern + : public OpRewritePattern { +public: + using OpRewritePattern::OpRewritePattern; + + PatternMatchResult matchAndRewrite(vector::ShapeCastOp op, + PatternRewriter &rewriter) const override { + auto sourceVectorType = op.getSourceVectorType(); + auto resultVectorType = op.getResultVectorType(); + if (sourceVectorType.getRank() != 1 || resultVectorType.getRank() != 2) + return matchFailure(); + + auto loc = op.getLoc(); + auto elemType = sourceVectorType.getElementType(); + Value zero = rewriter.create(loc, elemType, + rewriter.getZeroAttr(elemType)); + Value desc = rewriter.create(loc, resultVectorType, zero); + unsigned mostMinorVectorSize = resultVectorType.getShape()[1]; + for (int64_t i = 0, e = resultVectorType.getShape().front(); i != e; ++i) { + Value vec = rewriter.create( + loc, op.source(), /*offsets=*/i * mostMinorVectorSize, + /*sizes=*/mostMinorVectorSize, + /*strides=*/1); + desc = rewriter.create(loc, vec, desc, i); + } + rewriter.replaceOp(op, desc); + return matchSuccess(); + } +}; + +} // namespace + +// TODO(andydavis) Add pattern to rewrite ExtractSlices(ConstantMaskOp). +// TODO(andydavis) Add this as DRR pattern. +void mlir::vector::populateVectorToVectorTransformationPatterns( + OwningRewritePatternList &patterns, MLIRContext *context) { + patterns.insert(context); +} + +void mlir::vector::populateVectorSlicesLoweringPatterns( + OwningRewritePatternList &patterns, MLIRContext *context) { + patterns.insert(context); +} + +void mlir::vector::populateVectorContractLoweringPatterns( + OwningRewritePatternList &patterns, MLIRContext *context) { + patterns.insert( + context); +} diff --git a/mlir/lib/Dialect/VectorOps/VectorUtils.cpp b/mlir/lib/Dialect/VectorOps/VectorUtils.cpp new file mode 100644 index 0000000000000..46a990080a4f2 --- /dev/null +++ b/mlir/lib/Dialect/VectorOps/VectorUtils.cpp @@ -0,0 +1,278 @@ +//===- VectorUtils.cpp - MLIR Utilities for VectorOps ------------------===// +// +// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements utility methods for working with the VectorOps dialect. +// +//===----------------------------------------------------------------------===// + +#include "mlir/Dialect/VectorOps/VectorUtils.h" +#include "mlir/Analysis/LoopAnalysis.h" +#include "mlir/Dialect/Affine/IR/AffineOps.h" +#include "mlir/Dialect/StandardOps/IR/Ops.h" +#include "mlir/Dialect/VectorOps/VectorOps.h" +#include "mlir/IR/Builders.h" +#include "mlir/IR/IntegerSet.h" +#include "mlir/IR/Operation.h" +#include "mlir/Support/Functional.h" +#include "mlir/Support/LLVM.h" +#include "mlir/Support/MathExtras.h" +#include "mlir/Support/STLExtras.h" + +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/SetVector.h" + +using llvm::SetVector; + +namespace mlir { + +SmallVector computeStrides(ArrayRef shape, + ArrayRef sizes) { + int64_t rank = shape.size(); + // Compute the count for each dimension. + SmallVector sliceDimCounts(rank); + for (int64_t r = 0; r < rank; ++r) + sliceDimCounts[r] = ceilDiv(shape[r], sizes[r]); + // Use that to compute the slice stride for each dimension. + SmallVector sliceStrides(rank); + sliceStrides[rank - 1] = 1; + for (int64_t r = rank - 2; r >= 0; --r) + sliceStrides[r] = sliceStrides[r + 1] * sliceDimCounts[r + 1]; + return sliceStrides; +} + +SmallVector delinearize(ArrayRef sliceStrides, + int64_t index) { + int64_t rank = sliceStrides.size(); + SmallVector vectorOffsets(rank); + for (int64_t r = 0; r < rank; ++r) { + assert(sliceStrides[r] > 0); + vectorOffsets[r] = index / sliceStrides[r]; + index %= sliceStrides[r]; + } + return vectorOffsets; +} + +SmallVector +computeElementOffsetsFromVectorSliceOffsets(ArrayRef sizes, + ArrayRef vectorOffsets) { + return functional::zipMap([](int64_t v1, int64_t v2) { return v1 * v2; }, + vectorOffsets, sizes); +} + +SmallVector computeSliceSizes(ArrayRef shape, + ArrayRef sizes, + ArrayRef elementOffsets) { + int64_t rank = shape.size(); + SmallVector sliceSizes(rank); + for (unsigned r = 0; r < rank; ++r) + sliceSizes[r] = std::min(sizes[r], shape[r] - elementOffsets[r]); + return sliceSizes; +} + +Optional> shapeRatio(ArrayRef superShape, + ArrayRef subShape) { + if (superShape.size() < subShape.size()) { + return Optional>(); + } + + // Starting from the end, compute the integer divisors. + // Set the boolean `divides` if integral division is not possible. + std::vector result; + result.reserve(superShape.size()); + bool divides = true; + auto divide = [÷s, &result](int superSize, int subSize) { + assert(superSize > 0 && "superSize must be > 0"); + assert(subSize > 0 && "subSize must be > 0"); + divides &= (superSize % subSize == 0); + result.push_back(superSize / subSize); + }; + functional::zipApply( + divide, SmallVector{superShape.rbegin(), superShape.rend()}, + SmallVector{subShape.rbegin(), subShape.rend()}); + + // If integral division does not occur, return and let the caller decide. + if (!divides) { + return None; + } + + // At this point we computed the ratio (in reverse) for the common + // size. Fill with the remaining entries from the super-vector shape (still in + // reverse). + int commonSize = subShape.size(); + std::copy(superShape.rbegin() + commonSize, superShape.rend(), + std::back_inserter(result)); + + assert(result.size() == superShape.size() && + "super to sub shape ratio is not of the same size as the super rank"); + + // Reverse again to get it back in the proper order and return. + return SmallVector{result.rbegin(), result.rend()}; +} + +Optional> shapeRatio(VectorType superVectorType, + VectorType subVectorType) { + assert(superVectorType.getElementType() == subVectorType.getElementType() && + "vector types must be of the same elemental type"); + return shapeRatio(superVectorType.getShape(), subVectorType.getShape()); +} + +/// Constructs a permutation map from memref indices to vector dimension. +/// +/// The implementation uses the knowledge of the mapping of enclosing loop to +/// vector dimension. `enclosingLoopToVectorDim` carries this information as a +/// map with: +/// - keys representing "vectorized enclosing loops"; +/// - values representing the corresponding vector dimension. +/// The algorithm traverses "vectorized enclosing loops" and extracts the +/// at-most-one MemRef index that is invariant along said loop. This index is +/// guaranteed to be at most one by construction: otherwise the MemRef is not +/// vectorizable. +/// If this invariant index is found, it is added to the permutation_map at the +/// proper vector dimension. +/// If no index is found to be invariant, 0 is added to the permutation_map and +/// corresponds to a vector broadcast along that dimension. +/// +/// Returns an empty AffineMap if `enclosingLoopToVectorDim` is empty, +/// signalling that no permutation map can be constructed given +/// `enclosingLoopToVectorDim`. +/// +/// Examples can be found in the documentation of `makePermutationMap`, in the +/// header file. +static AffineMap makePermutationMap( + ArrayRef indices, + const DenseMap &enclosingLoopToVectorDim) { + if (enclosingLoopToVectorDim.empty()) + return AffineMap(); + MLIRContext *context = + enclosingLoopToVectorDim.begin()->getFirst()->getContext(); + using functional::makePtrDynCaster; + using functional::map; + SmallVector perm(enclosingLoopToVectorDim.size(), + getAffineConstantExpr(0, context)); + + for (auto kvp : enclosingLoopToVectorDim) { + assert(kvp.second < perm.size()); + auto invariants = getInvariantAccesses( + cast(kvp.first).getInductionVar(), indices); + unsigned numIndices = indices.size(); + unsigned countInvariantIndices = 0; + for (unsigned dim = 0; dim < numIndices; ++dim) { + if (!invariants.count(indices[dim])) { + assert(perm[kvp.second] == getAffineConstantExpr(0, context) && + "permutationMap already has an entry along dim"); + perm[kvp.second] = getAffineDimExpr(dim, context); + } else { + ++countInvariantIndices; + } + } + assert((countInvariantIndices == numIndices || + countInvariantIndices == numIndices - 1) && + "Vectorization prerequisite violated: at most 1 index may be " + "invariant wrt a vectorized loop"); + } + return AffineMap::get(indices.size(), 0, perm); +} + +/// Implementation detail that walks up the parents and records the ones with +/// the specified type. +/// TODO(ntv): could also be implemented as a collect parents followed by a +/// filter and made available outside this file. +template +static SetVector getParentsOfType(Operation *op) { + SetVector res; + auto *current = op; + while (auto *parent = current->getParentOp()) { + if (auto typedParent = dyn_cast(parent)) { + assert(res.count(parent) == 0 && "Already inserted"); + res.insert(parent); + } + current = parent; + } + return res; +} + +/// Returns the enclosing AffineForOp, from closest to farthest. +static SetVector getEnclosingforOps(Operation *op) { + return getParentsOfType(op); +} + +AffineMap +makePermutationMap(Operation *op, ArrayRef indices, + const DenseMap &loopToVectorDim) { + DenseMap enclosingLoopToVectorDim; + auto enclosingLoops = getEnclosingforOps(op); + for (auto *forInst : enclosingLoops) { + auto it = loopToVectorDim.find(forInst); + if (it != loopToVectorDim.end()) { + enclosingLoopToVectorDim.insert(*it); + } + } + return makePermutationMap(indices, enclosingLoopToVectorDim); +} + +bool matcher::operatesOnSuperVectorsOf(Operation &op, + VectorType subVectorType) { + // First, extract the vector type and distinguish between: + // a. ops that *must* lower a super-vector (i.e. vector.transfer_read, + // vector.transfer_write); and + // b. ops that *may* lower a super-vector (all other ops). + // The ops that *may* lower a super-vector only do so if the super-vector to + // sub-vector ratio exists. The ops that *must* lower a super-vector are + // explicitly checked for this property. + /// TODO(ntv): there should be a single function for all ops to do this so we + /// do not have to special case. Maybe a trait, or just a method, unclear atm. + bool mustDivide = false; + (void)mustDivide; + VectorType superVectorType; + if (auto read = dyn_cast(op)) { + superVectorType = read.getVectorType(); + mustDivide = true; + } else if (auto write = dyn_cast(op)) { + superVectorType = write.getVectorType(); + mustDivide = true; + } else if (op.getNumResults() == 0) { + if (!isa(op)) { + op.emitError("NYI: assuming only return operations can have 0 " + " results at this point"); + } + return false; + } else if (op.getNumResults() == 1) { + if (auto v = op.getResult(0).getType().dyn_cast()) { + superVectorType = v; + } else { + // Not a vector type. + return false; + } + } else { + // Not a vector.transfer and has more than 1 result, fail hard for now to + // wake us up when something changes. + op.emitError("NYI: operation has more than 1 result"); + return false; + } + + // Get the ratio. + auto ratio = shapeRatio(superVectorType, subVectorType); + + // Sanity check. + assert((ratio.hasValue() || !mustDivide) && + "vector.transfer operation in which super-vector size is not an" + " integer multiple of sub-vector size"); + + // This catches cases that are not strictly necessary to have multiplicity but + // still aren't divisible by the sub-vector shape. + // This could be useful information if we wanted to reshape at the level of + // the vector type (but we would have to look at the compute and distinguish + // between parallel, reduction and possibly other cases. + if (!ratio.hasValue()) { + return false; + } + + return true; +} + +} // namespace mlir diff --git a/mlir/lib/IR/AffineMap.cpp b/mlir/lib/IR/AffineMap.cpp index 94b7d4d05eb1a..00feb08db89ad 100644 --- a/mlir/lib/IR/AffineMap.cpp +++ b/mlir/lib/IR/AffineMap.cpp @@ -257,7 +257,8 @@ AffineMap AffineMap::replaceDimsAndSymbols(ArrayRef dimReplacements, results.push_back( expr.replaceDimsAndSymbols(dimReplacements, symReplacements)); - return get(numResultDims, numResultSyms, results); + return results.empty() ? get(numResultDims, 0, getContext()) + : get(numResultDims, numResultSyms, results); } AffineMap AffineMap::compose(AffineMap map) { diff --git a/mlir/lib/IR/AsmPrinter.cpp b/mlir/lib/IR/AsmPrinter.cpp index bbb2462d176d3..bf7905a147819 100644 --- a/mlir/lib/IR/AsmPrinter.cpp +++ b/mlir/lib/IR/AsmPrinter.cpp @@ -765,10 +765,10 @@ void SSANameState::setValueName(Value value, StringRef name) { static bool isPunct(char c) { return c == '$' || c == '.' || c == '_' || c == '-'; } - + StringRef SSANameState::uniqueValueName(StringRef name) { assert(!name.empty() && "Shouldn't have an empty name here"); - + // Check to see if this name is valid. If it starts with a digit, then it // could conflict with the autogenerated numeric ID's (we unique them in a // different map), so add an underscore prefix to avoid problems. @@ -777,13 +777,13 @@ StringRef SSANameState::uniqueValueName(StringRef name) { tmpName += name; return uniqueValueName(tmpName); } - + // Check to see if the name consists of all-valid identifiers. If not, we // need to escape them. for (char ch : name) { if (isalpha(ch) || isPunct(ch) || isdigit(ch)) continue; - + SmallString<16> tmpName; for (char ch : name) { if (isalpha(ch) || isPunct(ch) || isdigit(ch)) @@ -796,7 +796,7 @@ StringRef SSANameState::uniqueValueName(StringRef name) { } return uniqueValueName(tmpName); } - + // Check to see if this name is already unique. if (!usedNames.count(name)) { name = name.copy(usedNameAllocator); @@ -1963,7 +1963,8 @@ class OperationPrinter : public ModulePrinter, private OpAsmPrinter { bool printBlockTerminator = true); /// Print the ID of the given value, optionally with its result number. - void printValueID(Value value, bool printResultNo = true) const; + void printValueID(Value value, bool printResultNo = true, + raw_ostream *streamOverride = nullptr) const; //===--------------------------------------------------------------------===// // OpAsmPrinter methods @@ -1988,6 +1989,9 @@ class OperationPrinter : public ModulePrinter, private OpAsmPrinter { /// Print the ID for the given value. void printOperand(Value value) override { printValueID(value); } + void printOperand(Value value, raw_ostream &os) override { + printValueID(value, /*printResultNo=*/true, &os); + } /// Print an optional attribute dictionary with a given set of elided values. void printOptionalAttrDict(ArrayRef attrs, @@ -2195,8 +2199,10 @@ void OperationPrinter::print(Block *block, bool printBlockArgs, currentIndent -= indentWidth; } -void OperationPrinter::printValueID(Value value, bool printResultNo) const { - state->getSSANameState().printValueID(value, printResultNo, os); +void OperationPrinter::printValueID(Value value, bool printResultNo, + raw_ostream *streamOverride) const { + state->getSSANameState().printValueID(value, printResultNo, + streamOverride ? *streamOverride : os); } void OperationPrinter::printSuccessor(Block *successor) { diff --git a/mlir/lib/IR/Builders.cpp b/mlir/lib/IR/Builders.cpp index 82d2efb5255cf..23536651f9749 100644 --- a/mlir/lib/IR/Builders.cpp +++ b/mlir/lib/IR/Builders.cpp @@ -109,6 +109,20 @@ DenseIntElementsAttr Builder::getI64VectorAttr(ArrayRef values) { values); } +DenseIntElementsAttr Builder::getI32TensorAttr(ArrayRef values) { + return DenseIntElementsAttr::get( + RankedTensorType::get(static_cast(values.size()), + getIntegerType(32)), + values); +} + +DenseIntElementsAttr Builder::getI64TensorAttr(ArrayRef values) { + return DenseIntElementsAttr::get( + RankedTensorType::get(static_cast(values.size()), + getIntegerType(64)), + values); +} + IntegerAttr Builder::getI32IntegerAttr(int32_t value) { return IntegerAttr::get(getIntegerType(32), APInt(32, value)); } diff --git a/mlir/lib/IR/Dialect.cpp b/mlir/lib/IR/Dialect.cpp index 4ce461e2d7d99..e48e7f64010da 100644 --- a/mlir/lib/IR/Dialect.cpp +++ b/mlir/lib/IR/Dialect.cpp @@ -13,6 +13,7 @@ #include "mlir/IR/DialectInterface.h" #include "mlir/IR/MLIRContext.h" #include "mlir/IR/Operation.h" +#include "llvm/ADT/MapVector.h" #include "llvm/ADT/Twine.h" #include "llvm/Support/ManagedStatic.h" #include "llvm/Support/Regex.h" @@ -26,39 +27,40 @@ DialectAsmParser::~DialectAsmParser() {} // Dialect Registration //===----------------------------------------------------------------------===// -// Registry for all dialect allocation functions. -static llvm::ManagedStatic> +/// Registry for all dialect allocation functions. +static llvm::ManagedStatic< + llvm::MapVector> dialectRegistry; -// Registry for functions that set dialect hooks. -static llvm::ManagedStatic> +/// Registry for functions that set dialect hooks. +static llvm::ManagedStatic> dialectHooksRegistry; -/// Registers a specific dialect creation function with the system, typically -/// used through the DialectRegistration template. -void mlir::registerDialectAllocator(const DialectAllocatorFunction &function) { +void Dialect::registerDialectAllocator( + const ClassID *classId, const DialectAllocatorFunction &function) { assert(function && "Attempting to register an empty dialect initialize function"); - dialectRegistry->push_back(function); + dialectRegistry->insert({classId, function}); } /// Registers a function to set specific hooks for a specific dialect, typically /// used through the DialectHooksRegistration template. -void mlir::registerDialectHooksSetter(const DialectHooksSetter &function) { +void DialectHooks::registerDialectHooksSetter( + const ClassID *classId, const DialectHooksSetter &function) { assert( function && "Attempting to register an empty dialect hooks initialization function"); - dialectHooksRegistry->push_back(function); + dialectHooksRegistry->insert({classId, function}); } -/// Registers all dialects and their const folding hooks with the specified -/// MLIRContext. +/// Registers all dialects and hooks from the global registries with the +/// specified MLIRContext. void mlir::registerAllDialects(MLIRContext *context) { - for (const auto &fn : *dialectRegistry) - fn(context); - for (const auto &fn : *dialectHooksRegistry) { - fn(context); + for (const auto &it : *dialectRegistry) + it.second(context); + for (const auto &it : *dialectHooksRegistry) { + it.second(context); } } diff --git a/mlir/lib/IR/StandardTypes.cpp b/mlir/lib/IR/StandardTypes.cpp index 488601cdb16b6..1e7d9f38a2cee 100644 --- a/mlir/lib/IR/StandardTypes.cpp +++ b/mlir/lib/IR/StandardTypes.cpp @@ -86,6 +86,8 @@ bool Type::isSignlessIntOrFloat() { bool Type::isIntOrFloat() { return isa() || isa(); } +bool Type::isIntOrIndexOrFloat() { return isIntOrFloat() || isIndex(); } + //===----------------------------------------------------------------------===// // Integer Type //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Interfaces/CMakeLists.txt b/mlir/lib/Interfaces/CMakeLists.txt index 853a03d3cbada..a6e053d7674e0 100644 --- a/mlir/lib/Interfaces/CMakeLists.txt +++ b/mlir/lib/Interfaces/CMakeLists.txt @@ -7,7 +7,7 @@ set(LLVM_OPTIONAL_SOURCES SideEffects.cpp ) -add_llvm_library(MLIRCallInterfaces +add_mlir_library(MLIRCallInterfaces CallInterfaces.cpp ADDITIONAL_HEADER_DIRS @@ -21,7 +21,7 @@ target_link_libraries(MLIRCallInterfaces MLIRIR ) -add_llvm_library(MLIRControlFlowInterfaces +add_mlir_library(MLIRControlFlowInterfaces ControlFlowInterfaces.cpp ADDITIONAL_HEADER_DIRS @@ -35,7 +35,7 @@ target_link_libraries(MLIRControlFlowInterfaces MLIRIR ) -add_llvm_library(MLIRDerivedAttributeOpInterface +add_mlir_library(MLIRDerivedAttributeOpInterface DerivedAttributeOpInterface.cpp ADDITIONAL_HEADER_DIRS @@ -49,7 +49,7 @@ target_link_libraries(MLIRDerivedAttributeOpInterface MLIRIR ) -add_llvm_library(MLIRInferTypeOpInterface +add_mlir_library(MLIRInferTypeOpInterface InferTypeOpInterface.cpp ADDITIONAL_HEADER_DIRS @@ -63,7 +63,7 @@ target_link_libraries(MLIRInferTypeOpInterface MLIRIR ) -add_llvm_library(MLIRLoopLikeInterface +add_mlir_library(MLIRLoopLikeInterface LoopLikeInterface.cpp ADDITIONAL_HEADER_DIRS @@ -77,7 +77,7 @@ target_link_libraries(MLIRLoopLikeInterface MLIRIR ) -add_llvm_library(MLIRSideEffects +add_mlir_library(MLIRSideEffects SideEffects.cpp ADDITIONAL_HEADER_DIRS diff --git a/mlir/lib/Parser/Lexer.cpp b/mlir/lib/Parser/Lexer.cpp index 697af7392fb20..9a3418eaf8327 100644 --- a/mlir/lib/Parser/Lexer.cpp +++ b/mlir/lib/Parser/Lexer.cpp @@ -83,8 +83,8 @@ Token Lexer::lexToken() { // marker that llvm::MemoryBuffer guarantees will be there. if (curPtr - 1 == curBuffer.end()) return formToken(Token::eof, tokStart); + continue; - LLVM_FALLTHROUGH; case ':': return formToken(Token::colon, tokStart); case ',': diff --git a/mlir/lib/Parser/Parser.cpp b/mlir/lib/Parser/Parser.cpp index d987fd0d8addb..a29b34b570d0d 100644 --- a/mlir/lib/Parser/Parser.cpp +++ b/mlir/lib/Parser/Parser.cpp @@ -3322,8 +3322,13 @@ class OperationParser : public Parser { Operation *parseGenericOperation(Block *insertBlock, Block::iterator insertPt); + /// This is the structure of a result specifier in the assembly syntax, + /// including the name, number of results, and location. + typedef std::tuple ResultRecord; + /// Parse an operation instance that is in the op-defined custom form. - Operation *parseCustomOperation(); + /// resultInfo specifies information about the "%name =" specifiers. + Operation *parseCustomOperation(ArrayRef resultInfo); //===--------------------------------------------------------------------===// // Region Parsing @@ -3728,7 +3733,7 @@ Value OperationParser::createForwardRefPlaceholder(SMLoc loc, Type type) { /// ParseResult OperationParser::parseOperation() { auto loc = getToken().getLoc(); - SmallVector, 1> resultIDs; + SmallVector resultIDs; size_t numExpectedResults = 0; if (getToken().is(Token::percent_identifier)) { // Parse the group of result ids. @@ -3769,7 +3774,7 @@ ParseResult OperationParser::parseOperation() { Operation *op; if (getToken().is(Token::bare_identifier) || getToken().isKeyword()) - op = parseCustomOperation(); + op = parseCustomOperation(resultIDs); else if (getToken().is(Token::string)) op = parseGenericOperation(); else @@ -3790,7 +3795,7 @@ ParseResult OperationParser::parseOperation() { // Add definitions for each of the result groups. unsigned opResI = 0; - for (std::tuple &resIt : resultIDs) { + for (ResultRecord &resIt : resultIDs) { for (unsigned subRes : llvm::seq(0, std::get<1>(resIt))) { if (addDefinition({std::get<0>(resIt), subRes, std::get<2>(resIt)}, op->getResult(opResI++))) @@ -3955,9 +3960,12 @@ Operation *OperationParser::parseGenericOperation(Block *insertBlock, namespace { class CustomOpAsmParser : public OpAsmParser { public: - CustomOpAsmParser(SMLoc nameLoc, const AbstractOperation *opDefinition, + CustomOpAsmParser(SMLoc nameLoc, + ArrayRef resultIDs, + const AbstractOperation *opDefinition, OperationParser &parser) - : nameLoc(nameLoc), opDefinition(opDefinition), parser(parser) {} + : nameLoc(nameLoc), resultIDs(resultIDs), opDefinition(opDefinition), + parser(parser) {} /// Parse an instance of the operation described by 'opDefinition' into the /// provided operation state. @@ -3992,6 +4000,41 @@ class CustomOpAsmParser : public OpAsmParser { Builder &getBuilder() const override { return parser.builder; } + /// Return the name of the specified result in the specified syntax, as well + /// as the subelement in the name. For example, in this operation: + /// + /// %x, %y:2, %z = foo.op + /// + /// getResultName(0) == {"x", 0 } + /// getResultName(1) == {"y", 0 } + /// getResultName(2) == {"y", 1 } + /// getResultName(3) == {"z", 0 } + std::pair + getResultName(unsigned resultNo) const override { + // Scan for the resultID that contains this result number. + for (unsigned nameID = 0, e = resultIDs.size(); nameID != e; ++nameID) { + const auto &entry = resultIDs[nameID]; + if (resultNo < std::get<1>(entry)) { + // Don't pass on the leading %. + StringRef name = std::get<0>(entry).drop_front(); + return {name, resultNo}; + } + resultNo -= std::get<1>(entry); + } + + // Invalid result number. + return {"", ~0U}; + } + + /// Return the number of declared SSA results. This returns 4 for the foo.op + /// example in the comment for getResultName. + size_t getNumResults() const override { + size_t count = 0; + for (auto &entry : resultIDs) + count += std::get<1>(entry); + return count; + } + llvm::SMLoc getNameLoc() const override { return nameLoc; } //===--------------------------------------------------------------------===// @@ -4500,6 +4543,9 @@ class CustomOpAsmParser : public OpAsmParser { /// The source location of the operation name. SMLoc nameLoc; + /// Information about the result name specifiers. + ArrayRef resultIDs; + /// The abstract information of the operation. const AbstractOperation *opDefinition; @@ -4511,7 +4557,8 @@ class CustomOpAsmParser : public OpAsmParser { }; } // end anonymous namespace. -Operation *OperationParser::parseCustomOperation() { +Operation * +OperationParser::parseCustomOperation(ArrayRef resultIDs) { auto opLoc = getToken().getLoc(); auto opName = getTokenSpelling(); @@ -4544,7 +4591,7 @@ Operation *OperationParser::parseCustomOperation() { // Have the op implementation take a crack and parsing this. OperationState opState(srcLocation, opDefinition->name); CleanupOpStateRegions guard{opState}; - CustomOpAsmParser opAsmParser(opLoc, opDefinition, *this); + CustomOpAsmParser opAsmParser(opLoc, resultIDs, opDefinition, *this); if (opAsmParser.parseOperation(opState)) return nullptr; diff --git a/mlir/lib/Parser/Token.cpp b/mlir/lib/Parser/Token.cpp index 8fe16b05fde63..b619af08c4331 100644 --- a/mlir/lib/Parser/Token.cpp +++ b/mlir/lib/Parser/Token.cpp @@ -145,9 +145,6 @@ StringRef Token::getTokenSpelling(Kind kind) { #define TOK_PUNCTUATION(NAME, SPELLING) \ case NAME: \ return SPELLING; -#define TOK_OPERATOR(NAME, SPELLING) \ - case NAME: \ - return SPELLING; #define TOK_KEYWORD(SPELLING) \ case kw_##SPELLING: \ return #SPELLING; diff --git a/mlir/lib/Parser/Token.h b/mlir/lib/Parser/Token.h index e6fa6c70853f7..7952aca4546bb 100644 --- a/mlir/lib/Parser/Token.h +++ b/mlir/lib/Parser/Token.h @@ -23,7 +23,6 @@ class Token { #define TOK_IDENTIFIER(NAME) NAME, #define TOK_LITERAL(NAME) NAME, #define TOK_PUNCTUATION(NAME, SPELLING) NAME, -#define TOK_OPERATOR(NAME, SPELLING) NAME, #define TOK_KEYWORD(SPELLING) kw_##SPELLING, #include "TokenKinds.def" }; @@ -50,7 +49,8 @@ class Token { bool isNot(Kind k) const { return kind != k; } /// Return true if this token isn't one of the specified kinds. - template bool isNot(Kind k1, Kind k2, T... others) const { + template + bool isNot(Kind k1, Kind k2, T... others) const { return !isAny(k1, k2, others...); } diff --git a/mlir/lib/Parser/TokenKinds.def b/mlir/lib/Parser/TokenKinds.def index 47c43f6522fbd..0ec0c0ebf7bf4 100644 --- a/mlir/lib/Parser/TokenKinds.def +++ b/mlir/lib/Parser/TokenKinds.def @@ -11,9 +11,10 @@ // //===----------------------------------------------------------------------===// -#if !defined(TOK_MARKER) && !defined(TOK_IDENTIFIER) && !defined(TOK_LITERAL)&&\ - !defined(TOK_PUNCTUATION) && !defined(TOK_OPERATOR) && !defined(TOK_KEYWORD) -# error Must define one of the TOK_ macros. +#if !defined(TOK_MARKER) && !defined(TOK_IDENTIFIER) && \ + !defined(TOK_LITERAL) && !defined(TOK_PUNCTUATION) && \ + !defined(TOK_KEYWORD) +#error Must define one of the TOK_ macros. #endif #ifndef TOK_MARKER @@ -28,14 +29,10 @@ #ifndef TOK_PUNCTUATION #define TOK_PUNCTUATION(NAME, SPELLING) #endif -#ifndef TOK_OPERATOR -#define TOK_OPERATOR(NAME, SPELLING) -#endif #ifndef TOK_KEYWORD #define TOK_KEYWORD(SPELLING) #endif - // Markers TOK_MARKER(eof) TOK_MARKER(error) @@ -49,34 +46,30 @@ TOK_IDENTIFIER(caret_identifier) // ^foo TOK_IDENTIFIER(exclamation_identifier) // !foo // Literals -TOK_LITERAL(floatliteral) // 2.0 -TOK_LITERAL(integer) // 42 -TOK_LITERAL(string) // "foo" -TOK_LITERAL(inttype) // i4, si8, ui16 +TOK_LITERAL(floatliteral) // 2.0 +TOK_LITERAL(integer) // 42 +TOK_LITERAL(string) // "foo" +TOK_LITERAL(inttype) // i4, si8, ui16 // Punctuation. -TOK_PUNCTUATION(arrow, "->") -TOK_PUNCTUATION(at, "@") -TOK_PUNCTUATION(colon, ":") -TOK_PUNCTUATION(comma, ",") -TOK_PUNCTUATION(question, "?") -TOK_PUNCTUATION(l_paren, "(") -TOK_PUNCTUATION(r_paren, ")") -TOK_PUNCTUATION(l_brace, "{") -TOK_PUNCTUATION(r_brace, "}") -TOK_PUNCTUATION(l_square, "[") -TOK_PUNCTUATION(r_square, "]") -TOK_PUNCTUATION(less, "<") -TOK_PUNCTUATION(greater, ">") -TOK_PUNCTUATION(equal, "=") -TOK_PUNCTUATION(ellipsis, "...") -// TODO: More punctuation. - -// Operators. -TOK_OPERATOR(plus, "+") -TOK_OPERATOR(minus, "-") -TOK_OPERATOR(star, "*") -// TODO: More operator tokens +TOK_PUNCTUATION(arrow, "->") +TOK_PUNCTUATION(at, "@") +TOK_PUNCTUATION(colon, ":") +TOK_PUNCTUATION(comma, ",") +TOK_PUNCTUATION(ellipsis, "...") +TOK_PUNCTUATION(equal, "=") +TOK_PUNCTUATION(greater, ">") +TOK_PUNCTUATION(l_brace, "{") +TOK_PUNCTUATION(l_paren, "(") +TOK_PUNCTUATION(l_square, "[") +TOK_PUNCTUATION(less, "<") +TOK_PUNCTUATION(minus, "-") +TOK_PUNCTUATION(plus, "+") +TOK_PUNCTUATION(question, "?") +TOK_PUNCTUATION(r_brace, "}") +TOK_PUNCTUATION(r_paren, ")") +TOK_PUNCTUATION(r_square, "]") +TOK_PUNCTUATION(star, "*") // Keywords. These turn "foo" into Token::kw_foo enums. @@ -122,5 +115,4 @@ TOK_KEYWORD(vector) #undef TOK_IDENTIFIER #undef TOK_LITERAL #undef TOK_PUNCTUATION -#undef TOK_OPERATOR #undef TOK_KEYWORD diff --git a/mlir/lib/Support/MlirOptMain.cpp b/mlir/lib/Support/MlirOptMain.cpp index e2044c90ffd80..5db824d21ad20 100644 --- a/mlir/lib/Support/MlirOptMain.cpp +++ b/mlir/lib/Support/MlirOptMain.cpp @@ -12,7 +12,6 @@ //===----------------------------------------------------------------------===// #include "mlir/Support/MlirOptMain.h" -#include "mlir/Analysis/Passes.h" #include "mlir/IR/Attributes.h" #include "mlir/IR/Diagnostics.h" #include "mlir/IR/Location.h" diff --git a/mlir/lib/TableGen/Operator.cpp b/mlir/lib/TableGen/Operator.cpp index 6492b772e4b7f..ff84456d54910 100644 --- a/mlir/lib/TableGen/Operator.cpp +++ b/mlir/lib/TableGen/Operator.cpp @@ -11,6 +11,7 @@ //===----------------------------------------------------------------------===// #include "mlir/TableGen/Operator.h" +#include "mlir/ADT/TypeSwitch.h" #include "mlir/TableGen/OpTrait.h" #include "mlir/TableGen/Predicate.h" #include "mlir/TableGen/Type.h" @@ -411,6 +412,17 @@ StringRef tblgen::Operator::getSummary() const { return def.getValueAsString("summary"); } +bool tblgen::Operator::hasAssemblyFormat() const { + auto *valueInit = def.getValueInit("assemblyFormat"); + return isa(valueInit) || isa(valueInit); +} + +StringRef tblgen::Operator::getAssemblyFormat() const { + return TypeSwitch(def.getValueInit("assemblyFormat")) + .Case( + [&](auto *init) { return init->getValue(); }); +} + void tblgen::Operator::print(llvm::raw_ostream &os) const { os << "op '" << getOperationName() << "'\n"; for (Argument arg : arguments) { diff --git a/mlir/lib/Target/CMakeLists.txt b/mlir/lib/Target/CMakeLists.txt index b68bfa8d3cf20..9bc37cab1093e 100644 --- a/mlir/lib/Target/CMakeLists.txt +++ b/mlir/lib/Target/CMakeLists.txt @@ -18,6 +18,22 @@ target_link_libraries(MLIRTargetLLVMIRModuleTranslation MLIRTranslation ) +add_mlir_library(MLIRTargetAVX512 + LLVMIR/LLVMAVX512Intr.cpp + + ADDITIONAL_HEADER_DIRS + ${MLIR_MAIN_INCLUDE_DIR}/mlir/Target/LLVMIR + DEPENDS + MLIRLLVMAVX512ConversionsIncGen + ) +target_link_libraries(MLIRTargetAVX512 + PUBLIC + MLIRIR + MLIRLLVMAVX512 + MLIRLLVMIR + MLIRTargetLLVMIRModuleTranslation + ) + add_mlir_library(MLIRTargetLLVMIR LLVMIR/ConvertFromLLVMIR.cpp LLVMIR/ConvertToLLVMIR.cpp diff --git a/mlir/lib/Target/LLVMIR/ConvertFromLLVMIR.cpp b/mlir/lib/Target/LLVMIR/ConvertFromLLVMIR.cpp index 86351bd689ad9..8a71762e956db 100644 --- a/mlir/lib/Target/LLVMIR/ConvertFromLLVMIR.cpp +++ b/mlir/lib/Target/LLVMIR/ConvertFromLLVMIR.cpp @@ -60,6 +60,8 @@ class Importer { GlobalOp processGlobal(llvm::GlobalVariable *GV); private: + /// Returns personality of `f` as a FlatSymbolRefAttr. + FlatSymbolRefAttr getPersonalityAsAttr(llvm::Function *f); /// Imports `bb` into `block`, which must be initially empty. LogicalResult processBasicBlock(llvm::BasicBlock *bb, Block *block); /// Imports `inst` and populates instMap[inst] with the imported Value. @@ -471,7 +473,7 @@ static const DenseMap opcMap = { // FIXME: switch // FIXME: indirectbr // FIXME: invoke - // FIXME: resume + INST(Resume, Resume), // FIXME: unreachable // FIXME: cleanupret // FIXME: catchret @@ -604,6 +606,7 @@ LogicalResult Importer::processInstruction(llvm::Instruction *inst) { case llvm::Instruction::Load: case llvm::Instruction::Store: case llvm::Instruction::Ret: + case llvm::Instruction::Resume: case llvm::Instruction::Trunc: case llvm::Instruction::ZExt: case llvm::Instruction::SExt: @@ -726,8 +729,11 @@ LogicalResult Importer::processInstruction(llvm::Instruction *inst) { for (unsigned i = 0, ie = lpi->getNumClauses(); i < ie; i++) ops.push_back(processConstant(lpi->getClause(i))); - b.create(loc, processType(lpi->getType()), lpi->isCleanup(), - ops); + Type ty = processType(lpi->getType()); + if (!ty) + return failure(); + + v = b.create(loc, ty, lpi->isCleanup(), ops); return success(); } case llvm::Instruction::Invoke: { @@ -798,6 +804,28 @@ LogicalResult Importer::processInstruction(llvm::Instruction *inst) { } } +FlatSymbolRefAttr Importer::getPersonalityAsAttr(llvm::Function *f) { + if (!f->hasPersonalityFn()) + return nullptr; + + llvm::Constant *pf = f->getPersonalityFn(); + + // If it directly has a name, we can use it. + if (pf->hasName()) + return b.getSymbolRefAttr(pf->getName()); + + // If it doesn't have a name, currently, only function pointers that are + // bitcast to i8* are parsed. + if (auto ce = dyn_cast(pf)) { + if (ce->getOpcode() == llvm::Instruction::BitCast && + ce->getType() == llvm::Type::getInt8PtrTy(dialect->getLLVMContext())) { + if (auto func = dyn_cast(ce->getOperand(0))) + return b.getSymbolRefAttr(func->getName()); + } + } + return FlatSymbolRefAttr(); +} + LogicalResult Importer::processFunction(llvm::Function *f) { blocks.clear(); instMap.clear(); @@ -810,6 +838,13 @@ LogicalResult Importer::processFunction(llvm::Function *f) { b.setInsertionPoint(module.getBody(), getFuncInsertPt()); LLVMFuncOp fop = b.create(UnknownLoc::get(context), f->getName(), functionType); + + if (FlatSymbolRefAttr personality = getPersonalityAsAttr(f)) + fop.setAttr(b.getIdentifier("personality"), personality); + else if (f->hasPersonalityFn()) + emitWarning(UnknownLoc::get(context), + "could not deduce personality, skipping it"); + if (f->isDeclaration()) return success(); diff --git a/mlir/lib/Target/LLVMIR/LLVMAVX512Intr.cpp b/mlir/lib/Target/LLVMIR/LLVMAVX512Intr.cpp new file mode 100644 index 0000000000000..216ae862d4b2c --- /dev/null +++ b/mlir/lib/Target/LLVMIR/LLVMAVX512Intr.cpp @@ -0,0 +1,51 @@ +//===- AVX512Intr.cpp - Convert MLIR LLVM dialect to LLVM intrinsics ------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements a translation between the MLIR LLVM and AVX512 dialects +// and LLVM IR with AVX intrinsics. +// +//===----------------------------------------------------------------------===// + +#include "mlir/Dialect/LLVMIR/LLVMAVX512Dialect.h" +#include "mlir/Target/LLVMIR/ModuleTranslation.h" +#include "mlir/Translation.h" +#include "llvm/IR/IntrinsicsX86.h" + +using namespace mlir; + +namespace { +class LLVMAVX512ModuleTranslation : public LLVM::ModuleTranslation { + friend LLVM::ModuleTranslation; + +public: + using LLVM::ModuleTranslation::ModuleTranslation; + +protected: + LogicalResult convertOperation(Operation &opInst, + llvm::IRBuilder<> &builder) override { +#include "mlir/Dialect/LLVMIR/LLVMAVX512Conversions.inc" + + return LLVM::ModuleTranslation::convertOperation(opInst, builder); + } +}; + +std::unique_ptr translateLLVMAVX512ModuleToLLVMIR(Operation *m) { + return LLVM::ModuleTranslation::translateModule( + m); +} +} // end namespace + +static TranslateFromMLIRRegistration + reg("avx512-mlir-to-llvmir", [](ModuleOp module, raw_ostream &output) { + auto llvmModule = translateLLVMAVX512ModuleToLLVMIR(module); + if (!llvmModule) + return failure(); + + llvmModule->print(output, nullptr); + return success(); + }); diff --git a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp index 2c3a68fa10816..8bc76870e8f70 100644 --- a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp @@ -99,7 +99,8 @@ llvm::Constant *ModuleTranslation::getLLVMConstant(llvm::Type *llvmType, if (auto floatAttr = attr.dyn_cast()) return llvm::ConstantFP::get(llvmType, floatAttr.getValue()); if (auto funcAttr = attr.dyn_cast()) - return functionMapping.lookup(funcAttr.getValue()); + return llvm::ConstantExpr::getBitCast( + functionMapping.lookup(funcAttr.getValue()), llvmType); if (auto splatAttr = attr.dyn_cast()) { auto *sequentialType = cast(llvmType); auto elementType = sequentialType->getElementType(); @@ -353,6 +354,7 @@ LogicalResult ModuleTranslation::convertOperation(Operation &opInst, if (auto constOperand = dyn_cast(operand)) lpi->addClause(constOperand); } + valueMapping[lpOp.getResult()] = lpi; return success(); } @@ -470,7 +472,8 @@ LogicalResult ModuleTranslation::convertGlobals() { auto linkage = convertLinkageToLLVM(op.linkage()); bool anyExternalLinkage = - (linkage == llvm::GlobalVariable::ExternalLinkage || + ((linkage == llvm::GlobalVariable::ExternalLinkage && + isa(cst)) || linkage == llvm::GlobalVariable::ExternalWeakLinkage); auto addrSpace = op.addr_space().getLimitedValue(); auto *var = new llvm::GlobalVariable( @@ -584,6 +587,14 @@ LogicalResult ModuleTranslation::convertOneFunction(LLVMFuncOp func) { argIdx++; } + // Check the personality and set it. + if (func.personality().hasValue()) { + llvm::Type *ty = llvm::Type::getInt8PtrTy(llvmFunc->getContext()); + if (llvm::Constant *pfunc = + getLLVMConstant(ty, func.personalityAttr(), func.getLoc())) + llvmFunc->setPersonalityFn(pfunc); + } + // First, create all blocks so we can jump to them. llvm::LLVMContext &llvmContext = llvmFunc->getContext(); for (auto &bb : func) { @@ -645,8 +656,10 @@ SmallVector ModuleTranslation::lookupValues(ValueRange values) { SmallVector remapped; remapped.reserve(values.size()); - for (Value v : values) + for (Value v : values) { + assert(valueMapping.count(v) && "referencing undefined value"); remapped.push_back(valueMapping.lookup(v)); + } return remapped; } diff --git a/mlir/lib/Transforms/CMakeLists.txt b/mlir/lib/Transforms/CMakeLists.txt index 6d1d7b41d568e..8d3c5f46f8c4b 100644 --- a/mlir/lib/Transforms/CMakeLists.txt +++ b/mlir/lib/Transforms/CMakeLists.txt @@ -1,8 +1,6 @@ add_subdirectory(Utils) add_mlir_library(MLIRTransforms - AffineDataCopyGeneration.cpp - AffineLoopInvariantCodeMotion.cpp Canonicalizer.cpp CSE.cpp DialectConversion.cpp @@ -11,16 +9,11 @@ add_mlir_library(MLIRTransforms LoopCoalescing.cpp LoopFusion.cpp LoopInvariantCodeMotion.cpp - LoopTiling.cpp - LoopUnrollAndJam.cpp - LoopUnroll.cpp MemRefDataFlowOpt.cpp OpStats.cpp PipelineDataTransfer.cpp - SimplifyAffineStructures.cpp StripDebugInfo.cpp SymbolDCE.cpp - Vectorize.cpp ViewOpGraph.cpp ViewRegionGraph.cpp @@ -33,7 +26,7 @@ add_mlir_library(MLIRTransforms target_link_libraries(MLIRTransforms PUBLIC - MLIRAffineOps + MLIRAffine MLIRAnalysis MLIRLoopLikeInterface MLIRLoopOps diff --git a/mlir/lib/Transforms/LoopFusion.cpp b/mlir/lib/Transforms/LoopFusion.cpp index 378e91a214bd5..2f08f95261f27 100644 --- a/mlir/lib/Transforms/LoopFusion.cpp +++ b/mlir/lib/Transforms/LoopFusion.cpp @@ -14,7 +14,7 @@ #include "mlir/Analysis/AffineStructures.h" #include "mlir/Analysis/LoopAnalysis.h" #include "mlir/Analysis/Utils.h" -#include "mlir/Dialect/AffineOps/AffineOps.h" +#include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/IR/AffineExpr.h" #include "mlir/IR/AffineMap.h" #include "mlir/IR/Builders.h" diff --git a/mlir/lib/Transforms/LoopUnrollAndJam.cpp b/mlir/lib/Transforms/LoopUnrollAndJam.cpp deleted file mode 100644 index 4629e2c94bd4e..0000000000000 --- a/mlir/lib/Transforms/LoopUnrollAndJam.cpp +++ /dev/null @@ -1,235 +0,0 @@ -//===- LoopUnrollAndJam.cpp - Code to perform loop unroll and jam ---------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file implements loop unroll and jam. Unroll and jam is a transformation -// that improves locality, in particular, register reuse, while also improving -// operation level parallelism. The example below shows what it does in nearly -// the general case. Loop unroll and jam currently works if the bounds of the -// loops inner to the loop being unroll-jammed do not depend on the latter. -// -// Before After unroll and jam of i by factor 2: -// -// for i, step = 2 -// for i S1(i); -// S1; S2(i); -// S2; S1(i+1); -// for j S2(i+1); -// S3; for j -// S4; S3(i, j); -// S5; S4(i, j); -// S6; S3(i+1, j) -// S4(i+1, j) -// S5(i); -// S6(i); -// S5(i+1); -// S6(i+1); -// -// Note: 'if/else' blocks are not jammed. So, if there are loops inside if -// op's, bodies of those loops will not be jammed. -//===----------------------------------------------------------------------===// -#include "mlir/Transforms/Passes.h" - -#include "mlir/Analysis/LoopAnalysis.h" -#include "mlir/Dialect/AffineOps/AffineOps.h" -#include "mlir/IR/AffineExpr.h" -#include "mlir/IR/AffineMap.h" -#include "mlir/IR/BlockAndValueMapping.h" -#include "mlir/IR/Builders.h" -#include "mlir/Pass/Pass.h" -#include "mlir/Transforms/LoopUtils.h" -#include "llvm/ADT/DenseMap.h" -#include "llvm/Support/CommandLine.h" - -using namespace mlir; - -#define DEBUG_TYPE "affine-loop-unroll-jam" - -static llvm::cl::OptionCategory clOptionsCategory(DEBUG_TYPE " options"); - -// Loop unroll and jam factor. -static llvm::cl::opt - clUnrollJamFactor("unroll-jam-factor", llvm::cl::Hidden, - llvm::cl::desc("Use this unroll jam factor for all loops" - " (default 4)"), - llvm::cl::cat(clOptionsCategory)); - -namespace { -/// Loop unroll jam pass. Currently, this just unroll jams the first -/// outer loop in a Function. -struct LoopUnrollAndJam : public FunctionPass { - Optional unrollJamFactor; - static const unsigned kDefaultUnrollJamFactor = 4; - - explicit LoopUnrollAndJam(Optional unrollJamFactor = None) - : unrollJamFactor(unrollJamFactor) {} - - void runOnFunction() override; - LogicalResult runOnAffineForOp(AffineForOp forOp); -}; -} // end anonymous namespace - -std::unique_ptr> -mlir::createLoopUnrollAndJamPass(int unrollJamFactor) { - return std::make_unique( - unrollJamFactor == -1 ? None : Optional(unrollJamFactor)); -} - -void LoopUnrollAndJam::runOnFunction() { - // Currently, just the outermost loop from the first loop nest is - // unroll-and-jammed by this pass. However, runOnAffineForOp can be called on - // any for operation. - auto &entryBlock = getFunction().front(); - if (auto forOp = dyn_cast(entryBlock.front())) - runOnAffineForOp(forOp); -} - -/// Unroll and jam a 'affine.for' op. Default unroll jam factor is -/// kDefaultUnrollJamFactor. Return failure if nothing was done. -LogicalResult LoopUnrollAndJam::runOnAffineForOp(AffineForOp forOp) { - // Unroll and jam by the factor that was passed if any. - if (unrollJamFactor.hasValue()) - return loopUnrollJamByFactor(forOp, unrollJamFactor.getValue()); - // Otherwise, unroll jam by the command-line factor if one was specified. - if (clUnrollJamFactor.getNumOccurrences() > 0) - return loopUnrollJamByFactor(forOp, clUnrollJamFactor); - - // Unroll and jam by four otherwise. - return loopUnrollJamByFactor(forOp, kDefaultUnrollJamFactor); -} - -LogicalResult mlir::loopUnrollJamUpToFactor(AffineForOp forOp, - uint64_t unrollJamFactor) { - Optional mayBeConstantTripCount = getConstantTripCount(forOp); - - if (mayBeConstantTripCount.hasValue() && - mayBeConstantTripCount.getValue() < unrollJamFactor) - return loopUnrollJamByFactor(forOp, mayBeConstantTripCount.getValue()); - return loopUnrollJamByFactor(forOp, unrollJamFactor); -} - -/// Unrolls and jams this loop by the specified factor. -LogicalResult mlir::loopUnrollJamByFactor(AffineForOp forOp, - uint64_t unrollJamFactor) { - // Gathers all maximal sub-blocks of operations that do not themselves - // include a for op (a operation could have a descendant for op though - // in its tree). Ignore the block terminators. - struct JamBlockGatherer { - // Store iterators to the first and last op of each sub-block found. - std::vector> subBlocks; - - // This is a linear time walk. - void walk(Operation *op) { - for (auto ®ion : op->getRegions()) - for (auto &block : region) - walk(block); - } - void walk(Block &block) { - for (auto it = block.begin(), e = std::prev(block.end()); it != e;) { - auto subBlockStart = it; - while (it != e && !isa(&*it)) - ++it; - if (it != subBlockStart) - subBlocks.push_back({subBlockStart, std::prev(it)}); - // Process all for insts that appear next. - while (it != e && isa(&*it)) - walk(&*it++); - } - } - }; - - assert(unrollJamFactor >= 1 && "unroll jam factor should be >= 1"); - - if (unrollJamFactor == 1) - return promoteIfSingleIteration(forOp); - - if (forOp.getBody()->empty() || - forOp.getBody()->begin() == std::prev(forOp.getBody()->end())) - return failure(); - - // Loops where both lower and upper bounds are multi-result maps won't be - // unrolled (since the trip can't be expressed as an affine function in - // general). - // TODO(mlir-team): this may not be common, but we could support the case - // where the lower bound is a multi-result map and the ub is a single result - // one. - if (forOp.getLowerBoundMap().getNumResults() != 1) - return failure(); - - Optional mayBeConstantTripCount = getConstantTripCount(forOp); - // If the trip count is lower than the unroll jam factor, no unroll jam. - if (mayBeConstantTripCount.hasValue() && - mayBeConstantTripCount.getValue() < unrollJamFactor) - return failure(); - - auto *forInst = forOp.getOperation(); - - // Gather all sub-blocks to jam upon the loop being unrolled. - JamBlockGatherer jbg; - jbg.walk(forInst); - auto &subBlocks = jbg.subBlocks; - - // Generate the cleanup loop if trip count isn't a multiple of - // unrollJamFactor. - if (getLargestDivisorOfTripCount(forOp) % unrollJamFactor != 0) { - // Insert the cleanup loop right after 'forOp'. - OpBuilder builder(forInst->getBlock(), std::next(Block::iterator(forInst))); - auto cleanupAffineForOp = cast(builder.clone(*forInst)); - // Adjust the lower bound of the cleanup loop; its upper bound is the same - // as the original loop's upper bound. - AffineMap cleanupMap; - SmallVector cleanupOperands; - getCleanupLoopLowerBound(forOp, unrollJamFactor, &cleanupMap, - &cleanupOperands, builder); - cleanupAffineForOp.setLowerBound(cleanupOperands, cleanupMap); - - // Promote the cleanup loop if it has turned into a single iteration loop. - promoteIfSingleIteration(cleanupAffineForOp); - - // Adjust the upper bound of the original loop - it will be the same as the - // cleanup loop's lower bound. Its lower bound remains unchanged. - forOp.setUpperBound(cleanupOperands, cleanupMap); - } - - // Scale the step of loop being unroll-jammed by the unroll-jam factor. - int64_t step = forOp.getStep(); - forOp.setStep(step * unrollJamFactor); - - auto forOpIV = forOp.getInductionVar(); - // Unroll and jam (appends unrollJamFactor - 1 additional copies). - for (unsigned i = unrollJamFactor - 1; i >= 1; --i) { - // Operand map persists across all sub-blocks. - BlockAndValueMapping operandMapping; - for (auto &subBlock : subBlocks) { - // Builder to insert unroll-jammed bodies. Insert right at the end of - // sub-block. - OpBuilder builder(subBlock.first->getBlock(), std::next(subBlock.second)); - - // If the induction variable is used, create a remapping to the value for - // this unrolled instance. - if (!forOpIV.use_empty()) { - // iv' = iv + i, i = 1 to unrollJamFactor-1. - auto d0 = builder.getAffineDimExpr(0); - auto bumpMap = AffineMap::get(1, 0, {d0 + i * step}); - auto ivUnroll = - builder.create(forInst->getLoc(), bumpMap, forOpIV); - operandMapping.map(forOpIV, ivUnroll); - } - // Clone the sub-block being unroll-jammed. - for (auto it = subBlock.first; it != std::next(subBlock.second); ++it) { - builder.clone(*it, operandMapping); - } - } - } - - // Promote the loop body up if this has turned into a single iteration loop. - promoteIfSingleIteration(forOp); - return success(); -} - -static PassRegistration pass("affine-loop-unroll-jam", - "Unroll and jam loops"); diff --git a/mlir/lib/Transforms/MemRefDataFlowOpt.cpp b/mlir/lib/Transforms/MemRefDataFlowOpt.cpp index c1128c949baff..eaf5c744723ce 100644 --- a/mlir/lib/Transforms/MemRefDataFlowOpt.cpp +++ b/mlir/lib/Transforms/MemRefDataFlowOpt.cpp @@ -16,7 +16,7 @@ #include "mlir/Analysis/AffineAnalysis.h" #include "mlir/Analysis/Dominance.h" #include "mlir/Analysis/Utils.h" -#include "mlir/Dialect/AffineOps/AffineOps.h" +#include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/StandardOps/IR/Ops.h" #include "mlir/Pass/Pass.h" #include "mlir/Transforms/Passes.h" diff --git a/mlir/lib/Transforms/PipelineDataTransfer.cpp b/mlir/lib/Transforms/PipelineDataTransfer.cpp index 39874b1bc44a0..df7bafc4b90b0 100644 --- a/mlir/lib/Transforms/PipelineDataTransfer.cpp +++ b/mlir/lib/Transforms/PipelineDataTransfer.cpp @@ -15,7 +15,7 @@ #include "mlir/Analysis/AffineAnalysis.h" #include "mlir/Analysis/LoopAnalysis.h" #include "mlir/Analysis/Utils.h" -#include "mlir/Dialect/AffineOps/AffineOps.h" +#include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/IR/Builders.h" #include "mlir/Pass/Pass.h" #include "mlir/Transforms/LoopUtils.h" diff --git a/mlir/lib/Transforms/Utils/CMakeLists.txt b/mlir/lib/Transforms/Utils/CMakeLists.txt index 1e0442179bf41..e28a97c182313 100644 --- a/mlir/lib/Transforms/Utils/CMakeLists.txt +++ b/mlir/lib/Transforms/Utils/CMakeLists.txt @@ -16,7 +16,7 @@ add_mlir_library(MLIRTransformUtils target_link_libraries(MLIRTransformUtils PUBLIC - MLIRAffineOps + MLIRAffine MLIRAnalysis MLIRLoopAnalysis MLIRLoopOps diff --git a/mlir/lib/Transforms/Utils/FoldUtils.cpp b/mlir/lib/Transforms/Utils/FoldUtils.cpp index 66535ec678972..f2099bca75ea1 100644 --- a/mlir/lib/Transforms/Utils/FoldUtils.cpp +++ b/mlir/lib/Transforms/Utils/FoldUtils.cpp @@ -24,8 +24,8 @@ using namespace mlir; /// inserted into. static Region *getInsertionRegion( DialectInterfaceCollection &interfaces, - Operation *op) { - while (Region *region = op->getParentRegion()) { + Block *insertionBlock) { + while (Region *region = insertionBlock->getParent()) { // Insert in this region for any of the following scenarios: // * The parent is unregistered, or is known to be isolated from above. // * The parent is a top-level operation. @@ -40,7 +40,7 @@ static Region *getInsertionRegion( return region; // Traverse up the parent looking for an insertion region. - op = parentOp; + insertionBlock = parentOp->getBlock(); } llvm_unreachable("expected valid insertion region"); } @@ -82,20 +82,21 @@ LogicalResult OperationFolder::tryToFold( // Try to fold the operation. SmallVector results; - if (failed(tryToFold(op, results, processGeneratedConstants))) + OpBuilder builder(op); + if (failed(tryToFold(builder, op, results, processGeneratedConstants))) return failure(); - // Constant folding succeeded. We will start replacing this op's uses and - // eventually erase this op. Invoke the callback provided by the caller to - // perform any pre-replacement action. - if (preReplaceAction) - preReplaceAction(op); - // Check to see if the operation was just updated in place. if (results.empty()) return success(); - // Otherwise, replace all of the result values and erase the operation. + // Constant folding succeeded. We will start replacing this op's uses and + // erase this op. Invoke the callback provided by the caller to perform any + // pre-replacement action. + if (preReplaceAction) + preReplaceAction(op); + + // Replace all of the result values and erase the operation. for (unsigned i = 0, e = results.size(); i != e; ++i) op->getResult(i).replaceAllUsesWith(results[i]); op->erase(); @@ -117,7 +118,8 @@ void OperationFolder::notifyRemoval(Operation *op) { assert(constValue); // Get the constant map that this operation was uniqued in. - auto &uniquedConstants = foldScopes[getInsertionRegion(interfaces, op)]; + auto &uniquedConstants = + foldScopes[getInsertionRegion(interfaces, op->getBlock())]; // Erase all of the references to this operation. auto type = op->getResult(0).getType(); @@ -135,7 +137,7 @@ void OperationFolder::clear() { /// Tries to perform folding on the given `op`. If successful, populates /// `results` with the results of the folding. LogicalResult OperationFolder::tryToFold( - Operation *op, SmallVectorImpl &results, + OpBuilder &builder, Operation *op, SmallVectorImpl &results, function_ref processGeneratedConstants) { SmallVector operandConstants; SmallVector foldResults; @@ -164,9 +166,11 @@ LogicalResult OperationFolder::tryToFold( // Create a builder to insert new operations into the entry block of the // insertion region. - auto *insertRegion = getInsertionRegion(interfaces, op); + auto *insertRegion = + getInsertionRegion(interfaces, builder.getInsertionBlock()); auto &entry = insertRegion->front(); - OpBuilder builder(&entry, entry.begin()); + OpBuilder::InsertionGuard foldGuard(builder); + builder.setInsertionPoint(&entry, entry.begin()); // Get the constant map for the insertion region of this operation. auto &uniquedConstants = foldScopes[insertRegion]; diff --git a/mlir/lib/Transforms/Utils/LoopFusionUtils.cpp b/mlir/lib/Transforms/Utils/LoopFusionUtils.cpp index 47a3fcf3d519e..9ed4283101af5 100644 --- a/mlir/lib/Transforms/Utils/LoopFusionUtils.cpp +++ b/mlir/lib/Transforms/Utils/LoopFusionUtils.cpp @@ -16,7 +16,7 @@ #include "mlir/Analysis/AffineStructures.h" #include "mlir/Analysis/LoopAnalysis.h" #include "mlir/Analysis/Utils.h" -#include "mlir/Dialect/AffineOps/AffineOps.h" +#include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/IR/AffineExpr.h" #include "mlir/IR/AffineMap.h" #include "mlir/IR/BlockAndValueMapping.h" diff --git a/mlir/lib/Transforms/Utils/LoopUtils.cpp b/mlir/lib/Transforms/Utils/LoopUtils.cpp index 96b4e72eff488..a1c888b35af8c 100644 --- a/mlir/lib/Transforms/Utils/LoopUtils.cpp +++ b/mlir/lib/Transforms/Utils/LoopUtils.cpp @@ -16,7 +16,7 @@ #include "mlir/Analysis/LoopAnalysis.h" #include "mlir/Analysis/SliceAnalysis.h" #include "mlir/Analysis/Utils.h" -#include "mlir/Dialect/AffineOps/AffineOps.h" +#include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/LoopOps/LoopOps.h" #include "mlir/IR/AffineMap.h" #include "mlir/IR/BlockAndValueMapping.h" @@ -486,6 +486,135 @@ LogicalResult mlir::loopUnrollByFactor(AffineForOp forOp, return success(); } +LogicalResult mlir::loopUnrollJamUpToFactor(AffineForOp forOp, + uint64_t unrollJamFactor) { + Optional mayBeConstantTripCount = getConstantTripCount(forOp); + + if (mayBeConstantTripCount.hasValue() && + mayBeConstantTripCount.getValue() < unrollJamFactor) + return loopUnrollJamByFactor(forOp, mayBeConstantTripCount.getValue()); + return loopUnrollJamByFactor(forOp, unrollJamFactor); +} + +/// Unrolls and jams this loop by the specified factor. +LogicalResult mlir::loopUnrollJamByFactor(AffineForOp forOp, + uint64_t unrollJamFactor) { + // Gathers all maximal sub-blocks of operations that do not themselves + // include a for op (a operation could have a descendant for op though + // in its tree). Ignore the block terminators. + struct JamBlockGatherer { + // Store iterators to the first and last op of each sub-block found. + std::vector> subBlocks; + + // This is a linear time walk. + void walk(Operation *op) { + for (auto ®ion : op->getRegions()) + for (auto &block : region) + walk(block); + } + void walk(Block &block) { + for (auto it = block.begin(), e = std::prev(block.end()); it != e;) { + auto subBlockStart = it; + while (it != e && !isa(&*it)) + ++it; + if (it != subBlockStart) + subBlocks.push_back({subBlockStart, std::prev(it)}); + // Process all for insts that appear next. + while (it != e && isa(&*it)) + walk(&*it++); + } + } + }; + + assert(unrollJamFactor >= 1 && "unroll jam factor should be >= 1"); + + if (unrollJamFactor == 1) + return promoteIfSingleIteration(forOp); + + if (forOp.getBody()->empty() || + forOp.getBody()->begin() == std::prev(forOp.getBody()->end())) + return failure(); + + // Loops where both lower and upper bounds are multi-result maps won't be + // unrolled (since the trip can't be expressed as an affine function in + // general). + // TODO(mlir-team): this may not be common, but we could support the case + // where the lower bound is a multi-result map and the ub is a single result + // one. + if (forOp.getLowerBoundMap().getNumResults() != 1) + return failure(); + + Optional mayBeConstantTripCount = getConstantTripCount(forOp); + // If the trip count is lower than the unroll jam factor, no unroll jam. + if (mayBeConstantTripCount.hasValue() && + mayBeConstantTripCount.getValue() < unrollJamFactor) + return failure(); + + auto *forInst = forOp.getOperation(); + + // Gather all sub-blocks to jam upon the loop being unrolled. + JamBlockGatherer jbg; + jbg.walk(forInst); + auto &subBlocks = jbg.subBlocks; + + // Generate the cleanup loop if trip count isn't a multiple of + // unrollJamFactor. + if (getLargestDivisorOfTripCount(forOp) % unrollJamFactor != 0) { + // Insert the cleanup loop right after 'forOp'. + OpBuilder builder(forInst->getBlock(), std::next(Block::iterator(forInst))); + auto cleanupAffineForOp = cast(builder.clone(*forInst)); + // Adjust the lower bound of the cleanup loop; its upper bound is the same + // as the original loop's upper bound. + AffineMap cleanupMap; + SmallVector cleanupOperands; + getCleanupLoopLowerBound(forOp, unrollJamFactor, &cleanupMap, + &cleanupOperands, builder); + cleanupAffineForOp.setLowerBound(cleanupOperands, cleanupMap); + + // Promote the cleanup loop if it has turned into a single iteration loop. + promoteIfSingleIteration(cleanupAffineForOp); + + // Adjust the upper bound of the original loop - it will be the same as the + // cleanup loop's lower bound. Its lower bound remains unchanged. + forOp.setUpperBound(cleanupOperands, cleanupMap); + } + + // Scale the step of loop being unroll-jammed by the unroll-jam factor. + int64_t step = forOp.getStep(); + forOp.setStep(step * unrollJamFactor); + + auto forOpIV = forOp.getInductionVar(); + // Unroll and jam (appends unrollJamFactor - 1 additional copies). + for (unsigned i = unrollJamFactor - 1; i >= 1; --i) { + // Operand map persists across all sub-blocks. + BlockAndValueMapping operandMapping; + for (auto &subBlock : subBlocks) { + // Builder to insert unroll-jammed bodies. Insert right at the end of + // sub-block. + OpBuilder builder(subBlock.first->getBlock(), std::next(subBlock.second)); + + // If the induction variable is used, create a remapping to the value for + // this unrolled instance. + if (!forOpIV.use_empty()) { + // iv' = iv + i, i = 1 to unrollJamFactor-1. + auto d0 = builder.getAffineDimExpr(0); + auto bumpMap = AffineMap::get(1, 0, {d0 + i * step}); + auto ivUnroll = + builder.create(forInst->getLoc(), bumpMap, forOpIV); + operandMapping.map(forOpIV, ivUnroll); + } + // Clone the sub-block being unroll-jammed. + for (auto it = subBlock.first; it != std::next(subBlock.second); ++it) { + builder.clone(*it, operandMapping); + } + } + } + + // Promote the loop body up if this has turned into a single iteration loop. + promoteIfSingleIteration(forOp); + return success(); +} + /// Performs loop interchange on 'forOpA' and 'forOpB', where 'forOpB' is /// nested within 'forOpA' as the only non-terminator operation in its block. void mlir::interchangeLoops(AffineForOp forOpA, AffineForOp forOpB) { diff --git a/mlir/lib/Transforms/Utils/Utils.cpp b/mlir/lib/Transforms/Utils/Utils.cpp index 844befecea366..4ab773eb38df0 100644 --- a/mlir/lib/Transforms/Utils/Utils.cpp +++ b/mlir/lib/Transforms/Utils/Utils.cpp @@ -18,7 +18,7 @@ #include "mlir/Analysis/AffineStructures.h" #include "mlir/Analysis/Dominance.h" #include "mlir/Analysis/Utils.h" -#include "mlir/Dialect/AffineOps/AffineOps.h" +#include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/IR/Builders.h" #include "mlir/IR/Function.h" #include "mlir/IR/Module.h" diff --git a/mlir/test/Conversion/AVX512ToLLVM/convert-to-llvm.mlir b/mlir/test/Conversion/AVX512ToLLVM/convert-to-llvm.mlir new file mode 100644 index 0000000000000..936819e27eb90 --- /dev/null +++ b/mlir/test/Conversion/AVX512ToLLVM/convert-to-llvm.mlir @@ -0,0 +1,17 @@ +// RUN: mlir-opt %s -convert-avx512-to-llvm | mlir-opt | FileCheck %s + +func @avx512_mask_rndscale(%a: vector<16xf32>, %b: vector<8xf64>, %i32: i32, %i16: i16, %i8: i8) + -> (vector<16xf32>, vector<8xf64>) +{ + // CHECK: llvm_avx512.mask.rndscale.ps.512 + %0 = avx512.mask.rndscale %a, %i32, %a, %i16, %i32: vector<16xf32> + // CHECK: llvm_avx512.mask.rndscale.pd.512 + %1 = avx512.mask.rndscale %b, %i32, %b, %i8, %i32: vector<8xf64> + + // CHECK: llvm_avx512.mask.scalef.ps.512 + %a0 = avx512.mask.scalef %a, %a, %a, %i16, %i32: vector<16xf32> + // CHECK: llvm_avx512.mask.scalef.pd.512 + %a1 = avx512.mask.scalef %b, %b, %b, %i8, %i32: vector<8xf64> + + return %a0, %a1: vector<16xf32>, vector<8xf64> +} diff --git a/mlir/test/Conversion/GPUToSPIRV/if.mlir b/mlir/test/Conversion/GPUToSPIRV/if.mlir index 1585c53116c5d..8a8aa1c888133 100644 --- a/mlir/test/Conversion/GPUToSPIRV/if.mlir +++ b/mlir/test/Conversion/GPUToSPIRV/if.mlir @@ -1,6 +1,12 @@ // RUN: mlir-opt -convert-gpu-to-spirv %s -o - | FileCheck %s -module attributes {gpu.container_module} { +module attributes { + gpu.container_module, + spv.target_env = #spv.target_env< + #spv.vce, + {max_compute_workgroup_invocations = 128 : i32, + max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}> +} { func @main(%arg0 : memref<10xf32>, %arg1 : i1) { %c0 = constant 1 : index "gpu.launch_func"(%c0, %c0, %c0, %c0, %c0, %c0, %arg0, %arg1) { kernel = "kernel_simple_selection", kernel_module = @kernels} : (index, index, index, index, index, index, memref<10xf32>, i1) -> () diff --git a/mlir/test/Conversion/GPUToSPIRV/load-store.mlir b/mlir/test/Conversion/GPUToSPIRV/load-store.mlir index 6588de8700575..05c9d90c498c2 100644 --- a/mlir/test/Conversion/GPUToSPIRV/load-store.mlir +++ b/mlir/test/Conversion/GPUToSPIRV/load-store.mlir @@ -1,6 +1,12 @@ // RUN: mlir-opt -convert-gpu-to-spirv %s -o - | FileCheck %s -module attributes {gpu.container_module} { +module attributes { + gpu.container_module, + spv.target_env = #spv.target_env< + #spv.vce, + {max_compute_workgroup_invocations = 128 : i32, + max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}> +} { func @load_store(%arg0: memref<12x4xf32>, %arg1: memref<12x4xf32>, %arg2: memref<12x4xf32>) { %c0 = constant 0 : index %c12 = constant 12 : index @@ -21,9 +27,9 @@ module attributes {gpu.container_module} { // CHECK-DAG: spv.globalVariable [[LOCALINVOCATIONIDVAR:@.*]] built_in("LocalInvocationId") : !spv.ptr, Input> // CHECK-DAG: spv.globalVariable [[WORKGROUPIDVAR:@.*]] built_in("WorkgroupId") : !spv.ptr, Input> // CHECK-LABEL: spv.func @load_store_kernel - // CHECK-SAME: [[ARG0:%.*]]: !spv.ptr [0]>, StorageBuffer> {spv.interface_var_abi = {binding = 0 : i32, descriptor_set = 0 : i32, storage_class = 12 : i32{{[}][}]}} - // CHECK-SAME: [[ARG1:%.*]]: !spv.ptr [0]>, StorageBuffer> {spv.interface_var_abi = {binding = 1 : i32, descriptor_set = 0 : i32, storage_class = 12 : i32{{[}][}]}} - // CHECK-SAME: [[ARG2:%.*]]: !spv.ptr [0]>, StorageBuffer> {spv.interface_var_abi = {binding = 2 : i32, descriptor_set = 0 : i32, storage_class = 12 : i32{{[}][}]}} + // CHECK-SAME: [[ARG0:%.*]]: !spv.ptr [0]>, StorageBuffer> {spv.interface_var_abi = {binding = 0 : i32, descriptor_set = 0 : i32{{[}][}]}} + // CHECK-SAME: [[ARG1:%.*]]: !spv.ptr [0]>, StorageBuffer> {spv.interface_var_abi = {binding = 1 : i32, descriptor_set = 0 : i32{{[}][}]}} + // CHECK-SAME: [[ARG2:%.*]]: !spv.ptr [0]>, StorageBuffer> {spv.interface_var_abi = {binding = 2 : i32, descriptor_set = 0 : i32{{[}][}]}} // CHECK-SAME: [[ARG3:%.*]]: i32 {spv.interface_var_abi = {binding = 3 : i32, descriptor_set = 0 : i32, storage_class = 12 : i32{{[}][}]}} // CHECK-SAME: [[ARG4:%.*]]: i32 {spv.interface_var_abi = {binding = 4 : i32, descriptor_set = 0 : i32, storage_class = 12 : i32{{[}][}]}} // CHECK-SAME: [[ARG5:%.*]]: i32 {spv.interface_var_abi = {binding = 5 : i32, descriptor_set = 0 : i32, storage_class = 12 : i32{{[}][}]}} diff --git a/mlir/test/Conversion/GPUToSPIRV/loop.mlir b/mlir/test/Conversion/GPUToSPIRV/loop.mlir index 7044d5474d3c5..8adc5e355f087 100644 --- a/mlir/test/Conversion/GPUToSPIRV/loop.mlir +++ b/mlir/test/Conversion/GPUToSPIRV/loop.mlir @@ -1,6 +1,12 @@ // RUN: mlir-opt -convert-gpu-to-spirv %s -o - | FileCheck %s -module attributes {gpu.container_module} { +module attributes { + gpu.container_module, + spv.target_env = #spv.target_env< + #spv.vce, + {max_compute_workgroup_invocations = 128 : i32, + max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}> +} { func @loop(%arg0 : memref<10xf32>, %arg1 : memref<10xf32>) { %c0 = constant 1 : index "gpu.launch_func"(%c0, %c0, %c0, %c0, %c0, %c0, %arg0, %arg1) { kernel = "loop_kernel", kernel_module = @kernels} : (index, index, index, index, index, index, memref<10xf32>, memref<10xf32>) -> () diff --git a/mlir/test/Conversion/GPUToSPIRV/simple.mlir b/mlir/test/Conversion/GPUToSPIRV/simple.mlir index d9b32a6e571b7..3076cd04b9fe1 100644 --- a/mlir/test/Conversion/GPUToSPIRV/simple.mlir +++ b/mlir/test/Conversion/GPUToSPIRV/simple.mlir @@ -5,7 +5,7 @@ module attributes {gpu.container_module} { // CHECK: spv.module Logical GLSL450 { // CHECK-LABEL: spv.func @basic_module_structure // CHECK-SAME: {{%.*}}: f32 {spv.interface_var_abi = {binding = 0 : i32, descriptor_set = 0 : i32, storage_class = 12 : i32{{[}][}]}} - // CHECK-SAME: {{%.*}}: !spv.ptr [0]>, StorageBuffer> {spv.interface_var_abi = {binding = 1 : i32, descriptor_set = 0 : i32, storage_class = 12 : i32{{[}][}]}} + // CHECK-SAME: {{%.*}}: !spv.ptr [0]>, StorageBuffer> {spv.interface_var_abi = {binding = 1 : i32, descriptor_set = 0 : i32{{[}][}]}} // CHECK-SAME: spv.entry_point_abi = {local_size = dense<[32, 4, 1]> : vector<3xi32>} gpu.func @basic_module_structure(%arg0 : f32, %arg1 : memref<12xf32>) attributes {gpu.kernel, spv.entry_point_abi = {local_size = dense<[32, 4, 1]>: vector<3xi32>}} { diff --git a/mlir/test/Conversion/LoopsToGPU/imperfect_2D.mlir b/mlir/test/Conversion/LoopsToGPU/imperfect_2D.mlir index 59d18a1d7654e..49562a7f7840f 100644 --- a/mlir/test/Conversion/LoopsToGPU/imperfect_2D.mlir +++ b/mlir/test/Conversion/LoopsToGPU/imperfect_2D.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt -convert-loop-op-to-gpu -gpu-num-workgroups=2,2 -gpu-workgroup-size=32,4 %s | FileCheck %s +// RUN: mlir-opt -convert-loop-op-to-gpu="gpu-num-workgroups=2,2 gpu-workgroup-size=32,4" %s | FileCheck %s module { // arg2 = arg0 * transpose(arg1) ; with intermediate buffer and tile size passed as argument diff --git a/mlir/test/Conversion/LoopsToGPU/imperfect_3D.mlir b/mlir/test/Conversion/LoopsToGPU/imperfect_3D.mlir index 73f0ab7d71bae..f6cc5e2398b5b 100644 --- a/mlir/test/Conversion/LoopsToGPU/imperfect_3D.mlir +++ b/mlir/test/Conversion/LoopsToGPU/imperfect_3D.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt -convert-loop-op-to-gpu -gpu-num-workgroups=4,2,2 -gpu-workgroup-size=32,2,2 %s | FileCheck %s +// RUN: mlir-opt -convert-loop-op-to-gpu="gpu-num-workgroups=4,2,2 gpu-workgroup-size=32,2,2" %s | FileCheck %s module { func @imperfect_3D(%arg0 : memref, %arg1 : memref, %arg2 : memref, %arg3 : memref, %t1 : index, %t2 : index, %t3 : index, %step1 : index, %step2 : index, %step3 : index) { @@ -80,4 +80,4 @@ module { } return } -} \ No newline at end of file +} diff --git a/mlir/test/Conversion/LoopsToGPU/imperfect_4D.mlir b/mlir/test/Conversion/LoopsToGPU/imperfect_4D.mlir index 2c5dd5c0fb2fa..8858a3e5e6315 100644 --- a/mlir/test/Conversion/LoopsToGPU/imperfect_4D.mlir +++ b/mlir/test/Conversion/LoopsToGPU/imperfect_4D.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt -convert-loop-op-to-gpu -gpu-num-workgroups=4,2,2 -gpu-workgroup-size=32,2,2 %s | FileCheck %s +// RUN: mlir-opt -convert-loop-op-to-gpu="gpu-num-workgroups=4,2,2 gpu-workgroup-size=32,2,2" %s | FileCheck %s module { func @imperfect_3D(%arg0 : memref, %arg1 : memref, %arg2 : memref, %arg3 : memref, %t1 : index, %t2 : index, %t3 : index, %t4 : index, %step1 : index, %step2 : index, %step3 : index, %step4 : index) { @@ -83,4 +83,4 @@ module { } return } -} \ No newline at end of file +} diff --git a/mlir/test/Conversion/LoopsToGPU/imperfect_linalg.mlir b/mlir/test/Conversion/LoopsToGPU/imperfect_linalg.mlir index abf8da6b562ea..4ffb8906d4d6a 100644 --- a/mlir/test/Conversion/LoopsToGPU/imperfect_linalg.mlir +++ b/mlir/test/Conversion/LoopsToGPU/imperfect_linalg.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt %s -convert-loop-op-to-gpu -gpu-num-workgroups=2,16 -gpu-workgroup-size=32,4 | FileCheck %s +// RUN: mlir-opt %s -convert-loop-op-to-gpu="gpu-num-workgroups=2,16 gpu-workgroup-size=32,4" | FileCheck %s module { func @fmul(%arg0: memref, %arg1: memref, %arg2: memref) { diff --git a/mlir/test/Conversion/LoopsToGPU/no_blocks_no_threads.mlir b/mlir/test/Conversion/LoopsToGPU/no_blocks_no_threads.mlir index 51cedeb63cf46..6100a10e704f8 100644 --- a/mlir/test/Conversion/LoopsToGPU/no_blocks_no_threads.mlir +++ b/mlir/test/Conversion/LoopsToGPU/no_blocks_no_threads.mlir @@ -1,5 +1,5 @@ -// RUN: mlir-opt -convert-loops-to-gpu -gpu-block-dims=0 -gpu-thread-dims=1 %s | FileCheck --check-prefix=CHECK-THREADS %s --dump-input-on-failure -// RUN: mlir-opt -convert-loops-to-gpu -gpu-block-dims=1 -gpu-thread-dims=0 %s | FileCheck --check-prefix=CHECK-BLOCKS %s --dump-input-on-failure +// RUN: mlir-opt -convert-loops-to-gpu="gpu-block-dims=0 gpu-thread-dims=1" %s | FileCheck --check-prefix=CHECK-THREADS %s --dump-input-on-failure +// RUN: mlir-opt -convert-loops-to-gpu="gpu-block-dims=1 gpu-thread-dims=0" %s | FileCheck --check-prefix=CHECK-BLOCKS %s --dump-input-on-failure // CHECK-THREADS-LABEL: @one_d_loop // CHECK-BLOCKS-LABEL: @one_d_loop diff --git a/mlir/test/Conversion/LoopsToGPU/parallel_loop.mlir b/mlir/test/Conversion/LoopsToGPU/parallel_loop.mlir index 4bc97da954ff9..ab195936d83af 100644 --- a/mlir/test/Conversion/LoopsToGPU/parallel_loop.mlir +++ b/mlir/test/Conversion/LoopsToGPU/parallel_loop.mlir @@ -3,7 +3,7 @@ // 2-d parallel loop mapped to block.y and block.x func @parallel_loop_bidy_bidx(%arg0 : index, %arg1 : index, %arg2 : index, - %arg3 : index, %arg4 : index, + %arg3 : index, %arg4 : index, %buf : memref, %res : memref) { %step = constant 2 : index @@ -334,7 +334,7 @@ func @parallel_loop_loop_variant_bound(%arg0 : index, %arg1 : index, %arg2 : ind // expected-error@+1 {{failed to legalize operation 'loop.parallel'}} loop.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3) step (%four, %four) { - // expected-error@+1 {{cannot derive loop-invariant upper bound}} + // expected-error@+1 {{cannot derive loop-invariant upper bound}} loop.parallel (%si0, %si1) = (%zero, %zero) to (%i0, %i1) step (%one, %one) { %idx0 = addi %i0, %si0 : index diff --git a/mlir/test/Conversion/LoopsToGPU/perfect_1D_setlaunch.mlir b/mlir/test/Conversion/LoopsToGPU/perfect_1D_setlaunch.mlir index bf437a348b64f..2861b33c9e7bd 100644 --- a/mlir/test/Conversion/LoopsToGPU/perfect_1D_setlaunch.mlir +++ b/mlir/test/Conversion/LoopsToGPU/perfect_1D_setlaunch.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt -convert-loop-op-to-gpu -gpu-num-workgroups=2 -gpu-workgroup-size=32 %s | FileCheck %s +// RUN: mlir-opt -convert-loop-op-to-gpu="gpu-num-workgroups=2 gpu-workgroup-size=32" %s | FileCheck %s module { func @foo(%arg0: memref, %arg1 : memref, %arg2 : memref) { @@ -23,4 +23,4 @@ module { } return } -} \ No newline at end of file +} diff --git a/mlir/test/Conversion/LoopsToGPU/step_one.mlir b/mlir/test/Conversion/LoopsToGPU/step_one.mlir index e0cdbd456209e..a088880e5821c 100644 --- a/mlir/test/Conversion/LoopsToGPU/step_one.mlir +++ b/mlir/test/Conversion/LoopsToGPU/step_one.mlir @@ -1,5 +1,5 @@ -// RUN: mlir-opt -convert-loops-to-gpu -gpu-block-dims=1 -gpu-thread-dims=1 %s | FileCheck --check-prefix=CHECK-11 %s -// RUN: mlir-opt -convert-loops-to-gpu -gpu-block-dims=2 -gpu-thread-dims=2 %s | FileCheck --check-prefix=CHECK-22 %s +// RUN: mlir-opt -convert-loops-to-gpu="gpu-block-dims=1 gpu-thread-dims=1" %s | FileCheck --check-prefix=CHECK-11 %s +// RUN: mlir-opt -convert-loops-to-gpu="gpu-block-dims=2 gpu-thread-dims=2" %s | FileCheck --check-prefix=CHECK-22 %s // CHECK-11-LABEL: @step_1 // CHECK-22-LABEL: @step_1 diff --git a/mlir/test/Conversion/LoopsToGPU/step_positive.mlir b/mlir/test/Conversion/LoopsToGPU/step_positive.mlir index 6bedc92abca62..9037eace6584b 100644 --- a/mlir/test/Conversion/LoopsToGPU/step_positive.mlir +++ b/mlir/test/Conversion/LoopsToGPU/step_positive.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt -convert-loops-to-gpu -gpu-block-dims=1 -gpu-thread-dims=1 %s | FileCheck %s +// RUN: mlir-opt -convert-loops-to-gpu="gpu-block-dims=1 gpu-thread-dims=1" %s | FileCheck %s // CHECK-LABEL: @step_var func @step_var(%A : memref, %B : memref) { diff --git a/mlir/test/Conversion/StandardToSPIRV/std-to-spirv.mlir b/mlir/test/Conversion/StandardToSPIRV/std-to-spirv.mlir index 341df27460a0b..91219acc0bd54 100644 --- a/mlir/test/Conversion/StandardToSPIRV/std-to-spirv.mlir +++ b/mlir/test/Conversion/StandardToSPIRV/std-to-spirv.mlir @@ -1,105 +1,166 @@ -// RUN: mlir-opt -convert-std-to-spirv %s -o - | FileCheck %s +// RUN: mlir-opt -split-input-file -convert-std-to-spirv %s -o - | FileCheck %s //===----------------------------------------------------------------------===// -// std binary arithmetic ops +// std arithmetic ops //===----------------------------------------------------------------------===// -// CHECK-LABEL: @add_sub -func @add_sub(%arg0 : i32, %arg1 : i32) { - // CHECK: spv.IAdd - %0 = addi %arg0, %arg1 : i32 - // CHECK: spv.ISub - %1 = subi %arg0, %arg1 : i32 +module attributes { + spv.target_env = #spv.target_env< + #spv.vce, + {max_compute_workgroup_invocations = 128 : i32, + max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}> +} { + +// Check integer operation conversions. +// CHECK-LABEL: @int32_scalar +func @int32_scalar(%lhs: i32, %rhs: i32) { + // CHECK: spv.IAdd %{{.*}}, %{{.*}}: i32 + %0 = addi %lhs, %rhs: i32 + // CHECK: spv.ISub %{{.*}}, %{{.*}}: i32 + %1 = subi %lhs, %rhs: i32 + // CHECK: spv.IMul %{{.*}}, %{{.*}}: i32 + %2 = muli %lhs, %rhs: i32 + // CHECK: spv.SDiv %{{.*}}, %{{.*}}: i32 + %3 = divi_signed %lhs, %rhs: i32 + // CHECK: spv.SRem %{{.*}}, %{{.*}}: i32 + %4 = remi_signed %lhs, %rhs: i32 + // CHECK: spv.UDiv %{{.*}}, %{{.*}}: i32 + %5 = divi_unsigned %lhs, %rhs: i32 + // CHECK: spv.UMod %{{.*}}, %{{.*}}: i32 + %6 = remi_unsigned %lhs, %rhs: i32 return } -// CHECK-LABEL: @fadd_scalar -func @fadd_scalar(%arg: f32) { - // CHECK: spv.FAdd - %0 = addf %arg, %arg : f32 +// Check float unary operation conversions. +// CHECK-LABEL: @float32_unary_scalar +func @float32_unary_scalar(%arg0: f32) { + // CHECK: spv.GLSL.FAbs %{{.*}}: f32 + %0 = absf %arg0 : f32 + // CHECK: spv.GLSL.Ceil %{{.*}}: f32 + %1 = ceilf %arg0 : f32 + // CHECK: spv.GLSL.Cos %{{.*}}: f32 + %2 = cos %arg0 : f32 + // CHECK: spv.GLSL.Exp %{{.*}}: f32 + %3 = exp %arg0 : f32 + // CHECK: spv.GLSL.Log %{{.*}}: f32 + %4 = log %arg0 : f32 + // CHECK: spv.FNegate %{{.*}}: f32 + %5 = negf %arg0 : f32 + // CHECK: spv.GLSL.InverseSqrt %{{.*}}: f32 + %6 = rsqrt %arg0 : f32 + // CHECK: spv.GLSL.Sqrt %{{.*}}: f32 + %7 = sqrt %arg0 : f32 + // CHECK: spv.GLSL.Tanh %{{.*}}: f32 + %8 = tanh %arg0 : f32 return } -// CHECK-LABEL: @fdiv_scalar -func @fdiv_scalar(%arg: f32) { - // CHECK: spv.FDiv - %0 = divf %arg, %arg : f32 +// Check float binary operation conversions. +// CHECK-LABEL: @float32_binary_scalar +func @float32_binary_scalar(%lhs: f32, %rhs: f32) { + // CHECK: spv.FAdd %{{.*}}, %{{.*}}: f32 + %0 = addf %lhs, %rhs: f32 + // CHECK: spv.FSub %{{.*}}, %{{.*}}: f32 + %1 = subf %lhs, %rhs: f32 + // CHECK: spv.FMul %{{.*}}, %{{.*}}: f32 + %2 = mulf %lhs, %rhs: f32 + // CHECK: spv.FDiv %{{.*}}, %{{.*}}: f32 + %3 = divf %lhs, %rhs: f32 + // CHECK: spv.FRem %{{.*}}, %{{.*}}: f32 + %4 = remf %lhs, %rhs: f32 return } -// CHECK-LABEL: @fmul_scalar -func @fmul_scalar(%arg: f32) { - // CHECK: spv.FMul - %0 = mulf %arg, %arg : f32 +// Check int vector types. +// CHECK-LABEL: @int_vector234 +func @int_vector234(%arg0: vector<2xi8>, %arg1: vector<3xi16>, %arg2: vector<4xi64>) { + // CHECK: spv.SDiv %{{.*}}, %{{.*}}: vector<2xi8> + %0 = divi_signed %arg0, %arg0: vector<2xi8> + // CHECK: spv.SRem %{{.*}}, %{{.*}}: vector<3xi16> + %1 = remi_signed %arg1, %arg1: vector<3xi16> + // CHECK: spv.UDiv %{{.*}}, %{{.*}}: vector<4xi64> + %2 = divi_unsigned %arg2, %arg2: vector<4xi64> return } -// CHECK-LABEL: @fmul_vector2 -func @fmul_vector2(%arg: vector<2xf32>) { - // CHECK: spv.FMul - %0 = mulf %arg, %arg : vector<2xf32> +// Check float vector types. +// CHECK-LABEL: @float_vector234 +func @float_vector234(%arg0: vector<2xf16>, %arg1: vector<3xf64>) { + // CHECK: spv.FAdd %{{.*}}, %{{.*}}: vector<2xf16> + %0 = addf %arg0, %arg0: vector<2xf16> + // CHECK: spv.FMul %{{.*}}, %{{.*}}: vector<3xf64> + %1 = mulf %arg1, %arg1: vector<3xf64> return } -// CHECK-LABEL: @fmul_vector3 -func @fmul_vector3(%arg: vector<3xf32>) { - // CHECK: spv.FMul - %0 = mulf %arg, %arg : vector<3xf32> +// CHECK-LABEL: @unsupported_1elem_vector +func @unsupported_1elem_vector(%arg0: vector<1xi32>) { + // CHECK: addi + %0 = addi %arg0, %arg0: vector<1xi32> return } -// CHECK-LABEL: @fmul_vector4 -func @fmul_vector4(%arg: vector<4xf32>) { - // CHECK: spv.FMul - %0 = mulf %arg, %arg : vector<4xf32> +// CHECK-LABEL: @unsupported_5elem_vector +func @unsupported_5elem_vector(%arg0: vector<5xi32>) { + // CHECK: subi + %1 = subi %arg0, %arg0: vector<5xi32> return } -// CHECK-LABEL: @fmul_vector5 -func @fmul_vector5(%arg: vector<5xf32>) { - // Vector length of only 2, 3, and 4 is valid for SPIR-V. - // CHECK: mulf - %0 = mulf %arg, %arg : vector<5xf32> +// CHECK-LABEL: @unsupported_2x2elem_vector +func @unsupported_2x2elem_vector(%arg0: vector<2x2xi32>) { + // CHECK: muli + %2 = muli %arg0, %arg0: vector<2x2xi32> return } -// TODO(antiagainst): enable this once we support converting binary ops -// needing type conversion. -// XXXXX-LABEL: @fmul_tensor -//func @fmul_tensor(%arg: tensor<4xf32>) { - // For tensors mulf cannot be lowered directly to spv.FMul. - // XXXXX: mulf - //%0 = mulf %arg, %arg : tensor<4xf32> - //return -//} - -// CHECK-LABEL: @frem_scalar -func @frem_scalar(%arg: f32) { - // CHECK: spv.FRem - %0 = remf %arg, %arg : f32 +} // end module + +// ----- + +// Check that types are converted to 32-bit when no special capabilities. +module attributes { + spv.target_env = #spv.target_env< + #spv.vce, + {max_compute_workgroup_invocations = 128 : i32, + max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}> +} { + +// CHECK-LABEL: @int_vector234 +func @int_vector234(%arg0: vector<2xi8>, %arg1: vector<3xi16>, %arg2: vector<4xi64>) { + // CHECK: spv.SDiv %{{.*}}, %{{.*}}: vector<2xi32> + %0 = divi_signed %arg0, %arg0: vector<2xi8> + // CHECK: spv.SRem %{{.*}}, %{{.*}}: vector<3xi32> + %1 = remi_signed %arg1, %arg1: vector<3xi16> + // CHECK: spv.UDiv %{{.*}}, %{{.*}}: vector<4xi32> + %2 = divi_unsigned %arg2, %arg2: vector<4xi64> return } -// CHECK-LABEL: @fsub_scalar -func @fsub_scalar(%arg: f32) { - // CHECK: spv.FSub - %0 = subf %arg, %arg : f32 +// CHECK-LABEL: @float_scalar +func @float_scalar(%arg0: f16, %arg1: f64) { + // CHECK: spv.FAdd %{{.*}}, %{{.*}}: f32 + %0 = addf %arg0, %arg0: f16 + // CHECK: spv.FMul %{{.*}}, %{{.*}}: f32 + %1 = mulf %arg1, %arg1: f64 return } -// CHECK-LABEL: @div_rem -func @div_rem(%arg0 : i32, %arg1 : i32) { - // CHECK: spv.SDiv - %0 = divi_signed %arg0, %arg1 : i32 - // CHECK: spv.SMod - %1 = remi_signed %arg0, %arg1 : i32 - return -} +} // end module + +// ----- //===----------------------------------------------------------------------===// // std bit ops //===----------------------------------------------------------------------===// +module attributes { + spv.target_env = #spv.target_env< + #spv.vce, + {max_compute_workgroup_invocations = 128 : i32, + max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}> +} { + // CHECK-LABEL: @bitwise_scalar func @bitwise_scalar(%arg0 : i32, %arg1 : i32) { // CHECK: spv.BitwiseAnd @@ -122,6 +183,24 @@ func @bitwise_vector(%arg0 : vector<4xi32>, %arg1 : vector<4xi32>) { return } +// CHECK-LABEL: @logical_scalar +func @logical_scalar(%arg0 : i1, %arg1 : i1) { + // CHECK: spv.LogicalAnd + %0 = and %arg0, %arg1 : i1 + // CHECK: spv.LogicalOr + %1 = or %arg0, %arg1 : i1 + return +} + +// CHECK-LABEL: @logical_vector +func @logical_vector(%arg0 : vector<4xi1>, %arg1 : vector<4xi1>) { + // CHECK: spv.LogicalAnd + %0 = and %arg0, %arg1 : vector<4xi1> + // CHECK: spv.LogicalOr + %1 = or %arg0, %arg1 : vector<4xi1> + return +} + // CHECK-LABEL: @shift_scalar func @shift_scalar(%arg0 : i32, %arg1 : i32) { // CHECK: spv.ShiftLeftLogical @@ -206,17 +285,28 @@ func @cmpi(%arg0 : i32, %arg1 : i32) { return } +} // end module + +// ----- + //===----------------------------------------------------------------------===// // std.constant //===----------------------------------------------------------------------===// +module attributes { + spv.target_env = #spv.target_env< + #spv.vce, + {max_compute_workgroup_invocations = 128 : i32, + max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}> +} { + // CHECK-LABEL: @constant func @constant() { // CHECK: spv.constant true %0 = constant true - // CHECK: spv.constant 42 : i64 - %1 = constant 42 - // CHECK: spv.constant {{[0-9]*\.[0-9]*e?-?[0-9]*}} : f32 + // CHECK: spv.constant 42 : i32 + %1 = constant 42 : i32 + // CHECK: spv.constant 5.000000e-01 : f32 %2 = constant 0.5 : f32 // CHECK: spv.constant dense<[2, 3]> : vector<2xi32> %3 = constant dense<[2, 3]> : vector<2xi32> @@ -237,50 +327,234 @@ func @constant() { return } -//===----------------------------------------------------------------------===// -// std logical binary operations -//===----------------------------------------------------------------------===// +// CHECK-LABEL: @constant_16bit +func @constant_16bit() { + // CHECK: spv.constant 4 : i16 + %0 = constant 4 : i16 + // CHECK: spv.constant 5.000000e+00 : f16 + %1 = constant 5.0 : f16 + // CHECK: spv.constant dense<[2, 3]> : vector<2xi16> + %2 = constant dense<[2, 3]> : vector<2xi16> + // CHECK: spv.constant dense<4.000000e+00> : tensor<5xf16> : !spv.array<5 x f16 [2]> + %3 = constant dense<4.0> : tensor<5xf16> + return +} -// CHECK-LABEL: @logical_scalar -func @logical_scalar(%arg0 : i1, %arg1 : i1) { - // CHECK: spv.LogicalAnd - %0 = and %arg0, %arg1 : i1 - // CHECK: spv.LogicalOr - %1 = or %arg0, %arg1 : i1 +// CHECK-LABEL: @constant_64bit +func @constant_64bit() { + // CHECK: spv.constant 4 : i64 + %0 = constant 4 : i64 + // CHECK: spv.constant 5.000000e+00 : f64 + %1 = constant 5.0 : f64 + // CHECK: spv.constant dense<[2, 3]> : vector<2xi64> + %2 = constant dense<[2, 3]> : vector<2xi64> + // CHECK: spv.constant dense<4.000000e+00> : tensor<5xf64> : !spv.array<5 x f64 [8]> + %3 = constant dense<4.0> : tensor<5xf64> return } -// CHECK-LABEL: @logical_vector -func @logical_vector(%arg0 : vector<4xi1>, %arg1 : vector<4xi1>) { - // CHECK: spv.LogicalAnd - %0 = and %arg0, %arg1 : vector<4xi1> - // CHECK: spv.LogicalOr - %1 = or %arg0, %arg1 : vector<4xi1> +} // end module + +// ----- + +// Check that constants are converted to 32-bit when no special capability. +module attributes { + spv.target_env = #spv.target_env< + #spv.vce, + {max_compute_workgroup_invocations = 128 : i32, + max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}> +} { + +// CHECK-LABEL: @constant_16bit +func @constant_16bit() { + // CHECK: spv.constant 4 : i32 + %0 = constant 4 : i16 + // CHECK: spv.constant 5.000000e+00 : f32 + %1 = constant 5.0 : f16 + // CHECK: spv.constant dense<[2, 3]> : vector<2xi32> + %2 = constant dense<[2, 3]> : vector<2xi16> + // CHECK: spv.constant dense<4.000000e+00> : tensor<5xf32> : !spv.array<5 x f32 [4]> + %3 = constant dense<4.0> : tensor<5xf16> + // CHECK: spv.constant dense<[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00]> : tensor<4xf32> : !spv.array<4 x f32 [4]> + %4 = constant dense<[[1.0, 2.0], [3.0, 4.0]]> : tensor<2x2xf16> return } -//===----------------------------------------------------------------------===// -// std.fpext -//===----------------------------------------------------------------------===// +// CHECK-LABEL: @constant_64bit +func @constant_64bit() { + // CHECK: spv.constant 4 : i32 + %0 = constant 4 : i64 + // CHECK: spv.constant 5.000000e+00 : f32 + %1 = constant 5.0 : f64 + // CHECK: spv.constant dense<[2, 3]> : vector<2xi32> + %2 = constant dense<[2, 3]> : vector<2xi64> + // CHECK: spv.constant dense<4.000000e+00> : tensor<5xf32> : !spv.array<5 x f32 [4]> + %3 = constant dense<4.0> : tensor<5xf64> + // CHECK: spv.constant dense<[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00]> : tensor<4xf32> : !spv.array<4 x f32 [4]> + %4 = constant dense<[[1.0, 2.0], [3.0, 4.0]]> : tensor<2x2xf16> + return +} -// CHECK-LABEL: @fpext -func @fpext(%arg0 : f32) { - // CHECK: spv.FConvert - %0 = std.fpext %arg0 : f32 to f64 +// CHECK-LABEL: @corner_cases +func @corner_cases() { + // CHECK: %{{.*}} = spv.constant -1 : i32 + %0 = constant 4294967295 : i64 // 2^32 - 1 + // CHECK: %{{.*}} = spv.constant 2147483647 : i32 + %1 = constant 2147483647 : i64 // 2^31 - 1 + // CHECK: %{{.*}} = spv.constant -2147483648 : i32 + %2 = constant 2147483648 : i64 // 2^31 + // CHECK: %{{.*}} = spv.constant -2147483648 : i32 + %3 = constant -2147483648 : i64 // -2^31 + + // CHECK: %{{.*}} = spv.constant -1 : i32 + %5 = constant -1 : i64 + // CHECK: %{{.*}} = spv.constant -2 : i32 + %6 = constant -2 : i64 + // CHECK: %{{.*}} = spv.constant -1 : i32 + %7 = constant -1 : index + // CHECK: %{{.*}} = spv.constant -2 : i32 + %8 = constant -2 : index + + + // CHECK: spv.constant false + %9 = constant 0 : i1 + // CHECK: spv.constant true + %10 = constant 1 : i1 + + return +} + +// CHECK-LABEL: @unsupported_cases +func @unsupported_cases() { + // CHECK: %{{.*}} = constant 4294967296 : i64 + %0 = constant 4294967296 : i64 // 2^32 + // CHECK: %{{.*}} = constant -2147483649 : i64 + %1 = constant -2147483649 : i64 // -2^31 - 1 + // CHECK: %{{.*}} = constant 1.0000000000000002 : f64 + %2 = constant 0x3FF0000000000001 : f64 // smallest number > 1 return } +} // end module + +// ----- + //===----------------------------------------------------------------------===// -// std.fptrunc +// std cast ops //===----------------------------------------------------------------------===// -// CHECK-LABEL: @fptrunc -func @fptrunc(%arg0 : f64) { - // CHECK: spv.FConvert - %0 = std.fptrunc %arg0 : f64 to f32 - return +module attributes { + spv.target_env = #spv.target_env< + #spv.vce, + {max_compute_workgroup_invocations = 128 : i32, + max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}> +} { + +// CHECK-LABEL: @fpext1 +func @fpext1(%arg0: f16) -> f64 { + // CHECK: spv.FConvert %{{.*}} : f16 to f64 + %0 = std.fpext %arg0 : f16 to f64 + return %0 : f64 +} + +// CHECK-LABEL: @fpext2 +func @fpext2(%arg0 : f32) -> f64 { + // CHECK: spv.FConvert %{{.*}} : f32 to f64 + %0 = std.fpext %arg0 : f32 to f64 + return %0 : f64 } +// CHECK-LABEL: @fptrunc1 +func @fptrunc1(%arg0 : f64) -> f16 { + // CHECK: spv.FConvert %{{.*}} : f64 to f16 + %0 = std.fptrunc %arg0 : f64 to f16 + return %0 : f16 +} + +// CHECK-LABEL: @fptrunc2 +func @fptrunc2(%arg0: f32) -> f16 { + // CHECK: spv.FConvert %{{.*}} : f32 to f16 + %0 = std.fptrunc %arg0 : f32 to f16 + return %0 : f16 +} + +// CHECK-LABEL: @sitofp1 +func @sitofp1(%arg0 : i32) -> f32 { + // CHECK: spv.ConvertSToF %{{.*}} : i32 to f32 + %0 = std.sitofp %arg0 : i32 to f32 + return %0 : f32 +} + +// CHECK-LABEL: @sitofp2 +func @sitofp2(%arg0 : i64) -> f64 { + // CHECK: spv.ConvertSToF %{{.*}} : i64 to f64 + %0 = std.sitofp %arg0 : i64 to f64 + return %0 : f64 +} + +} // end module + +// ----- + +// Checks that cast types will be adjusted when no special capabilities for +// non-32-bit scalar types. +module attributes { + spv.target_env = #spv.target_env< + #spv.vce, + {max_compute_workgroup_invocations = 128 : i32, + max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}> +} { + +// CHECK-LABEL: @fpext1 +// CHECK-SAME: %[[ARG:.*]]: f32 +func @fpext1(%arg0: f16) { + // CHECK-NEXT: "use"(%[[ARG]]) + %0 = std.fpext %arg0 : f16 to f64 + "use"(%0) : (f64) -> () +} + +// CHECK-LABEL: @fpext2 +// CHECK-SAME: %[[ARG:.*]]: f32 +func @fpext2(%arg0 : f32) { + // CHECK-NEXT: "use"(%[[ARG]]) + %0 = std.fpext %arg0 : f32 to f64 + "use"(%0) : (f64) -> () +} + +// CHECK-LABEL: @fptrunc1 +// CHECK-SAME: %[[ARG:.*]]: f32 +func @fptrunc1(%arg0 : f64) { + // CHECK-NEXT: "use"(%[[ARG]]) + %0 = std.fptrunc %arg0 : f64 to f16 + "use"(%0) : (f16) -> () +} + +// CHECK-LABEL: @fptrunc2 +// CHECK-SAME: %[[ARG:.*]]: f32 +func @fptrunc2(%arg0: f32) { + // CHECK-NEXT: "use"(%[[ARG]]) + %0 = std.fptrunc %arg0 : f32 to f16 + "use"(%0) : (f16) -> () +} + +// CHECK-LABEL: @sitofp +func @sitofp(%arg0 : i64) { + // CHECK: spv.ConvertSToF %{{.*}} : i32 to f32 + %0 = std.sitofp %arg0 : i64 to f64 + "use"(%0) : (f64) -> () +} + +} // end module + +// ----- + +module attributes { + spv.target_env = #spv.target_env< + #spv.vce, + {max_compute_workgroup_invocations = 128 : i32, + max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}> +} { + //===----------------------------------------------------------------------===// // std.select //===----------------------------------------------------------------------===// @@ -294,25 +568,9 @@ func @select(%arg0 : i32, %arg1 : i32) { } //===----------------------------------------------------------------------===// -// std.sitofp +// std load/store ops //===----------------------------------------------------------------------===// -// CHECK-LABEL: @sitofp -func @sitofp(%arg0 : i32) { - // CHECK: spv.ConvertSToF - %0 = std.sitofp %arg0 : i32 to f32 - return -} - -//===----------------------------------------------------------------------===// -// memref type -//===----------------------------------------------------------------------===// - -// CHECK-LABEL: func @memref_type({{%.*}}: memref<3xi1>) -func @memref_type(%arg0: memref<3xi1>) { - return -} - // CHECK-LABEL: @load_store_zero_rank_float // CHECK: [[ARG0:%.*]]: !spv.ptr [0]>, StorageBuffer>, // CHECK: [[ARG1:%.*]]: !spv.ptr [0]>, StorageBuffer>) @@ -350,3 +608,5 @@ func @load_store_zero_rank_int(%arg0: memref, %arg1: memref) { store %0, %arg1[] : memref return } + +} // end module diff --git a/mlir/test/Conversion/StandardToSPIRV/std-types-to-spirv.mlir b/mlir/test/Conversion/StandardToSPIRV/std-types-to-spirv.mlir new file mode 100644 index 0000000000000..81911bd1a6332 --- /dev/null +++ b/mlir/test/Conversion/StandardToSPIRV/std-types-to-spirv.mlir @@ -0,0 +1,597 @@ +// RUN: mlir-opt -split-input-file -convert-std-to-spirv %s -o - | FileCheck %s + +//===----------------------------------------------------------------------===// +// Integer types +//===----------------------------------------------------------------------===// + +// Check that non-32-bit integer types are converted to 32-bit types if the +// corresponding capabilities are not available. +module attributes { + spv.target_env = #spv.target_env< + #spv.vce, + {max_compute_workgroup_invocations = 128 : i32, + max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}> +} { + +// CHECK-LABEL: spv.func @integer8 +// CHECK-SAME: i32 +// CHECK-SAME: si32 +// CHECK-SAME: ui32 +func @integer8(%arg0: i8, %arg1: si8, %arg2: ui8) { return } + +// CHECK-LABEL: spv.func @integer16 +// CHECK-SAME: i32 +// CHECK-SAME: si32 +// CHECK-SAME: ui32 +func @integer16(%arg0: i16, %arg1: si16, %arg2: ui16) { return } + +// CHECK-LABEL: spv.func @integer64 +// CHECK-SAME: i32 +// CHECK-SAME: si32 +// CHECK-SAME: ui32 +func @integer64(%arg0: i64, %arg1: si64, %arg2: ui64) { return } + +} // end module + +// ----- + +// Check that non-32-bit integer types are kept untouched if the corresponding +// capabilities are available. +module attributes { + spv.target_env = #spv.target_env< + #spv.vce, + {max_compute_workgroup_invocations = 128 : i32, + max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}> +} { + +// CHECK-LABEL: spv.func @integer8 +// CHECK-SAME: i8 +// CHECK-SAME: si8 +// CHECK-SAME: ui8 +func @integer8(%arg0: i8, %arg1: si8, %arg2: ui8) { return } + +// CHECK-LABEL: spv.func @integer16 +// CHECK-SAME: i16 +// CHECK-SAME: si16 +// CHECK-SAME: ui16 +func @integer16(%arg0: i16, %arg1: si16, %arg2: ui16) { return } + +// CHECK-LABEL: spv.func @integer64 +// CHECK-SAME: i64 +// CHECK-SAME: si64 +// CHECK-SAME: ui64 +func @integer64(%arg0: i64, %arg1: si64, %arg2: ui64) { return } + +} // end module + +// ----- + +// Check that weird bitwidths are not supported. +module attributes { + spv.target_env = #spv.target_env< + #spv.vce, + {max_compute_workgroup_invocations = 128 : i32, + max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}> +} { + +// CHECK-NOT: spv.func @integer4 +func @integer4(%arg0: i4) { return } + +// CHECK-NOT: spv.func @integer128 +func @integer128(%arg0: i128) { return } + +// CHECK-NOT: spv.func @integer42 +func @integer42(%arg0: i42) { return } + +} // end module +// ----- + +//===----------------------------------------------------------------------===// +// Index type +//===----------------------------------------------------------------------===// + +// The index type is always converted into i32. +module attributes { + spv.target_env = #spv.target_env< + #spv.vce, + {max_compute_workgroup_invocations = 128 : i32, + max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}> +} { + +// CHECK-LABEL: spv.func @index_type +// CHECK-SAME: %{{.*}}: i32 +func @index_type(%arg0: index) { return } + +} // end module + +// ----- + +//===----------------------------------------------------------------------===// +// Float types +//===----------------------------------------------------------------------===// + +// Check that non-32-bit float types are converted to 32-bit types if the +// corresponding capabilities are not available. +module attributes { + spv.target_env = #spv.target_env< + #spv.vce, + {max_compute_workgroup_invocations = 128 : i32, + max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}> +} { + +// CHECK-LABEL: spv.func @float16 +// CHECK-SAME: f32 +func @float16(%arg0: f16) { return } + +// CHECK-LABEL: spv.func @float64 +// CHECK-SAME: f32 +func @float64(%arg0: f64) { return } + +} // end module + +// ----- + +// Check that non-32-bit float types are kept untouched if the corresponding +// capabilities are available. +module attributes { + spv.target_env = #spv.target_env< + #spv.vce, + {max_compute_workgroup_invocations = 128 : i32, + max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}> +} { + +// CHECK-LABEL: spv.func @float16 +// CHECK-SAME: f16 +func @float16(%arg0: f16) { return } + +// CHECK-LABEL: spv.func @float64 +// CHECK-SAME: f64 +func @float64(%arg0: f64) { return } + +} // end module + +// ----- + +// Check that bf16 is not supported. +module attributes { + spv.target_env = #spv.target_env< + #spv.vce, + {max_compute_workgroup_invocations = 128 : i32, + max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}> +} { + +// CHECK-NOT: spv.func @bf16_type +func @bf16_type(%arg0: bf16) { return } + +} // end module + +// ----- + +//===----------------------------------------------------------------------===// +// Vector types +//===----------------------------------------------------------------------===// + +// Check that capabilities for scalar types affects vector types too: no special +// capabilities available means using turning element types to 32-bit. +module attributes { + spv.target_env = #spv.target_env< + #spv.vce, + {max_compute_workgroup_invocations = 128 : i32, + max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}> +} { + +// CHECK-LABEL: spv.func @int_vector +// CHECK-SAME: vector<2xi32> +// CHECK-SAME: vector<3xsi32> +// CHECK-SAME: vector<4xui32> +func @int_vector( + %arg0: vector<2xi8>, + %arg1: vector<3xsi16>, + %arg2: vector<4xui64> +) { return } + +// CHECK-LABEL: spv.func @float_vector +// CHECK-SAME: vector<2xf32> +// CHECK-SAME: vector<3xf32> +func @float_vector( + %arg0: vector<2xf16>, + %arg1: vector<3xf64> +) { return } + +} // end module + +// ----- + +// Check that capabilities for scalar types affects vector types too: having +// special capabilities means keep vector types untouched. +module attributes { + spv.target_env = #spv.target_env< + #spv.vce, + {max_compute_workgroup_invocations = 128 : i32, + max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}> +} { + +// CHECK-LABEL: spv.func @int_vector +// CHECK-SAME: vector<2xi8> +// CHECK-SAME: vector<3xsi16> +// CHECK-SAME: vector<4xui64> +func @int_vector( + %arg0: vector<2xi8>, + %arg1: vector<3xsi16>, + %arg2: vector<4xui64> +) { return } + +// CHECK-LABEL: spv.func @float_vector +// CHECK-SAME: vector<2xf16> +// CHECK-SAME: vector<3xf64> +func @float_vector( + %arg0: vector<2xf16>, + %arg1: vector<3xf64> +) { return } + +} // end module + +// ----- + +// Check that 1- or > 4-element vectors are not supported. +module attributes { + spv.target_env = #spv.target_env< + #spv.vce, + {max_compute_workgroup_invocations = 128 : i32, + max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}> +} { + +// CHECK-NOT: spv.func @one_element_vector +func @one_element_vector(%arg0: vector<1xi32>) { return } + +// CHECK-NOT: spv.func @large_vector +func @large_vector(%arg0: vector<1024xi32>) { return } + +} // end module + +// ----- + +//===----------------------------------------------------------------------===// +// MemRef types +//===----------------------------------------------------------------------===// + +// Check memory spaces. +module attributes { + spv.target_env = #spv.target_env< + #spv.vce, + {max_compute_workgroup_invocations = 128 : i32, + max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}> +} { + +// CHECK-LABEL: func @memref_mem_space +// CHECK-SAME: StorageBuffer +// CHECK-SAME: Uniform +// CHECK-SAME: Workgroup +// CHECK-SAME: PushConstant +// CHECK-SAME: Private +// CHECK-SAME: Function +func @memref_mem_space( + %arg0: memref<4xf32, 0>, + %arg1: memref<4xf32, 4>, + %arg2: memref<4xf32, 3>, + %arg3: memref<4xf32, 7>, + %arg4: memref<4xf32, 5>, + %arg5: memref<4xf32, 6> +) { return } + +} // end module + +// ----- + +// Check that boolean memref is not supported at the moment. +module attributes { + spv.target_env = #spv.target_env< + #spv.vce, + {max_compute_workgroup_invocations = 128 : i32, + max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}> +} { + +// CHECK-LABEL: func @memref_type({{%.*}}: memref<3xi1>) +func @memref_type(%arg0: memref<3xi1>) { + return +} + +} // end module + +// ----- + +// Check that using non-32-bit scalar types in interface storage classes +// requires special capability and extension: convert them to 32-bit if not +// satisfied. +module attributes { + spv.target_env = #spv.target_env< + #spv.vce, + {max_compute_workgroup_invocations = 128 : i32, + max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}> +} { + +// CHECK-LABEL: spv.func @memref_8bit_StorageBuffer +// CHECK-SAME: !spv.ptr [0]>, StorageBuffer> +func @memref_8bit_StorageBuffer(%arg0: memref<16xi8, 0>) { return } + +// CHECK-LABEL: spv.func @memref_8bit_Uniform +// CHECK-SAME: !spv.ptr [0]>, Uniform> +func @memref_8bit_Uniform(%arg0: memref<16xsi8, 4>) { return } + +// CHECK-LABEL: spv.func @memref_8bit_PushConstant +// CHECK-SAME: !spv.ptr [0]>, PushConstant> +func @memref_8bit_PushConstant(%arg0: memref<16xui8, 7>) { return } + +// CHECK-LABEL: spv.func @memref_16bit_StorageBuffer +// CHECK-SAME: !spv.ptr [0]>, StorageBuffer> +func @memref_16bit_StorageBuffer(%arg0: memref<16xi16, 0>) { return } + +// CHECK-LABEL: spv.func @memref_16bit_Uniform +// CHECK-SAME: !spv.ptr [0]>, Uniform> +func @memref_16bit_Uniform(%arg0: memref<16xsi16, 4>) { return } + +// CHECK-LABEL: spv.func @memref_16bit_PushConstant +// CHECK-SAME: !spv.ptr [0]>, PushConstant> +func @memref_16bit_PushConstant(%arg0: memref<16xui16, 7>) { return } + +// CHECK-LABEL: spv.func @memref_16bit_Input +// CHECK-SAME: !spv.ptr [0]>, Input> +func @memref_16bit_Input(%arg3: memref<16xf16, 9>) { return } + +// CHECK-LABEL: spv.func @memref_16bit_Output +// CHECK-SAME: !spv.ptr [0]>, Output> +func @memref_16bit_Output(%arg4: memref<16xf16, 10>) { return } + +} // end module + +// ----- + +// Check that using non-32-bit scalar types in interface storage classes +// requires special capability and extension: keep as-is when the capability +// and extension is available. +module attributes { + spv.target_env = #spv.target_env< + #spv.vce, + {max_compute_workgroup_invocations = 128 : i32, + max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}> +} { + +// CHECK-LABEL: spv.func @memref_8bit_PushConstant +// CHECK-SAME: !spv.ptr [0]>, PushConstant> +func @memref_8bit_PushConstant(%arg0: memref<16xi8, 7>) { return } + +// CHECK-LABEL: spv.func @memref_16bit_PushConstant +// CHECK-SAME: !spv.ptr [0]>, PushConstant> +// CHECK-SAME: !spv.ptr [0]>, PushConstant> +func @memref_16bit_PushConstant( + %arg0: memref<16xi16, 7>, + %arg1: memref<16xf16, 7> +) { return } + +} // end module + +// ----- + +// Check that using non-32-bit scalar types in interface storage classes +// requires special capability and extension: keep as-is when the capability +// and extension is available. +module attributes { + spv.target_env = #spv.target_env< + #spv.vce, + {max_compute_workgroup_invocations = 128 : i32, + max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}> +} { + +// CHECK-LABEL: spv.func @memref_8bit_StorageBuffer +// CHECK-SAME: !spv.ptr [0]>, StorageBuffer> +func @memref_8bit_StorageBuffer(%arg0: memref<16xi8, 0>) { return } + +// CHECK-LABEL: spv.func @memref_16bit_StorageBuffer +// CHECK-SAME: !spv.ptr [0]>, StorageBuffer> +// CHECK-SAME: !spv.ptr [0]>, StorageBuffer> +func @memref_16bit_StorageBuffer( + %arg0: memref<16xi16, 0>, + %arg1: memref<16xf16, 0> +) { return } + +} // end module + +// ----- + +// Check that using non-32-bit scalar types in interface storage classes +// requires special capability and extension: keep as-is when the capability +// and extension is available. +module attributes { + spv.target_env = #spv.target_env< + #spv.vce, + {max_compute_workgroup_invocations = 128 : i32, + max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}> +} { + +// CHECK-LABEL: spv.func @memref_8bit_Uniform +// CHECK-SAME: !spv.ptr [0]>, Uniform> +func @memref_8bit_Uniform(%arg0: memref<16xi8, 4>) { return } + +// CHECK-LABEL: spv.func @memref_16bit_Uniform +// CHECK-SAME: !spv.ptr [0]>, Uniform> +// CHECK-SAME: !spv.ptr [0]>, Uniform> +func @memref_16bit_Uniform( + %arg0: memref<16xi16, 4>, + %arg1: memref<16xf16, 4> +) { return } + +} // end module + +// ----- + +// Check that using non-32-bit scalar types in interface storage classes +// requires special capability and extension: keep as-is when the capability +// and extension is available. +module attributes { + spv.target_env = #spv.target_env< + #spv.vce, + {max_compute_workgroup_invocations = 128 : i32, + max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}> +} { + +// CHECK-LABEL: spv.func @memref_16bit_Input +// CHECK-SAME: !spv.ptr [0]>, Input> +func @memref_16bit_Input(%arg3: memref<16xf16, 9>) { return } + +// CHECK-LABEL: spv.func @memref_16bit_Output +// CHECK-SAME: !spv.ptr [0]>, Output> +func @memref_16bit_Output(%arg4: memref<16xi16, 10>) { return } + +} // end module + +// ----- + +// Check that memref offset and strides affect the array size. +module attributes { + spv.target_env = #spv.target_env< + #spv.vce, + {max_compute_workgroup_invocations = 128 : i32, + max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}> +} { + +// CHECK-LABEL: spv.func @memref_offset_strides +func @memref_offset_strides( +// CHECK-SAME: !spv.array<64 x f32 [4]> [0]>, StorageBuffer> +// CHECK-SAME: !spv.array<72 x f32 [4]> [0]>, StorageBuffer> +// CHECK-SAME: !spv.array<256 x f32 [4]> [0]>, StorageBuffer> +// CHECK-SAME: !spv.array<64 x f32 [4]> [0]>, StorageBuffer> +// CHECK-SAME: !spv.array<88 x f32 [4]> [0]>, StorageBuffer> + %arg0: memref<16x4xf32, offset: 0, strides: [4, 1]>, // tightly packed; row major + %arg1: memref<16x4xf32, offset: 8, strides: [4, 1]>, // offset 8 + %arg2: memref<16x4xf32, offset: 0, strides: [16, 1]>, // pad 12 after each row + %arg3: memref<16x4xf32, offset: 0, strides: [1, 16]>, // tightly packed; col major + %arg4: memref<16x4xf32, offset: 0, strides: [1, 22]>, // pad 4 after each col + +// CHECK-SAME: !spv.array<64 x f16 [2]> [0]>, StorageBuffer> +// CHECK-SAME: !spv.array<72 x f16 [2]> [0]>, StorageBuffer> +// CHECK-SAME: !spv.array<256 x f16 [2]> [0]>, StorageBuffer> +// CHECK-SAME: !spv.array<64 x f16 [2]> [0]>, StorageBuffer> +// CHECK-SAME: !spv.array<88 x f16 [2]> [0]>, StorageBuffer> + %arg5: memref<16x4xf16, offset: 0, strides: [4, 1]>, + %arg6: memref<16x4xf16, offset: 8, strides: [4, 1]>, + %arg7: memref<16x4xf16, offset: 0, strides: [16, 1]>, + %arg8: memref<16x4xf16, offset: 0, strides: [1, 16]>, + %arg9: memref<16x4xf16, offset: 0, strides: [1, 22]> +) { return } + +} // end module + +// ----- + +// Check that dynamic shapes are not supported. +module attributes { + spv.target_env = #spv.target_env< + #spv.vce, + {max_compute_workgroup_invocations = 128 : i32, + max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}> +} { + +// CHECK-LABEL: func @unranked_memref +// CHECK-SAME: memref<*xi32> +func @unranked_memref(%arg0: memref<*xi32>) { return } + +// CHECK-LABEL: func @dynamic_dim_memref +// CHECK-SAME: memref<8x?xi32> +func @dynamic_dim_memref(%arg0: memref<8x?xi32>) { return } + +} // end module + +// ----- + +//===----------------------------------------------------------------------===// +// Tensor types +//===----------------------------------------------------------------------===// + +// Check that tensor element types are kept untouched with proper capabilites. +module attributes { + spv.target_env = #spv.target_env< + #spv.vce, + {max_compute_workgroup_invocations = 128 : i32, + max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}> +} { + +// CHECK-LABEL: spv.func @int_tensor_types +// CHECK-SAME: !spv.array<32 x i64 [8]> +// CHECK-SAME: !spv.array<32 x i32 [4]> +// CHECK-SAME: !spv.array<32 x i16 [2]> +// CHECK-SAME: !spv.array<32 x i8 [1]> +func @int_tensor_types( + %arg0: tensor<8x4xi64>, + %arg1: tensor<8x4xi32>, + %arg2: tensor<8x4xi16>, + %arg3: tensor<8x4xi8> +) { return } + +// CHECK-LABEL: spv.func @float_tensor_types +// CHECK-SAME: !spv.array<32 x f64 [8]> +// CHECK-SAME: !spv.array<32 x f32 [4]> +// CHECK-SAME: !spv.array<32 x f16 [2]> +func @float_tensor_types( + %arg0: tensor<8x4xf64>, + %arg1: tensor<8x4xf32>, + %arg2: tensor<8x4xf16> +) { return } + +} // end module + +// ----- + +// Check that tensor element types are changed to 32-bit without capabilities. +module attributes { + spv.target_env = #spv.target_env< + #spv.vce, + {max_compute_workgroup_invocations = 128 : i32, + max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}> +} { + +// CHECK-LABEL: spv.func @int_tensor_types +// CHECK-SAME: !spv.array<32 x i32 [4]> +// CHECK-SAME: !spv.array<32 x i32 [4]> +// CHECK-SAME: !spv.array<32 x i32 [4]> +// CHECK-SAME: !spv.array<32 x i32 [4]> +func @int_tensor_types( + %arg0: tensor<8x4xi64>, + %arg1: tensor<8x4xi32>, + %arg2: tensor<8x4xi16>, + %arg3: tensor<8x4xi8> +) { return } + +// CHECK-LABEL: spv.func @float_tensor_types +// CHECK-SAME: !spv.array<32 x f32 [4]> +// CHECK-SAME: !spv.array<32 x f32 [4]> +// CHECK-SAME: !spv.array<32 x f32 [4]> +func @float_tensor_types( + %arg0: tensor<8x4xf64>, + %arg1: tensor<8x4xf32>, + %arg2: tensor<8x4xf16> +) { return } + +} // end module + +// ----- + +// Check that dynamic shapes are not supported. +module attributes { + spv.target_env = #spv.target_env< + #spv.vce, + {max_compute_workgroup_invocations = 128 : i32, + max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}> +} { + +// CHECK-LABEL: func @unranked_tensor +// CHECK-SAME: tensor<*xi32> +func @unranked_tensor(%arg0: tensor<*xi32>) { return } + +// CHECK-LABEL: func @dynamic_dim_tensor +// CHECK-SAME: tensor<8x?xi32> +func @dynamic_dim_tensor(%arg0: tensor<8x?xi32>) { return } + +} // end module diff --git a/mlir/test/Conversion/StandardToSPIRV/subview-to-spirv.mlir b/mlir/test/Conversion/StandardToSPIRV/subview-to-spirv.mlir index c9d1195bc0562..cc94c089dfb23 100644 --- a/mlir/test/Conversion/StandardToSPIRV/subview-to-spirv.mlir +++ b/mlir/test/Conversion/StandardToSPIRV/subview-to-spirv.mlir @@ -4,6 +4,13 @@ // the desired output. Adding all of patterns within a single pass does // not seem to work. +module attributes { + spv.target_env = #spv.target_env< + #spv.vce, + {max_compute_workgroup_invocations = 128 : i32, + max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}> +} { + //===----------------------------------------------------------------------===// // std.subview //===----------------------------------------------------------------------===// @@ -51,3 +58,5 @@ func @fold_static_stride_subview_with_store(%arg0 : memref<12x32xf32>, %arg1 : i store %arg5, %0[%arg3, %arg4] : memref<4x4xf32, offset:?, strides: [64, 3]> return } + +} // end module diff --git a/mlir/test/Dialect/AVX512/roundtrip.mlir b/mlir/test/Dialect/AVX512/roundtrip.mlir new file mode 100644 index 0000000000000..bd23103fa432a --- /dev/null +++ b/mlir/test/Dialect/AVX512/roundtrip.mlir @@ -0,0 +1,21 @@ +// RUN: mlir-opt -verify-diagnostics %s | mlir-opt | FileCheck %s + +func @avx512_mask_rndscale(%a: vector<16xf32>, %b: vector<8xf64>, %i32: i32, %i16: i16, %i8: i8) + -> (vector<16xf32>, vector<8xf64>) +{ + // CHECK: avx512.mask.rndscale {{.*}}: vector<16xf32> + %0 = avx512.mask.rndscale %a, %i32, %a, %i16, %i32 : vector<16xf32> + // CHECK: avx512.mask.rndscale {{.*}}: vector<8xf64> + %1 = avx512.mask.rndscale %b, %i32, %b, %i8, %i32 : vector<8xf64> + return %0, %1: vector<16xf32>, vector<8xf64> +} + +func @avx512_scalef(%a: vector<16xf32>, %b: vector<8xf64>, %i32: i32, %i16: i16, %i8: i8) + -> (vector<16xf32>, vector<8xf64>) +{ + // CHECK: avx512.mask.scalef {{.*}}: vector<16xf32> + %0 = avx512.mask.scalef %a, %a, %a, %i16, %i32: vector<16xf32> + // CHECK: avx512.mask.scalef {{.*}}: vector<8xf64> + %1 = avx512.mask.scalef %b, %b, %b, %i8, %i32 : vector<8xf64> + return %0, %1: vector<16xf32>, vector<8xf64> +} diff --git a/mlir/test/Transforms/Vectorize/compose_maps.mlir b/mlir/test/Dialect/Affine/SuperVectorize/compose_maps.mlir similarity index 98% rename from mlir/test/Transforms/Vectorize/compose_maps.mlir rename to mlir/test/Dialect/Affine/SuperVectorize/compose_maps.mlir index 1e6a0436e4b44..ca1641762838c 100644 --- a/mlir/test/Transforms/Vectorize/compose_maps.mlir +++ b/mlir/test/Dialect/Affine/SuperVectorize/compose_maps.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt %s -affine-vectorizer-test -compose-maps 2>&1 | FileCheck %s +// RUN: mlir-opt %s -affine-super-vectorizer-test -compose-maps 2>&1 | FileCheck %s // For all these cases, the test traverses the `test_affine_map` ops and // composes them in order one-by-one. diff --git a/mlir/test/Transforms/Vectorize/normalize_maps.mlir b/mlir/test/Dialect/Affine/SuperVectorize/normalize_maps.mlir similarity index 96% rename from mlir/test/Transforms/Vectorize/normalize_maps.mlir rename to mlir/test/Dialect/Affine/SuperVectorize/normalize_maps.mlir index 0d77859574272..7c2bacd1a46ed 100644 --- a/mlir/test/Transforms/Vectorize/normalize_maps.mlir +++ b/mlir/test/Dialect/Affine/SuperVectorize/normalize_maps.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt %s -affine-vectorizer-test -normalize-maps | FileCheck %s +// RUN: mlir-opt %s -affine-super-vectorizer-test -normalize-maps | FileCheck %s // CHECK-DAG: #[[ZERO:[a-zA-Z0-9]+]] = affine_map<() -> (0)> // CHECK-DAG: #[[ID1:[a-zA-Z0-9]+]] = affine_map<(d0) -> (d0)> diff --git a/mlir/test/Transforms/Vectorize/vector_utils.mlir b/mlir/test/Dialect/Affine/SuperVectorize/vector_utils.mlir similarity index 85% rename from mlir/test/Transforms/Vectorize/vector_utils.mlir rename to mlir/test/Dialect/Affine/SuperVectorize/vector_utils.mlir index e6d00b17130e7..4e06028802a6d 100644 --- a/mlir/test/Transforms/Vectorize/vector_utils.mlir +++ b/mlir/test/Dialect/Affine/SuperVectorize/vector_utils.mlir @@ -1,5 +1,5 @@ -// RUN: mlir-opt %s -affine-vectorizer-test -vector-shape-ratio 4 -vector-shape-ratio 8 2>&1 | FileCheck %s -// RUN: mlir-opt %s -affine-vectorizer-test -vector-shape-ratio 2 -vector-shape-ratio 5 -vector-shape-ratio 2 2>&1 | FileCheck %s -check-prefix=TEST-3x4x5x8 +// RUN: mlir-opt %s -affine-super-vectorizer-test -vector-shape-ratio 4 -vector-shape-ratio 8 2>&1 | FileCheck %s +// RUN: mlir-opt %s -affine-super-vectorizer-test -vector-shape-ratio 2 -vector-shape-ratio 5 -vector-shape-ratio 2 2>&1 | FileCheck %s -check-prefix=TEST-3x4x5x8 func @vector_add_2d(%arg0: index, %arg1: index) -> f32 { // Nothing should be matched in this first block. diff --git a/mlir/test/Transforms/Vectorize/vectorize_1d.mlir b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_1d.mlir similarity index 99% rename from mlir/test/Transforms/Vectorize/vectorize_1d.mlir rename to mlir/test/Dialect/Affine/SuperVectorize/vectorize_1d.mlir index 7fbb6fe0b226f..94943df75bd12 100644 --- a/mlir/test/Transforms/Vectorize/vectorize_1d.mlir +++ b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_1d.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt %s -affine-vectorize -virtual-vector-size 128 --test-fastest-varying=0 | FileCheck %s +// RUN: mlir-opt %s -affine-super-vectorize="virtual-vector-size=128 test-fastest-varying=0" | FileCheck %s // Permutation maps used in vectorization. // CHECK: #[[map_proj_d0d1_0:map[0-9]+]] = affine_map<(d0, d1) -> (0)> diff --git a/mlir/test/Transforms/Vectorize/vectorize_2d.mlir b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_2d.mlir similarity index 96% rename from mlir/test/Transforms/Vectorize/vectorize_2d.mlir rename to mlir/test/Dialect/Affine/SuperVectorize/vectorize_2d.mlir index 8fa3842edea5c..884907024bb11 100644 --- a/mlir/test/Transforms/Vectorize/vectorize_2d.mlir +++ b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_2d.mlir @@ -1,5 +1,5 @@ -// RUN: mlir-opt %s -affine-vectorize -virtual-vector-size 4 -virtual-vector-size 8 | FileCheck %s -check-prefix=VECT -// RUN: mlir-opt %s -affine-vectorize -virtual-vector-size 32 -virtual-vector-size 256 --test-fastest-varying=1 --test-fastest-varying=0 | FileCheck %s +// RUN: mlir-opt %s -affine-super-vectorize="virtual-vector-size=4,8" | FileCheck %s -check-prefix=VECT +// RUN: mlir-opt %s -affine-super-vectorize="virtual-vector-size=32,256 test-fastest-varying=1,0" | FileCheck %s // Permutation maps used in vectorization. // CHECK-DAG: #[[map_id1:map[0-9]+]] = affine_map<(d0) -> (d0)> diff --git a/mlir/test/Transforms/Vectorize/vectorize_3d.mlir b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_3d.mlir similarity index 84% rename from mlir/test/Transforms/Vectorize/vectorize_3d.mlir rename to mlir/test/Dialect/Affine/SuperVectorize/vectorize_3d.mlir index b7355c6e3cf69..2980ee30d9086 100644 --- a/mlir/test/Transforms/Vectorize/vectorize_3d.mlir +++ b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_3d.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt %s -affine-vectorize -virtual-vector-size 32 -virtual-vector-size 64 -virtual-vector-size 256 --test-fastest-varying=2 --test-fastest-varying=1 --test-fastest-varying=0 | FileCheck %s +// RUN: mlir-opt %s -affine-super-vectorize="virtual-vector-size=32,64,256 test-fastest-varying=2,1,0" | FileCheck %s // Permutation maps used in vectorization. // CHECK: #[[map_proj_d0d1d2_d0d1d2:map[0-9]+]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)> diff --git a/mlir/test/Transforms/Vectorize/vectorize_outer_loop_2d.mlir b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_outer_loop_2d.mlir similarity index 89% rename from mlir/test/Transforms/Vectorize/vectorize_outer_loop_2d.mlir rename to mlir/test/Dialect/Affine/SuperVectorize/vectorize_outer_loop_2d.mlir index 39350c88610bc..c41c14d42390c 100644 --- a/mlir/test/Transforms/Vectorize/vectorize_outer_loop_2d.mlir +++ b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_outer_loop_2d.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt %s -affine-vectorize -virtual-vector-size 32 -virtual-vector-size 256 --test-fastest-varying=2 --test-fastest-varying=0 | FileCheck %s +// RUN: mlir-opt %s -affine-super-vectorize="virtual-vector-size=32,256 test-fastest-varying=2,0" | FileCheck %s // Permutation maps used in vectorization. // CHECK: #[[map_proj_d0d1d2_d0d2:map[0-9]+]] = affine_map<(d0, d1, d2) -> (d0, d2)> diff --git a/mlir/test/Transforms/Vectorize/vectorize_outer_loop_transpose_2d.mlir b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_outer_loop_transpose_2d.mlir similarity index 94% rename from mlir/test/Transforms/Vectorize/vectorize_outer_loop_transpose_2d.mlir rename to mlir/test/Dialect/Affine/SuperVectorize/vectorize_outer_loop_transpose_2d.mlir index bac0c0cdb58c9..3dc3e69e66784 100644 --- a/mlir/test/Transforms/Vectorize/vectorize_outer_loop_transpose_2d.mlir +++ b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_outer_loop_transpose_2d.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt %s -affine-vectorize -virtual-vector-size 32 -virtual-vector-size 256 --test-fastest-varying=0 --test-fastest-varying=2 | FileCheck %s +// RUN: mlir-opt %s -affine-super-vectorize="virtual-vector-size=32,256 test-fastest-varying=0,2" | FileCheck %s // Permutation maps used in vectorization. // CHECK: #[[map_proj_d0d1d2_d2d0:map[0-9]+]] = affine_map<(d0, d1, d2) -> (d2, d0)> diff --git a/mlir/test/Transforms/Vectorize/vectorize_transpose_2d.mlir b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_transpose_2d.mlir similarity index 94% rename from mlir/test/Transforms/Vectorize/vectorize_transpose_2d.mlir rename to mlir/test/Dialect/Affine/SuperVectorize/vectorize_transpose_2d.mlir index d86ad1ccbde3b..893352a40db47 100644 --- a/mlir/test/Transforms/Vectorize/vectorize_transpose_2d.mlir +++ b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_transpose_2d.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt %s -affine-vectorize -virtual-vector-size 32 -virtual-vector-size 256 --test-fastest-varying=0 --test-fastest-varying=1 | FileCheck %s +// RUN: mlir-opt %s -affine-super-vectorize="virtual-vector-size=32,256 test-fastest-varying=0,1" | FileCheck %s // Permutation maps used in vectorization. // CHECK-DAG: #[[map_proj_d0d1d2_d2d1:map[0-9]+]] = affine_map<(d0, d1, d2) -> (d2, d1)> diff --git a/mlir/test/Transforms/affine-data-copy.mlir b/mlir/test/Dialect/Affine/affine-data-copy.mlir similarity index 100% rename from mlir/test/Transforms/affine-data-copy.mlir rename to mlir/test/Dialect/Affine/affine-data-copy.mlir diff --git a/mlir/test/Transforms/affine-loop-invariant-code-motion.mlir b/mlir/test/Dialect/Affine/affine-loop-invariant-code-motion.mlir similarity index 100% rename from mlir/test/Transforms/affine-loop-invariant-code-motion.mlir rename to mlir/test/Dialect/Affine/affine-loop-invariant-code-motion.mlir diff --git a/mlir/test/Dialect/AffineOps/canonicalize.mlir b/mlir/test/Dialect/Affine/canonicalize.mlir similarity index 100% rename from mlir/test/Dialect/AffineOps/canonicalize.mlir rename to mlir/test/Dialect/Affine/canonicalize.mlir diff --git a/mlir/test/Transforms/dma-generate.mlir b/mlir/test/Dialect/Affine/dma-generate.mlir similarity index 100% rename from mlir/test/Transforms/dma-generate.mlir rename to mlir/test/Dialect/Affine/dma-generate.mlir diff --git a/mlir/test/Dialect/AffineOps/dma.mlir b/mlir/test/Dialect/Affine/dma.mlir similarity index 100% rename from mlir/test/Dialect/AffineOps/dma.mlir rename to mlir/test/Dialect/Affine/dma.mlir diff --git a/mlir/test/Dialect/AffineOps/inlining.mlir b/mlir/test/Dialect/Affine/inlining.mlir similarity index 100% rename from mlir/test/Dialect/AffineOps/inlining.mlir rename to mlir/test/Dialect/Affine/inlining.mlir diff --git a/mlir/test/Dialect/AffineOps/invalid.mlir b/mlir/test/Dialect/Affine/invalid.mlir similarity index 100% rename from mlir/test/Dialect/AffineOps/invalid.mlir rename to mlir/test/Dialect/Affine/invalid.mlir diff --git a/mlir/test/Dialect/AffineOps/load-store-invalid.mlir b/mlir/test/Dialect/Affine/load-store-invalid.mlir similarity index 100% rename from mlir/test/Dialect/AffineOps/load-store-invalid.mlir rename to mlir/test/Dialect/Affine/load-store-invalid.mlir diff --git a/mlir/test/Dialect/AffineOps/load-store.mlir b/mlir/test/Dialect/Affine/load-store.mlir similarity index 100% rename from mlir/test/Dialect/AffineOps/load-store.mlir rename to mlir/test/Dialect/Affine/load-store.mlir diff --git a/mlir/test/Transforms/loop-tiling.mlir b/mlir/test/Dialect/Affine/loop-tiling.mlir similarity index 97% rename from mlir/test/Transforms/loop-tiling.mlir rename to mlir/test/Dialect/Affine/loop-tiling.mlir index c0a583f52cf44..2f8223b37eebc 100644 --- a/mlir/test/Transforms/loop-tiling.mlir +++ b/mlir/test/Dialect/Affine/loop-tiling.mlir @@ -1,5 +1,5 @@ -// RUN: mlir-opt %s -split-input-file -affine-loop-tile -tile-size=32 | FileCheck %s -// RUN: mlir-opt %s -split-input-file -affine-loop-tile -tile-cache-size=512 | FileCheck %s --check-prefix=MODEL +// RUN: mlir-opt %s -split-input-file -affine-loop-tile -affine-tile-size=32 | FileCheck %s +// RUN: mlir-opt %s -split-input-file -affine-loop-tile -affine-tile-cache-size=512 | FileCheck %s --check-prefix=MODEL // ----- diff --git a/mlir/test/Dialect/AffineOps/memref-stride-calculation.mlir b/mlir/test/Dialect/Affine/memref-stride-calculation.mlir similarity index 100% rename from mlir/test/Dialect/AffineOps/memref-stride-calculation.mlir rename to mlir/test/Dialect/Affine/memref-stride-calculation.mlir diff --git a/mlir/test/Dialect/AffineOps/ops.mlir b/mlir/test/Dialect/Affine/ops.mlir similarity index 100% rename from mlir/test/Dialect/AffineOps/ops.mlir rename to mlir/test/Dialect/Affine/ops.mlir diff --git a/mlir/test/Transforms/parallelism-detection.mlir b/mlir/test/Dialect/Affine/parallelism-detection.mlir similarity index 100% rename from mlir/test/Transforms/parallelism-detection.mlir rename to mlir/test/Dialect/Affine/parallelism-detection.mlir diff --git a/mlir/test/Transforms/simplify-affine-structures.mlir b/mlir/test/Dialect/Affine/simplify-affine-structures.mlir similarity index 100% rename from mlir/test/Transforms/simplify-affine-structures.mlir rename to mlir/test/Dialect/Affine/simplify-affine-structures.mlir diff --git a/mlir/test/Transforms/slicing-utils.mlir b/mlir/test/Dialect/Affine/slicing-utils.mlir similarity index 97% rename from mlir/test/Transforms/slicing-utils.mlir rename to mlir/test/Dialect/Affine/slicing-utils.mlir index 145695db5fbed..251e400a9b524 100644 --- a/mlir/test/Transforms/slicing-utils.mlir +++ b/mlir/test/Dialect/Affine/slicing-utils.mlir @@ -1,6 +1,6 @@ -// RUN: mlir-opt %s -affine-vectorizer-test -forward-slicing=true 2>&1 | FileCheck %s --check-prefix=FWD -// RUN: mlir-opt %s -affine-vectorizer-test -backward-slicing=true 2>&1 | FileCheck %s --check-prefix=BWD -// RUN: mlir-opt %s -affine-vectorizer-test -slicing=true 2>&1 | FileCheck %s --check-prefix=FWDBWD +// RUN: mlir-opt %s -affine-super-vectorizer-test -forward-slicing=true 2>&1 | FileCheck %s --check-prefix=FWD +// RUN: mlir-opt %s -affine-super-vectorizer-test -backward-slicing=true 2>&1 | FileCheck %s --check-prefix=BWD +// RUN: mlir-opt %s -affine-super-vectorizer-test -slicing=true 2>&1 | FileCheck %s --check-prefix=FWDBWD /// 1 2 3 4 /// |_______| |______| diff --git a/mlir/test/Transforms/unroll-jam.mlir b/mlir/test/Dialect/Affine/unroll-jam.mlir similarity index 100% rename from mlir/test/Transforms/unroll-jam.mlir rename to mlir/test/Dialect/Affine/unroll-jam.mlir diff --git a/mlir/test/Transforms/unroll.mlir b/mlir/test/Dialect/Affine/unroll.mlir similarity index 100% rename from mlir/test/Transforms/unroll.mlir rename to mlir/test/Dialect/Affine/unroll.mlir diff --git a/mlir/test/Dialect/GPU/multiple-all-reduce.mlir b/mlir/test/Dialect/GPU/multiple-all-reduce.mlir new file mode 100644 index 0000000000000..f1437dbb1adb2 --- /dev/null +++ b/mlir/test/Dialect/GPU/multiple-all-reduce.mlir @@ -0,0 +1,25 @@ +// RUN: mlir-opt --gpu-kernel-outlining --convert-gpu-to-nvvm %s | FileCheck %s + +func @main() { + %data = alloc() : memref<2x6xf32> + %sum = alloc() : memref<2xf32> + %mul = alloc() : memref<2xf32> + %c1 = constant 1 : index + + // ADD + MUL + gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %c1, %grid_y = %c1, %grid_z = %c1) + threads(%tx, %ty, %tz) in (%block_x = %c1, %block_y = %c1, %block_z = %c1) { + %val = load %data[%bx, %tx] : memref<2x6xf32> + %reduced0 = "gpu.all_reduce"(%val) ({}) { op = "add" } : (f32) -> (f32) + store %reduced0, %sum[%bx] : memref<2xf32> + %reduced1 = "gpu.all_reduce"(%val) ({}) { op = "mul" } : (f32) -> (f32) + store %reduced1, %mul[%bx] : memref<2xf32> + gpu.terminator + } + +// CHECK: gpu.module @main_kernel { +// CHECK-NEXT: llvm.mlir.global internal @{{.*}}() {addr_space = 3 : i32} : !llvm<"[32 x float]"> +// CHECK-NEXT: llvm.mlir.global internal @{{.*}}() {addr_space = 3 : i32} : !llvm<"[32 x float]"> + + return +} diff --git a/mlir/test/Dialect/LLVMIR/invalid.mlir b/mlir/test/Dialect/LLVMIR/invalid.mlir index 03de594083dde..710328c1cfc89 100644 --- a/mlir/test/Dialect/LLVMIR/invalid.mlir +++ b/mlir/test/Dialect/LLVMIR/invalid.mlir @@ -515,7 +515,7 @@ func @cmpxchg_failure_acq_rel(%i32_ptr : !llvm<"i32*">, %i32 : !llvm.i32) { llvm.func @foo(!llvm.i32) -> !llvm.i32 llvm.func @__gxx_personality_v0(...) -> !llvm.i32 -llvm.func @bad_landingpad(%arg0: !llvm<"i8**">) { +llvm.func @bad_landingpad(%arg0: !llvm<"i8**">) attributes { personality = @__gxx_personality_v0} { %0 = llvm.mlir.constant(3 : i32) : !llvm.i32 %1 = llvm.mlir.constant(2 : i32) : !llvm.i32 %2 = llvm.invoke @foo(%1) to ^bb1 unwind ^bb2 : (!llvm.i32) -> !llvm.i32 @@ -532,7 +532,7 @@ llvm.func @bad_landingpad(%arg0: !llvm<"i8**">) { llvm.func @foo(!llvm.i32) -> !llvm.i32 llvm.func @__gxx_personality_v0(...) -> !llvm.i32 -llvm.func @caller(%arg0: !llvm.i32) -> !llvm.i32 { +llvm.func @caller(%arg0: !llvm.i32) -> !llvm.i32 attributes { personality = @__gxx_personality_v0} { %0 = llvm.mlir.constant(1 : i32) : !llvm.i32 %1 = llvm.alloca %0 x !llvm<"i8*"> : (!llvm.i32) -> !llvm<"i8**"> // expected-note@+1 {{global addresses expected as operand to bitcast used in clauses for landingpad}} @@ -551,7 +551,7 @@ llvm.func @caller(%arg0: !llvm.i32) -> !llvm.i32 { llvm.func @foo(!llvm.i32) -> !llvm.i32 llvm.func @__gxx_personality_v0(...) -> !llvm.i32 -llvm.func @caller(%arg0: !llvm.i32) -> !llvm.i32 { +llvm.func @caller(%arg0: !llvm.i32) -> !llvm.i32 attributes { personality = @__gxx_personality_v0} { %0 = llvm.mlir.constant(1 : i32) : !llvm.i32 %1 = llvm.invoke @foo(%0) to ^bb1 unwind ^bb2 : (!llvm.i32) -> !llvm.i32 ^bb1: // pred: ^bb0 @@ -564,6 +564,37 @@ llvm.func @caller(%arg0: !llvm.i32) -> !llvm.i32 { // ----- +llvm.func @foo(!llvm.i32) -> !llvm.i32 +llvm.func @__gxx_personality_v0(...) -> !llvm.i32 + +llvm.func @caller(%arg0: !llvm.i32) -> !llvm.i32 attributes { personality = @__gxx_personality_v0 } { + %0 = llvm.mlir.constant(1 : i32) : !llvm.i32 + %1 = llvm.invoke @foo(%0) to ^bb1 unwind ^bb2 : (!llvm.i32) -> !llvm.i32 +^bb1: // pred: ^bb0 + llvm.return %0 : !llvm.i32 +^bb2: // pred: ^bb0 + %2 = llvm.landingpad cleanup : !llvm<"{ i8*, i32 }"> + // expected-error@+1 {{'llvm.resume' op expects landingpad value as operand}} + llvm.resume %0 : !llvm.i32 +} + +// ----- + +llvm.func @foo(!llvm.i32) -> !llvm.i32 + +llvm.func @caller(%arg0: !llvm.i32) -> !llvm.i32 { + %0 = llvm.mlir.constant(1 : i32) : !llvm.i32 + %1 = llvm.invoke @foo(%0) to ^bb1 unwind ^bb2 : (!llvm.i32) -> !llvm.i32 +^bb1: // pred: ^bb0 + llvm.return %0 : !llvm.i32 +^bb2: // pred: ^bb0 + // expected-error@+1 {{llvm.landingpad needs to be in a function with a personality}} + %2 = llvm.landingpad cleanup : !llvm<"{ i8*, i32 }"> + llvm.resume %2 : !llvm<"{ i8*, i32 }"> +} + +// ----- + func @invalid_ordering_in_fence() { // expected-error @+1 {{can be given only acquire, release, acq_rel, and seq_cst orderings}} llvm.fence syncscope("agent") monotonic diff --git a/mlir/test/Dialect/LLVMIR/roundtrip.mlir b/mlir/test/Dialect/LLVMIR/roundtrip.mlir index 32fe4c496523a..8e08d5004d69d 100644 --- a/mlir/test/Dialect/LLVMIR/roundtrip.mlir +++ b/mlir/test/Dialect/LLVMIR/roundtrip.mlir @@ -238,7 +238,7 @@ llvm.func @bar(!llvm<"i8*">, !llvm<"i8*">, !llvm<"i8*">) llvm.func @__gxx_personality_v0(...) -> !llvm.i32 // CHECK-LABEL: @invokeLandingpad -llvm.func @invokeLandingpad() -> !llvm.i32 { +llvm.func @invokeLandingpad() -> !llvm.i32 attributes { personality = @__gxx_personality_v0 } { // CHECK-NEXT: %[[a0:[0-9]+]] = llvm.mlir.constant(0 : i32) : !llvm.i32 // CHECK-NEXT: %{{[0-9]+}} = llvm.mlir.constant(3 : i32) : !llvm.i32 // CHECK-NEXT: %[[a2:[0-9]+]] = llvm.mlir.constant("\01") : !llvm<"[1 x i8]"> @@ -261,11 +261,11 @@ llvm.func @invokeLandingpad() -> !llvm.i32 { %9 = llvm.invoke @foo(%7) to ^bb2 unwind ^bb1 : (!llvm.i32) -> !llvm<"{ i32, double, i32 }"> // CHECK-NEXT: ^bb1: -// CHECK-NEXT: %{{[0-9]+}} = llvm.landingpad cleanup (catch %[[a3]] : !llvm<"i8**">) (catch %[[a6]] : !llvm<"i8*">) (filter %[[a2]] : !llvm<"[1 x i8]">) : !llvm<"{ i8*, i32 }"> -// CHECK-NEXT: llvm.br ^bb3 +// CHECK-NEXT: %[[lp:[0-9]+]] = llvm.landingpad cleanup (catch %[[a3]] : !llvm<"i8**">) (catch %[[a6]] : !llvm<"i8*">) (filter %[[a2]] : !llvm<"[1 x i8]">) : !llvm<"{ i8*, i32 }"> +// CHECK-NEXT: llvm.resume %[[lp]] : !llvm<"{ i8*, i32 }"> ^bb1: %10 = llvm.landingpad cleanup (catch %3 : !llvm<"i8**">) (catch %6 : !llvm<"i8*">) (filter %2 : !llvm<"[1 x i8]">) : !llvm<"{ i8*, i32 }"> - llvm.br ^bb3 + llvm.resume %10 : !llvm<"{ i8*, i32 }"> // CHECK-NEXT: ^bb2: // CHECK-NEXT: llvm.return %[[a7]] : !llvm.i32 diff --git a/mlir/test/Dialect/Linalg/fusion-tensor.mlir b/mlir/test/Dialect/Linalg/fusion-tensor.mlir index 458846eda639d..f8999f7ff5cc0 100644 --- a/mlir/test/Dialect/Linalg/fusion-tensor.mlir +++ b/mlir/test/Dialect/Linalg/fusion-tensor.mlir @@ -105,3 +105,28 @@ func @add_broadcast_mul_fusion(%arg0: tensor, %arg1 : tensor, %arg }: tensor, tensor -> tensor return %2 : tensor } + +// ----- + +// CHECK: #[[MAP0:.*]] = affine_map<() -> ()> +#map0 = affine_map<() -> ()> + +// CHECK-LABEL: @add_mul_scalar_fusion +func @add_mul_scalar_fusion(%arg0: tensor, %arg1: tensor, %arg2: tensor) -> tensor +{ + %0 = linalg.generic {args_in = 2 : i64, args_out = 1 : i64, indexing_maps = [#map0, #map0, #map0], iterator_types = []} %arg0, %arg1 { + ^bb0(%arg3: f32, %arg4: f32): // no predecessors + %1 = addf %arg3, %arg4 : f32 + linalg.yield %1 : f32 + }: tensor, tensor -> tensor + // CHECK: linalg.generic {args_in = 2 : i64, args_out = 1 : i64 + // CHECK: addf + // CHECK: mulf + %1 = linalg.generic {args_in = 2 : i64, args_out = 1 : i64, indexing_maps = [#map0, #map0, #map0], iterator_types = []} %0, %arg2 { + ^bb0(%arg3: f32, %arg4: f32): // no predecessors + %1 = mulf %arg3, %arg4 : f32 + linalg.yield %1 : f32 + }: tensor, tensor -> tensor + + return %1 : tensor +} diff --git a/mlir/test/Dialect/Linalg/promote.mlir b/mlir/test/Dialect/Linalg/promote.mlir index e9eea206b26e6..a9b860d8f28b0 100644 --- a/mlir/test/Dialect/Linalg/promote.mlir +++ b/mlir/test/Dialect/Linalg/promote.mlir @@ -1,5 +1,5 @@ // RUN: mlir-opt %s -linalg-promote-subviews | FileCheck %s -// RUN: mlir-opt %s -linalg-promote-subviews -test-linalg-promote-dynamic | FileCheck %s --check-prefix=DYNAMIC +// RUN: mlir-opt %s -linalg-promote-subviews="test-promote-dynamic" | FileCheck %s --check-prefix=DYNAMIC #map0 = affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)> #map1 = affine_map<(d0) -> (d0 + 2)> diff --git a/mlir/test/Dialect/Linalg/tile.mlir b/mlir/test/Dialect/Linalg/tile.mlir index c06447f29c0f9..c1903cbd4d34a 100644 --- a/mlir/test/Dialect/Linalg/tile.mlir +++ b/mlir/test/Dialect/Linalg/tile.mlir @@ -1,7 +1,7 @@ -// RUN: mlir-opt %s -linalg-tile -linalg-tile-sizes=2 | FileCheck %s -check-prefix=TILE-2 -// RUN: mlir-opt %s -linalg-tile -linalg-tile-sizes=0,2 | FileCheck %s -check-prefix=TILE-02 -// RUN: mlir-opt %s -linalg-tile -linalg-tile-sizes=0,0,2 | FileCheck %s -check-prefix=TILE-002 -// RUN: mlir-opt %s -linalg-tile -linalg-tile-sizes=2,3,4 | FileCheck %s -check-prefix=TILE-234 +// RUN: mlir-opt %s -linalg-tile="linalg-tile-sizes=2" | FileCheck %s -check-prefix=TILE-2 +// RUN: mlir-opt %s -linalg-tile="linalg-tile-sizes=0,2" | FileCheck %s -check-prefix=TILE-02 +// RUN: mlir-opt %s -linalg-tile="linalg-tile-sizes=0,0,2" | FileCheck %s -check-prefix=TILE-002 +// RUN: mlir-opt %s -linalg-tile="linalg-tile-sizes=2,3,4" | FileCheck %s -check-prefix=TILE-234 // TILE-2-DAG: #[[strided1D:.*]] = affine_map<(d0)[s0] -> (d0 + s0)> // TILE-02-DAG: #[[strided1D:.*]] = affine_map<(d0)[s0] -> (d0 + s0)> diff --git a/mlir/test/Dialect/Linalg/tile_conv.mlir b/mlir/test/Dialect/Linalg/tile_conv.mlir index 25cabc02efb05..c62b240511e79 100644 --- a/mlir/test/Dialect/Linalg/tile_conv.mlir +++ b/mlir/test/Dialect/Linalg/tile_conv.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt %s -linalg-tile -linalg-tile-sizes=2,3,0,0,4 | FileCheck %s -check-prefix=TILE-23004 +// RUN: mlir-opt %s -linalg-tile="linalg-tile-sizes=2,3,0,0,4" | FileCheck %s -check-prefix=TILE-23004 // TILE-23004-DAG: #[[D0x30pS0x10:.*]] = affine_map<(d0) -> (d0 * 30)> // TILE-23004-DAG: #[[S0x10p90:.*]] = affine_map<()[s0] -> (s0 * 10 + 90)> diff --git a/mlir/test/Dialect/Linalg/tile_indexed_generic.mlir b/mlir/test/Dialect/Linalg/tile_indexed_generic.mlir index 24619bf404b88..fc1d27a5a2686 100644 --- a/mlir/test/Dialect/Linalg/tile_indexed_generic.mlir +++ b/mlir/test/Dialect/Linalg/tile_indexed_generic.mlir @@ -1,6 +1,6 @@ -// RUN: mlir-opt %s -linalg-tile -linalg-tile-sizes=10,25 | FileCheck %s -check-prefix=TILE-10n25 -// RUN: mlir-opt %s -linalg-tile -linalg-tile-sizes=25,0 | FileCheck %s -check-prefix=TILE-25n0 -// RUN: mlir-opt %s -linalg-tile -linalg-tile-sizes=0,25 | FileCheck %s -check-prefix=TILE-0n25 +// RUN: mlir-opt %s -linalg-tile="linalg-tile-sizes=10,25" | FileCheck %s -check-prefix=TILE-10n25 +// RUN: mlir-opt %s -linalg-tile="linalg-tile-sizes=25,0" | FileCheck %s -check-prefix=TILE-25n0 +// RUN: mlir-opt %s -linalg-tile="linalg-tile-sizes=0,25" | FileCheck %s -check-prefix=TILE-0n25 #id_1d = affine_map<(i) -> (i)> #pointwise_1d_trait = { diff --git a/mlir/test/Dialect/Linalg/tile_parallel.mlir b/mlir/test/Dialect/Linalg/tile_parallel.mlir index 7db9da0715aa1..caca3a0e795e0 100644 --- a/mlir/test/Dialect/Linalg/tile_parallel.mlir +++ b/mlir/test/Dialect/Linalg/tile_parallel.mlir @@ -1,7 +1,7 @@ -// RUN: mlir-opt %s -linalg-tile-to-parallel-loops -linalg-tile-sizes=2 | FileCheck %s -check-prefix=TILE-2 --dump-input-on-failure -// RUN: mlir-opt %s -linalg-tile-to-parallel-loops -linalg-tile-sizes=0,2 | FileCheck %s -check-prefix=TILE-02 --dump-input-on-failure -// RUN: mlir-opt %s -linalg-tile-to-parallel-loops -linalg-tile-sizes=0,0,2 | FileCheck %s -check-prefix=TILE-002 --dump-input-on-failure -// RUN: mlir-opt %s -linalg-tile-to-parallel-loops -linalg-tile-sizes=2,3,4 | FileCheck %s -check-prefix=TILE-234 --dump-input-on-failure +// RUN: mlir-opt %s -linalg-tile-to-parallel-loops="linalg-tile-sizes=2" | FileCheck %s -check-prefix=TILE-2 --dump-input-on-failure +// RUN: mlir-opt %s -linalg-tile-to-parallel-loops="linalg-tile-sizes=0,2" | FileCheck %s -check-prefix=TILE-02 --dump-input-on-failure +// RUN: mlir-opt %s -linalg-tile-to-parallel-loops="linalg-tile-sizes=0,0,2" | FileCheck %s -check-prefix=TILE-002 --dump-input-on-failure +// RUN: mlir-opt %s -linalg-tile-to-parallel-loops="linalg-tile-sizes=2,3,4" | FileCheck %s -check-prefix=TILE-234 --dump-input-on-failure #id_2d = affine_map<(i, j) -> (i, j)> #pointwise_2d_trait = { diff --git a/mlir/test/Dialect/SPIRV/Transforms/abi-simple.mlir b/mlir/test/Dialect/SPIRV/Transforms/abi-interface.mlir similarity index 57% rename from mlir/test/Dialect/SPIRV/Transforms/abi-simple.mlir rename to mlir/test/Dialect/SPIRV/Transforms/abi-interface.mlir index edc66c41591cf..3972def985bb9 100644 --- a/mlir/test/Dialect/SPIRV/Transforms/abi-simple.mlir +++ b/mlir/test/Dialect/SPIRV/Transforms/abi-interface.mlir @@ -1,18 +1,25 @@ // RUN: mlir-opt -spirv-lower-abi-attrs -verify-diagnostics %s -o - | FileCheck %s +module attributes { + spv.target_env = #spv.target_env< + #spv.vce, + {max_compute_workgroup_invocations = 128 : i32, + max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}> +} { + // CHECK-LABEL: spv.module spv.module Logical GLSL450 { // CHECK-DAG: spv.globalVariable [[VAR0:@.*]] bind(0, 0) : !spv.ptr, StorageBuffer> // CHECK-DAG: spv.globalVariable [[VAR1:@.*]] bind(0, 1) : !spv.ptr [0]>, StorageBuffer> // CHECK: spv.func [[FN:@.*]]() - spv.func @kernel(%arg0: f32 - {spv.interface_var_abi = {binding = 0 : i32, - descriptor_set = 0 : i32, - storage_class = 12 : i32}}, - %arg1: !spv.ptr>, StorageBuffer> - {spv.interface_var_abi = {binding = 1 : i32, - descriptor_set = 0 : i32, - storage_class = 12 : i32}}) "None" + spv.func @kernel( + %arg0: f32 + {spv.interface_var_abi = {binding = 0 : i32, + descriptor_set = 0 : i32, + storage_class = 12 : i32}}, + %arg1: !spv.ptr>, StorageBuffer> + {spv.interface_var_abi = {binding = 1 : i32, + descriptor_set = 0 : i32}}) "None" attributes {spv.entry_point_abi = {local_size = dense<[32, 1, 1]> : vector<3xi32>}} { // CHECK: [[ARG1:%.*]] = spv._address_of [[VAR1]] // CHECK: [[ADDRESSARG0:%.*]] = spv._address_of [[VAR0]] @@ -24,4 +31,6 @@ spv.module Logical GLSL450 { } // CHECK: spv.EntryPoint "GLCompute" [[FN]] // CHECK: spv.ExecutionMode [[FN]] "LocalSize", 32, 1, 1 -} +} // end spv.module + +} // end module diff --git a/mlir/test/Dialect/SPIRV/Transforms/abi-load-store.mlir b/mlir/test/Dialect/SPIRV/Transforms/abi-load-store.mlir index d8af9fa826075..42ff3f55e1ead 100644 --- a/mlir/test/Dialect/SPIRV/Transforms/abi-load-store.mlir +++ b/mlir/test/Dialect/SPIRV/Transforms/abi-load-store.mlir @@ -1,5 +1,12 @@ // RUN: mlir-opt -spirv-lower-abi-attrs -verify-diagnostics %s -o - | FileCheck %s +module attributes { + spv.target_env = #spv.target_env< + #spv.vce, + {max_compute_workgroup_invocations = 128 : i32, + max_compute_workgroup_size = dense<[128, 128, 64]> : vector<3xi32>}> +} { + // CHECK-LABEL: spv.module spv.module Logical GLSL450 { // CHECK-DAG: spv.globalVariable [[WORKGROUPSIZE:@.*]] built_in("WorkgroupSize") @@ -21,16 +28,13 @@ spv.module Logical GLSL450 { spv.func @load_store_kernel( %arg0: !spv.ptr>>, StorageBuffer> {spv.interface_var_abi = {binding = 0 : i32, - descriptor_set = 0 : i32, - storage_class = 12 : i32}}, + descriptor_set = 0 : i32}}, %arg1: !spv.ptr>>, StorageBuffer> {spv.interface_var_abi = {binding = 1 : i32, - descriptor_set = 0 : i32, - storage_class = 12 : i32}}, + descriptor_set = 0 : i32}}, %arg2: !spv.ptr>>, StorageBuffer> {spv.interface_var_abi = {binding = 2 : i32, - descriptor_set = 0 : i32, - storage_class = 12 : i32}}, + descriptor_set = 0 : i32}}, %arg3: i32 {spv.interface_var_abi = {binding = 3 : i32, descriptor_set = 0 : i32, @@ -122,4 +126,6 @@ spv.module Logical GLSL450 { } // CHECK: spv.EntryPoint "GLCompute" [[FN]], [[WORKGROUPID]], [[LOCALINVOCATIONID]], [[NUMWORKGROUPS]], [[WORKGROUPSIZE]] // CHECK-NEXT: spv.ExecutionMode [[FN]] "LocalSize", 32, 1, 1 -} +} // end spv.module + +} // end module diff --git a/mlir/test/Dialect/SPIRV/target-and-abi.mlir b/mlir/test/Dialect/SPIRV/target-and-abi.mlir index a28ca29e0ab9e..2c380e8ff0396 100644 --- a/mlir/test/Dialect/SPIRV/target-and-abi.mlir +++ b/mlir/test/Dialect/SPIRV/target-and-abi.mlir @@ -14,7 +14,7 @@ func @unknown_attr_on_region(%arg: i32 {spv.something}) { // ----- -// expected-error @+1 {{found unsupported 'spv.something' attribute on region result}} +// expected-error @+1 {{cannot attach SPIR-V attributes to region result}} func @unknown_attr_on_region() -> (i32 {spv.something}) { %0 = constant 10.0 : f32 return %0: f32 @@ -51,14 +51,14 @@ func @spv_entry_point() attributes { // spv.interface_var_abi //===----------------------------------------------------------------------===// -// expected-error @+1 {{'spv.interface_var_abi' attribute must be a dictionary attribute containing three 32-bit integer attributes: 'descriptor_set', 'binding', and 'storage_class'}} +// expected-error @+1 {{'spv.interface_var_abi' attribute must be a dictionary attribute containing two or three 32-bit integer attributes: 'descriptor_set', 'binding', and optional 'storage_class'}} func @interface_var( %arg0 : f32 {spv.interface_var_abi = 64} ) { return } // ----- -// expected-error @+1 {{'spv.interface_var_abi' attribute must be a dictionary attribute containing three 32-bit integer attributes: 'descriptor_set', 'binding', and 'storage_class'}} +// expected-error @+1 {{'spv.interface_var_abi' attribute must be a dictionary attribute containing two or three 32-bit integer attributes: 'descriptor_set', 'binding', and optional 'storage_class'}} func @interface_var( %arg0 : f32 {spv.interface_var_abi = {binding = 0: i32}} ) { return } @@ -74,31 +74,12 @@ func @interface_var( // ----- -// expected-error @+1 {{'spv.interface_var_abi' attribute must be a dictionary attribute containing three 32-bit integer attributes: 'descriptor_set', 'binding', and 'storage_class'}} -func @interface_var() -> (f32 {spv.interface_var_abi = 64}) -{ - %0 = constant 10.0 : f32 - return %0: f32 -} - -// ----- - -// expected-error @+1 {{'spv.interface_var_abi' attribute must be a dictionary attribute containing three 32-bit integer attributes: 'descriptor_set', 'binding', and 'storage_class'}} -func @interface_var() -> (f32 {spv.interface_var_abi = {binding = 0: i32}}) -{ - %0 = constant 10.0 : f32 - return %0: f32 -} - -// ----- - -// CHECK: {spv.interface_var_abi = {binding = 0 : i32, descriptor_set = 0 : i32, storage_class = 12 : i32}} -func @interface_var() -> (f32 {spv.interface_var_abi = { - binding = 0 : i32, descriptor_set = 0 : i32, storage_class = 12 : i32}}) -{ - %0 = constant 10.0 : f32 - return %0: f32 -} +// expected-error @+1 {{'spv.interface_var_abi' attribute cannot specify storage class when attaching to a non-scalar value}} +func @interface_var( + %arg0 : memref<4xf32> {spv.interface_var_abi = {binding = 0 : i32, + descriptor_set = 0 : i32, + storage_class = 12 : i32}} +) { return } // ----- diff --git a/mlir/test/Dialect/Vector/invalid.mlir b/mlir/test/Dialect/Vector/invalid.mlir index 91f6850779a98..bb5eca0a361b4 100644 --- a/mlir/test/Dialect/Vector/invalid.mlir +++ b/mlir/test/Dialect/Vector/invalid.mlir @@ -1046,3 +1046,45 @@ func @reduce_unsupported_rank(%arg0: vector<4x16xf32>) -> f32 { // expected-error@+1 {{'vector.reduction' op unsupported reduction rank: 2}} %0 = vector.reduction "add", %arg0 : vector<4x16xf32> into f32 } + +// ----- + +func @transpose_rank_mismatch(%arg0: vector<4x16x11xf32>) { + // expected-error@+1 {{'vector.transpose' op vector result rank mismatch: 1}} + %0 = vector.transpose %arg0, [2, 1, 0] : vector<4x16x11xf32> to vector<100xf32> +} + +// ----- + +func @transpose_length_mismatch(%arg0: vector<4x4xf32>) { + // expected-error@+1 {{'vector.transpose' op transposition length mismatch: 3}} + %0 = vector.transpose %arg0, [2, 0, 1] : vector<4x4xf32> to vector<4x4xf32> +} + +// ----- + +func @transpose_index_oob(%arg0: vector<4x4xf32>) { + // expected-error@+1 {{'vector.transpose' op transposition index out of range: 2}} + %0 = vector.transpose %arg0, [2, 0] : vector<4x4xf32> to vector<4x4xf32> +} + +// ----- + +func @transpose_index_dup(%arg0: vector<4x4xf32>) { + // expected-error@+1 {{'vector.transpose' op duplicate position index: 0}} + %0 = vector.transpose %arg0, [0, 0] : vector<4x4xf32> to vector<4x4xf32> +} + +// ----- + +func @transpose_dim_size_mismatch(%arg0: vector<11x7x3x2xi32>) { + // expected-error@+1 {{'vector.transpose' op dimension size mismatch at: 0}} + %0 = vector.transpose %arg0, [3, 0, 1, 2] : vector<11x7x3x2xi32> to vector<2x3x7x11xi32> +} + +// ----- + +func @type_cast_layout(%arg0: memref<4x3xf32, affine_map<(d0, d1)[s0, s1, s2] -> (d0 * s0 + d1 * s1 + s2)>>) { + // expected-error@+1 {{expects operand to be a memref with no layout}} + %0 = vector.type_cast %arg0: memref<4x3xf32, affine_map<(d0, d1)[s0, s1, s2] -> (d0 * s0 + d1 * s1 + s2)>> to memref> +} diff --git a/mlir/test/Dialect/Vector/ops.mlir b/mlir/test/Dialect/Vector/ops.mlir index f286b932a4721..a3b3fcc9c23c5 100644 --- a/mlir/test/Dialect/Vector/ops.mlir +++ b/mlir/test/Dialect/Vector/ops.mlir @@ -315,3 +315,15 @@ func @reduce_int(%arg0: vector<16xi32>) -> i32 { // CHECK: return %[[X]] : i32 return %0 : i32 } + +// CHECK-LABEL: transpose_fp +func @transpose_fp(%arg0: vector<3x7xf32>) -> vector<7x3xf32> { + %0 = vector.transpose %arg0, [1, 0] : vector<3x7xf32> to vector<7x3xf32> + return %0 : vector<7x3xf32> +} + +// CHECK-LABEL: transpose_int +func @transpose_int(%arg0: vector<11x7x3x2xi32>) -> vector<2x11x7x3xi32> { + %0 = vector.transpose %arg0, [3, 0, 1, 2] : vector<11x7x3x2xi32> to vector<2x11x7x3xi32> + return %0 : vector<2x11x7x3xi32> +} diff --git a/mlir/test/Dialect/Vector/vector-contract-transforms.mlir b/mlir/test/Dialect/Vector/vector-contract-transforms.mlir index bed90d6341d94..051c42d32ed55 100644 --- a/mlir/test/Dialect/Vector/vector-contract-transforms.mlir +++ b/mlir/test/Dialect/Vector/vector-contract-transforms.mlir @@ -296,6 +296,28 @@ func @outerproduct_acc(%arg0: vector<2xf32>, return %0: vector<2x3xf32> } +// CHECK-LABEL: func @transpose23 +// CHECK-SAME: %[[A:.*]]: vector<2x3xf32> +// CHECK: %[[Z:.*]] = constant dense<0.000000e+00> : vector<3x2xf32> +// CHECK: %[[T0:.*]] = vector.extract %[[A]][0, 0] : vector<2x3xf32> +// CHECK: %[[T1:.*]] = vector.insert %[[T0]], %[[Z]] [0, 0] : f32 into vector<3x2xf32> +// CHECK: %[[T2:.*]] = vector.extract %[[A]][1, 0] : vector<2x3xf32> +// CHECK: %[[T3:.*]] = vector.insert %[[T2]], %[[T1]] [0, 1] : f32 into vector<3x2xf32> +// CHECK: %[[T4:.*]] = vector.extract %[[A]][0, 1] : vector<2x3xf32> +// CHECK: %[[T5:.*]] = vector.insert %[[T4]], %[[T3]] [1, 0] : f32 into vector<3x2xf32> +// CHECK: %[[T6:.*]] = vector.extract %[[A]][1, 1] : vector<2x3xf32> +// CHECK: %[[T7:.*]] = vector.insert %[[T6]], %[[T5]] [1, 1] : f32 into vector<3x2xf32> +// CHECK: %[[T8:.*]] = vector.extract %[[A]][0, 2] : vector<2x3xf32> +// CHECK: %[[T9:.*]] = vector.insert %[[T8]], %[[T7]] [2, 0] : f32 into vector<3x2xf32> +// CHECK: %[[T10:.*]] = vector.extract %[[A]][1, 2] : vector<2x3xf32> +// CHECK: %[[T11:.*]] = vector.insert %[[T10]], %[[T9]] [2, 1] : f32 into vector<3x2xf32> +// CHECK: return %[[T11]] : vector<3x2xf32> + +func @transpose23(%arg0: vector<2x3xf32>) -> vector<3x2xf32> { + %0 = vector.transpose %arg0, [1, 0] : vector<2x3xf32> to vector<3x2xf32> + return %0 : vector<3x2xf32> +} + // Shape up and downcasts for 2-D vectors, for supporting conversion to // llvm.matrix operations // CHECK-LABEL: func @shape_casts diff --git a/mlir/test/EDSC/CMakeLists.txt b/mlir/test/EDSC/CMakeLists.txt index c3c0cf35497b7..6c2f5f9fd0beb 100644 --- a/mlir/test/EDSC/CMakeLists.txt +++ b/mlir/test/EDSC/CMakeLists.txt @@ -6,7 +6,7 @@ llvm_update_compile_flags(mlir-edsc-builder-api-test) target_link_libraries(mlir-edsc-builder-api-test PRIVATE - MLIRAffineOps + MLIRAffine MLIREDSC MLIRIR MLIRLinalgEDSC @@ -22,7 +22,7 @@ target_link_libraries(mlir-edsc-builder-api-test target_include_directories(mlir-edsc-builder-api-test PRIVATE ..) whole_archive_link(mlir-edsc-builder-api-test - MLIRAffineOps + MLIRAffine MLIRLinalgOps MLIRLoopOps MLIRStandardOps diff --git a/mlir/test/EDSC/builder-api-test.cpp b/mlir/test/EDSC/builder-api-test.cpp index b60d5894df030..a9bb0b62dbdf9 100644 --- a/mlir/test/EDSC/builder-api-test.cpp +++ b/mlir/test/EDSC/builder-api-test.cpp @@ -8,7 +8,7 @@ // RUN: mlir-edsc-builder-api-test | FileCheck %s -dump-input-on-failure -#include "mlir/Dialect/AffineOps/EDSC/Intrinsics.h" +#include "mlir/Dialect/Affine/EDSC/Intrinsics.h" #include "mlir/Dialect/Linalg/EDSC/Intrinsics.h" #include "mlir/Dialect/LoopOps/EDSC/Builders.h" #include "mlir/Dialect/StandardOps/EDSC/Intrinsics.h" @@ -38,7 +38,7 @@ using namespace mlir::edsc::intrinsics; static MLIRContext &globalContext() { static bool init_once = []() { - registerDialect(); + registerDialect(); registerDialect(); registerDialect(); registerDialect(); diff --git a/mlir/test/IR/parser.mlir b/mlir/test/IR/parser.mlir index a6dc9b6617b3a..45ee1e1d89cd3 100644 --- a/mlir/test/IR/parser.mlir +++ b/mlir/test/IR/parser.mlir @@ -1185,3 +1185,49 @@ func @custom_asm_names() -> (i32, i32, i32, i32, i32, i32, i32) { // CHECK: return %[[FIRST]], %[[MIDDLE]]#0, %[[MIDDLE]]#1, %[[LAST]], %[[FIRST_2]], %[[LAST_2]] return %0, %1#0, %1#1, %2, %3, %4, %5 : i32, i32, i32, i32, i32, i32, i32 } + + +// CHECK-LABEL: func @pretty_names + +// This tests the behavior +func @pretty_names() { + // Simple case, should parse and print as %x being an implied 'name' + // attribute. + %x = test.string_attr_pretty_name + // CHECK: %x = test.string_attr_pretty_name + // CHECK-NOT: attributes + + // This specifies an explicit name, which should override the result. + %YY = test.string_attr_pretty_name attributes { names = ["y"] } + // CHECK: %y = test.string_attr_pretty_name + // CHECK-NOT: attributes + + // Conflicts with the 'y' name, so need an explicit attribute. + %0 = "test.string_attr_pretty_name"() { names = ["y"]} : () -> i32 + // CHECK: %y_0 = test.string_attr_pretty_name attributes {names = ["y"]} + + // Name contains a space. + %1 = "test.string_attr_pretty_name"() { names = ["space name"]} : () -> i32 + // CHECK: %space_name = test.string_attr_pretty_name attributes {names = ["space name"]} + + "unknown.use"(%x, %YY, %0, %1) : (i32, i32, i32, i32) -> () + + // Multi-result support. + + %a, %b, %c = test.string_attr_pretty_name + // CHECK: %a, %b, %c = test.string_attr_pretty_name + // CHECK-NOT: attributes + + %q:3, %r = test.string_attr_pretty_name + // CHECK: %q, %q_1, %q_2, %r = test.string_attr_pretty_name attributes {names = ["q", "q", "q", "r"]} + + // CHECK: return + return +} + +// CHECK-LABEL: func @zero_whitespace() { +// CHECK-NEXT: return +func @zero_whitespace() { + // This is a \0 byte. + return +} diff --git a/mlir/test/Target/avx512.mlir b/mlir/test/Target/avx512.mlir new file mode 100644 index 0000000000000..5e75a98dc4ef8 --- /dev/null +++ b/mlir/test/Target/avx512.mlir @@ -0,0 +1,31 @@ +// RUN: mlir-opt -verify-diagnostics %s | mlir-opt | mlir-translate --avx512-mlir-to-llvmir | FileCheck %s + +// CHECK-LABEL: define <16 x float> @LLVM_x86_avx512_mask_ps_512 +llvm.func @LLVM_x86_avx512_mask_ps_512(%a: !llvm<"<16 x float>">, + %b: !llvm.i32, + %c: !llvm.i16) + -> (!llvm<"<16 x float>">) +{ + // CHECK: call <16 x float> @llvm.x86.avx512.mask.rndscale.ps.512(<16 x float> + %0 = "llvm_avx512.mask.rndscale.ps.512"(%a, %b, %a, %c, %b) : + (!llvm<"<16 x float>">, !llvm.i32, !llvm<"<16 x float>">, !llvm.i16, !llvm.i32) -> !llvm<"<16 x float>"> + // CHECK: call <16 x float> @llvm.x86.avx512.mask.scalef.ps.512(<16 x float> + %1 = "llvm_avx512.mask.scalef.ps.512"(%a, %a, %a, %c, %b) : + (!llvm<"<16 x float>">, !llvm<"<16 x float>">, !llvm<"<16 x float>">, !llvm.i16, !llvm.i32) -> !llvm<"<16 x float>"> + llvm.return %1: !llvm<"<16 x float>"> +} + +// CHECK-LABEL: define <8 x double> @LLVM_x86_avx512_mask_pd_512 +llvm.func @LLVM_x86_avx512_mask_pd_512(%a: !llvm<"<8 x double>">, + %b: !llvm.i32, + %c: !llvm.i8) + -> (!llvm<"<8 x double>">) +{ + // CHECK: call <8 x double> @llvm.x86.avx512.mask.rndscale.pd.512(<8 x double> + %0 = "llvm_avx512.mask.rndscale.pd.512"(%a, %b, %a, %c, %b) : + (!llvm<"<8 x double>">, !llvm.i32, !llvm<"<8 x double>">, !llvm.i8, !llvm.i32) -> !llvm<"<8 x double>"> + // CHECK: call <8 x double> @llvm.x86.avx512.mask.scalef.pd.512(<8 x double> + %1 = "llvm_avx512.mask.scalef.pd.512"(%a, %a, %a, %c, %b) : + (!llvm<"<8 x double>">, !llvm<"<8 x double>">, !llvm<"<8 x double>">, !llvm.i8, !llvm.i32) -> !llvm<"<8 x double>"> + llvm.return %1: !llvm<"<8 x double>"> +} diff --git a/mlir/test/Target/import.ll b/mlir/test/Target/import.ll index 0394309093f29..23fc219168888 100644 --- a/mlir/test/Target/import.ll +++ b/mlir/test/Target/import.ll @@ -282,8 +282,7 @@ define i32 @invokeLandingpad() personality i8* bitcast (i32 (...)* @__gxx_person ; FIXME: Change filter to a constant array once they are handled. ; Currently, even though it parses this, LLVM module is broken filter [1 x i8] [i8 1] - ; CHECK: llvm.br ^bb3 - br label %5 + resume { i8*, i32 } %3 ; CHECK: ^bb2: ; CHECK: llvm.return %{{[0-9]+}} : !llvm.i32 diff --git a/mlir/test/Target/llvmir.mlir b/mlir/test/Target/llvmir.mlir index 43cc7d804daec..59c43b82cca5f 100644 --- a/mlir/test/Target/llvmir.mlir +++ b/mlir/test/Target/llvmir.mlir @@ -1137,7 +1137,7 @@ llvm.func @bar(!llvm<"i8*">) -> !llvm<"i8*"> llvm.func @__gxx_personality_v0(...) -> !llvm.i32 // CHECK-LABEL: @invokeLandingpad -llvm.func @invokeLandingpad() -> !llvm.i32 { +llvm.func @invokeLandingpad() -> !llvm.i32 attributes { personality = @__gxx_personality_v0 } { // CHECK: %[[a1:[0-9]+]] = alloca i8 %0 = llvm.mlir.constant(0 : i32) : !llvm.i32 %1 = llvm.mlir.constant("\01") : !llvm<"[1 x i8]"> diff --git a/mlir/test/Transforms/test-legalizer.mlir b/mlir/test/Transforms/test-legalizer.mlir index 38f87dd2302cc..dd8330626551f 100644 --- a/mlir/test/Transforms/test-legalizer.mlir +++ b/mlir/test/Transforms/test-legalizer.mlir @@ -23,6 +23,13 @@ func @remap_input_1_to_1(%arg0: i64) { "test.invalid"(%arg0) : (i64) -> () } +// CHECK-LABEL: func @remap_call_1_to_1(%arg0: f64) +func @remap_call_1_to_1(%arg0: i64) { + // CHECK-NEXT: call @remap_input_1_to_1(%arg0) : (f64) -> () + call @remap_input_1_to_1(%arg0) : (i64) -> () + return +} + // CHECK-LABEL: func @remap_input_1_to_N({{.*}}f16, {{.*}}f16) func @remap_input_1_to_N(%arg0: f32) -> f32 { // CHECK-NEXT: "test.return"{{.*}} : (f16, f16) -> () diff --git a/mlir/test/lib/Dialect/Affine/CMakeLists.txt b/mlir/test/lib/Dialect/Affine/CMakeLists.txt new file mode 100644 index 0000000000000..0d6cd2aa9edcc --- /dev/null +++ b/mlir/test/lib/Dialect/Affine/CMakeLists.txt @@ -0,0 +1,16 @@ +add_llvm_library(MLIRAffineTransformsTestPasses + TestAffineDataCopy.cpp + TestParallelismDetection.cpp + TestVectorizationUtils.cpp + + ADDITIONAL_HEADER_DIRS + ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/Affine + ${MLIR_MAIN_INCLUDE_DIR}/mlir/IR + ) + +target_link_libraries(MLIRAffineTransformsTestPasses PRIVATE + MLIRIR + MLIRPass + MLIRAffineTransforms + MLIRSupport + ) diff --git a/mlir/test/lib/Transforms/TestAffineDataCopy.cpp b/mlir/test/lib/Dialect/Affine/TestAffineDataCopy.cpp similarity index 97% rename from mlir/test/lib/Transforms/TestAffineDataCopy.cpp rename to mlir/test/lib/Dialect/Affine/TestAffineDataCopy.cpp index 966df287359ad..35c374c2ee910 100644 --- a/mlir/test/lib/Transforms/TestAffineDataCopy.cpp +++ b/mlir/test/lib/Dialect/Affine/TestAffineDataCopy.cpp @@ -11,9 +11,8 @@ // //===----------------------------------------------------------------------===// -#include "mlir/Analysis/Passes.h" #include "mlir/Analysis/Utils.h" -#include "mlir/Dialect/AffineOps/AffineOps.h" +#include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Pass/Pass.h" #include "mlir/Transforms/LoopUtils.h" #include "mlir/Transforms/Passes.h" diff --git a/mlir/test/lib/Transforms/TestParallelismDetection.cpp b/mlir/test/lib/Dialect/Affine/TestParallelismDetection.cpp similarity index 94% rename from mlir/test/lib/Transforms/TestParallelismDetection.cpp rename to mlir/test/lib/Dialect/Affine/TestParallelismDetection.cpp index 7c16a259723fd..1140dab92dbb7 100644 --- a/mlir/test/lib/Transforms/TestParallelismDetection.cpp +++ b/mlir/test/lib/Dialect/Affine/TestParallelismDetection.cpp @@ -10,9 +10,8 @@ // //===----------------------------------------------------------------------===// -#include "mlir/Analysis/Passes.h" #include "mlir/Analysis/Utils.h" -#include "mlir/Dialect/AffineOps/AffineOps.h" +#include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/IR/Builders.h" #include "mlir/Pass/Pass.h" @@ -27,7 +26,6 @@ struct TestParallelismDetection } // end anonymous namespace - // Walks the function and emits a note for all 'affine.for' ops detected as // parallel. void TestParallelismDetection::runOnFunction() { diff --git a/mlir/test/lib/Transforms/TestVectorizationUtils.cpp b/mlir/test/lib/Dialect/Affine/TestVectorizationUtils.cpp similarity index 98% rename from mlir/test/lib/Transforms/TestVectorizationUtils.cpp rename to mlir/test/lib/Dialect/Affine/TestVectorizationUtils.cpp index 4ae4509bc56d2..01382530fa397 100644 --- a/mlir/test/lib/Transforms/TestVectorizationUtils.cpp +++ b/mlir/test/lib/Dialect/Affine/TestVectorizationUtils.cpp @@ -13,7 +13,7 @@ #include "mlir/Analysis/AffineAnalysis.h" #include "mlir/Analysis/NestedMatcher.h" #include "mlir/Analysis/SliceAnalysis.h" -#include "mlir/Dialect/AffineOps/AffineOps.h" +#include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/Vector/VectorUtils.h" #include "mlir/IR/Builders.h" #include "mlir/IR/Diagnostics.h" @@ -27,7 +27,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" -#define DEBUG_TYPE "affine-vectorizer-test" +#define DEBUG_TYPE "affine-super-vectorizer-test" using namespace mlir; @@ -284,6 +284,7 @@ void VectorizerTestPass::runOnFunction() { namespace mlir { void registerVectorizerTestPass() { PassRegistration pass( - "affine-vectorizer-test", "Tests vectorizer standalone functionality."); + "affine-super-vectorizer-test", + "Tests vectorizer standalone functionality."); } } // namespace mlir diff --git a/mlir/test/lib/Dialect/CMakeLists.txt b/mlir/test/lib/Dialect/CMakeLists.txt index cc1766c6127a5..160fe9c203da9 100644 --- a/mlir/test/lib/Dialect/CMakeLists.txt +++ b/mlir/test/lib/Dialect/CMakeLists.txt @@ -1 +1,2 @@ +add_subdirectory(Affine) add_subdirectory(SPIRV) diff --git a/mlir/test/lib/Dialect/SPIRV/TestAvailability.cpp b/mlir/test/lib/Dialect/SPIRV/TestAvailability.cpp index a91800d68fc04..ad77e7d05f42f 100644 --- a/mlir/test/lib/Dialect/SPIRV/TestAvailability.cpp +++ b/mlir/test/lib/Dialect/SPIRV/TestAvailability.cpp @@ -130,7 +130,12 @@ void ConvertToTargetEnv::runOnFunction() { auto targetEnv = fn.getOperation() ->getAttr(spirv::getTargetEnvAttrName()) .cast(); - auto target = spirv::SPIRVConversionTarget::get(targetEnv, context); + if (!targetEnv) { + fn.emitError("missing 'spv.target_env' attribute"); + return signalPassFailure(); + } + + auto target = spirv::SPIRVConversionTarget::get(targetEnv); OwningRewritePatternList patterns; patterns.insert names; + auto *context = result.getContext(); + + for (size_t i = 0, e = parser.getNumResults(); i != e; ++i) { + auto resultName = parser.getResultName(i); + StringRef nameStr; + if (!resultName.first.empty() && !isdigit(resultName.first[0])) + nameStr = resultName.first; + + names.push_back(nameStr); + } + + auto namesAttr = parser.getBuilder().getStrArrayAttr(names); + result.attributes.push_back({Identifier::get("names", context), namesAttr}); + return success(); +} + +static void print(OpAsmPrinter &p, StringAttrPrettyNameOp op) { + p << "test.string_attr_pretty_name"; + + // Note that we only need to print the "name" attribute if the asmprinter + // result name disagrees with it. This can happen in strange cases, e.g. + // when there are conflicts. + bool namesDisagree = op.names().size() != op.getNumResults(); + + SmallString<32> resultNameStr; + for (size_t i = 0, e = op.getNumResults(); i != e && !namesDisagree; ++i) { + resultNameStr.clear(); + llvm::raw_svector_ostream tmpStream(resultNameStr); + p.printOperand(op.getResult(i), tmpStream); + + auto expectedName = op.names()[i].dyn_cast(); + if (!expectedName || + tmpStream.str().drop_front() != expectedName.getValue()) { + namesDisagree = true; + } + } + + if (namesDisagree) + p.printOptionalAttrDictWithKeyword(op.getAttrs()); + else + p.printOptionalAttrDictWithKeyword(op.getAttrs(), {"names"}); +} + +// We set the SSA name in the asm syntax to the contents of the name +// attribute. +void StringAttrPrettyNameOp::getAsmResultNames( + function_ref setNameFn) { + + auto value = names(); + for (size_t i = 0, e = value.size(); i != e; ++i) + if (auto str = value[i].dyn_cast()) + if (!str.getValue().empty()) + setNameFn(getResult(i), str.getValue()); +} + //===----------------------------------------------------------------------===// // Dialect Registration //===----------------------------------------------------------------------===// diff --git a/mlir/test/lib/TestDialect/TestOps.td b/mlir/test/lib/TestDialect/TestOps.td index cf0ec63fe6c59..80783001a4c97 100644 --- a/mlir/test/lib/TestDialect/TestOps.td +++ b/mlir/test/lib/TestDialect/TestOps.td @@ -496,6 +496,18 @@ def AttrSizedResultOp : TEST_Op<"attr_sized_results", ); } +// This is used to test encoding of a string attribute into an SSA name of a +// pretty printed value name. +def StringAttrPrettyNameOp + : TEST_Op<"string_attr_pretty_name", + [DeclareOpInterfaceMethods]> { + let arguments = (ins StrArrayAttr:$names); + let results = (outs Variadic:$r); + + let printer = [{ return ::print(p, *this); }]; + let parser = [{ return ::parse$cppClass(parser, result); }]; +} + //===----------------------------------------------------------------------===// // Test Patterns //===----------------------------------------------------------------------===// diff --git a/mlir/test/lib/TestDialect/TestPatterns.cpp b/mlir/test/lib/TestDialect/TestPatterns.cpp index c7235b8cb3a54..0b73f09c1943a 100644 --- a/mlir/test/lib/TestDialect/TestPatterns.cpp +++ b/mlir/test/lib/TestDialect/TestPatterns.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "TestDialect.h" +#include "mlir/Conversion/StandardToStandard/StandardToStandard.h" #include "mlir/IR/PatternMatch.h" #include "mlir/Pass/Pass.h" #include "mlir/Transforms/DialectConversion.h" @@ -381,6 +382,8 @@ struct TestLegalizePatternDriver patterns.insert(&getContext(), converter); mlir::populateFuncOpTypeConversionPattern(patterns, &getContext(), converter); + mlir::populateCallOpTypeConversionPattern(patterns, &getContext(), + converter); // Define the conversion target used for the test. ConversionTarget target(getContext()); diff --git a/mlir/test/lib/Transforms/CMakeLists.txt b/mlir/test/lib/Transforms/CMakeLists.txt index bc737a0a119fb..b4726439c83f7 100644 --- a/mlir/test/lib/Transforms/CMakeLists.txt +++ b/mlir/test/lib/Transforms/CMakeLists.txt @@ -1,8 +1,8 @@ add_llvm_library(MLIRTestTransforms - TestAffineDataCopy.cpp TestAllReduceLowering.cpp TestCallGraph.cpp TestConstantFold.cpp + TestConvertGPUKernelToCubin.cpp TestLoopFusion.cpp TestGpuMemoryPromotion.cpp TestGpuParallelLoopMapping.cpp @@ -15,10 +15,8 @@ add_llvm_library(MLIRTestTransforms TestMemRefBoundCheck.cpp TestMemRefDependenceCheck.cpp TestMemRefStrideCalculation.cpp - TestParallelismDetection.cpp TestVectorToLoopsConversion.cpp TestVectorTransforms.cpp - TestVectorizationUtils.cpp ADDITIONAL_HEADER_DIRS ${MLIR_MAIN_INCLUDE_DIR}/mlir/Transforms @@ -36,15 +34,17 @@ include_directories(${CMAKE_CURRENT_BINARY_DIR}/../DeclarativeTransforms) target_link_libraries(MLIRTestTransforms PUBLIC - MLIRAffineOps + MLIRAffine MLIRAnalysis MLIREDSC MLIRGPU + MLIRGPUtoCUDATransforms MLIRLinalgOps MLIRLinalgTransforms MLIRLoopOps MLIRGPU MLIRPass + MLIRStandardToStandard MLIRTestDialect MLIRTransformUtils MLIRVectorToLoops diff --git a/mlir/test/lib/Transforms/TestConstantFold.cpp b/mlir/test/lib/Transforms/TestConstantFold.cpp index f45b0ae18d504..cc6ece7f7c466 100644 --- a/mlir/test/lib/Transforms/TestConstantFold.cpp +++ b/mlir/test/lib/Transforms/TestConstantFold.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -#include "mlir/Dialect/AffineOps/AffineOps.h" +#include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/StandardOps/IR/Ops.h" #include "mlir/IR/Builders.h" #include "mlir/IR/Function.h" diff --git a/mlir/test/lib/Transforms/TestConvertGPUKernelToCubin.cpp b/mlir/test/lib/Transforms/TestConvertGPUKernelToCubin.cpp new file mode 100644 index 0000000000000..e0c4c1907c4f5 --- /dev/null +++ b/mlir/test/lib/Transforms/TestConvertGPUKernelToCubin.cpp @@ -0,0 +1,31 @@ +//===- TestConvertGPUKernelToCubin.cpp - Test gpu kernel cubin lowering ---===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Pass/PassManager.h" +using namespace mlir; + +#if MLIR_CUDA_CONVERSIONS_ENABLED +static OwnedCubin compilePtxToCubinForTesting(const std::string &, Location, + StringRef) { + const char data[] = "CUBIN"; + return std::make_unique>(data, data + sizeof(data) - 1); +} + +namespace mlir { +void registerTestConvertGPUKernelToCubinPass() { + PassPipelineRegistration<>("test-kernel-to-cubin", + "Convert all kernel functions to CUDA cubin blobs", + [](OpPassManager &pm) { + pm.addPass(createConvertGPUKernelToCubinPass( + compilePtxToCubinForTesting)); + }); +} +} // namespace mlir +#endif diff --git a/mlir/test/lib/Transforms/TestLoopFusion.cpp b/mlir/test/lib/Transforms/TestLoopFusion.cpp index d650288836d19..4d63e412aab69 100644 --- a/mlir/test/lib/Transforms/TestLoopFusion.cpp +++ b/mlir/test/lib/Transforms/TestLoopFusion.cpp @@ -12,9 +12,8 @@ #include "mlir/Analysis/AffineAnalysis.h" #include "mlir/Analysis/AffineStructures.h" -#include "mlir/Analysis/Passes.h" #include "mlir/Analysis/Utils.h" -#include "mlir/Dialect/AffineOps/AffineOps.h" +#include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/StandardOps/IR/Ops.h" #include "mlir/IR/Builders.h" #include "mlir/Pass/Pass.h" diff --git a/mlir/test/lib/Transforms/TestMemRefBoundCheck.cpp b/mlir/test/lib/Transforms/TestMemRefBoundCheck.cpp index e107bf81cfe20..ef566de1391e4 100644 --- a/mlir/test/lib/Transforms/TestMemRefBoundCheck.cpp +++ b/mlir/test/lib/Transforms/TestMemRefBoundCheck.cpp @@ -14,9 +14,8 @@ #include "mlir/ADT/TypeSwitch.h" #include "mlir/Analysis/AffineAnalysis.h" #include "mlir/Analysis/AffineStructures.h" -#include "mlir/Analysis/Passes.h" #include "mlir/Analysis/Utils.h" -#include "mlir/Dialect/AffineOps/AffineOps.h" +#include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/StandardOps/IR/Ops.h" #include "mlir/IR/Builders.h" #include "mlir/Pass/Pass.h" @@ -35,10 +34,6 @@ struct TestMemRefBoundCheck : public FunctionPass { } // end anonymous namespace -std::unique_ptr> mlir::createTestMemRefBoundCheckPass() { - return std::make_unique(); -} - void TestMemRefBoundCheck::runOnFunction() { getFunction().walk([](Operation *opInst) { TypeSwitch(opInst).Case( diff --git a/mlir/test/lib/Transforms/TestMemRefDependenceCheck.cpp b/mlir/test/lib/Transforms/TestMemRefDependenceCheck.cpp index e2d0c873f9597..2803c1d9dccc0 100644 --- a/mlir/test/lib/Transforms/TestMemRefDependenceCheck.cpp +++ b/mlir/test/lib/Transforms/TestMemRefDependenceCheck.cpp @@ -12,9 +12,8 @@ #include "mlir/Analysis/AffineAnalysis.h" #include "mlir/Analysis/AffineStructures.h" -#include "mlir/Analysis/Passes.h" #include "mlir/Analysis/Utils.h" -#include "mlir/Dialect/AffineOps/AffineOps.h" +#include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/StandardOps/IR/Ops.h" #include "mlir/IR/Builders.h" #include "mlir/Pass/Pass.h" @@ -36,11 +35,6 @@ struct TestMemRefDependenceCheck } // end anonymous namespace -std::unique_ptr> -mlir::createTestMemRefDependenceCheckPass() { - return std::make_unique(); -} - // Returns a result string which represents the direction vector (if there was // a dependence), returns the string "false" otherwise. static std::string diff --git a/mlir/test/mlir-cpu-runner/linalg_integration_test.mlir b/mlir/test/mlir-cpu-runner/linalg_integration_test.mlir index 306d86f03739b..c7676f7031f32 100644 --- a/mlir/test/mlir-cpu-runner/linalg_integration_test.mlir +++ b/mlir/test/mlir-cpu-runner/linalg_integration_test.mlir @@ -2,8 +2,8 @@ // RUN: mlir-opt %s -convert-linalg-to-loops -convert-linalg-to-llvm | mlir-cpu-runner -e dot -entry-point-result=f32 -shared-libs=%linalg_test_lib_dir/libcblas%shlibext,%linalg_test_lib_dir/libcblas_interface%shlibext | FileCheck %s // RUN: mlir-opt %s -convert-linalg-to-llvm | mlir-cpu-runner -e matmul -entry-point-result=f32 -shared-libs=%linalg_test_lib_dir/libcblas%shlibext,%linalg_test_lib_dir/libcblas_interface%shlibext | FileCheck %s // RUN: mlir-opt %s -convert-linalg-to-loops -convert-linalg-to-llvm | mlir-cpu-runner -e matmul -entry-point-result=f32 -shared-libs=%linalg_test_lib_dir/libcblas%shlibext,%linalg_test_lib_dir/libcblas_interface%shlibext | FileCheck %s -// RUN: mlir-opt %s -linalg-tile -linalg-tile-sizes=2,3,4 -linalg-promote-subviews -convert-linalg-to-loops -convert-linalg-to-llvm | mlir-cpu-runner -e matmul -entry-point-result=f32 -shared-libs=%linalg_test_lib_dir/libcblas%shlibext,%linalg_test_lib_dir/libcblas_interface%shlibext | FileCheck %s -// RUN: mlir-opt %s -linalg-tile -linalg-tile-sizes=2,3,4 -linalg-promote-subviews -convert-linalg-to-llvm | mlir-cpu-runner -e matmul -entry-point-result=f32 -shared-libs=%linalg_test_lib_dir/libcblas%shlibext,%linalg_test_lib_dir/libcblas_interface%shlibext | FileCheck %s +// RUN: mlir-opt %s -linalg-tile="linalg-tile-sizes=2,3,4" -linalg-promote-subviews -convert-linalg-to-loops -convert-linalg-to-llvm | mlir-cpu-runner -e matmul -entry-point-result=f32 -shared-libs=%linalg_test_lib_dir/libcblas%shlibext,%linalg_test_lib_dir/libcblas_interface%shlibext | FileCheck %s +// RUN: mlir-opt %s -linalg-tile="linalg-tile-sizes=2,3,4" -linalg-promote-subviews -convert-linalg-to-llvm | mlir-cpu-runner -e matmul -entry-point-result=f32 -shared-libs=%linalg_test_lib_dir/libcblas%shlibext,%linalg_test_lib_dir/libcblas_interface%shlibext | FileCheck %s #strided1D = affine_map<(d0) -> (d0)> #strided2D = affine_map<(d0, d1)[s0] -> (d0 * s0 + d1)> diff --git a/mlir/test/mlir-cuda-runner/all-reduce-and.mlir b/mlir/test/mlir-cuda-runner/all-reduce-and.mlir index edf3e029b9e85..8eb2e0d72ec5b 100644 --- a/mlir/test/mlir-cuda-runner/all-reduce-and.mlir +++ b/mlir/test/mlir-cuda-runner/all-reduce-and.mlir @@ -2,9 +2,7 @@ func @main() { %data = alloc() : memref<2x6xi32> - %sum_and = alloc() : memref<2xi32> - %sum_or = alloc() : memref<2xi32> - %sum_min = alloc() : memref<2xi32> + %sum = alloc() : memref<2xi32> %cst0 = constant 0 : i32 %cst1 = constant 1 : i32 %cst2 = constant 2 : i32 @@ -25,7 +23,12 @@ func @main() { %c4 = constant 4 : index %c5 = constant 5 : index %c6 = constant 6 : index - + + %cast_data = memref_cast %data : memref<2x6xi32> to memref + call @mcuMemHostRegisterMemRef2dInt32(%cast_data) : (memref) -> () + %cast_sum = memref_cast %sum : memref<2xi32> to memref + call @mcuMemHostRegisterMemRef1dInt32(%cast_sum) : (memref) -> () + store %cst0, %data[%c0, %c0] : memref<2x6xi32> store %cst1, %data[%c0, %c1] : memref<2x6xi32> store %cst2, %data[%c0, %c2] : memref<2x6xi32> @@ -44,17 +47,19 @@ func @main() { gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %c2, %grid_y = %c1, %grid_z = %c1) threads(%tx, %ty, %tz) in (%block_x = %c6, %block_y = %c1, %block_z = %c1) { %val = load %data[%bx, %tx] : memref<2x6xi32> - %reduced_and = "gpu.all_reduce"(%val) ({}) { op = "and" } : (i32) -> (i32) - store %reduced_and, %sum_and[%bx] : memref<2xi32> + %reduced = "gpu.all_reduce"(%val) ({}) { op = "and" } : (i32) -> (i32) + store %reduced, %sum[%bx] : memref<2xi32> gpu.terminator } - %ptr_and = memref_cast %sum_and : memref<2xi32> to memref<*xi32> - call @print_memref_i32(%ptr_and) : (memref<*xi32>) -> () + %ptr = memref_cast %sum : memref<2xi32> to memref<*xi32> + call @print_memref_i32(%ptr) : (memref<*xi32>) -> () // CHECK: [0, 2] return } +func @mcuMemHostRegisterMemRef1dInt32(%ptr : memref) +func @mcuMemHostRegisterMemRef2dInt32(%ptr : memref) func @print_memref_i32(memref<*xi32>) diff --git a/mlir/test/mlir-cuda-runner/all-reduce-max.mlir b/mlir/test/mlir-cuda-runner/all-reduce-max.mlir index 6ed27ccb9d4b3..1213625fa9a0f 100644 --- a/mlir/test/mlir-cuda-runner/all-reduce-max.mlir +++ b/mlir/test/mlir-cuda-runner/all-reduce-max.mlir @@ -23,7 +23,12 @@ func @main() { %c4 = constant 4 : index %c5 = constant 5 : index %c6 = constant 6 : index - + + %cast_data = memref_cast %data : memref<2x6xi32> to memref + call @mcuMemHostRegisterMemRef2dInt32(%cast_data) : (memref) -> () + %cast_sum = memref_cast %sum : memref<2xi32> to memref + call @mcuMemHostRegisterMemRef1dInt32(%cast_sum) : (memref) -> () + store %cst0, %data[%c0, %c0] : memref<2x6xi32> store %cst1, %data[%c0, %c1] : memref<2x6xi32> store %cst2, %data[%c0, %c2] : memref<2x6xi32> @@ -54,5 +59,7 @@ func @main() { return } +func @mcuMemHostRegisterMemRef1dInt32(%ptr : memref) +func @mcuMemHostRegisterMemRef2dInt32(%ptr : memref) func @print_memref_i32(memref<*xi32>) diff --git a/mlir/test/mlir-cuda-runner/all-reduce-min.mlir b/mlir/test/mlir-cuda-runner/all-reduce-min.mlir index 2165fe58ce499..f467c80027c2c 100644 --- a/mlir/test/mlir-cuda-runner/all-reduce-min.mlir +++ b/mlir/test/mlir-cuda-runner/all-reduce-min.mlir @@ -23,7 +23,12 @@ func @main() { %c4 = constant 4 : index %c5 = constant 5 : index %c6 = constant 6 : index - + + %cast_data = memref_cast %data : memref<2x6xi32> to memref + call @mcuMemHostRegisterMemRef2dInt32(%cast_data) : (memref) -> () + %cast_sum = memref_cast %sum : memref<2xi32> to memref + call @mcuMemHostRegisterMemRef1dInt32(%cast_sum) : (memref) -> () + store %cst0, %data[%c0, %c0] : memref<2x6xi32> store %cst1, %data[%c0, %c1] : memref<2x6xi32> store %cst2, %data[%c0, %c2] : memref<2x6xi32> @@ -54,5 +59,7 @@ func @main() { return } +func @mcuMemHostRegisterMemRef1dInt32(%ptr : memref) +func @mcuMemHostRegisterMemRef2dInt32(%ptr : memref) func @print_memref_i32(memref<*xi32>) diff --git a/mlir/test/mlir-cuda-runner/all-reduce-or.mlir b/mlir/test/mlir-cuda-runner/all-reduce-or.mlir index 2091c22356f5f..3135970620dc9 100644 --- a/mlir/test/mlir-cuda-runner/all-reduce-or.mlir +++ b/mlir/test/mlir-cuda-runner/all-reduce-or.mlir @@ -23,7 +23,12 @@ func @main() { %c4 = constant 4 : index %c5 = constant 5 : index %c6 = constant 6 : index - + + %cast_data = memref_cast %data : memref<2x6xi32> to memref + call @mcuMemHostRegisterMemRef2dInt32(%cast_data) : (memref) -> () + %cast_sum = memref_cast %sum : memref<2xi32> to memref + call @mcuMemHostRegisterMemRef1dInt32(%cast_sum) : (memref) -> () + store %cst0, %data[%c0, %c0] : memref<2x6xi32> store %cst1, %data[%c0, %c1] : memref<2x6xi32> store %cst2, %data[%c0, %c2] : memref<2x6xi32> @@ -54,5 +59,7 @@ func @main() { return } +func @mcuMemHostRegisterMemRef1dInt32(%ptr : memref) +func @mcuMemHostRegisterMemRef2dInt32(%ptr : memref) func @print_memref_i32(memref<*xi32>) diff --git a/mlir/test/mlir-cuda-runner/all-reduce-xor.mlir b/mlir/test/mlir-cuda-runner/all-reduce-xor.mlir index 153164128b16d..913088f1164ee 100644 --- a/mlir/test/mlir-cuda-runner/all-reduce-xor.mlir +++ b/mlir/test/mlir-cuda-runner/all-reduce-xor.mlir @@ -23,7 +23,12 @@ func @main() { %c4 = constant 4 : index %c5 = constant 5 : index %c6 = constant 6 : index - + + %cast_data = memref_cast %data : memref<2x6xi32> to memref + call @mcuMemHostRegisterMemRef2dInt32(%cast_data) : (memref) -> () + %cast_sum = memref_cast %sum : memref<2xi32> to memref + call @mcuMemHostRegisterMemRef1dInt32(%cast_sum) : (memref) -> () + store %cst0, %data[%c0, %c0] : memref<2x6xi32> store %cst1, %data[%c0, %c1] : memref<2x6xi32> store %cst2, %data[%c0, %c2] : memref<2x6xi32> @@ -54,5 +59,7 @@ func @main() { return } +func @mcuMemHostRegisterMemRef1dInt32(%ptr : memref) +func @mcuMemHostRegisterMemRef2dInt32(%ptr : memref) func @print_memref_i32(memref<*xi32>) diff --git a/mlir/test/mlir-cuda-runner/multiple-all-reduce.mlir b/mlir/test/mlir-cuda-runner/multiple-all-reduce.mlir new file mode 100644 index 0000000000000..c989db9b1cb90 --- /dev/null +++ b/mlir/test/mlir-cuda-runner/multiple-all-reduce.mlir @@ -0,0 +1,73 @@ +// RUN: mlir-cuda-runner %s --shared-libs=%cuda_wrapper_library_dir/libcuda-runtime-wrappers%shlibext,%linalg_test_lib_dir/libmlir_runner_utils%shlibext --entry-point-result=void | FileCheck %s + +func @main() { + %data = alloc() : memref<2x6xf32> + %sum = alloc() : memref<2xf32> + %mul = alloc() : memref<2xf32> + %cst0 = constant 0.0 : f32 + %cst1 = constant 1.0 : f32 + %cst2 = constant 2.0 : f32 + %cst4 = constant 4.0 : f32 + %cst8 = constant 8.0 : f32 + %cst16 = constant 16.0 : f32 + + %cst3 = constant 3.0 : f32 + %cst6 = constant 6.0 : f32 + %cst7 = constant 7.0 : f32 + %cst10 = constant 10.0 : f32 + %cst11 = constant 11.0 : f32 + + %c0 = constant 0 : index + %c1 = constant 1 : index + %c2 = constant 2 : index + %c3 = constant 3 : index + %c4 = constant 4 : index + %c5 = constant 5 : index + %c6 = constant 6 : index + + %cast_data = memref_cast %data : memref<2x6xf32> to memref + call @mcuMemHostRegisterMemRef2dFloat(%cast_data) : (memref) -> () + %cast_sum = memref_cast %sum : memref<2xf32> to memref + call @mcuMemHostRegisterMemRef1dFloat(%cast_sum) : (memref) -> () + %cast_mul = memref_cast %mul : memref<2xf32> to memref + call @mcuMemHostRegisterMemRef1dFloat(%cast_mul) : (memref) -> () + + store %cst0, %data[%c0, %c0] : memref<2x6xf32> + store %cst1, %data[%c0, %c1] : memref<2x6xf32> + store %cst2, %data[%c0, %c2] : memref<2x6xf32> + store %cst4, %data[%c0, %c3] : memref<2x6xf32> + store %cst8, %data[%c0, %c4] : memref<2x6xf32> + store %cst16, %data[%c0, %c5] : memref<2x6xf32> + + store %cst2, %data[%c1, %c0] : memref<2x6xf32> + store %cst3, %data[%c1, %c1] : memref<2x6xf32> + store %cst6, %data[%c1, %c2] : memref<2x6xf32> + store %cst7, %data[%c1, %c3] : memref<2x6xf32> + store %cst10, %data[%c1, %c4] : memref<2x6xf32> + store %cst11, %data[%c1, %c5] : memref<2x6xf32> + + // ADD + MUL + gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %c2, %grid_y = %c1, %grid_z = %c1) + threads(%tx, %ty, %tz) in (%block_x = %c6, %block_y = %c1, %block_z = %c1) { + %val = load %data[%bx, %tx] : memref<2x6xf32> + %reduced0 = "gpu.all_reduce"(%val) ({}) { op = "add" } : (f32) -> (f32) + store %reduced0, %sum[%bx] : memref<2xf32> + %reduced1 = "gpu.all_reduce"(%val) ({}) { op = "mul" } : (f32) -> (f32) + store %reduced1, %mul[%bx] : memref<2xf32> + gpu.terminator + } + + %ptr_sum = memref_cast %sum : memref<2xf32> to memref<*xf32> + call @print_memref_f32(%ptr_sum) : (memref<*xf32>) -> () + // CHECK: [31, 39] + + %ptr_mul = memref_cast %mul : memref<2xf32> to memref<*xf32> + call @print_memref_f32(%ptr_mul) : (memref<*xf32>) -> () + // CHECK: [0, 27720] + + return +} + +func @mcuMemHostRegisterMemRef1dFloat(%ptr : memref) +func @mcuMemHostRegisterMemRef2dFloat(%ptr : memref) +func @print_memref_f32(memref<*xf32>) diff --git a/mlir/tools/mlir-cuda-runner/cuda-runtime-wrappers.cpp b/mlir/tools/mlir-cuda-runner/cuda-runtime-wrappers.cpp index 350d9869373a7..9c191c5a1a4be 100644 --- a/mlir/tools/mlir-cuda-runner/cuda-runtime-wrappers.cpp +++ b/mlir/tools/mlir-cuda-runner/cuda-runtime-wrappers.cpp @@ -15,6 +15,7 @@ #include #include +#include "llvm/ADT/ArrayRef.h" #include "llvm/Support/raw_ostream.h" #include "cuda.h" @@ -89,24 +90,39 @@ template struct MemRefType { // Allows to register a MemRef with the CUDA runtime. Initializes array with // value. Helpful until we have transfer functions implemented. -template -void mcuMemHostRegisterMemRef(const MemRefType *arg, T value) { - auto count = std::accumulate(arg->sizes, arg->sizes + N, 1, - std::multiplies()); - std::fill_n(arg->data, count, value); - mcuMemHostRegister(arg->data, count * sizeof(T)); +template +void mcuMemHostRegisterMemRef(T *pointer, llvm::ArrayRef sizes, + llvm::ArrayRef strides, T value) { + assert(sizes.size() == strides.size()); + llvm::SmallVector denseStrides(strides.size()); + + std::partial_sum(sizes.rbegin(), sizes.rend(), denseStrides.rbegin(), + std::multiplies()); + auto count = denseStrides.front(); + + // Only densely packed tensors are currently supported. + std::rotate(denseStrides.begin(), denseStrides.begin() + 1, + denseStrides.end()); + denseStrides.back() = 1; + assert(strides == llvm::makeArrayRef(denseStrides)); + + std::fill_n(pointer, count, value); + mcuMemHostRegister(pointer, count * sizeof(T)); } extern "C" void mcuMemHostRegisterMemRef1dFloat(float *allocated, float *aligned, int64_t offset, int64_t size, int64_t stride) { - MemRefType descriptor; - descriptor.basePtr = allocated; - descriptor.data = aligned; - descriptor.offset = offset; - descriptor.sizes[0] = size; - descriptor.strides[0] = stride; - mcuMemHostRegisterMemRef(&descriptor, 1.23f); + mcuMemHostRegisterMemRef(aligned + offset, {size}, {stride}, 1.23f); +} + +extern "C" void mcuMemHostRegisterMemRef2dFloat(float *allocated, + float *aligned, int64_t offset, + int64_t size0, int64_t size1, + int64_t stride0, + int64_t stride1) { + mcuMemHostRegisterMemRef(aligned + offset, {size0, size1}, {stride0, stride1}, + 1.23f); } extern "C" void mcuMemHostRegisterMemRef3dFloat(float *allocated, @@ -115,15 +131,31 @@ extern "C" void mcuMemHostRegisterMemRef3dFloat(float *allocated, int64_t size2, int64_t stride0, int64_t stride1, int64_t stride2) { - MemRefType descriptor; - descriptor.basePtr = allocated; - descriptor.data = aligned; - descriptor.offset = offset; - descriptor.sizes[0] = size0; - descriptor.strides[0] = stride0; - descriptor.sizes[1] = size1; - descriptor.strides[1] = stride1; - descriptor.sizes[2] = size2; - descriptor.strides[2] = stride2; - mcuMemHostRegisterMemRef(&descriptor, 1.23f); + mcuMemHostRegisterMemRef(aligned + offset, {size0, size1, size2}, + {stride0, stride1, stride2}, 1.23f); +} + +extern "C" void mcuMemHostRegisterMemRef1dInt32(int32_t *allocated, + int32_t *aligned, + int64_t offset, int64_t size, + int64_t stride) { + mcuMemHostRegisterMemRef(aligned + offset, {size}, {stride}, 123); +} + +extern "C" void mcuMemHostRegisterMemRef2dInt32(int32_t *allocated, + int32_t *aligned, + int64_t offset, int64_t size0, + int64_t size1, int64_t stride0, + int64_t stride1) { + mcuMemHostRegisterMemRef(aligned + offset, {size0, size1}, {stride0, stride1}, + 123); +} + +extern "C" void +mcuMemHostRegisterMemRef3dInt32(int32_t *allocated, int32_t *aligned, + int64_t offset, int64_t size0, int64_t size1, + int64_t size2, int64_t stride0, int64_t stride1, + int64_t stride2) { + mcuMemHostRegisterMemRef(aligned + offset, {size0, size1, size2}, + {stride0, stride1, stride2}, 123); } diff --git a/mlir/tools/mlir-opt/CMakeLists.txt b/mlir/tools/mlir-opt/CMakeLists.txt index 9ade33aceabec..2ae36d466dd45 100644 --- a/mlir/tools/mlir-opt/CMakeLists.txt +++ b/mlir/tools/mlir-opt/CMakeLists.txt @@ -9,6 +9,7 @@ set(LIBS ${conversion_libs} MLIRLoopOpsTransforms MLIRLoopAnalysis + MLIRAffineTransformsTestPasses MLIRAnalysis MLIRDialect MLIREDSC diff --git a/mlir/tools/mlir-opt/mlir-opt.cpp b/mlir/tools/mlir-opt/mlir-opt.cpp index 443d951573e1e..ff0f49f987b60 100644 --- a/mlir/tools/mlir-opt/mlir-opt.cpp +++ b/mlir/tools/mlir-opt/mlir-opt.cpp @@ -10,7 +10,6 @@ // //===----------------------------------------------------------------------===// -#include "mlir/Analysis/Passes.h" #include "mlir/InitAllDialects.h" #include "mlir/InitAllPasses.h" #include "mlir/IR/Dialect.h" @@ -42,6 +41,7 @@ void registerTestAffineDataCopyPass(); void registerTestAllReduceLoweringPass(); void registerTestCallGraphPass(); void registerTestConstantFold(); +void registerTestConvertGPUKernelToCubinPass(); void registerTestFunc(); void registerTestGpuMemoryPromotionPass(); void registerTestLinalgTransforms(); @@ -97,6 +97,9 @@ void registerTestPasses() { registerTestAllReduceLoweringPass(); registerTestCallGraphPass(); registerTestConstantFold(); +#if MLIR_CUDA_CONVERSIONS_ENABLED + registerTestConvertGPUKernelToCubinPass(); +#endif registerTestFunc(); registerTestGpuMemoryPromotionPass(); registerTestLinalgTransforms(); @@ -112,14 +115,6 @@ void registerTestPasses() { registerTestVectorConversions(); registerTestVectorToLoopsPass(); registerVectorizerTestPass(); - - // The following passes are using global initializers, just link them in. - if (std::getenv("bar") != (char *)-1) - return; - - // TODO: move these to the test folder. - createTestMemRefBoundCheckPass(); - createTestMemRefDependenceCheckPass(); } static cl::opt diff --git a/mlir/tools/mlir-tblgen/OpDocGen.cpp b/mlir/tools/mlir-tblgen/OpDocGen.cpp index b8a00af10af20..a432b4a2f21c4 100644 --- a/mlir/tools/mlir-tblgen/OpDocGen.cpp +++ b/mlir/tools/mlir-tblgen/OpDocGen.cpp @@ -78,82 +78,140 @@ static void emitIfNotEmpty(StringRef str, raw_ostream &os) { } } -static void emitOpDocForDialect(const Dialect &dialect, - const std::vector &ops, - const std::vector &types, - raw_ostream &os) { - os << "# Dialect '" << dialect.getName() << "' definition\n\n"; - emitIfNotEmpty(dialect.getSummary(), os); - emitIfNotEmpty(dialect.getDescription(), os); - - // TODO(b/143543720) Generate TOC where extension is not supported. - os << "[TOC]\n\n"; +/// Emit the given named constraint. +template +static void emitNamedConstraint(const T &it, raw_ostream &os) { + if (!it.name.empty()) + os << "`" << it.name << "`"; + else + os << "«unnamed»"; + os << " | " << it.constraint.getDescription() << "\n"; +} - // TODO(antiagainst): Add link between use and def for types - if (!types.empty()) - os << "## Type definition\n\n"; - for (auto type : types) { - os << "### " << type.getDescription() << "\n"; - emitDescription(type.getTypeDescription(), os); - os << "\n"; - } +//===----------------------------------------------------------------------===// +// Operation Documentation +//===----------------------------------------------------------------------===// - if (!ops.empty()) - os << "## Operation definition\n\n"; - for (auto op : ops) { - os << "### " << op.getOperationName() << " (" << op.getQualCppClassName() - << ")"; - - // Emit summary & description of operator. - if (op.hasSummary()) - os << "\n" << op.getSummary() << "\n"; - os << "\n#### Description:\n\n"; - if (op.hasDescription()) - mlir::tblgen::emitDescription(op.getDescription(), os); - - // Emit operands & type of operand. All operands are numbered, some may be - // named too. - os << "\n#### Operands:\n\n"; - for (const auto &operand : op.getOperands()) { - os << "1. "; - if (!operand.name.empty()) - os << "`" << operand.name << "`: "; - else - os << "«unnamed»: "; - os << operand.constraint.getDescription() << "\n"; - } +/// Emit the assembly format of an operation. +static void emitAssemblyFormat(StringRef opName, StringRef format, + raw_ostream &os) { + os << "\nSyntax:\n\n```\noperation ::= `" << opName << "` "; + + // Print the assembly format aligned. + unsigned indent = strlen("operation ::= "); + std::pair split = format.split('\n'); + os << split.first.trim() << "\n"; + do { + split = split.second.split('\n'); + StringRef formatChunk = split.first.trim(); + if (!formatChunk.empty()) + os.indent(indent) << formatChunk << "\n"; + } while (!split.second.empty()); + os << "```\n\n"; +} - // Emit attributes. +static void emitOpDoc(Operator op, raw_ostream &os) { + os << llvm::formatv("### `{0}` ({1})\n", op.getOperationName(), + op.getQualCppClassName()); + + // Emit the summary, syntax, and description if present. + if (op.hasSummary()) + os << "\n" << op.getSummary() << "\n"; + if (op.hasAssemblyFormat()) + emitAssemblyFormat(op.getOperationName(), op.getAssemblyFormat().trim(), + os); + if (op.hasDescription()) + mlir::tblgen::emitDescription(op.getDescription(), os); + + // Emit attributes. + if (op.getNumAttributes() != 0) { // TODO: Attributes are only documented by TableGen name, with no further // info. This should be improved. os << "\n#### Attributes:\n\n"; - if (op.getNumAttributes() > 0) { - os << "| Attribute | MLIR Type | Description |\n" - << "| :-------: | :-------: | ----------- |\n"; - } - for (auto namedAttr : op.getAttributes()) { - os << "| `" << namedAttr.name << "` | `" - << namedAttr.attr.getStorageType() << "` | " - << namedAttr.attr.getDescription() << " attribute |\n"; + os << "| Attribute | MLIR Type | Description |\n" + << "| :-------: | :-------: | ----------- |\n"; + for (const auto &it : op.getAttributes()) { + StringRef storageType = it.attr.getStorageType(); + os << "`" << it.name << "` | " << storageType << " | " + << it.attr.getDescription() << "\n"; } + } - // Emit results. + // Emit each of the operands. + if (op.getNumOperands() != 0) { + os << "\n#### Operands:\n\n"; + os << "| Operand | Description |\n" + << "| :-----: | ----------- |\n"; + for (const auto &it : op.getOperands()) + emitNamedConstraint(it, os); + } + + // Emit results. + if (op.getNumResults() != 0) { os << "\n#### Results:\n\n"; - for (unsigned i = 0, e = op.getNumResults(); i < e; ++i) { - os << "1. "; - auto name = op.getResultName(i); - if (name.empty()) - os << "«unnamed»: "; - else - os << "`" << name << "`: "; - os << op.getResultTypeConstraint(i).getDescription() << "\n"; - } + os << "| Result | Description |\n" + << "| :----: | ----------- |\n"; + for (const auto &it : op.getResults()) + emitNamedConstraint(it, os); + } - os << "\n"; + // Emit successors. + if (op.getNumSuccessors() != 0) { + os << "\n#### Successors:\n\n"; + os << "| Successor | Description |\n" + << "| :-------: | ----------- |\n"; + for (const auto &it : op.getSuccessors()) + emitNamedConstraint(it, os); } + + os << "\n"; } static void emitOpDoc(const RecordKeeper &recordKeeper, raw_ostream &os) { + auto opDefs = recordKeeper.getAllDerivedDefinitions("Op"); + + os << "\n"; + for (const llvm::Record *opDef : opDefs) + emitOpDoc(Operator(opDef), os); +} + +//===----------------------------------------------------------------------===// +// Type Documentation +//===----------------------------------------------------------------------===// + +static void emitTypeDoc(const Type &type, raw_ostream &os) { + os << "### " << type.getDescription() << "\n"; + emitDescription(type.getTypeDescription(), os); + os << "\n"; +} + +//===----------------------------------------------------------------------===// +// Dialect Documentation +//===----------------------------------------------------------------------===// + +static void emitDialectDoc(const Dialect &dialect, ArrayRef ops, + ArrayRef types, raw_ostream &os) { + os << "# '" << dialect.getName() << "' Dialect\n\n"; + emitIfNotEmpty(dialect.getSummary(), os); + emitIfNotEmpty(dialect.getDescription(), os); + + os << "[TOC]\n\n"; + + // TODO(antiagainst): Add link between use and def for types + if (!types.empty()) { + os << "## Type definition\n\n"; + for (const Type &type : types) + emitTypeDoc(type, os); + } + + if (!ops.empty()) { + os << "## Operation definition\n\n"; + for (const Operator &op : ops) + emitOpDoc(op, os); + } +} + +static void emitDialectDoc(const RecordKeeper &recordKeeper, raw_ostream &os) { const auto &opDefs = recordKeeper.getAllDerivedDefinitions("Op"); const auto &typeDefs = recordKeeper.getAllDerivedDefinitions("DialectType"); @@ -171,13 +229,24 @@ static void emitOpDoc(const RecordKeeper &recordKeeper, raw_ostream &os) { os << "\n"; for (auto dialectWithOps : dialectOps) - emitOpDocForDialect(dialectWithOps.first, dialectWithOps.second, - dialectTypes[dialectWithOps.first], os); + emitDialectDoc(dialectWithOps.first, dialectWithOps.second, + dialectTypes[dialectWithOps.first], os); } +//===----------------------------------------------------------------------===// +// Gen Registration +//===----------------------------------------------------------------------===// + +static mlir::GenRegistration + genOpRegister("gen-op-doc", "Generate dialect documentation", + [](const RecordKeeper &records, raw_ostream &os) { + emitOpDoc(records, os); + return false; + }); + static mlir::GenRegistration - genRegister("gen-op-doc", "Generate operation documentation", + genRegister("gen-dialect-doc", "Generate dialect documentation", [](const RecordKeeper &records, raw_ostream &os) { - emitOpDoc(records, os); + emitDialectDoc(records, os); return false; }); diff --git a/mlir/tools/mlir-tblgen/OpFormatGen.cpp b/mlir/tools/mlir-tblgen/OpFormatGen.cpp index 3249d06bb764a..525981e03c86b 100644 --- a/mlir/tools/mlir-tblgen/OpFormatGen.cpp +++ b/mlir/tools/mlir-tblgen/OpFormatGen.cpp @@ -1830,19 +1830,13 @@ void mlir::tblgen::generateOpFormat(const Operator &constOp, OpClass &opClass) { // TODO(riverriddle) Operator doesn't expose all necessary functionality via // the const interface. Operator &op = const_cast(constOp); - - // Check if the operation specified the format field. - StringRef formatStr; - TypeSwitch(op.getDef().getValueInit("assemblyFormat")) - .Case( - [&](auto *init) { formatStr = init->getValue(); }); - if (formatStr.empty()) + if (!op.hasAssemblyFormat()) return; // Parse the format description. llvm::SourceMgr mgr; - mgr.AddNewSourceBuffer(llvm::MemoryBuffer::getMemBuffer(formatStr), - llvm::SMLoc()); + mgr.AddNewSourceBuffer( + llvm::MemoryBuffer::getMemBuffer(op.getAssemblyFormat()), llvm::SMLoc()); OperationFormat format(op); if (failed(FormatParser(mgr, format, op).parse())) { // Exit the process if format errors are treated as fatal. diff --git a/mlir/tools/mlir-translate/CMakeLists.txt b/mlir/tools/mlir-translate/CMakeLists.txt index d665789e5bd0b..bf7a92509912a 100644 --- a/mlir/tools/mlir-translate/CMakeLists.txt +++ b/mlir/tools/mlir-translate/CMakeLists.txt @@ -5,6 +5,7 @@ set(LIBS MLIRPass MLIRSPIRV MLIRSPIRVSerialization + MLIRTargetAVX512 MLIRTargetLLVMIR MLIRTargetNVVMIR MLIRTargetROCDLIR @@ -13,6 +14,7 @@ set(LIBS ) set(FULL_LIBS MLIRSPIRVSerialization + MLIRTargetAVX512 MLIRTargetLLVMIR MLIRTargetNVVMIR MLIRTargetROCDLIR diff --git a/mlir/tools/mlir-vulkan-runner/VulkanRuntime.h b/mlir/tools/mlir-vulkan-runner/VulkanRuntime.h index 9c63714306b97..cf266f9349bd2 100644 --- a/mlir/tools/mlir-vulkan-runner/VulkanRuntime.h +++ b/mlir/tools/mlir-vulkan-runner/VulkanRuntime.h @@ -13,7 +13,6 @@ #ifndef VULKAN_RUNTIME_H #define VULKAN_RUNTIME_H -#include "mlir/Analysis/Passes.h" #include "mlir/Dialect/SPIRV/SPIRVOps.h" #include "mlir/Dialect/SPIRV/Serialization.h" #include "mlir/IR/Module.h" diff --git a/openmp/README.rst b/openmp/README.rst index 7f747caf9abc7..55342e4dbac51 100644 --- a/openmp/README.rst +++ b/openmp/README.rst @@ -4,7 +4,7 @@ How to Build the LLVM* OpenMP* Libraries This repository requires `CMake `_ v2.8.0 or later. LLVM and Clang need a more recent version which also applies for in-tree builds. For more information than available in this document please see -`LLVM's CMake documentation `_ and the +`LLVM's CMake documentation `_ and the `official documentation `_. .. contents:: diff --git a/openmp/libomptarget/deviceRTLs/CMakeLists.txt b/openmp/libomptarget/deviceRTLs/CMakeLists.txt index 8bbf987aaf205..3df94eac0727e 100644 --- a/openmp/libomptarget/deviceRTLs/CMakeLists.txt +++ b/openmp/libomptarget/deviceRTLs/CMakeLists.txt @@ -10,5 +10,4 @@ # ##===----------------------------------------------------------------------===## -add_subdirectory(amdgcn) add_subdirectory(nvptx) diff --git a/openmp/libomptarget/deviceRTLs/common/src/reduction.cu b/openmp/libomptarget/deviceRTLs/common/src/reduction.cu index 7f6ee2e39c7d0..427c90a7e0913 100644 --- a/openmp/libomptarget/deviceRTLs/common/src/reduction.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/reduction.cu @@ -230,7 +230,7 @@ static int32_t nvptx_teams_reduce_nowait(int32_t global_tid, int32_t num_vars, : /*Master thread only*/ 1; uint32_t TeamId = GetBlockIdInKernel(); uint32_t NumTeams = GetNumberOfBlocksInKernel(); - SHARED volatile bool IsLastTeam; + static SHARED volatile bool IsLastTeam; // Team masters of all teams write to the scratchpad. if (ThreadId == 0) { @@ -423,8 +423,8 @@ EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_v2( : /*Master thread only*/ 1; uint32_t TeamId = GetBlockIdInKernel(); uint32_t NumTeams = GetNumberOfBlocksInKernel(); - SHARED unsigned Bound; - SHARED unsigned ChunkTeamCount; + static SHARED unsigned Bound; + static SHARED unsigned ChunkTeamCount; // Block progress for teams greater than the current upper // limit. We always only allow a number of teams less or equal diff --git a/openmp/libomptarget/plugins/cuda/CMakeLists.txt b/openmp/libomptarget/plugins/cuda/CMakeLists.txt index 06c3dd94878f0..54bcdf26e9e6b 100644 --- a/openmp/libomptarget/plugins/cuda/CMakeLists.txt +++ b/openmp/libomptarget/plugins/cuda/CMakeLists.txt @@ -1,16 +1,16 @@ ##===----------------------------------------------------------------------===## -# +# # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. # See https://llvm.org/LICENSE.txt for license information. # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -# +# ##===----------------------------------------------------------------------===## # # Build a plugin for a CUDA machine if available. # ##===----------------------------------------------------------------------===## -if (NOT(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(ppc64le)$" AND CMAKE_SYSTEM_NAME MATCHES "Linux")) - libomptarget_say("Not building CUDA offloading plugin: only support CUDA in Linux x86_64 or ppc64le hosts.") +if (NOT(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(ppc64le)|(aarch64)$" AND CMAKE_SYSTEM_NAME MATCHES "Linux")) + libomptarget_say("Not building CUDA offloading plugin: only support CUDA in Linux x86_64, ppc64le, or aarch64 hosts.") return() elseif (NOT LIBOMPTARGET_DEP_LIBELF_FOUND) libomptarget_say("Not building CUDA offloading plugin: libelf dependency not found.") diff --git a/openmp/runtime/src/kmp_tasking.cpp b/openmp/runtime/src/kmp_tasking.cpp index 15ffc1454fe94..6dae9dc73e7a9 100644 --- a/openmp/runtime/src/kmp_tasking.cpp +++ b/openmp/runtime/src/kmp_tasking.cpp @@ -3923,9 +3923,12 @@ kmp_task_t *__kmp_task_dup_alloc(kmp_info_t *thread, kmp_task_t *task_src) { } taskdata->td_alloc_thread = thread; taskdata->td_parent = parent_task; - taskdata->td_taskgroup = - parent_task - ->td_taskgroup; // task inherits the taskgroup from the parent task + // task inherits the taskgroup from the parent task + taskdata->td_taskgroup = parent_task->td_taskgroup; + // tied task needs to initialize the td_last_tied at creation, + // untied one does this when it is scheduled for execution + if (taskdata->td_flags.tiedness == TASK_TIED) + taskdata->td_last_tied = taskdata; // Only need to keep track of child task counts if team parallel and tasking // not serialized diff --git a/openmp/runtime/test/tasking/omp_task_red_taskloop.c b/openmp/runtime/test/tasking/omp_task_red_taskloop.c new file mode 100644 index 0000000000000..89c66256da736 --- /dev/null +++ b/openmp/runtime/test/tasking/omp_task_red_taskloop.c @@ -0,0 +1,57 @@ +// RUN: %libomp-compile-and-run + +#include +#include + +int r; + +int work(int k, int l) +{ + return k + l + 1; +} +void bar(int i) { + #pragma omp taskgroup task_reduction(+:r) + { int th_gen = omp_get_thread_num(); + #pragma omp task in_reduction(+:r) firstprivate(i, th_gen) + { + r += work(i, 0); +printf("executing task (%d, 0), th %d (gen by th %d)\n", i, omp_get_thread_num(), th_gen); + } + #pragma omp task in_reduction(+:r) firstprivate(i, th_gen) + { + r += work(i, 1); +printf("executing task (%d, 1), th %d (gen by th %d)\n", i, omp_get_thread_num(), th_gen); + } + } +} +int foo() { + int i; + int th_gen = omp_get_thread_num(); + #pragma omp taskgroup task_reduction(+:r) + { + bar(0); + } +printf("th %d passed bar0\n", th_gen); + #pragma omp taskloop reduction(+:r) firstprivate(th_gen) + for (i = 1; i < 4; ++i) { + bar(i); +printf("th %d (gen by th %d) passed bar%d in taskloop\n", omp_get_thread_num(), th_gen, i); +// #pragma omp task in_reduction(+:r) + r += i; + } + return 0; +} +// res = 2*((1+2)+(2+3)+(3+4)+(4+5)+1+2+3) = 60 +#define res 60 +int main() +{ + r = 0; + #pragma omp parallel num_threads(2) + foo(); + if (r == res) { + return 0; + } else { + printf("error r = %d (!= %d)\n", r, res); + return 1; + } +} diff --git a/polly/docs/TipsAndTricks.rst b/polly/docs/TipsAndTricks.rst index 146da0406604e..38df79a996662 100644 --- a/polly/docs/TipsAndTricks.rst +++ b/polly/docs/TipsAndTricks.rst @@ -21,7 +21,7 @@ Using bugpoint to track down errors in large files - ``$ bugpoint crash.ll -polly-codegen -opt-args -polly-canonicalize -polly-process-unprofitable`` - For more documentation on bugpoint, `Visit the LLVM manual `_ + For more documentation on bugpoint, `Visit the LLVM manual `_ Understanding which pass makes a particular change diff --git a/sycl/include/CL/sycl/detail/kernel_desc.hpp b/sycl/include/CL/sycl/detail/kernel_desc.hpp index 63da32fe9f9a7..4bd64ed4e699e 100644 --- a/sycl/include/CL/sycl/detail/kernel_desc.hpp +++ b/sycl/include/CL/sycl/detail/kernel_desc.hpp @@ -74,7 +74,7 @@ using make_index_sequence = __make_integer_seq; template struct KernelInfoImpl { private: - static constexpr auto n = __unique_stable_name(T); + static constexpr auto n = __builtin_unique_stable_name(T); template static KernelInfoData impl(index_sequence) { return {}; diff --git a/sycl/include/CL/sycl/experimental/spec_constant.hpp b/sycl/include/CL/sycl/experimental/spec_constant.hpp index 91b72d1d9a6e9..eff35a8a928ab 100644 --- a/sycl/include/CL/sycl/experimental/spec_constant.hpp +++ b/sycl/include/CL/sycl/experimental/spec_constant.hpp @@ -42,7 +42,7 @@ template class spec_constant { public: T get() const { // explicit access. #ifdef __SYCL_DEVICE_ONLY__ - const char *TName = __unique_stable_name(ID); + const char *TName = __builtin_unique_stable_name(ID); return __sycl_getSpecConstantValue(TName); #else return Val;